TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
51 #include <blaze/math/Exception.h>
57 #include <blaze/math/Functions.h>
58 #include <blaze/math/shims/Reset.h>
60 #include <blaze/math/SIMD.h>
102 #include <blaze/system/BLAS.h>
103 #include <blaze/system/Blocking.h>
105 #include <blaze/system/Thresholds.h>
106 #include <blaze/util/Assert.h>
107 #include <blaze/util/Complex.h>
111 #include <blaze/util/DisableIf.h>
112 #include <blaze/util/EnableIf.h>
114 #include <blaze/util/InvalidType.h>
116 #include <blaze/util/mpl/And.h>
117 #include <blaze/util/mpl/If.h>
118 #include <blaze/util/mpl/Not.h>
119 #include <blaze/util/mpl/Or.h>
120 #include <blaze/util/Types.h>
129 
130 
131 namespace blaze {
132 
133 //=================================================================================================
134 //
135 // CLASS TDMATDMATMULTEXPR
136 //
137 //=================================================================================================
138 
139 //*************************************************************************************************
146 template< typename MT1 // Type of the left-hand side dense matrix
147  , typename MT2 > // Type of the right-hand side dense matrix
148 class TDMatDMatMultExpr : public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
149  , private MatMatMultExpr
150  , private Computation
151 {
152  private:
153  //**Type definitions****************************************************************************
160  //**********************************************************************************************
161 
162  //**********************************************************************************************
164  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
165  //**********************************************************************************************
166 
167  //**********************************************************************************************
169  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
170  //**********************************************************************************************
171 
172  //**********************************************************************************************
174 
178  template< typename T1, typename T2, typename T3 >
179  struct IsEvaluationRequired {
180  enum : bool { value = ( evaluateLeft || evaluateRight ) };
181  };
183  //**********************************************************************************************
184 
185  //**********************************************************************************************
187 
190  template< typename T1, typename T2, typename T3 >
191  struct UseBlasKernel {
193  HasMutableDataAccess<T1>::value &&
194  HasConstDataAccess<T2>::value &&
195  HasConstDataAccess<T3>::value &&
196  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
197  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
198  IsBLASCompatible< ElementType_<T1> >::value &&
199  IsBLASCompatible< ElementType_<T2> >::value &&
200  IsBLASCompatible< ElementType_<T3> >::value &&
201  IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
202  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
203  };
205  //**********************************************************************************************
206 
207  //**********************************************************************************************
209 
212  template< typename T1, typename T2, typename T3 >
213  struct UseVectorizedDefaultKernel {
214  enum : bool { value = useOptimizedKernels &&
215  !( IsDiagonal<T2>::value && IsDiagonal<T3>::value ) &&
216  !( IsDiagonal<T2>::value && IsColumnMajorMatrix<T1>::value ) &&
217  !( IsDiagonal<T3>::value && IsRowMajorMatrix<T1>::value ) &&
218  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
219  AreSIMDCombinable< ElementType_<T1>
220  , ElementType_<T2>
221  , ElementType_<T3> >::value &&
222  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
223  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
224  };
226  //**********************************************************************************************
227 
228  public:
229  //**Type definitions****************************************************************************
236  typedef const ElementType ReturnType;
237  typedef const ResultType CompositeType;
238 
240  typedef If_< IsExpression<MT1>, const MT1, const MT1& > LeftOperand;
241 
243  typedef If_< IsExpression<MT2>, const MT2, const MT2& > RightOperand;
244 
247 
250  //**********************************************************************************************
251 
252  //**Compilation flags***************************************************************************
254  enum : bool { simdEnabled = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
255  MT1::simdEnabled && MT2::simdEnabled &&
258 
260  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
261  !evaluateRight && MT2::smpAssignable };
262  //**********************************************************************************************
263 
264  //**SIMD properties*****************************************************************************
266  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
267  //**********************************************************************************************
268 
269  //**Constructor*********************************************************************************
275  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
276  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
277  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
278  {
279  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
280  }
281  //**********************************************************************************************
282 
283  //**Access operator*****************************************************************************
290  inline ReturnType operator()( size_t i, size_t j ) const {
291  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
292  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
293 
294  if( IsDiagonal<MT1>::value ) {
295  return lhs_(i,i) * rhs_(i,j);
296  }
297  else if( IsDiagonal<MT2>::value ) {
298  return lhs_(i,j) * rhs_(j,j);
299  }
301  const size_t begin( ( IsUpper<MT1>::value )
302  ?( ( IsLower<MT2>::value )
303  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
304  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
305  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
306  :( ( IsLower<MT2>::value )
307  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
308  :( 0UL ) ) );
309  const size_t end( ( IsLower<MT1>::value )
310  ?( ( IsUpper<MT2>::value )
311  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
312  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
313  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
314  :( ( IsUpper<MT2>::value )
315  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
316  :( lhs_.columns() ) ) );
317 
318  if( begin >= end ) return ElementType();
319 
320  const size_t n( end - begin );
321 
322  return subvector( row( lhs_, i ), begin, n ) * subvector( column( rhs_, j ), begin, n );
323  }
324  else {
325  return row( lhs_, i ) * column( rhs_, j );
326  }
327  }
328  //**********************************************************************************************
329 
330  //**At function*********************************************************************************
338  inline ReturnType at( size_t i, size_t j ) const {
339  if( i >= lhs_.rows() ) {
340  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
341  }
342  if( j >= rhs_.columns() ) {
343  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
344  }
345  return (*this)(i,j);
346  }
347  //**********************************************************************************************
348 
349  //**Rows function*******************************************************************************
354  inline size_t rows() const noexcept {
355  return lhs_.rows();
356  }
357  //**********************************************************************************************
358 
359  //**Columns function****************************************************************************
364  inline size_t columns() const noexcept {
365  return rhs_.columns();
366  }
367  //**********************************************************************************************
368 
369  //**Left operand access*************************************************************************
374  inline LeftOperand leftOperand() const noexcept {
375  return lhs_;
376  }
377  //**********************************************************************************************
378 
379  //**Right operand access************************************************************************
384  inline RightOperand rightOperand() const noexcept {
385  return rhs_;
386  }
387  //**********************************************************************************************
388 
389  //**********************************************************************************************
395  template< typename T >
396  inline bool canAlias( const T* alias ) const noexcept {
397  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
398  }
399  //**********************************************************************************************
400 
401  //**********************************************************************************************
407  template< typename T >
408  inline bool isAliased( const T* alias ) const noexcept {
409  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
410  }
411  //**********************************************************************************************
412 
413  //**********************************************************************************************
418  inline bool isAligned() const noexcept {
419  return lhs_.isAligned() && rhs_.isAligned();
420  }
421  //**********************************************************************************************
422 
423  //**********************************************************************************************
428  inline bool canSMPAssign() const noexcept {
429  return ( !BLAZE_BLAS_IS_PARALLEL ||
430  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
431  ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD );
432  }
433  //**********************************************************************************************
434 
435  private:
436  //**Member variables****************************************************************************
437  LeftOperand lhs_;
438  RightOperand rhs_;
439  //**********************************************************************************************
440 
441  //**Assignment to dense matrices****************************************************************
454  template< typename MT // Type of the target dense matrix
455  , bool SO > // Storage order of the target dense matrix
456  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
457  {
459 
460  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
461  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
462 
463  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
464  return;
465  }
466  else if( rhs.lhs_.columns() == 0UL ) {
467  reset( ~lhs );
468  return;
469  }
470 
471  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
472  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
473 
474  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
475  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
476  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
477  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
478  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
479  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
480 
481  TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
482  }
484  //**********************************************************************************************
485 
486  //**Assignment to dense matrices (kernel selection)*********************************************
497  template< typename MT3 // Type of the left-hand side target matrix
498  , typename MT4 // Type of the left-hand side matrix operand
499  , typename MT5 > // Type of the right-hand side matrix operand
500  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
501  {
503  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
504  selectSmallAssignKernel( C, A, B );
505  else
506  selectBlasAssignKernel( C, A, B );
507  }
509  //**********************************************************************************************
510 
511  //**Default assignment to row-major dense matrices (general/general)****************************
525  template< typename MT3 // Type of the left-hand side target matrix
526  , typename MT4 // Type of the left-hand side matrix operand
527  , typename MT5 > // Type of the right-hand side matrix operand
528  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
529  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
530  {
531  const size_t M( A.rows() );
532  const size_t N( B.columns() );
533  const size_t K( A.columns() );
534 
535  for( size_t i=0UL; i<M; ++i )
536  {
537  const size_t kbegin( ( IsUpper<MT4>::value )
538  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
539  :( 0UL ) );
540  const size_t kend( ( IsLower<MT4>::value )
541  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
542  :( K ) );
543  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
544 
545  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
546  for( size_t j=0UL; j<N; ++j ) {
547  reset( (~C)(i,j) );
548  }
549  continue;
550  }
551 
552  {
553  const size_t jbegin( ( IsUpper<MT5>::value )
554  ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
555  :( 0UL ) );
556  const size_t jend( ( IsLower<MT5>::value )
557  ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
558  :( N ) );
559  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
560 
561  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
562  for( size_t j=0UL; j<jbegin; ++j ) {
563  reset( (~C)(i,j) );
564  }
565  }
566  else if( IsStrictlyUpper<MT5>::value ) {
567  reset( (~C)(i,0UL) );
568  }
569  for( size_t j=jbegin; j<jend; ++j ) {
570  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
571  }
572  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
573  for( size_t j=jend; j<N; ++j ) {
574  reset( (~C)(i,j) );
575  }
576  }
577  else if( IsStrictlyLower<MT5>::value ) {
578  reset( (~C)(i,N-1UL) );
579  }
580  }
581 
582  for( size_t k=kbegin+1UL; k<kend; ++k )
583  {
584  const size_t jbegin( ( IsUpper<MT5>::value )
585  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
586  :( 0UL ) );
587  const size_t jend( ( IsLower<MT5>::value )
588  ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
589  :( N ) );
590  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
591 
592  for( size_t j=jbegin; j<jend; ++j ) {
593  (~C)(i,j) += A(i,k) * B(k,j);
594  }
595  if( IsLower<MT5>::value ) {
596  (~C)(i,jend) = A(i,k) * B(k,jend);
597  }
598  }
599  }
600  }
602  //**********************************************************************************************
603 
604  //**Default assignment to column-major dense matrices (general/general)*************************
618  template< typename MT3 // Type of the left-hand side target matrix
619  , typename MT4 // Type of the left-hand side matrix operand
620  , typename MT5 > // Type of the right-hand side matrix operand
621  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
622  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
623  {
624  const size_t M( A.rows() );
625  const size_t N( B.columns() );
626  const size_t K( A.columns() );
627 
628  for( size_t j=0UL; j<N; ++j )
629  {
630  const size_t kbegin( ( IsLower<MT5>::value )
631  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
632  :( 0UL ) );
633  const size_t kend( ( IsUpper<MT5>::value )
634  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
635  :( K ) );
636  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
637 
638  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
639  for( size_t i=0UL; i<M; ++i ) {
640  reset( (~C)(i,j) );
641  }
642  continue;
643  }
644 
645  {
646  const size_t ibegin( ( IsLower<MT4>::value )
647  ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
648  :( 0UL ) );
649  const size_t iend( ( IsUpper<MT4>::value )
650  ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
651  :( M ) );
652  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
653 
654  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
655  for( size_t i=0UL; i<ibegin; ++i ) {
656  reset( (~C)(i,j) );
657  }
658  }
659  else if( IsStrictlyLower<MT4>::value ) {
660  reset( (~C)(0UL,j) );
661  }
662  for( size_t i=ibegin; i<iend; ++i ) {
663  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
664  }
665  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
666  for( size_t i=iend; i<M; ++i ) {
667  reset( (~C)(i,j) );
668  }
669  }
670  else if( IsStrictlyUpper<MT4>::value ) {
671  reset( (~C)(M-1UL,j) );
672  }
673  }
674 
675  for( size_t k=kbegin+1UL; k<kend; ++k )
676  {
677  const size_t ibegin( ( IsLower<MT4>::value )
678  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
679  :( 0UL ) );
680  const size_t iend( ( IsUpper<MT4>::value )
681  ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
682  :( M ) );
683  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
684 
685  for( size_t i=ibegin; i<iend; ++i ) {
686  (~C)(i,j) += A(i,k) * B(k,j);
687  }
688  if( IsUpper<MT4>::value ) {
689  (~C)(iend,j) = A(iend,k) * B(k,j);
690  }
691  }
692  }
693  }
695  //**********************************************************************************************
696 
697  //**Default assignment to row-major dense matrices (general/diagonal)***************************
711  template< typename MT3 // Type of the left-hand side target matrix
712  , typename MT4 // Type of the left-hand side matrix operand
713  , typename MT5 > // Type of the right-hand side matrix operand
714  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
715  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
716  {
717  const size_t M( A.rows() );
718  const size_t N( B.columns() );
719 
720  const size_t block( BLOCK_SIZE );
721 
722  for( size_t ii=0UL; ii<M; ii+=block ) {
723  const size_t iend( min( M, ii+block ) );
724  for( size_t jj=0UL; jj<N; jj+=block ) {
725  const size_t jend( min( N, jj+block ) );
726  for( size_t i=ii; i<iend; ++i )
727  {
728  const size_t jbegin( ( IsUpper<MT4>::value )
729  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
730  :( jj ) );
731  const size_t jpos( ( IsLower<MT4>::value )
732  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
733  :( jend ) );
734 
735  if( IsUpper<MT4>::value ) {
736  for( size_t j=jj; j<jbegin; ++j ) {
737  reset( (~C)(i,j) );
738  }
739  }
740  for( size_t j=jbegin; j<jpos; ++j ) {
741  (~C)(i,j) = A(i,j) * B(j,j);
742  }
743  if( IsLower<MT4>::value ) {
744  for( size_t j=jpos; j<jend; ++j ) {
745  reset( (~C)(i,j) );
746  }
747  }
748  }
749  }
750  }
751  }
753  //**********************************************************************************************
754 
755  //**Default assignment to column-major dense matrices (general/diagonal)************************
769  template< typename MT3 // Type of the left-hand side target matrix
770  , typename MT4 // Type of the left-hand side matrix operand
771  , typename MT5 > // Type of the right-hand side matrix operand
772  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
773  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
774  {
775  const size_t M( A.rows() );
776  const size_t N( B.columns() );
777 
778  for( size_t j=0UL; j<N; ++j )
779  {
780  const size_t ibegin( ( IsLower<MT4>::value )
781  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
782  :( 0UL ) );
783  const size_t iend( ( IsUpper<MT4>::value )
784  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
785  :( M ) );
786  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
787 
788  if( IsLower<MT4>::value ) {
789  for( size_t i=0UL; i<ibegin; ++i ) {
790  reset( (~C)(i,j) );
791  }
792  }
793  for( size_t i=ibegin; i<iend; ++i ) {
794  (~C)(i,j) = A(i,j) * B(j,j);
795  }
796  if( IsUpper<MT4>::value ) {
797  for( size_t i=iend; i<M; ++i ) {
798  reset( (~C)(i,j) );
799  }
800  }
801  }
802  }
804  //**********************************************************************************************
805 
806  //**Default assignment to row-major dense matrices (diagonal/general)***************************
820  template< typename MT3 // Type of the left-hand side target matrix
821  , typename MT4 // Type of the left-hand side matrix operand
822  , typename MT5 > // Type of the right-hand side matrix operand
823  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
824  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
825  {
826  const size_t M( A.rows() );
827  const size_t N( B.columns() );
828 
829  for( size_t i=0UL; i<M; ++i )
830  {
831  const size_t jbegin( ( IsUpper<MT5>::value )
832  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
833  :( 0UL ) );
834  const size_t jend( ( IsLower<MT5>::value )
835  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
836  :( N ) );
837  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
838 
839  if( IsUpper<MT5>::value ) {
840  for( size_t j=0UL; j<jbegin; ++j ) {
841  reset( (~C)(i,j) );
842  }
843  }
844  for( size_t j=jbegin; j<jend; ++j ) {
845  (~C)(i,j) = A(i,i) * B(i,j);
846  }
847  if( IsLower<MT5>::value ) {
848  for( size_t j=jend; j<N; ++j ) {
849  reset( (~C)(i,j) );
850  }
851  }
852  }
853  }
855  //**********************************************************************************************
856 
857  //**Default assignment to column-major dense matrices (diagonal/general)************************
871  template< typename MT3 // Type of the left-hand side target matrix
872  , typename MT4 // Type of the left-hand side matrix operand
873  , typename MT5 > // Type of the right-hand side matrix operand
874  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
875  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
876  {
877  const size_t M( A.rows() );
878  const size_t N( B.columns() );
879 
880  const size_t block( BLOCK_SIZE );
881 
882  for( size_t jj=0UL; jj<N; jj+=block ) {
883  const size_t jend( min( N, jj+block ) );
884  for( size_t ii=0UL; ii<M; ii+=block ) {
885  const size_t iend( min( M, ii+block ) );
886  for( size_t j=jj; j<jend; ++j )
887  {
888  const size_t ibegin( ( IsLower<MT5>::value )
889  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
890  :( ii ) );
891  const size_t ipos( ( IsUpper<MT5>::value )
892  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
893  :( iend ) );
894 
895  if( IsLower<MT5>::value ) {
896  for( size_t i=ii; i<ibegin; ++i ) {
897  reset( (~C)(i,j) );
898  }
899  }
900  for( size_t i=ibegin; i<ipos; ++i ) {
901  (~C)(i,j) = A(i,i) * B(i,j);
902  }
903  if( IsUpper<MT5>::value ) {
904  for( size_t i=ipos; i<iend; ++i ) {
905  reset( (~C)(i,j) );
906  }
907  }
908  }
909  }
910  }
911  }
913  //**********************************************************************************************
914 
915  //**Default assignment to dense matrices (diagonal/diagonal)************************************
929  template< typename MT3 // Type of the left-hand side target matrix
930  , typename MT4 // Type of the left-hand side matrix operand
931  , typename MT5 > // Type of the right-hand side matrix operand
932  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
933  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
934  {
935  reset( C );
936 
937  for( size_t i=0UL; i<A.rows(); ++i ) {
938  C(i,i) = A(i,i) * B(i,i);
939  }
940  }
942  //**********************************************************************************************
943 
944  //**Default assignment to dense matrices (small matrices)***************************************
958  template< typename MT3 // Type of the left-hand side target matrix
959  , typename MT4 // Type of the left-hand side matrix operand
960  , typename MT5 > // Type of the right-hand side matrix operand
961  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
962  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
963  {
964  selectDefaultAssignKernel( ~C, A, B );
965  }
967  //**********************************************************************************************
968 
969  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
984  template< typename MT3 // Type of the left-hand side target matrix
985  , typename MT4 // Type of the left-hand side matrix operand
986  , typename MT5 > // Type of the right-hand side matrix operand
987  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
988  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
989  {
990  const size_t M( A.rows() );
991  const size_t N( B.columns() );
992  const size_t K( A.columns() );
993 
994  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
995 
996  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
997  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
998 
999  size_t j( 0UL );
1000 
1001  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1002  for( size_t i=0UL; i<M; ++i )
1003  {
1004  const size_t kbegin( ( IsUpper<MT4>::value )
1005  ?( ( IsLower<MT5>::value )
1006  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1007  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1008  :( IsLower<MT5>::value ? j : 0UL ) );
1009  const size_t kend( ( IsLower<MT4>::value )
1010  ?( ( IsUpper<MT5>::value )
1011  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
1012  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1013  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
1014 
1015  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1016 
1017  for( size_t k=kbegin; k<kend; ++k ) {
1018  const SIMDType a1( set( A(i,k) ) );
1019  xmm1 = xmm1 + a1 * B.load(k,j );
1020  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
1021  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
1022  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
1023  xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
1024  xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
1025  xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
1026  xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
1027  }
1028 
1029  (~C).store( i, j , xmm1 );
1030  (~C).store( i, j+SIMDSIZE , xmm2 );
1031  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1032  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1033  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1034  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
1035  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
1036  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
1037  }
1038  }
1039 
1040  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1041  {
1042  size_t i( 0UL );
1043 
1044  for( ; (i+2UL) <= M; i+=2UL )
1045  {
1046  const size_t kbegin( ( IsUpper<MT4>::value )
1047  ?( ( IsLower<MT5>::value )
1048  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1049  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1050  :( IsLower<MT5>::value ? j : 0UL ) );
1051  const size_t kend( ( IsLower<MT4>::value )
1052  ?( ( IsUpper<MT5>::value )
1053  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1054  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1055  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
1056 
1057  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1058 
1059  for( size_t k=kbegin; k<kend; ++k ) {
1060  const SIMDType a1( set( A(i ,k) ) );
1061  const SIMDType a2( set( A(i+1UL,k) ) );
1062  const SIMDType b1( B.load(k,j ) );
1063  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1064  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1065  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1066  xmm1 = xmm1 + a1 * b1;
1067  xmm2 = xmm2 + a1 * b2;
1068  xmm3 = xmm3 + a1 * b3;
1069  xmm4 = xmm4 + a1 * b4;
1070  xmm5 = xmm5 + a2 * b1;
1071  xmm6 = xmm6 + a2 * b2;
1072  xmm7 = xmm7 + a2 * b3;
1073  xmm8 = xmm8 + a2 * b4;
1074  }
1075 
1076  (~C).store( i , j , xmm1 );
1077  (~C).store( i , j+SIMDSIZE , xmm2 );
1078  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1079  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1080  (~C).store( i+1UL, j , xmm5 );
1081  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1082  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1083  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1084  }
1085 
1086  if( i < M )
1087  {
1088  const size_t kbegin( ( IsUpper<MT4>::value )
1089  ?( ( IsLower<MT5>::value )
1090  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1091  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1092  :( IsLower<MT5>::value ? j : 0UL ) );
1093  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
1094 
1095  SIMDType xmm1, xmm2, xmm3, xmm4;
1096 
1097  for( size_t k=kbegin; k<kend; ++k ) {
1098  const SIMDType a1( set( A(i,k) ) );
1099  xmm1 = xmm1 + a1 * B.load(k,j );
1100  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
1101  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
1102  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
1103  }
1104 
1105  (~C).store( i, j , xmm1 );
1106  (~C).store( i, j+SIMDSIZE , xmm2 );
1107  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1108  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1109  }
1110  }
1111 
1112  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1113  {
1114  size_t i( 0UL );
1115 
1116  for( ; (i+2UL) <= M; i+=2UL )
1117  {
1118  const size_t kbegin( ( IsUpper<MT4>::value )
1119  ?( ( IsLower<MT5>::value )
1120  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1121  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1122  :( IsLower<MT5>::value ? j : 0UL ) );
1123  const size_t kend( ( IsLower<MT4>::value )
1124  ?( ( IsUpper<MT5>::value )
1125  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
1126  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1127  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
1128 
1129  SIMDType xmm1, xmm2, xmm3, xmm4;
1130 
1131  for( size_t k=kbegin; k<kend; ++k ) {
1132  const SIMDType a1( set( A(i ,k) ) );
1133  const SIMDType a2( set( A(i+1UL,k) ) );
1134  const SIMDType b1( B.load(k,j ) );
1135  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1136  xmm1 = xmm1 + a1 * b1;
1137  xmm2 = xmm2 + a1 * b2;
1138  xmm3 = xmm3 + a2 * b1;
1139  xmm4 = xmm4 + a2 * b2;
1140  }
1141 
1142  (~C).store( i , j , xmm1 );
1143  (~C).store( i , j+SIMDSIZE, xmm2 );
1144  (~C).store( i+1UL, j , xmm3 );
1145  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1146  }
1147 
1148  if( i < M )
1149  {
1150  const size_t kbegin( ( IsUpper<MT4>::value )
1151  ?( ( IsLower<MT5>::value )
1152  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1153  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1154  :( IsLower<MT5>::value ? j : 0UL ) );
1155  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
1156 
1157  SIMDType xmm1, xmm2;
1158 
1159  for( size_t k=kbegin; k<kend; ++k ) {
1160  const SIMDType a1( set( A(i,k) ) );
1161  xmm1 = xmm1 + a1 * B.load(k,j );
1162  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
1163  }
1164 
1165  (~C).store( i, j , xmm1 );
1166  (~C).store( i, j+SIMDSIZE, xmm2 );
1167  }
1168  }
1169 
1170  for( ; j<jpos; j+=SIMDSIZE )
1171  {
1172  size_t i( 0UL );
1173 
1174  for( ; (i+2UL) <= M; i+=2UL )
1175  {
1176  const size_t kbegin( ( IsUpper<MT4>::value )
1177  ?( ( IsLower<MT5>::value )
1178  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1179  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1180  :( IsLower<MT5>::value ? j : 0UL ) );
1181  const size_t kend( ( IsLower<MT4>::value )
1182  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1183  :( K ) );
1184 
1185  SIMDType xmm1, xmm2;
1186 
1187  for( size_t k=kbegin; k<kend; ++k ) {
1188  const SIMDType b1( B.load(k,j) );
1189  xmm1 = xmm1 + set( A(i ,k) ) * b1;
1190  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
1191  }
1192 
1193  (~C).store( i , j, xmm1 );
1194  (~C).store( i+1UL, j, xmm2 );
1195  }
1196 
1197  if( i < M )
1198  {
1199  const size_t kbegin( ( IsUpper<MT4>::value )
1200  ?( ( IsLower<MT5>::value )
1201  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1202  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1203  :( IsLower<MT5>::value ? j : 0UL ) );
1204 
1205  SIMDType xmm1;
1206 
1207  for( size_t k=kbegin; k<K; ++k ) {
1208  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
1209  }
1210 
1211  (~C).store( i, j, xmm1 );
1212  }
1213  }
1214 
1215  for( ; remainder && j<N; ++j )
1216  {
1217  size_t i( 0UL );
1218 
1219  for( ; (i+2UL) <= M; i+=2UL )
1220  {
1221  const size_t kbegin( ( IsUpper<MT4>::value )
1222  ?( ( IsLower<MT5>::value )
1223  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1224  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1225  :( IsLower<MT5>::value ? j : 0UL ) );
1226  const size_t kend( ( IsLower<MT4>::value )
1227  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1228  :( K ) );
1229 
1230  ElementType value1 = ElementType();
1231  ElementType value2 = ElementType();
1232 
1233  for( size_t k=kbegin; k<kend; ++k ) {
1234  value1 += A(i ,k) * B(k,j);
1235  value2 += A(i+1UL,k) * B(k,j);
1236  }
1237 
1238  (~C)(i ,j) = value1;
1239  (~C)(i+1UL,j) = value2;
1240  }
1241 
1242  if( i < M )
1243  {
1244  const size_t kbegin( ( IsUpper<MT4>::value )
1245  ?( ( IsLower<MT5>::value )
1246  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1247  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1248  :( IsLower<MT5>::value ? j : 0UL ) );
1249 
1250  ElementType value = ElementType();
1251 
1252  for( size_t k=kbegin; k<K; ++k ) {
1253  value += A(i,k) * B(k,j);
1254  }
1255 
1256  (~C)(i,j) = value;
1257  }
1258  }
1259  }
1261  //**********************************************************************************************
1262 
1263  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1278  template< typename MT3 // Type of the left-hand side target matrix
1279  , typename MT4 // Type of the left-hand side matrix operand
1280  , typename MT5 > // Type of the right-hand side matrix operand
1281  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1282  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1283  {
1284  const size_t M( A.rows() );
1285  const size_t N( B.columns() );
1286  const size_t K( A.columns() );
1287 
1288  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1289 
1290  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
1291  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1292 
1293  size_t i( 0UL );
1294 
1295  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1296  for( size_t j=0UL; j<N; ++j )
1297  {
1298  const size_t kbegin( ( IsLower<MT5>::value )
1299  ?( ( IsUpper<MT4>::value )
1300  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1301  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1302  :( IsUpper<MT4>::value ? i : 0UL ) );
1303  const size_t kend( ( IsUpper<MT5>::value )
1304  ?( ( IsLower<MT4>::value )
1305  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1306  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1307  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
1308 
1309  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1310 
1311  for( size_t k=kbegin; k<kend; ++k ) {
1312  const SIMDType b1( set( B(k,j) ) );
1313  xmm1 = xmm1 + A.load(i ,k) * b1;
1314  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
1315  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
1316  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
1317  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
1318  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
1319  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
1320  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
1321  }
1322 
1323  (~C).store( i , j, xmm1 );
1324  (~C).store( i+SIMDSIZE , j, xmm2 );
1325  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1326  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1327  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1328  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
1329  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
1330  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
1331  }
1332  }
1333 
1334  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1335  {
1336  size_t j( 0UL );
1337 
1338  for( ; (j+2UL) <= N; j+=2UL )
1339  {
1340  const size_t kbegin( ( IsLower<MT5>::value )
1341  ?( ( IsUpper<MT4>::value )
1342  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1343  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1344  :( IsUpper<MT4>::value ? i : 0UL ) );
1345  const size_t kend( ( IsUpper<MT5>::value )
1346  ?( ( IsLower<MT4>::value )
1347  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1348  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1349  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
1350 
1351  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1352 
1353  for( size_t k=kbegin; k<kend; ++k ) {
1354  const SIMDType a1( A.load(i ,k) );
1355  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1356  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1357  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1358  const SIMDType b1( set( B(k,j ) ) );
1359  const SIMDType b2( set( B(k,j+1UL) ) );
1360  xmm1 = xmm1 + a1 * b1;
1361  xmm2 = xmm2 + a2 * b1;
1362  xmm3 = xmm3 + a3 * b1;
1363  xmm4 = xmm4 + a4 * b1;
1364  xmm5 = xmm5 + a1 * b2;
1365  xmm6 = xmm6 + a2 * b2;
1366  xmm7 = xmm7 + a3 * b2;
1367  xmm8 = xmm8 + a4 * b2;
1368  }
1369 
1370  (~C).store( i , j , xmm1 );
1371  (~C).store( i+SIMDSIZE , j , xmm2 );
1372  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1373  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1374  (~C).store( i , j+1UL, xmm5 );
1375  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1376  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1377  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1378  }
1379 
1380  if( j < N )
1381  {
1382  const size_t kbegin( ( IsLower<MT5>::value )
1383  ?( ( IsUpper<MT4>::value )
1384  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1385  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1386  :( IsUpper<MT4>::value ? i : 0UL ) );
1387  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
1388 
1389  SIMDType xmm1, xmm2, xmm3, xmm4;
1390 
1391  for( size_t k=kbegin; k<kend; ++k ) {
1392  const SIMDType b1( set( B(k,j) ) );
1393  xmm1 = xmm1 + A.load(i ,k) * b1;
1394  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
1395  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
1396  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
1397  }
1398 
1399  (~C).store( i , j, xmm1 );
1400  (~C).store( i+SIMDSIZE , j, xmm2 );
1401  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1402  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1403  }
1404  }
1405 
1406  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1407  {
1408  size_t j( 0UL );
1409 
1410  for( ; (j+2UL) <= N; j+=2UL )
1411  {
1412  const size_t kbegin( ( IsLower<MT5>::value )
1413  ?( ( IsUpper<MT4>::value )
1414  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1415  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1416  :( IsUpper<MT4>::value ? i : 0UL ) );
1417  const size_t kend( ( IsUpper<MT5>::value )
1418  ?( ( IsLower<MT4>::value )
1419  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1420  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1421  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
1422 
1423  SIMDType xmm1, xmm2, xmm3, xmm4;
1424 
1425  for( size_t k=kbegin; k<kend; ++k ) {
1426  const SIMDType a1( A.load(i ,k) );
1427  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1428  const SIMDType b1( set( B(k,j ) ) );
1429  const SIMDType b2( set( B(k,j+1UL) ) );
1430  xmm1 = xmm1 + a1 * b1;
1431  xmm2 = xmm2 + a2 * b1;
1432  xmm3 = xmm3 + a1 * b2;
1433  xmm4 = xmm4 + a2 * b2;
1434  }
1435 
1436  (~C).store( i , j , xmm1 );
1437  (~C).store( i+SIMDSIZE, j , xmm2 );
1438  (~C).store( i , j+1UL, xmm3 );
1439  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1440  }
1441 
1442  if( j < N )
1443  {
1444  const size_t kbegin( ( IsLower<MT5>::value )
1445  ?( ( IsUpper<MT4>::value )
1446  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1447  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1448  :( IsUpper<MT4>::value ? i : 0UL ) );
1449  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
1450 
1451  SIMDType xmm1, xmm2;
1452 
1453  for( size_t k=kbegin; k<kend; ++k ) {
1454  const SIMDType b1( set( B(k,j) ) );
1455  xmm1 = xmm1 + A.load(i ,k) * b1;
1456  xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
1457  }
1458 
1459  (~C).store( i , j, xmm1 );
1460  (~C).store( i+SIMDSIZE, j, xmm2 );
1461  }
1462  }
1463 
1464  for( ; i<ipos; i+=SIMDSIZE )
1465  {
1466  size_t j( 0UL );
1467 
1468  for( ; (j+2UL) <= N; j+=2UL )
1469  {
1470  const size_t kbegin( ( IsLower<MT5>::value )
1471  ?( ( IsUpper<MT4>::value )
1472  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1473  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1474  :( IsUpper<MT4>::value ? i : 0UL ) );
1475  const size_t kend( ( IsUpper<MT5>::value )
1476  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1477  :( K ) );
1478 
1479  SIMDType xmm1, xmm2;
1480 
1481  for( size_t k=kbegin; k<kend; ++k ) {
1482  const SIMDType a1( A.load(i,k) );
1483  xmm1 = xmm1 + a1 * set( B(k,j ) );
1484  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
1485  }
1486 
1487  (~C).store( i, j , xmm1 );
1488  (~C).store( i, j+1UL, xmm2 );
1489  }
1490 
1491  if( j < N )
1492  {
1493  const size_t kbegin( ( IsLower<MT5>::value )
1494  ?( ( IsUpper<MT4>::value )
1495  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1496  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1497  :( IsUpper<MT4>::value ? i : 0UL ) );
1498 
1499  SIMDType xmm1;
1500 
1501  for( size_t k=kbegin; k<K; ++k ) {
1502  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
1503  }
1504 
1505  (~C).store( i, j, xmm1 );
1506  }
1507  }
1508 
1509  for( ; remainder && i<M; ++i )
1510  {
1511  size_t j( 0UL );
1512 
1513  for( ; (j+2UL) <= N; j+=2UL )
1514  {
1515  const size_t kbegin( ( IsLower<MT5>::value )
1516  ?( ( IsUpper<MT4>::value )
1517  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1518  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1519  :( IsUpper<MT4>::value ? i : 0UL ) );
1520  const size_t kend( ( IsUpper<MT5>::value )
1521  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1522  :( K ) );
1523 
1524  ElementType value1 = ElementType();
1525  ElementType value2 = ElementType();
1526 
1527  for( size_t k=kbegin; k<kend; ++k ) {
1528  value1 += A(i,k) * B(k,j );
1529  value2 += A(i,k) * B(k,j+1UL);
1530  }
1531 
1532  (~C)(i,j ) = value1;
1533  (~C)(i,j+1UL) = value2;
1534  }
1535 
1536  if( j < N )
1537  {
1538  const size_t kbegin( ( IsLower<MT5>::value )
1539  ?( ( IsUpper<MT4>::value )
1540  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1541  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1542  :( IsUpper<MT4>::value ? i : 0UL ) );
1543 
1544  ElementType value = ElementType();
1545 
1546  for( size_t k=kbegin; k<K; ++k ) {
1547  value += A(i,k) * B(k,j);
1548  }
1549 
1550  (~C)(i,j) = value;
1551  }
1552  }
1553  }
1555  //**********************************************************************************************
1556 
1557  //**Default assignment to dense matrices (large matrices)***************************************
1571  template< typename MT3 // Type of the left-hand side target matrix
1572  , typename MT4 // Type of the left-hand side matrix operand
1573  , typename MT5 > // Type of the right-hand side matrix operand
1574  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1575  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1576  {
1577  selectDefaultAssignKernel( C, A, B );
1578  }
1580  //**********************************************************************************************
1581 
1582  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
1597  template< typename MT3 // Type of the left-hand side target matrix
1598  , typename MT4 // Type of the left-hand side matrix operand
1599  , typename MT5 > // Type of the right-hand side matrix operand
1600  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1601  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1602  {
1603  const size_t M( A.rows() );
1604  const size_t N( B.columns() );
1605  const size_t K( A.columns() );
1606 
1607  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1608 
1609  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
1610  {
1611  const size_t jend( min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
1612 
1613  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1614  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1615 
1616  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
1617  {
1618  const size_t iend( min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
1619 
1620  for( size_t i=ii; i<iend; ++i ) {
1621  for( size_t j=jj; j<jend; ++j ) {
1622  reset( (~C)(i,j) );
1623  }
1624  }
1625 
1626  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
1627  {
1628  const size_t ktmp( min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
1629 
1630  size_t j( jj );
1631 
1632  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1633  {
1634  const size_t j1( j+SIMDSIZE );
1635  const size_t j2( j+SIMDSIZE*2UL );
1636  const size_t j3( j+SIMDSIZE*3UL );
1637 
1638  size_t i( ii );
1639 
1640  for( ; (i+2UL) <= iend; i+=2UL )
1641  {
1642  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1643  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1644  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1645  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
1646 
1647  SIMDType xmm1( (~C).load(i ,j ) );
1648  SIMDType xmm2( (~C).load(i ,j1) );
1649  SIMDType xmm3( (~C).load(i ,j2) );
1650  SIMDType xmm4( (~C).load(i ,j3) );
1651  SIMDType xmm5( (~C).load(i+1UL,j ) );
1652  SIMDType xmm6( (~C).load(i+1UL,j1) );
1653  SIMDType xmm7( (~C).load(i+1UL,j2) );
1654  SIMDType xmm8( (~C).load(i+1UL,j3) );
1655 
1656  for( size_t k=kbegin; k<kend; ++k ) {
1657  const SIMDType a1( set( A(i ,k) ) );
1658  const SIMDType a2( set( A(i+1UL,k) ) );
1659  const SIMDType b1( B.load(k,j ) );
1660  const SIMDType b2( B.load(k,j1) );
1661  const SIMDType b3( B.load(k,j2) );
1662  const SIMDType b4( B.load(k,j3) );
1663  xmm1 = xmm1 + a1 * b1;
1664  xmm2 = xmm2 + a1 * b2;
1665  xmm3 = xmm3 + a1 * b3;
1666  xmm4 = xmm4 + a1 * b4;
1667  xmm5 = xmm5 + a2 * b1;
1668  xmm6 = xmm6 + a2 * b2;
1669  xmm7 = xmm7 + a2 * b3;
1670  xmm8 = xmm8 + a2 * b4;
1671  }
1672 
1673  (~C).store( i , j , xmm1 );
1674  (~C).store( i , j1, xmm2 );
1675  (~C).store( i , j2, xmm3 );
1676  (~C).store( i , j3, xmm4 );
1677  (~C).store( i+1UL, j , xmm5 );
1678  (~C).store( i+1UL, j1, xmm6 );
1679  (~C).store( i+1UL, j2, xmm7 );
1680  (~C).store( i+1UL, j3, xmm8 );
1681  }
1682 
1683  if( i < iend )
1684  {
1685  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1686  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1687  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1688  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
1689 
1690  SIMDType xmm1( (~C).load(i,j ) );
1691  SIMDType xmm2( (~C).load(i,j1) );
1692  SIMDType xmm3( (~C).load(i,j2) );
1693  SIMDType xmm4( (~C).load(i,j3) );
1694 
1695  for( size_t k=kbegin; k<kend; ++k ) {
1696  const SIMDType a1( set( A(i,k) ) );
1697  xmm1 = xmm1 + a1 * B.load(k,j );
1698  xmm2 = xmm2 + a1 * B.load(k,j1);
1699  xmm3 = xmm3 + a1 * B.load(k,j2);
1700  xmm4 = xmm4 + a1 * B.load(k,j3);
1701  }
1702 
1703  (~C).store( i, j , xmm1 );
1704  (~C).store( i, j1, xmm2 );
1705  (~C).store( i, j2, xmm3 );
1706  (~C).store( i, j3, xmm4 );
1707  }
1708  }
1709 
1710  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1711  {
1712  const size_t j1( j+SIMDSIZE );
1713 
1714  size_t i( ii );
1715 
1716  for( ; (i+4UL) <= iend; i+=4UL )
1717  {
1718  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1719  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1720  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
1721  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
1722 
1723  SIMDType xmm1( (~C).load(i ,j ) );
1724  SIMDType xmm2( (~C).load(i ,j1) );
1725  SIMDType xmm3( (~C).load(i+1UL,j ) );
1726  SIMDType xmm4( (~C).load(i+1UL,j1) );
1727  SIMDType xmm5( (~C).load(i+2UL,j ) );
1728  SIMDType xmm6( (~C).load(i+2UL,j1) );
1729  SIMDType xmm7( (~C).load(i+3UL,j ) );
1730  SIMDType xmm8( (~C).load(i+3UL,j1) );
1731 
1732  for( size_t k=kbegin; k<kend; ++k ) {
1733  const SIMDType a1( set( A(i ,k) ) );
1734  const SIMDType a2( set( A(i+1UL,k) ) );
1735  const SIMDType a3( set( A(i+2UL,k) ) );
1736  const SIMDType a4( set( A(i+3UL,k) ) );
1737  const SIMDType b1( B.load(k,j ) );
1738  const SIMDType b2( B.load(k,j1) );
1739  xmm1 = xmm1 + a1 * b1;
1740  xmm2 = xmm2 + a1 * b2;
1741  xmm3 = xmm3 + a2 * b1;
1742  xmm4 = xmm4 + a2 * b2;
1743  xmm5 = xmm5 + a3 * b1;
1744  xmm6 = xmm6 + a3 * b2;
1745  xmm7 = xmm7 + a4 * b1;
1746  xmm8 = xmm8 + a4 * b2;
1747  }
1748 
1749  (~C).store( i , j , xmm1 );
1750  (~C).store( i , j1, xmm2 );
1751  (~C).store( i+1UL, j , xmm3 );
1752  (~C).store( i+1UL, j1, xmm4 );
1753  (~C).store( i+2UL, j , xmm5 );
1754  (~C).store( i+2UL, j1, xmm6 );
1755  (~C).store( i+3UL, j , xmm7 );
1756  (~C).store( i+3UL, j1, xmm8 );
1757  }
1758 
1759  for( ; (i+2UL) <= iend; i+=2UL )
1760  {
1761  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1762  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1763  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1764  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
1765 
1766  SIMDType xmm1( (~C).load(i ,j ) );
1767  SIMDType xmm2( (~C).load(i ,j1) );
1768  SIMDType xmm3( (~C).load(i+1UL,j ) );
1769  SIMDType xmm4( (~C).load(i+1UL,j1) );
1770 
1771  for( size_t k=kbegin; k<kend; ++k ) {
1772  const SIMDType a1( set( A(i ,k) ) );
1773  const SIMDType a2( set( A(i+1UL,k) ) );
1774  const SIMDType b1( B.load(k,j ) );
1775  const SIMDType b2( B.load(k,j1) );
1776  xmm1 = xmm1 + a1 * b1;
1777  xmm2 = xmm2 + a1 * b2;
1778  xmm3 = xmm3 + a2 * b1;
1779  xmm4 = xmm4 + a2 * b2;
1780  }
1781 
1782  (~C).store( i , j , xmm1 );
1783  (~C).store( i , j1, xmm2 );
1784  (~C).store( i+1UL, j , xmm3 );
1785  (~C).store( i+1UL, j1, xmm4 );
1786  }
1787 
1788  if( i < iend )
1789  {
1790  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1791  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1792  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1793  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
1794 
1795  SIMDType xmm1( (~C).load(i,j ) );
1796  SIMDType xmm2( (~C).load(i,j1) );
1797 
1798  for( size_t k=kbegin; k<kend; ++k ) {
1799  const SIMDType a1( set( A(i,k) ) );
1800  xmm1 = xmm1 + a1 * B.load(k,j );
1801  xmm2 = xmm2 + a1 * B.load(k,j1);
1802  }
1803 
1804  (~C).store( i, j , xmm1 );
1805  (~C).store( i, j1, xmm2 );
1806  }
1807  }
1808 
1809  for( ; j<jpos; j+=SIMDSIZE )
1810  {
1811  for( size_t i=ii; i<iend; ++i )
1812  {
1813  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1814  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1815  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1816  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
1817 
1818  SIMDType xmm1( (~C).load(i,j) );
1819 
1820  for( size_t k=kbegin; k<kend; ++k ) {
1821  const SIMDType a1( set( A(i,k) ) );
1822  xmm1 = xmm1 + a1 * B.load(k,j);
1823  }
1824 
1825  (~C).store( i, j, xmm1 );
1826  }
1827  }
1828 
1829  for( ; remainder && j<jend; ++j )
1830  {
1831  for( size_t i=ii; i<iend; ++i )
1832  {
1833  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1834  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1835  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1836  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
1837 
1838  ElementType value( (~C)(i,j) );
1839 
1840  for( size_t k=kbegin; k<kend; ++k ) {
1841  value += A(i,k) * B(k,j);
1842  }
1843 
1844  (~C)(i,j) = value;
1845  }
1846  }
1847  }
1848  }
1849  }
1850  }
1852  //**********************************************************************************************
1853 
1854  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
1869  template< typename MT3 // Type of the left-hand side target matrix
1870  , typename MT4 // Type of the left-hand side matrix operand
1871  , typename MT5 > // Type of the right-hand side matrix operand
1872  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1873  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1874  {
1875  const size_t M( A.rows() );
1876  const size_t N( B.columns() );
1877  const size_t K( A.columns() );
1878 
1879  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1880 
1881  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
1882  {
1883  const size_t iend( min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
1884 
1885  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1886  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1887 
1888  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
1889  {
1890  const size_t jend( min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
1891 
1892  for( size_t j=jj; j<jend; ++j ) {
1893  for( size_t i=ii; i<iend; ++i ) {
1894  reset( (~C)(i,j) );
1895  }
1896  }
1897 
1898  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
1899  {
1900  const size_t ktmp( min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
1901 
1902  size_t i( ii );
1903 
1904  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1905  {
1906  const size_t i1( i+SIMDSIZE );
1907  const size_t i2( i+SIMDSIZE*2UL );
1908  const size_t i3( i+SIMDSIZE*3UL );
1909 
1910  size_t j( jj );
1911 
1912  for( ; (j+2UL) <= jend; j+=2UL )
1913  {
1914  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1915  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1916  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
1917  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1918 
1919  SIMDType xmm1( (~C).load(i ,j ) );
1920  SIMDType xmm2( (~C).load(i1,j ) );
1921  SIMDType xmm3( (~C).load(i2,j ) );
1922  SIMDType xmm4( (~C).load(i3,j ) );
1923  SIMDType xmm5( (~C).load(i ,j+1UL) );
1924  SIMDType xmm6( (~C).load(i1,j+1UL) );
1925  SIMDType xmm7( (~C).load(i2,j+1UL) );
1926  SIMDType xmm8( (~C).load(i3,j+1UL) );
1927 
1928  for( size_t k=kbegin; k<kend; ++k ) {
1929  const SIMDType a1( A.load(i ,k) );
1930  const SIMDType a2( A.load(i1,k) );
1931  const SIMDType a3( A.load(i2,k) );
1932  const SIMDType a4( A.load(i3,k) );
1933  const SIMDType b1( set( B(k,j ) ) );
1934  const SIMDType b2( set( B(k,j+1UL) ) );
1935  xmm1 = xmm1 + a1 * b1;
1936  xmm2 = xmm2 + a2 * b1;
1937  xmm3 = xmm3 + a3 * b1;
1938  xmm4 = xmm4 + a4 * b1;
1939  xmm5 = xmm5 + a1 * b2;
1940  xmm6 = xmm6 + a2 * b2;
1941  xmm7 = xmm7 + a3 * b2;
1942  xmm8 = xmm8 + a4 * b2;
1943  }
1944 
1945  (~C).store( i , j , xmm1 );
1946  (~C).store( i1, j , xmm2 );
1947  (~C).store( i2, j , xmm3 );
1948  (~C).store( i3, j , xmm4 );
1949  (~C).store( i , j+1UL, xmm5 );
1950  (~C).store( i1, j+1UL, xmm6 );
1951  (~C).store( i2, j+1UL, xmm7 );
1952  (~C).store( i3, j+1UL, xmm8 );
1953  }
1954 
1955  if( j < jend )
1956  {
1957  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1958  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1959  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
1960  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1961 
1962  SIMDType xmm1( (~C).load(i ,j) );
1963  SIMDType xmm2( (~C).load(i1,j) );
1964  SIMDType xmm3( (~C).load(i2,j) );
1965  SIMDType xmm4( (~C).load(i3,j) );
1966 
1967  for( size_t k=kbegin; k<kend; ++k ) {
1968  const SIMDType b1( set( B(k,j) ) );
1969  xmm1 = xmm1 + A.load(i ,k) * b1;
1970  xmm2 = xmm2 + A.load(i1,k) * b1;
1971  xmm3 = xmm3 + A.load(i2,k) * b1;
1972  xmm4 = xmm4 + A.load(i3,k) * b1;
1973  }
1974 
1975  (~C).store( i , j, xmm1 );
1976  (~C).store( i1, j, xmm2 );
1977  (~C).store( i2, j, xmm3 );
1978  (~C).store( i3, j, xmm4 );
1979  }
1980  }
1981 
1982  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1983  {
1984  const size_t i1( i+SIMDSIZE );
1985 
1986  size_t j( jj );
1987 
1988  for( ; (j+4UL) <= jend; j+=4UL )
1989  {
1990  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1991  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1992  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
1993  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
1994 
1995  SIMDType xmm1( (~C).load(i ,j ) );
1996  SIMDType xmm2( (~C).load(i1,j ) );
1997  SIMDType xmm3( (~C).load(i ,j+1UL) );
1998  SIMDType xmm4( (~C).load(i1,j+1UL) );
1999  SIMDType xmm5( (~C).load(i ,j+2UL) );
2000  SIMDType xmm6( (~C).load(i1,j+2UL) );
2001  SIMDType xmm7( (~C).load(i ,j+3UL) );
2002  SIMDType xmm8( (~C).load(i1,j+3UL) );
2003 
2004  for( size_t k=kbegin; k<kend; ++k ) {
2005  const SIMDType a1( A.load(i ,k) );
2006  const SIMDType a2( A.load(i1,k) );
2007  const SIMDType b1( set( B(k,j ) ) );
2008  const SIMDType b2( set( B(k,j+1UL) ) );
2009  const SIMDType b3( set( B(k,j+2UL) ) );
2010  const SIMDType b4( set( B(k,j+3UL) ) );
2011  xmm1 = xmm1 + a1 * b1;
2012  xmm2 = xmm2 + a2 * b1;
2013  xmm3 = xmm3 + a1 * b2;
2014  xmm4 = xmm4 + a2 * b2;
2015  xmm5 = xmm5 + a1 * b3;
2016  xmm6 = xmm6 + a2 * b3;
2017  xmm7 = xmm7 + a1 * b4;
2018  xmm8 = xmm8 + a2 * b4;
2019  }
2020 
2021  (~C).store( i , j , xmm1 );
2022  (~C).store( i1, j , xmm2 );
2023  (~C).store( i , j+1UL, xmm3 );
2024  (~C).store( i1, j+1UL, xmm4 );
2025  (~C).store( i , j+2UL, xmm5 );
2026  (~C).store( i1, j+2UL, xmm6 );
2027  (~C).store( i , j+3UL, xmm7 );
2028  (~C).store( i1, j+3UL, xmm8 );
2029  }
2030 
2031  for( ; (j+2UL) <= jend; j+=2UL )
2032  {
2033  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2034  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2035  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
2036  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2037 
2038  SIMDType xmm1( (~C).load(i ,j ) );
2039  SIMDType xmm2( (~C).load(i1,j ) );
2040  SIMDType xmm3( (~C).load(i ,j+1UL) );
2041  SIMDType xmm4( (~C).load(i1,j+1UL) );
2042 
2043  for( size_t k=kbegin; k<kend; ++k ) {
2044  const SIMDType a1( A.load(i ,k) );
2045  const SIMDType a2( A.load(i1,k) );
2046  const SIMDType b1( set( B(k,j ) ) );
2047  const SIMDType b2( set( B(k,j+1UL) ) );
2048  xmm1 = xmm1 + a1 * b1;
2049  xmm2 = xmm2 + a2 * b1;
2050  xmm3 = xmm3 + a1 * b2;
2051  xmm4 = xmm4 + a2 * b2;
2052  }
2053 
2054  (~C).store( i , j , xmm1 );
2055  (~C).store( i1, j , xmm2 );
2056  (~C).store( i , j+1UL, xmm3 );
2057  (~C).store( i1, j+1UL, xmm4 );
2058  }
2059 
2060  if( j < jend )
2061  {
2062  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2063  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2064  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
2065  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2066 
2067  SIMDType xmm1( (~C).load(i ,j) );
2068  SIMDType xmm2( (~C).load(i1,j) );
2069 
2070  for( size_t k=kbegin; k<kend; ++k ) {
2071  const SIMDType b1( set( B(k,j) ) );
2072  xmm1 = xmm1 + A.load(i ,k) * b1;
2073  xmm2 = xmm2 + A.load(i1,k) * b1;
2074  }
2075 
2076  (~C).store( i , j, xmm1 );
2077  (~C).store( i1, j, xmm2 );
2078  }
2079  }
2080 
2081  for( ; i<ipos; i+=SIMDSIZE )
2082  {
2083  for( size_t j=jj; j<jend; ++j )
2084  {
2085  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2086  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2087  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE, ktmp ) ):( ktmp ),
2088  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2089 
2090  SIMDType xmm1( (~C).load(i,j) );
2091 
2092  for( size_t k=kbegin; k<kend; ++k ) {
2093  const SIMDType b1( set( B(k,j) ) );
2094  xmm1 = xmm1 + A.load(i,k) * b1;
2095  }
2096 
2097  (~C).store( i, j, xmm1 );
2098  }
2099  }
2100 
2101  for( ; remainder && i<iend; ++i )
2102  {
2103  for( size_t j=jj; j<jend; ++j )
2104  {
2105  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2106  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2107  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
2108  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2109 
2110  ElementType value( (~C)(i,j) );
2111 
2112  for( size_t k=kbegin; k<kend; ++k ) {
2113  value += A(i,k) * B(k,j);
2114  }
2115 
2116  (~C)(i,j) = value;
2117  }
2118  }
2119  }
2120  }
2121  }
2122  }
2124  //**********************************************************************************************
2125 
2126  //**BLAS-based assignment to dense matrices (default)*******************************************
2140  template< typename MT3 // Type of the left-hand side target matrix
2141  , typename MT4 // Type of the left-hand side matrix operand
2142  , typename MT5 > // Type of the right-hand side matrix operand
2143  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
2144  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2145  {
2146  selectLargeAssignKernel( C, A, B );
2147  }
2149  //**********************************************************************************************
2150 
2151  //**BLAS-based assignment to dense matrices*****************************************************
2152 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2153 
2166  template< typename MT3 // Type of the left-hand side target matrix
2167  , typename MT4 // Type of the left-hand side matrix operand
2168  , typename MT5 > // Type of the right-hand side matrix operand
2169  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
2170  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2171  {
2172  typedef ElementType_<MT3> ET;
2173 
2174  if( IsTriangular<MT4>::value ) {
2175  assign( C, B );
2176  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2177  }
2178  else if( IsTriangular<MT5>::value ) {
2179  assign( C, A );
2180  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2181  }
2182  else {
2183  gemm( C, A, B, ET(1), ET(0) );
2184  }
2185  }
2187 #endif
2188  //**********************************************************************************************
2189 
2190  //**Assignment to sparse matrices***************************************************************
2203  template< typename MT // Type of the target sparse matrix
2204  , bool SO > // Storage order of the target sparse matrix
2205  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2206  {
2208 
2209  typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
2210 
2216  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<TmpType> );
2217 
2218  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2219  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2220 
2221  const TmpType tmp( serial( rhs ) );
2222  assign( ~lhs, tmp );
2223  }
2225  //**********************************************************************************************
2226 
2227  //**Addition assignment to dense matrices*******************************************************
2240  template< typename MT // Type of the target dense matrix
2241  , bool SO > // Storage order of the target dense matrix
2242  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2243  {
2245 
2246  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2247  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2248 
2249  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2250  return;
2251  }
2252 
2253  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2254  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2255 
2256  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2257  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2258  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2259  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2260  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2261  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2262 
2263  TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2264  }
2266  //**********************************************************************************************
2267 
2268  //**Addition assignment to dense matrices (kernel selection)************************************
2279  template< typename MT3 // Type of the left-hand side target matrix
2280  , typename MT4 // Type of the left-hand side matrix operand
2281  , typename MT5 > // Type of the right-hand side matrix operand
2282  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2283  {
2284  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
2285  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2286  selectSmallAddAssignKernel( C, A, B );
2287  else
2288  selectBlasAddAssignKernel( C, A, B );
2289  }
2291  //**********************************************************************************************
2292 
2293  //**Default addition assignment to row-major dense matrices (general/general)*******************
2307  template< typename MT3 // Type of the left-hand side target matrix
2308  , typename MT4 // Type of the left-hand side matrix operand
2309  , typename MT5 > // Type of the right-hand side matrix operand
2310  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2311  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2312  {
2313  const size_t M( A.rows() );
2314  const size_t N( B.columns() );
2315  const size_t K( A.columns() );
2316 
2317  for( size_t i=0UL; i<M; ++i )
2318  {
2319  const size_t kbegin( ( IsUpper<MT4>::value )
2320  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2321  :( 0UL ) );
2322  const size_t kend( ( IsLower<MT4>::value )
2323  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2324  :( K ) );
2325  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2326 
2327  for( size_t k=kbegin; k<kend; ++k )
2328  {
2329  const size_t jbegin( ( IsUpper<MT5>::value )
2330  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
2331  :( 0UL ) );
2332  const size_t jend( ( IsLower<MT5>::value )
2333  ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
2334  :( N ) );
2335  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2336 
2337  const size_t jnum( jend - jbegin );
2338  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2339 
2340  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2341  (~C)(i,j ) += A(i,k) * B(k,j );
2342  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2343  }
2344  if( jpos < jend ) {
2345  (~C)(i,jpos) += A(i,k) * B(k,jpos);
2346  }
2347  }
2348  }
2349  }
2351  //**********************************************************************************************
2352 
2353  //**Default addition assignment to column-major dense matrices (general/general)****************
2367  template< typename MT3 // Type of the left-hand side target matrix
2368  , typename MT4 // Type of the left-hand side matrix operand
2369  , typename MT5 > // Type of the right-hand side matrix operand
2370  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2371  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2372  {
2373  const size_t M( A.rows() );
2374  const size_t N( B.columns() );
2375  const size_t K( A.columns() );
2376 
2377  for( size_t j=0UL; j<N; ++j )
2378  {
2379  const size_t kbegin( ( IsLower<MT5>::value )
2380  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2381  :( 0UL ) );
2382  const size_t kend( ( IsUpper<MT5>::value )
2383  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2384  :( K ) );
2385  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2386 
2387  for( size_t k=kbegin; k<kend; ++k )
2388  {
2389  const size_t ibegin( ( IsLower<MT4>::value )
2390  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
2391  :( 0UL ) );
2392  const size_t iend( ( IsUpper<MT4>::value )
2393  ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
2394  :( M ) );
2395  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2396 
2397  const size_t inum( iend - ibegin );
2398  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2399 
2400  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2401  (~C)(i ,j) += A(i ,k) * B(k,j);
2402  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2403  }
2404  if( ipos < iend ) {
2405  (~C)(ipos,j) += A(ipos,k) * B(k,j);
2406  }
2407  }
2408  }
2409  }
2411  //**********************************************************************************************
2412 
2413  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
2427  template< typename MT3 // Type of the left-hand side target matrix
2428  , typename MT4 // Type of the left-hand side matrix operand
2429  , typename MT5 > // Type of the right-hand side matrix operand
2430  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2431  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2432  {
2433  const size_t M( A.rows() );
2434  const size_t N( B.columns() );
2435 
2436  const size_t block( BLOCK_SIZE );
2437 
2438  for( size_t ii=0UL; ii<M; ii+=block ) {
2439  const size_t iend( min( M, ii+block ) );
2440  for( size_t jj=0UL; jj<N; jj+=block ) {
2441  const size_t jend( min( N, jj+block ) );
2442  for( size_t i=ii; i<iend; ++i )
2443  {
2444  const size_t jbegin( ( IsUpper<MT4>::value )
2445  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
2446  :( jj ) );
2447  const size_t jpos( ( IsLower<MT4>::value )
2448  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
2449  :( jend ) );
2450 
2451  for( size_t j=jbegin; j<jpos; ++j ) {
2452  (~C)(i,j) += A(i,j) * B(j,j);
2453  }
2454  }
2455  }
2456  }
2457  }
2459  //**********************************************************************************************
2460 
2461  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
2475  template< typename MT3 // Type of the left-hand side target matrix
2476  , typename MT4 // Type of the left-hand side matrix operand
2477  , typename MT5 > // Type of the right-hand side matrix operand
2478  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2479  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2480  {
2481  const size_t M( A.rows() );
2482  const size_t N( B.columns() );
2483 
2484  for( size_t j=0UL; j<N; ++j )
2485  {
2486  const size_t ibegin( ( IsLower<MT4>::value )
2487  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2488  :( 0UL ) );
2489  const size_t iend( ( IsUpper<MT4>::value )
2490  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2491  :( M ) );
2492  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2493 
2494  const size_t inum( iend - ibegin );
2495  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2496 
2497  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2498  (~C)(i ,j) += A(i ,j) * B(j,j);
2499  (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j);
2500  }
2501  if( ipos < iend ) {
2502  (~C)(ipos,j) += A(ipos,j) * B(j,j);
2503  }
2504  }
2505  }
2507  //**********************************************************************************************
2508 
2509  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
2523  template< typename MT3 // Type of the left-hand side target matrix
2524  , typename MT4 // Type of the left-hand side matrix operand
2525  , typename MT5 > // Type of the right-hand side matrix operand
2526  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2527  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2528  {
2529  const size_t M( A.rows() );
2530  const size_t N( B.columns() );
2531 
2532  for( size_t i=0UL; i<M; ++i )
2533  {
2534  const size_t jbegin( ( IsUpper<MT5>::value )
2535  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2536  :( 0UL ) );
2537  const size_t jend( ( IsLower<MT5>::value )
2538  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2539  :( N ) );
2540  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2541 
2542  const size_t jnum( jend - jbegin );
2543  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2544 
2545  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2546  (~C)(i,j ) += A(i,i) * B(i,j );
2547  (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL);
2548  }
2549  if( jpos < jend ) {
2550  (~C)(i,jpos) += A(i,i) * B(i,jpos);
2551  }
2552  }
2553  }
2555  //**********************************************************************************************
2556 
2557  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
2571  template< typename MT3 // Type of the left-hand side target matrix
2572  , typename MT4 // Type of the left-hand side matrix operand
2573  , typename MT5 > // Type of the right-hand side matrix operand
2574  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2575  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2576  {
2577  const size_t M( A.rows() );
2578  const size_t N( B.columns() );
2579 
2580  const size_t block( BLOCK_SIZE );
2581 
2582  for( size_t jj=0UL; jj<N; jj+=block ) {
2583  const size_t jend( min( N, jj+block ) );
2584  for( size_t ii=0UL; ii<M; ii+=block ) {
2585  const size_t iend( min( M, ii+block ) );
2586  for( size_t j=jj; j<jend; ++j )
2587  {
2588  const size_t ibegin( ( IsLower<MT5>::value )
2589  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
2590  :( ii ) );
2591  const size_t ipos( ( IsUpper<MT5>::value )
2592  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
2593  :( iend ) );
2594 
2595  for( size_t i=ibegin; i<ipos; ++i ) {
2596  (~C)(i,j) += A(i,i) * B(i,j);
2597  }
2598  }
2599  }
2600  }
2601  }
2603  //**********************************************************************************************
2604 
2605  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2619  template< typename MT3 // Type of the left-hand side target matrix
2620  , typename MT4 // Type of the left-hand side matrix operand
2621  , typename MT5 > // Type of the right-hand side matrix operand
2622  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2623  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2624  {
2625  for( size_t i=0UL; i<A.rows(); ++i ) {
2626  C(i,i) += A(i,i) * B(i,i);
2627  }
2628  }
2630  //**********************************************************************************************
2631 
2632  //**Default addition assignment to dense matrices (small matrices)******************************
2646  template< typename MT3 // Type of the left-hand side target matrix
2647  , typename MT4 // Type of the left-hand side matrix operand
2648  , typename MT5 > // Type of the right-hand side matrix operand
2649  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2650  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2651  {
2652  selectDefaultAddAssignKernel( C, A, B );
2653  }
2655  //**********************************************************************************************
2656 
2657  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2672  template< typename MT3 // Type of the left-hand side target matrix
2673  , typename MT4 // Type of the left-hand side matrix operand
2674  , typename MT5 > // Type of the right-hand side matrix operand
2675  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2676  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2677  {
2678  const size_t M( A.rows() );
2679  const size_t N( B.columns() );
2680  const size_t K( A.columns() );
2681 
2682  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2683 
2684  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
2685  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
2686 
2687  size_t j( 0UL );
2688 
2689  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
2690  for( size_t i=0UL; i<M; ++i )
2691  {
2692  const size_t kbegin( ( IsUpper<MT4>::value )
2693  ?( ( IsLower<MT5>::value )
2694  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2695  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2696  :( IsLower<MT5>::value ? j : 0UL ) );
2697  const size_t kend( ( IsLower<MT4>::value )
2698  ?( ( IsUpper<MT5>::value )
2699  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
2700  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2701  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
2702 
2703  SIMDType xmm1( (~C).load(i,j ) );
2704  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2705  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2706  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2707  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2708  SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
2709  SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
2710  SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
2711 
2712  for( size_t k=kbegin; k<kend; ++k ) {
2713  const SIMDType a1( set( A(i,k) ) );
2714  xmm1 = xmm1 + a1 * B.load(k,j );
2715  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
2716  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
2717  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
2718  xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
2719  xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
2720  xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
2721  xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
2722  }
2723 
2724  (~C).store( i, j , xmm1 );
2725  (~C).store( i, j+SIMDSIZE , xmm2 );
2726  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2727  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2728  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2729  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
2730  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
2731  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
2732  }
2733  }
2734 
2735  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2736  {
2737  size_t i( 0UL );
2738 
2739  for( ; (i+2UL) <= M; i+=2UL )
2740  {
2741  const size_t kbegin( ( IsUpper<MT4>::value )
2742  ?( ( IsLower<MT5>::value )
2743  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2744  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2745  :( IsLower<MT5>::value ? j : 0UL ) );
2746  const size_t kend( ( IsLower<MT4>::value )
2747  ?( ( IsUpper<MT5>::value )
2748  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
2749  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2750  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
2751 
2752  SIMDType xmm1( (~C).load(i ,j ) );
2753  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
2754  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
2755  SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
2756  SIMDType xmm5( (~C).load(i+1UL,j ) );
2757  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
2758  SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2759  SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
2760 
2761  for( size_t k=kbegin; k<kend; ++k ) {
2762  const SIMDType a1( set( A(i ,k) ) );
2763  const SIMDType a2( set( A(i+1UL,k) ) );
2764  const SIMDType b1( B.load(k,j ) );
2765  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2766  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2767  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2768  xmm1 = xmm1 + a1 * b1;
2769  xmm2 = xmm2 + a1 * b2;
2770  xmm3 = xmm3 + a1 * b3;
2771  xmm4 = xmm4 + a1 * b4;
2772  xmm5 = xmm5 + a2 * b1;
2773  xmm6 = xmm6 + a2 * b2;
2774  xmm7 = xmm7 + a2 * b3;
2775  xmm8 = xmm8 + a2 * b4;
2776  }
2777 
2778  (~C).store( i , j , xmm1 );
2779  (~C).store( i , j+SIMDSIZE , xmm2 );
2780  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2781  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2782  (~C).store( i+1UL, j , xmm5 );
2783  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
2784  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
2785  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
2786  }
2787 
2788  if( i < M )
2789  {
2790  const size_t kbegin( ( IsUpper<MT4>::value )
2791  ?( ( IsLower<MT5>::value )
2792  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2793  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2794  :( IsLower<MT5>::value ? j : 0UL ) );
2795  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
2796 
2797  SIMDType xmm1( (~C).load(i,j ) );
2798  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2799  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2800  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2801 
2802  for( size_t k=kbegin; k<kend; ++k ) {
2803  const SIMDType a1( set( A(i,k) ) );
2804  xmm1 = xmm1 + a1 * B.load(k,j );
2805  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
2806  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
2807  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
2808  }
2809 
2810  (~C).store( i, j , xmm1 );
2811  (~C).store( i, j+SIMDSIZE , xmm2 );
2812  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2813  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2814  }
2815  }
2816 
2817  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2818  {
2819  size_t i( 0UL );
2820 
2821  for( ; (i+2UL) <= M; i+=2UL )
2822  {
2823  const size_t kbegin( ( IsUpper<MT4>::value )
2824  ?( ( IsLower<MT5>::value )
2825  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2826  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2827  :( IsLower<MT5>::value ? j : 0UL ) );
2828  const size_t kend( ( IsLower<MT4>::value )
2829  ?( ( IsUpper<MT5>::value )
2830  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
2831  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2832  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
2833 
2834  SIMDType xmm1( (~C).load(i ,j ) );
2835  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2836  SIMDType xmm3( (~C).load(i+1UL,j ) );
2837  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2838 
2839  for( size_t k=kbegin; k<kend; ++k ) {
2840  const SIMDType a1( set( A(i ,k) ) );
2841  const SIMDType a2( set( A(i+1UL,k) ) );
2842  const SIMDType b1( B.load(k,j ) );
2843  const SIMDType b2( B.load(k,j+SIMDSIZE) );
2844  xmm1 = xmm1 + a1 * b1;
2845  xmm2 = xmm2 + a1 * b2;
2846  xmm3 = xmm3 + a2 * b1;
2847  xmm4 = xmm4 + a2 * b2;
2848  }
2849 
2850  (~C).store( i , j , xmm1 );
2851  (~C).store( i , j+SIMDSIZE, xmm2 );
2852  (~C).store( i+1UL, j , xmm3 );
2853  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
2854  }
2855 
2856  if( i < M )
2857  {
2858  const size_t kbegin( ( IsUpper<MT4>::value )
2859  ?( ( IsLower<MT5>::value )
2860  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2861  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2862  :( IsLower<MT5>::value ? j : 0UL ) );
2863  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
2864 
2865  SIMDType xmm1( (~C).load(i,j ) );
2866  SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
2867 
2868  for( size_t k=kbegin; k<kend; ++k ) {
2869  const SIMDType a1( set( A(i,k) ) );
2870  xmm1 = xmm1 + a1 * B.load(k,j );
2871  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
2872  }
2873 
2874  (~C).store( i, j , xmm1 );
2875  (~C).store( i, j+SIMDSIZE, xmm2 );
2876  }
2877  }
2878 
2879  for( ; j<jpos; j+=SIMDSIZE )
2880  {
2881  size_t i( 0UL );
2882 
2883  for( ; (i+2UL) <= M; i+=2UL )
2884  {
2885  const size_t kbegin( ( IsUpper<MT4>::value )
2886  ?( ( IsLower<MT5>::value )
2887  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2888  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2889  :( IsLower<MT5>::value ? j : 0UL ) );
2890  const size_t kend( ( IsLower<MT4>::value )
2891  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2892  :( K ) );
2893 
2894  SIMDType xmm1( (~C).load(i ,j) );
2895  SIMDType xmm2( (~C).load(i+1UL,j) );
2896 
2897  for( size_t k=kbegin; k<kend; ++k ) {
2898  const SIMDType b1( B.load(k,j) );
2899  xmm1 = xmm1 + set( A(i ,k) ) * b1;
2900  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
2901  }
2902 
2903  (~C).store( i , j, xmm1 );
2904  (~C).store( i+1UL, j, xmm2 );
2905  }
2906 
2907  if( i < M )
2908  {
2909  const size_t kbegin( ( IsUpper<MT4>::value )
2910  ?( ( IsLower<MT5>::value )
2911  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2912  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2913  :( IsLower<MT5>::value ? j : 0UL ) );
2914 
2915  SIMDType xmm1( (~C).load(i,j) );
2916 
2917  for( size_t k=kbegin; k<K; ++k ) {
2918  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
2919  }
2920 
2921  (~C).store( i, j, xmm1 );
2922  }
2923  }
2924 
2925  for( ; remainder && j<N; ++j )
2926  {
2927  size_t i( 0UL );
2928 
2929  for( ; (i+2UL) <= M; i+=2UL )
2930  {
2931  const size_t kbegin( ( IsUpper<MT4>::value )
2932  ?( ( IsLower<MT5>::value )
2933  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2934  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2935  :( IsLower<MT5>::value ? j : 0UL ) );
2936  const size_t kend( ( IsLower<MT4>::value )
2937  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2938  :( K ) );
2939 
2940  ElementType value1( (~C)(i ,j) );
2941  ElementType value2( (~C)(i+1UL,j) );;
2942 
2943  for( size_t k=kbegin; k<kend; ++k ) {
2944  value1 += A(i ,k) * B(k,j);
2945  value2 += A(i+1UL,k) * B(k,j);
2946  }
2947 
2948  (~C)(i ,j) = value1;
2949  (~C)(i+1UL,j) = value2;
2950  }
2951 
2952  if( i < M )
2953  {
2954  const size_t kbegin( ( IsUpper<MT4>::value )
2955  ?( ( IsLower<MT5>::value )
2956  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2957  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2958  :( IsLower<MT5>::value ? j : 0UL ) );
2959 
2960  ElementType value( (~C)(i,j) );
2961 
2962  for( size_t k=kbegin; k<K; ++k ) {
2963  value += A(i,k) * B(k,j);
2964  }
2965 
2966  (~C)(i,j) = value;
2967  }
2968  }
2969  }
2971  //**********************************************************************************************
2972 
2973  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2988  template< typename MT3 // Type of the left-hand side target matrix
2989  , typename MT4 // Type of the left-hand side matrix operand
2990  , typename MT5 > // Type of the right-hand side matrix operand
2991  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2992  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2993  {
2994  const size_t M( A.rows() );
2995  const size_t N( B.columns() );
2996  const size_t K( A.columns() );
2997 
2998  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
2999 
3000  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
3001  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3002 
3003  size_t i( 0UL );
3004 
3005  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3006  for( size_t j=0UL; j<N; ++j )
3007  {
3008  const size_t kbegin( ( IsLower<MT5>::value )
3009  ?( ( IsUpper<MT4>::value )
3010  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3011  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3012  :( IsUpper<MT4>::value ? i : 0UL ) );
3013  const size_t kend( ( IsUpper<MT5>::value )
3014  ?( ( IsLower<MT4>::value )
3015  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3016  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3017  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
3018 
3019  SIMDType xmm1( (~C).load(i ,j) );
3020  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3021  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3022  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3023  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3024  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3025  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3026  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3027 
3028  for( size_t k=kbegin; k<kend; ++k ) {
3029  const SIMDType b1( set( B(k,j) ) );
3030  xmm1 = xmm1 + A.load(i ,k) * b1;
3031  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
3032  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
3033  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
3034  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
3035  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
3036  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
3037  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
3038  }
3039 
3040  (~C).store( i , j, xmm1 );
3041  (~C).store( i+SIMDSIZE , j, xmm2 );
3042  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3043  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3044  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3045  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3046  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3047  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3048  }
3049  }
3050 
3051  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3052  {
3053  size_t j( 0UL );
3054 
3055  for( ; (j+2UL) <= N; j+=2UL )
3056  {
3057  const size_t kbegin( ( IsLower<MT5>::value )
3058  ?( ( IsUpper<MT4>::value )
3059  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3060  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3061  :( IsUpper<MT4>::value ? i : 0UL ) );
3062  const size_t kend( ( IsUpper<MT5>::value )
3063  ?( ( IsLower<MT4>::value )
3064  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3065  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3066  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
3067 
3068  SIMDType xmm1( (~C).load(i ,j ) );
3069  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3070  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3071  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3072  SIMDType xmm5( (~C).load(i ,j+1UL) );
3073  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3074  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3075  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3076 
3077  for( size_t k=kbegin; k<kend; ++k ) {
3078  const SIMDType a1( A.load(i ,k) );
3079  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3080  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3081  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3082  const SIMDType b1( set( B(k,j ) ) );
3083  const SIMDType b2( set( B(k,j+1UL) ) );
3084  xmm1 = xmm1 + a1 * b1;
3085  xmm2 = xmm2 + a2 * b1;
3086  xmm3 = xmm3 + a3 * b1;
3087  xmm4 = xmm4 + a4 * b1;
3088  xmm5 = xmm5 + a1 * b2;
3089  xmm6 = xmm6 + a2 * b2;
3090  xmm7 = xmm7 + a3 * b2;
3091  xmm8 = xmm8 + a4 * b2;
3092  }
3093 
3094  (~C).store( i , j , xmm1 );
3095  (~C).store( i+SIMDSIZE , j , xmm2 );
3096  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3097  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3098  (~C).store( i , j+1UL, xmm5 );
3099  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3100  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3101  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3102  }
3103 
3104  if( j < N )
3105  {
3106  const size_t kbegin( ( IsLower<MT5>::value )
3107  ?( ( IsUpper<MT4>::value )
3108  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3109  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3110  :( IsUpper<MT4>::value ? i : 0UL ) );
3111  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
3112 
3113  SIMDType xmm1( (~C).load(i ,j) );
3114  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3115  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3116  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3117 
3118  for( size_t k=kbegin; k<kend; ++k ) {
3119  const SIMDType b1( set( B(k,j) ) );
3120  xmm1 = xmm1 + A.load(i ,k) * b1;
3121  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
3122  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
3123  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
3124  }
3125 
3126  (~C).store( i , j, xmm1 );
3127  (~C).store( i+SIMDSIZE , j, xmm2 );
3128  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3129  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3130  }
3131  }
3132 
3133  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3134  {
3135  size_t j( 0UL );
3136 
3137  for( ; (j+2UL) <= N; j+=2UL )
3138  {
3139  const size_t kbegin( ( IsLower<MT5>::value )
3140  ?( ( IsUpper<MT4>::value )
3141  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3142  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3143  :( IsUpper<MT4>::value ? i : 0UL ) );
3144  const size_t kend( ( IsUpper<MT5>::value )
3145  ?( ( IsLower<MT4>::value )
3146  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3147  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3148  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
3149 
3150  SIMDType xmm1( (~C).load(i ,j ) );
3151  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3152  SIMDType xmm3( (~C).load(i ,j+1UL) );
3153  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3154 
3155  for( size_t k=kbegin; k<kend; ++k ) {
3156  const SIMDType a1( A.load(i ,k) );
3157  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3158  const SIMDType b1( set( B(k,j ) ) );
3159  const SIMDType b2( set( B(k,j+1UL) ) );
3160  xmm1 = xmm1 + a1 * b1;
3161  xmm2 = xmm2 + a2 * b1;
3162  xmm3 = xmm3 + a1 * b2;
3163  xmm4 = xmm4 + a2 * b2;
3164  }
3165 
3166  (~C).store( i , j , xmm1 );
3167  (~C).store( i+SIMDSIZE, j , xmm2 );
3168  (~C).store( i , j+1UL, xmm3 );
3169  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3170  }
3171 
3172  if( j < N )
3173  {
3174  const size_t kbegin( ( IsLower<MT5>::value )
3175  ?( ( IsUpper<MT4>::value )
3176  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3177  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3178  :( IsUpper<MT4>::value ? i : 0UL ) );
3179  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
3180 
3181  SIMDType xmm1( (~C).load(i ,j) );
3182  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
3183 
3184  for( size_t k=kbegin; k<kend; ++k ) {
3185  const SIMDType b1( set( B(k,j) ) );
3186  xmm1 = xmm1 + A.load(i ,k) * b1;
3187  xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
3188  }
3189 
3190  (~C).store( i , j, xmm1 );
3191  (~C).store( i+SIMDSIZE, j, xmm2 );
3192  }
3193  }
3194 
3195  for( ; i<ipos; i+=SIMDSIZE )
3196  {
3197  size_t j( 0UL );
3198 
3199  for( ; (j+2UL) <= N; j+=2UL )
3200  {
3201  const size_t kbegin( ( IsLower<MT5>::value )
3202  ?( ( IsUpper<MT4>::value )
3203  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3204  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3205  :( IsUpper<MT4>::value ? i : 0UL ) );
3206  const size_t kend( ( IsUpper<MT5>::value )
3207  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3208  :( K ) );
3209 
3210  SIMDType xmm1( (~C).load(i,j ) );
3211  SIMDType xmm2( (~C).load(i,j+1UL) );
3212 
3213  for( size_t k=kbegin; k<kend; ++k ) {
3214  const SIMDType a1( A.load(i,k) );
3215  xmm1 = xmm1 + a1 * set( B(k,j ) );
3216  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
3217  }
3218 
3219  (~C).store( i, j , xmm1 );
3220  (~C).store( i, j+1UL, xmm2 );
3221  }
3222 
3223  if( j < N )
3224  {
3225  const size_t kbegin( ( IsLower<MT5>::value )
3226  ?( ( IsUpper<MT4>::value )
3227  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3228  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3229  :( IsUpper<MT4>::value ? i : 0UL ) );
3230 
3231  SIMDType xmm1( (~C).load(i,j) );
3232 
3233  for( size_t k=kbegin; k<K; ++k ) {
3234  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
3235  }
3236 
3237  (~C).store( i, j, xmm1 );
3238  }
3239  }
3240 
3241  for( ; remainder && i<M; ++i )
3242  {
3243  size_t j( 0UL );
3244 
3245  for( ; (j+2UL) <= N; j+=2UL )
3246  {
3247  const size_t kbegin( ( IsLower<MT5>::value )
3248  ?( ( IsUpper<MT4>::value )
3249  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3250  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3251  :( IsUpper<MT4>::value ? i : 0UL ) );
3252  const size_t kend( ( IsUpper<MT5>::value )
3253  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3254  :( K ) );
3255 
3256  ElementType value1( (~C)(i,j ) );
3257  ElementType value2( (~C)(i,j+1UL) );
3258 
3259  for( size_t k=kbegin; k<kend; ++k ) {
3260  value1 += A(i,k) * B(k,j );
3261  value2 += A(i,k) * B(k,j+1UL);
3262  }
3263 
3264  (~C)(i,j ) = value1;
3265  (~C)(i,j+1UL) = value2;
3266  }
3267 
3268  if( j < N )
3269  {
3270  const size_t kbegin( ( IsLower<MT5>::value )
3271  ?( ( IsUpper<MT4>::value )
3272  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3273  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3274  :( IsUpper<MT4>::value ? i : 0UL ) );
3275 
3276  ElementType value( (~C)(i,j) );
3277 
3278  for( size_t k=kbegin; k<K; ++k ) {
3279  value += A(i,k) * B(k,j);
3280  }
3281 
3282  (~C)(i,j) = value;
3283  }
3284  }
3285  }
3287  //**********************************************************************************************
3288 
3289  //**Default addition assignment to dense matrices (large matrices)******************************
3303  template< typename MT3 // Type of the left-hand side target matrix
3304  , typename MT4 // Type of the left-hand side matrix operand
3305  , typename MT5 > // Type of the right-hand side matrix operand
3306  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3307  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3308  {
3309  selectDefaultAddAssignKernel( C, A, B );
3310  }
3312  //**********************************************************************************************
3313 
3314  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
3329  template< typename MT3 // Type of the left-hand side target matrix
3330  , typename MT4 // Type of the left-hand side matrix operand
3331  , typename MT5 > // Type of the right-hand side matrix operand
3332  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3333  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3334  {
3335  const size_t M( A.rows() );
3336  const size_t N( B.columns() );
3337  const size_t K( A.columns() );
3338 
3339  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3340 
3341  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
3342  {
3343  const size_t jend( min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
3344 
3345  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3346  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3347 
3348  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
3349  {
3350  const size_t iend( min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
3351 
3352  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
3353  {
3354  const size_t ktmp( min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
3355 
3356  size_t j( jj );
3357 
3358  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3359  {
3360  const size_t j1( j+SIMDSIZE );
3361  const size_t j2( j+SIMDSIZE*2UL );
3362  const size_t j3( j+SIMDSIZE*3UL );
3363 
3364  size_t i( ii );
3365 
3366  for( ; (i+2UL) <= iend; i+=2UL )
3367  {
3368  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3369  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3370  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3371  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
3372 
3373  SIMDType xmm1( (~C).load(i ,j ) );
3374  SIMDType xmm2( (~C).load(i ,j1) );
3375  SIMDType xmm3( (~C).load(i ,j2) );
3376  SIMDType xmm4( (~C).load(i ,j3) );
3377  SIMDType xmm5( (~C).load(i+1UL,j ) );
3378  SIMDType xmm6( (~C).load(i+1UL,j1) );
3379  SIMDType xmm7( (~C).load(i+1UL,j2) );
3380  SIMDType xmm8( (~C).load(i+1UL,j3) );
3381 
3382  for( size_t k=kbegin; k<kend; ++k ) {
3383  const SIMDType a1( set( A(i ,k) ) );
3384  const SIMDType a2( set( A(i+1UL,k) ) );
3385  const SIMDType b1( B.load(k,j ) );
3386  const SIMDType b2( B.load(k,j1) );
3387  const SIMDType b3( B.load(k,j2) );
3388  const SIMDType b4( B.load(k,j3) );
3389  xmm1 = xmm1 + a1 * b1;
3390  xmm2 = xmm2 + a1 * b2;
3391  xmm3 = xmm3 + a1 * b3;
3392  xmm4 = xmm4 + a1 * b4;
3393  xmm5 = xmm5 + a2 * b1;
3394  xmm6 = xmm6 + a2 * b2;
3395  xmm7 = xmm7 + a2 * b3;
3396  xmm8 = xmm8 + a2 * b4;
3397  }
3398 
3399  (~C).store( i , j , xmm1 );
3400  (~C).store( i , j1, xmm2 );
3401  (~C).store( i , j2, xmm3 );
3402  (~C).store( i , j3, xmm4 );
3403  (~C).store( i+1UL, j , xmm5 );
3404  (~C).store( i+1UL, j1, xmm6 );
3405  (~C).store( i+1UL, j2, xmm7 );
3406  (~C).store( i+1UL, j3, xmm8 );
3407  }
3408 
3409  if( i < iend )
3410  {
3411  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3412  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3413  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3414  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
3415 
3416  SIMDType xmm1( (~C).load(i,j ) );
3417  SIMDType xmm2( (~C).load(i,j1) );
3418  SIMDType xmm3( (~C).load(i,j2) );
3419  SIMDType xmm4( (~C).load(i,j3) );
3420 
3421  for( size_t k=kbegin; k<kend; ++k ) {
3422  const SIMDType a1( set( A(i,k) ) );
3423  xmm1 = xmm1 + a1 * B.load(k,j );
3424  xmm2 = xmm2 + a1 * B.load(k,j1);
3425  xmm3 = xmm3 + a1 * B.load(k,j2);
3426  xmm4 = xmm4 + a1 * B.load(k,j3);
3427  }
3428 
3429  (~C).store( i, j , xmm1 );
3430  (~C).store( i, j1, xmm2 );
3431  (~C).store( i, j2, xmm3 );
3432  (~C).store( i, j3, xmm4 );
3433  }
3434  }
3435 
3436  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3437  {
3438  const size_t j1( j+SIMDSIZE );
3439 
3440  size_t i( ii );
3441 
3442  for( ; (i+4UL) <= iend; i+=4UL )
3443  {
3444  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3445  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3446  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
3447  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
3448 
3449  SIMDType xmm1( (~C).load(i ,j ) );
3450  SIMDType xmm2( (~C).load(i ,j1) );
3451  SIMDType xmm3( (~C).load(i+1UL,j ) );
3452  SIMDType xmm4( (~C).load(i+1UL,j1) );
3453  SIMDType xmm5( (~C).load(i+2UL,j ) );
3454  SIMDType xmm6( (~C).load(i+2UL,j1) );
3455  SIMDType xmm7( (~C).load(i+3UL,j ) );
3456  SIMDType xmm8( (~C).load(i+3UL,j1) );
3457 
3458  for( size_t k=kbegin; k<kend; ++k ) {
3459  const SIMDType a1( set( A(i ,k) ) );
3460  const SIMDType a2( set( A(i+1UL,k) ) );
3461  const SIMDType a3( set( A(i+2UL,k) ) );
3462  const SIMDType a4( set( A(i+3UL,k) ) );
3463  const SIMDType b1( B.load(k,j ) );
3464  const SIMDType b2( B.load(k,j1) );
3465  xmm1 = xmm1 + a1 * b1;
3466  xmm2 = xmm2 + a1 * b2;
3467  xmm3 = xmm3 + a2 * b1;
3468  xmm4 = xmm4 + a2 * b2;
3469  xmm5 = xmm5 + a3 * b1;
3470  xmm6 = xmm6 + a3 * b2;
3471  xmm7 = xmm7 + a4 * b1;
3472  xmm8 = xmm8 + a4 * b2;
3473  }
3474 
3475  (~C).store( i , j , xmm1 );
3476  (~C).store( i , j1, xmm2 );
3477  (~C).store( i+1UL, j , xmm3 );
3478  (~C).store( i+1UL, j1, xmm4 );
3479  (~C).store( i+2UL, j , xmm5 );
3480  (~C).store( i+2UL, j1, xmm6 );
3481  (~C).store( i+3UL, j , xmm7 );
3482  (~C).store( i+3UL, j1, xmm8 );
3483  }
3484 
3485  for( ; (i+2UL) <= iend; i+=2UL )
3486  {
3487  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3488  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3489  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3490  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
3491 
3492  SIMDType xmm1( (~C).load(i ,j ) );
3493  SIMDType xmm2( (~C).load(i ,j1) );
3494  SIMDType xmm3( (~C).load(i+1UL,j ) );
3495  SIMDType xmm4( (~C).load(i+1UL,j1) );
3496 
3497  for( size_t k=kbegin; k<kend; ++k ) {
3498  const SIMDType a1( set( A(i ,k) ) );
3499  const SIMDType a2( set( A(i+1UL,k) ) );
3500  const SIMDType b1( B.load(k,j ) );
3501  const SIMDType b2( B.load(k,j1) );
3502  xmm1 = xmm1 + a1 * b1;
3503  xmm2 = xmm2 + a1 * b2;
3504  xmm3 = xmm3 + a2 * b1;
3505  xmm4 = xmm4 + a2 * b2;
3506  }
3507 
3508  (~C).store( i , j , xmm1 );
3509  (~C).store( i , j1, xmm2 );
3510  (~C).store( i+1UL, j , xmm3 );
3511  (~C).store( i+1UL, j1, xmm4 );
3512  }
3513 
3514  if( i < iend )
3515  {
3516  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3517  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3518  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3519  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
3520 
3521  SIMDType xmm1( (~C).load(i,j ) );
3522  SIMDType xmm2( (~C).load(i,j1) );
3523 
3524  for( size_t k=kbegin; k<kend; ++k ) {
3525  const SIMDType a1( set( A(i,k) ) );
3526  xmm1 = xmm1 + a1 * B.load(k,j );
3527  xmm2 = xmm2 + a1 * B.load(k,j1);
3528  }
3529 
3530  (~C).store( i, j , xmm1 );
3531  (~C).store( i, j1, xmm2 );
3532  }
3533  }
3534 
3535  for( ; j<jpos; j+=SIMDSIZE )
3536  {
3537  for( size_t i=ii; i<iend; ++i )
3538  {
3539  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3540  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3541  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3542  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
3543 
3544  SIMDType xmm1( (~C).load(i,j) );
3545 
3546  for( size_t k=kbegin; k<kend; ++k ) {
3547  const SIMDType a1( set( A(i,k) ) );
3548  xmm1 = xmm1 + a1 * B.load(k,j);
3549  }
3550 
3551  (~C).store( i, j, xmm1 );
3552  }
3553  }
3554 
3555  for( ; remainder && j<jend; ++j )
3556  {
3557  for( size_t i=ii; i<iend; ++i )
3558  {
3559  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3560  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3561  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3562  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
3563 
3564  ElementType value( (~C)(i,j) );
3565 
3566  for( size_t k=kbegin; k<kend; ++k ) {
3567  value += A(i,k) * B(k,j);
3568  }
3569 
3570  (~C)(i,j) = value;
3571  }
3572  }
3573  }
3574  }
3575  }
3576  }
3578  //**********************************************************************************************
3579 
3580  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
3595  template< typename MT3 // Type of the left-hand side target matrix
3596  , typename MT4 // Type of the left-hand side matrix operand
3597  , typename MT5 > // Type of the right-hand side matrix operand
3598  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3599  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3600  {
3601  const size_t M( A.rows() );
3602  const size_t N( B.columns() );
3603  const size_t K( A.columns() );
3604 
3605  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3606 
3607  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
3608  {
3609  const size_t iend( min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
3610 
3611  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3612  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3613 
3614  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
3615  {
3616  const size_t jend( min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
3617 
3618  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
3619  {
3620  const size_t ktmp( min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
3621 
3622  size_t i( ii );
3623 
3624  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3625  {
3626  const size_t i1( i+SIMDSIZE );
3627  const size_t i2( i+SIMDSIZE*2UL );
3628  const size_t i3( i+SIMDSIZE*3UL );
3629 
3630  size_t j( jj );
3631 
3632  for( ; (j+2UL) <= jend; j+=2UL )
3633  {
3634  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3635  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3636  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
3637  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3638 
3639  SIMDType xmm1( (~C).load(i ,j ) );
3640  SIMDType xmm2( (~C).load(i1,j ) );
3641  SIMDType xmm3( (~C).load(i2,j ) );
3642  SIMDType xmm4( (~C).load(i3,j ) );
3643  SIMDType xmm5( (~C).load(i ,j+1UL) );
3644  SIMDType xmm6( (~C).load(i1,j+1UL) );
3645  SIMDType xmm7( (~C).load(i2,j+1UL) );
3646  SIMDType xmm8( (~C).load(i3,j+1UL) );
3647 
3648  for( size_t k=kbegin; k<kend; ++k ) {
3649  const SIMDType a1( A.load(i ,k) );
3650  const SIMDType a2( A.load(i1,k) );
3651  const SIMDType a3( A.load(i2,k) );
3652  const SIMDType a4( A.load(i3,k) );
3653  const SIMDType b1( set( B(k,j ) ) );
3654  const SIMDType b2( set( B(k,j+1UL) ) );
3655  xmm1 = xmm1 + a1 * b1;
3656  xmm2 = xmm2 + a2 * b1;
3657  xmm3 = xmm3 + a3 * b1;
3658  xmm4 = xmm4 + a4 * b1;
3659  xmm5 = xmm5 + a1 * b2;
3660  xmm6 = xmm6 + a2 * b2;
3661  xmm7 = xmm7 + a3 * b2;
3662  xmm8 = xmm8 + a4 * b2;
3663  }
3664 
3665  (~C).store( i , j , xmm1 );
3666  (~C).store( i1, j , xmm2 );
3667  (~C).store( i2, j , xmm3 );
3668  (~C).store( i3, j , xmm4 );
3669  (~C).store( i , j+1UL, xmm5 );
3670  (~C).store( i1, j+1UL, xmm6 );
3671  (~C).store( i2, j+1UL, xmm7 );
3672  (~C).store( i3, j+1UL, xmm8 );
3673  }
3674 
3675  if( j < jend )
3676  {
3677  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3678  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3679  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
3680  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3681 
3682  SIMDType xmm1( (~C).load(i ,j) );
3683  SIMDType xmm2( (~C).load(i1,j) );
3684  SIMDType xmm3( (~C).load(i2,j) );
3685  SIMDType xmm4( (~C).load(i3,j) );
3686 
3687  for( size_t k=kbegin; k<kend; ++k ) {
3688  const SIMDType b1( set( B(k,j) ) );
3689  xmm1 = xmm1 + A.load(i ,k) * b1;
3690  xmm2 = xmm2 + A.load(i1,k) * b1;
3691  xmm3 = xmm3 + A.load(i2,k) * b1;
3692  xmm4 = xmm4 + A.load(i3,k) * b1;
3693  }
3694 
3695  (~C).store( i , j, xmm1 );
3696  (~C).store( i1, j, xmm2 );
3697  (~C).store( i2, j, xmm3 );
3698  (~C).store( i3, j, xmm4 );
3699  }
3700  }
3701 
3702  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3703  {
3704  const size_t i1( i+SIMDSIZE );
3705 
3706  size_t j( jj );
3707 
3708  for( ; (j+4UL) <= jend; j+=4UL )
3709  {
3710  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3711  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3712  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
3713  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
3714 
3715  SIMDType xmm1( (~C).load(i ,j ) );
3716  SIMDType xmm2( (~C).load(i1,j ) );
3717  SIMDType xmm3( (~C).load(i ,j+1UL) );
3718  SIMDType xmm4( (~C).load(i1,j+1UL) );
3719  SIMDType xmm5( (~C).load(i ,j+2UL) );
3720  SIMDType xmm6( (~C).load(i1,j+2UL) );
3721  SIMDType xmm7( (~C).load(i ,j+3UL) );
3722  SIMDType xmm8( (~C).load(i1,j+3UL) );
3723 
3724  for( size_t k=kbegin; k<kend; ++k ) {
3725  const SIMDType a1( A.load(i ,k) );
3726  const SIMDType a2( A.load(i1,k) );
3727  const SIMDType b1( set( B(k,j ) ) );
3728  const SIMDType b2( set( B(k,j+1UL) ) );
3729  const SIMDType b3( set( B(k,j+2UL) ) );
3730  const SIMDType b4( set( B(k,j+3UL) ) );
3731  xmm1 = xmm1 + a1 * b1;
3732  xmm2 = xmm2 + a2 * b1;
3733  xmm3 = xmm3 + a1 * b2;
3734  xmm4 = xmm4 + a2 * b2;
3735  xmm5 = xmm5 + a1 * b3;
3736  xmm6 = xmm6 + a2 * b3;
3737  xmm7 = xmm7 + a1 * b4;
3738  xmm8 = xmm8 + a2 * b4;
3739  }
3740 
3741  (~C).store( i , j , xmm1 );
3742  (~C).store( i1, j , xmm2 );
3743  (~C).store( i , j+1UL, xmm3 );
3744  (~C).store( i1, j+1UL, xmm4 );
3745  (~C).store( i , j+2UL, xmm5 );
3746  (~C).store( i1, j+2UL, xmm6 );
3747  (~C).store( i , j+3UL, xmm7 );
3748  (~C).store( i1, j+3UL, xmm8 );
3749  }
3750 
3751  for( ; (j+2UL) <= jend; j+=2UL )
3752  {
3753  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3754  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3755  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
3756  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3757 
3758  SIMDType xmm1( (~C).load(i ,j ) );
3759  SIMDType xmm2( (~C).load(i1,j ) );
3760  SIMDType xmm3( (~C).load(i ,j+1UL) );
3761  SIMDType xmm4( (~C).load(i1,j+1UL) );
3762 
3763  for( size_t k=kbegin; k<kend; ++k ) {
3764  const SIMDType a1( A.load(i ,k) );
3765  const SIMDType a2( A.load(i1,k) );
3766  const SIMDType b1( set( B(k,j ) ) );
3767  const SIMDType b2( set( B(k,j+1UL) ) );
3768  xmm1 = xmm1 + a1 * b1;
3769  xmm2 = xmm2 + a2 * b1;
3770  xmm3 = xmm3 + a1 * b2;
3771  xmm4 = xmm4 + a2 * b2;
3772  }
3773 
3774  (~C).store( i , j , xmm1 );
3775  (~C).store( i1, j , xmm2 );
3776  (~C).store( i , j+1UL, xmm3 );
3777  (~C).store( i1, j+1UL, xmm4 );
3778  }
3779 
3780  if( j < jend )
3781  {
3782  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3783  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3784  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
3785  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3786 
3787  SIMDType xmm1( (~C).load(i ,j) );
3788  SIMDType xmm2( (~C).load(i1,j) );
3789 
3790  for( size_t k=kbegin; k<kend; ++k ) {
3791  const SIMDType b1( set( B(k,j) ) );
3792  xmm1 = xmm1 + A.load(i ,k) * b1;
3793  xmm2 = xmm2 + A.load(i1,k) * b1;
3794  }
3795 
3796  (~C).store( i , j, xmm1 );
3797  (~C).store( i1, j, xmm2 );
3798  }
3799  }
3800 
3801  for( ; i<ipos; i+=SIMDSIZE )
3802  {
3803  for( size_t j=jj; j<jend; ++j )
3804  {
3805  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3806  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3807  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE, ktmp ) ):( ktmp ),
3808  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3809 
3810  SIMDType xmm1( (~C).load(i,j) );
3811 
3812  for( size_t k=kbegin; k<kend; ++k ) {
3813  const SIMDType b1( set( B(k,j) ) );
3814  xmm1 = xmm1 + A.load(i,k) * b1;
3815  }
3816 
3817  (~C).store( i, j, xmm1 );
3818  }
3819  }
3820 
3821  for( ; remainder && i<iend; ++i )
3822  {
3823  for( size_t j=jj; j<jend; ++j )
3824  {
3825  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3826  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3827  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
3828  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3829 
3830  ElementType value( (~C)(i,j) );
3831 
3832  for( size_t k=kbegin; k<kend; ++k ) {
3833  value += A(i,k) * B(k,j);
3834  }
3835 
3836  (~C)(i,j) = value;
3837  }
3838  }
3839  }
3840  }
3841  }
3842  }
3844  //**********************************************************************************************
3845 
3846  //**BLAS-based addition assignment to dense matrices (default)**********************************
3860  template< typename MT3 // Type of the left-hand side target matrix
3861  , typename MT4 // Type of the left-hand side matrix operand
3862  , typename MT5 > // Type of the right-hand side matrix operand
3863  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
3864  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3865  {
3866  selectLargeAddAssignKernel( C, A, B );
3867  }
3869  //**********************************************************************************************
3870 
3871  //**BLAS-based addition assignment to dense matrices********************************************
3872 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3873 
3886  template< typename MT3 // Type of the left-hand side target matrix
3887  , typename MT4 // Type of the left-hand side matrix operand
3888  , typename MT5 > // Type of the right-hand side matrix operand
3889  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
3890  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3891  {
3892  typedef ElementType_<MT3> ET;
3893 
3894  if( IsTriangular<MT4>::value ) {
3895  ResultType_<MT3> tmp( serial( B ) );
3896  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3897  addAssign( C, tmp );
3898  }
3899  else if( IsTriangular<MT5>::value ) {
3900  ResultType_<MT3> tmp( serial( A ) );
3901  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3902  addAssign( C, tmp );
3903  }
3904  else {
3905  gemm( C, A, B, ET(1), ET(1) );
3906  }
3907  }
3909 #endif
3910  //**********************************************************************************************
3911 
3912  //**Addition assignment to sparse matrices******************************************************
3913  // No special implementation for the addition assignment to sparse matrices.
3914  //**********************************************************************************************
3915 
3916  //**Subtraction assignment to dense matrices****************************************************
3929  template< typename MT // Type of the target dense matrix
3930  , bool SO > // Storage order of the target dense matrix
3931  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
3932  {
3934 
3935  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3936  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3937 
3938  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3939  return;
3940  }
3941 
3942  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3943  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3944 
3945  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3946  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3947  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3948  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3949  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3950  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3951 
3952  TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3953  }
3955  //**********************************************************************************************
3956 
3957  //**Subtraction assignment to dense matrices (kernel selection)*********************************
3968  template< typename MT3 // Type of the left-hand side target matrix
3969  , typename MT4 // Type of the left-hand side matrix operand
3970  , typename MT5 > // Type of the right-hand side matrix operand
3971  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3972  {
3973  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
3974  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
3975  selectSmallSubAssignKernel( C, A, B );
3976  else
3977  selectBlasSubAssignKernel( C, A, B );
3978  }
3980  //**********************************************************************************************
3981 
3982  //**Default subtraction assignment to row-major dense matrices (general/general)****************
3996  template< typename MT3 // Type of the left-hand side target matrix
3997  , typename MT4 // Type of the left-hand side matrix operand
3998  , typename MT5 > // Type of the right-hand side matrix operand
3999  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4000  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4001  {
4002  const size_t M( A.rows() );
4003  const size_t N( B.columns() );
4004  const size_t K( A.columns() );
4005 
4006  for( size_t i=0UL; i<M; ++i )
4007  {
4008  const size_t kbegin( ( IsUpper<MT4>::value )
4009  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4010  :( 0UL ) );
4011  const size_t kend( ( IsLower<MT4>::value )
4012  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4013  :( K ) );
4014  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4015 
4016  for( size_t k=kbegin; k<kend; ++k )
4017  {
4018  const size_t jbegin( ( IsUpper<MT5>::value )
4019  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
4020  :( 0UL ) );
4021  const size_t jend( ( IsLower<MT5>::value )
4022  ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
4023  :( N ) );
4024  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4025 
4026  const size_t jnum( jend - jbegin );
4027  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4028 
4029  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4030  (~C)(i,j ) -= A(i,k) * B(k,j );
4031  (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4032  }
4033  if( jpos < jend ) {
4034  (~C)(i,jpos) -= A(i,k) * B(k,jpos);
4035  }
4036  }
4037  }
4038  }
4040  //**********************************************************************************************
4041 
4042  //**Default subtraction assignment to column-major dense matrices (general/general)*************
4056  template< typename MT3 // Type of the left-hand side target matrix
4057  , typename MT4 // Type of the left-hand side matrix operand
4058  , typename MT5 > // Type of the right-hand side matrix operand
4059  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4060  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4061  {
4062  const size_t M( A.rows() );
4063  const size_t N( B.columns() );
4064  const size_t K( A.columns() );
4065 
4066  for( size_t j=0UL; j<N; ++j )
4067  {
4068  const size_t kbegin( ( IsLower<MT5>::value )
4069  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4070  :( 0UL ) );
4071  const size_t kend( ( IsUpper<MT5>::value )
4072  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4073  :( K ) );
4074  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4075 
4076  for( size_t k=kbegin; k<kend; ++k )
4077  {
4078  const size_t ibegin( ( IsLower<MT4>::value )
4079  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
4080  :( 0UL ) );
4081  const size_t iend( ( IsUpper<MT4>::value )
4082  ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
4083  :( M ) );
4084  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4085 
4086  const size_t inum( iend - ibegin );
4087  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4088 
4089  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4090  (~C)(i ,j) -= A(i ,k) * B(k,j);
4091  (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4092  }
4093  if( ipos < iend ) {
4094  (~C)(ipos,j) -= A(ipos,k) * B(k,j);
4095  }
4096  }
4097  }
4098  }
4100  //**********************************************************************************************
4101 
4102  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
4116  template< typename MT3 // Type of the left-hand side target matrix
4117  , typename MT4 // Type of the left-hand side matrix operand
4118  , typename MT5 > // Type of the right-hand side matrix operand
4119  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4120  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4121  {
4122  const size_t M( A.rows() );
4123  const size_t N( B.columns() );
4124 
4125  const size_t block( BLOCK_SIZE );
4126 
4127  for( size_t ii=0UL; ii<M; ii+=block ) {
4128  const size_t iend( min( M, ii+block ) );
4129  for( size_t jj=0UL; jj<N; jj+=block ) {
4130  const size_t jend( min( N, jj+block ) );
4131  for( size_t i=ii; i<iend; ++i )
4132  {
4133  const size_t jbegin( ( IsUpper<MT4>::value )
4134  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
4135  :( jj ) );
4136  const size_t jpos( ( IsLower<MT4>::value )
4137  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
4138  :( jend ) );
4139 
4140  for( size_t j=jbegin; j<jpos; ++j ) {
4141  (~C)(i,j) -= A(i,j) * B(j,j);
4142  }
4143  }
4144  }
4145  }
4146  }
4148  //**********************************************************************************************
4149 
4150  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
4164  template< typename MT3 // Type of the left-hand side target matrix
4165  , typename MT4 // Type of the left-hand side matrix operand
4166  , typename MT5 > // Type of the right-hand side matrix operand
4167  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4168  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4169  {
4170  const size_t M( A.rows() );
4171  const size_t N( B.columns() );
4172 
4173  for( size_t j=0UL; j<N; ++j )
4174  {
4175  const size_t ibegin( ( IsLower<MT4>::value )
4176  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4177  :( 0UL ) );
4178  const size_t iend( ( IsUpper<MT4>::value )
4179  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4180  :( M ) );
4181  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4182 
4183  const size_t inum( iend - ibegin );
4184  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4185 
4186  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4187  (~C)(i ,j) -= A(i ,j) * B(j,j);
4188  (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4189  }
4190  if( ipos < iend ) {
4191  (~C)(ipos,j) -= A(ipos,j) * B(j,j);
4192  }
4193  }
4194  }
4196  //**********************************************************************************************
4197 
4198  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
4212  template< typename MT3 // Type of the left-hand side target matrix
4213  , typename MT4 // Type of the left-hand side matrix operand
4214  , typename MT5 > // Type of the right-hand side matrix operand
4215  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4216  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4217  {
4218  const size_t M( A.rows() );
4219  const size_t N( B.columns() );
4220 
4221  for( size_t i=0UL; i<M; ++i )
4222  {
4223  const size_t jbegin( ( IsUpper<MT5>::value )
4224  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4225  :( 0UL ) );
4226  const size_t jend( ( IsLower<MT5>::value )
4227  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4228  :( N ) );
4229  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4230 
4231  const size_t jnum( jend - jbegin );
4232  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4233 
4234  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4235  (~C)(i,j ) -= A(i,i) * B(i,j );
4236  (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4237  }
4238  if( jpos < jend ) {
4239  (~C)(i,jpos) -= A(i,i) * B(i,jpos);
4240  }
4241  }
4242  }
4244  //**********************************************************************************************
4245 
4246  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
4260  template< typename MT3 // Type of the left-hand side target matrix
4261  , typename MT4 // Type of the left-hand side matrix operand
4262  , typename MT5 > // Type of the right-hand side matrix operand
4263  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4264  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4265  {
4266  const size_t M( A.rows() );
4267  const size_t N( B.columns() );
4268 
4269  const size_t block( BLOCK_SIZE );
4270 
4271  for( size_t jj=0UL; jj<N; jj+=block ) {
4272  const size_t jend( min( N, jj+block ) );
4273  for( size_t ii=0UL; ii<M; ii+=block ) {
4274  const size_t iend( min( M, ii+block ) );
4275  for( size_t j=jj; j<jend; ++j )
4276  {
4277  const size_t ibegin( ( IsLower<MT5>::value )
4278  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
4279  :( ii ) );
4280  const size_t ipos( ( IsUpper<MT5>::value )
4281  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
4282  :( iend ) );
4283 
4284  for( size_t i=ibegin; i<ipos; ++i ) {
4285  (~C)(i,j) -= A(i,i) * B(i,j);
4286  }
4287  }
4288  }
4289  }
4290  }
4292  //**********************************************************************************************
4293 
4294  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
4308  template< typename MT3 // Type of the left-hand side target matrix
4309  , typename MT4 // Type of the left-hand side matrix operand
4310  , typename MT5 > // Type of the right-hand side matrix operand
4311  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
4312  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4313  {
4314  for( size_t i=0UL; i<A.rows(); ++i ) {
4315  C(i,i) -= A(i,i) * B(i,i);
4316  }
4317  }
4319  //**********************************************************************************************
4320 
4321  //**Default subtraction assignment to dense matrices (small matrices)***************************
4335  template< typename MT3 // Type of the left-hand side target matrix
4336  , typename MT4 // Type of the left-hand side matrix operand
4337  , typename MT5 > // Type of the right-hand side matrix operand
4338  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
4339  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4340  {
4341  selectDefaultSubAssignKernel( C, A, B );
4342  }
4344  //**********************************************************************************************
4345 
4346  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
4361  template< typename MT3 // Type of the left-hand side target matrix
4362  , typename MT4 // Type of the left-hand side matrix operand
4363  , typename MT5 > // Type of the right-hand side matrix operand
4364  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
4365  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4366  {
4367  const size_t M( A.rows() );
4368  const size_t N( B.columns() );
4369  const size_t K( A.columns() );
4370 
4371  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
4372 
4373  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
4374  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4375 
4376  size_t j( 0UL );
4377 
4378  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
4379  for( size_t i=0UL; i<M; ++i )
4380  {
4381  const size_t kbegin( ( IsUpper<MT4>::value )
4382  ?( ( IsLower<MT5>::value )
4383  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4384  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4385  :( IsLower<MT5>::value ? j : 0UL ) );
4386  const size_t kend( ( IsLower<MT4>::value )
4387  ?( ( IsUpper<MT5>::value )
4388  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
4389  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4390  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
4391 
4392  SIMDType xmm1( (~C).load(i,j ) );
4393  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
4394  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
4395  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
4396  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
4397  SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
4398  SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
4399  SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
4400 
4401  for( size_t k=kbegin; k<kend; ++k ) {
4402  const SIMDType a1( set( A(i,k) ) );
4403  xmm1 = xmm1 - a1 * B.load(k,j );
4404  xmm2 = xmm2 - a1 * B.load(k,j+SIMDSIZE );
4405  xmm3 = xmm3 - a1 * B.load(k,j+SIMDSIZE*2UL);
4406  xmm4 = xmm4 - a1 * B.load(k,j+SIMDSIZE*3UL);
4407  xmm5 = xmm5 - a1 * B.load(k,j+SIMDSIZE*4UL);
4408  xmm6 = xmm6 - a1 * B.load(k,j+SIMDSIZE*5UL);
4409  xmm7 = xmm7 - a1 * B.load(k,j+SIMDSIZE*6UL);
4410  xmm8 = xmm8 - a1 * B.load(k,j+SIMDSIZE*7UL);
4411  }
4412 
4413  (~C).store( i, j , xmm1 );
4414  (~C).store( i, j+SIMDSIZE , xmm2 );
4415  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
4416  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
4417  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
4418  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
4419  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
4420  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
4421  }
4422  }
4423 
4424  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4425  {
4426  size_t i( 0UL );
4427 
4428  for( ; (i+2UL) <= M; i+=2UL )
4429  {
4430  const size_t kbegin( ( IsUpper<MT4>::value )
4431  ?( ( IsLower<MT5>::value )
4432  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4433  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4434  :( IsLower<MT5>::value ? j : 0UL ) );
4435  const size_t kend( ( IsLower<MT4>::value )
4436  ?( ( IsUpper<MT5>::value )
4437  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
4438  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4439  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
4440 
4441  SIMDType xmm1( (~C).load(i ,j ) );
4442  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
4443  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
4444  SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
4445  SIMDType xmm5( (~C).load(i+1UL,j ) );
4446  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
4447  SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
4448  SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
4449 
4450  for( size_t k=kbegin; k<kend; ++k ) {
4451  const SIMDType a1( set( A(i ,k) ) );
4452  const SIMDType a2( set( A(i+1UL,k) ) );
4453  const SIMDType b1( B.load(k,j ) );
4454  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4455  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4456  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
4457  xmm1 = xmm1 - a1 * b1;
4458  xmm2 = xmm2 - a1 * b2;
4459  xmm3 = xmm3 - a1 * b3;
4460  xmm4 = xmm4 - a1 * b4;
4461  xmm5 = xmm5 - a2 * b1;
4462  xmm6 = xmm6 - a2 * b2;
4463  xmm7 = xmm7 - a2 * b3;
4464  xmm8 = xmm8 - a2 * b4;
4465  }
4466 
4467  (~C).store( i , j , xmm1 );
4468  (~C).store( i , j+SIMDSIZE , xmm2 );
4469  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
4470  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
4471  (~C).store( i+1UL, j , xmm5 );
4472  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
4473  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
4474  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
4475  }
4476 
4477  if( i < M )
4478  {
4479  const size_t kbegin( ( IsUpper<MT4>::value )
4480  ?( ( IsLower<MT5>::value )
4481  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4482  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4483  :( IsLower<MT5>::value ? j : 0UL ) );
4484  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
4485 
4486  SIMDType xmm1( (~C).load(i,j ) );
4487  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
4488  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
4489  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
4490 
4491  for( size_t k=kbegin; k<kend; ++k ) {
4492  const SIMDType a1( set( A(i,k) ) );
4493  xmm1 = xmm1 - a1 * B.load(k,j );
4494  xmm2 = xmm2 - a1 * B.load(k,j+SIMDSIZE );
4495  xmm3 = xmm3 - a1 * B.load(k,j+SIMDSIZE*2UL);
4496  xmm4 = xmm4 - a1 * B.load(k,j+SIMDSIZE*3UL);
4497  }
4498 
4499  (~C).store( i, j , xmm1 );
4500  (~C).store( i, j+SIMDSIZE , xmm2 );
4501  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
4502  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
4503  }
4504  }
4505 
4506  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4507  {
4508  size_t i( 0UL );
4509 
4510  for( ; (i+2UL) <= M; i+=2UL )
4511  {
4512  const size_t kbegin( ( IsUpper<MT4>::value )
4513  ?( ( IsLower<MT5>::value )
4514  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4515  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4516  :( IsLower<MT5>::value ? j : 0UL ) );
4517  const size_t kend( ( IsLower<MT4>::value )
4518  ?( ( IsUpper<MT5>::value )
4519  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
4520  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4521  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
4522 
4523  SIMDType xmm1( (~C).load(i ,j ) );
4524  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
4525  SIMDType xmm3( (~C).load(i+1UL,j ) );
4526  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
4527 
4528  for( size_t k=kbegin; k<kend; ++k ) {
4529  const SIMDType a1( set( A(i ,k) ) );
4530  const SIMDType a2( set( A(i+1UL,k) ) );
4531  const SIMDType b1( B.load(k,j ) );
4532  const SIMDType b2( B.load(k,j+SIMDSIZE) );
4533  xmm1 = xmm1 - a1 * b1;
4534  xmm2 = xmm2 - a1 * b2;
4535  xmm3 = xmm3 - a2 * b1;
4536  xmm4 = xmm4 - a2 * b2;
4537  }
4538 
4539  (~C).store( i , j , xmm1 );
4540  (~C).store( i , j+SIMDSIZE, xmm2 );
4541  (~C).store( i+1UL, j , xmm3 );
4542  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
4543  }
4544 
4545  if( i < M )
4546  {
4547  const size_t kbegin( ( IsUpper<MT4>::value )
4548  ?( ( IsLower<MT5>::value )
4549  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4550  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4551  :( IsLower<MT5>::value ? j : 0UL ) );
4552  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
4553 
4554  SIMDType xmm1( (~C).load(i,j ) );
4555  SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
4556 
4557  for( size_t k=kbegin; k<kend; ++k ) {
4558  const SIMDType a1( set( A(i,k) ) );
4559  xmm1 = xmm1 - a1 * B.load(k,j );
4560  xmm2 = xmm2 - a1 * B.load(k,j+SIMDSIZE);
4561  }
4562 
4563  (~C).store( i, j , xmm1 );
4564  (~C).store( i, j+SIMDSIZE, xmm2 );
4565  }
4566  }
4567 
4568  for( ; j<jpos; j+=SIMDSIZE )
4569  {
4570  size_t i( 0UL );
4571 
4572  for( ; (i+2UL) <= M; i+=2UL )
4573  {
4574  const size_t kbegin( ( IsUpper<MT4>::value )
4575  ?( ( IsLower<MT5>::value )
4576  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4577  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4578  :( IsLower<MT5>::value ? j : 0UL ) );
4579  const size_t kend( ( IsLower<MT4>::value )
4580  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4581  :( K ) );
4582 
4583  SIMDType xmm1( (~C).load(i ,j) );
4584  SIMDType xmm2( (~C).load(i+1UL,j) );
4585 
4586  for( size_t k=kbegin; k<kend; ++k ) {
4587  const SIMDType b1( B.load(k,j) );
4588  xmm1 = xmm1 - set( A(i ,k) ) * b1;
4589  xmm2 = xmm2 - set( A(i+1UL,k) ) * b1;
4590  }
4591 
4592  (~C).store( i , j, xmm1 );
4593  (~C).store( i+1UL, j, xmm2 );
4594  }
4595 
4596  if( i < M )
4597  {
4598  const size_t kbegin( ( IsUpper<MT4>::value )
4599  ?( ( IsLower<MT5>::value )
4600  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4601  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4602  :( IsLower<MT5>::value ? j : 0UL ) );
4603 
4604  SIMDType xmm1( (~C).load(i,j) );
4605 
4606  for( size_t k=kbegin; k<K; ++k ) {
4607  xmm1 = xmm1 - set( A(i,k) ) * B.load(k,j);
4608  }
4609 
4610  (~C).store( i, j, xmm1 );
4611  }
4612  }
4613 
4614  for( ; remainder && j<N; ++j )
4615  {
4616  size_t i( 0UL );
4617 
4618  for( ; (i+2UL) <= M; i+=2UL )
4619  {
4620  const size_t kbegin( ( IsUpper<MT4>::value )
4621  ?( ( IsLower<MT5>::value )
4622  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4623  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4624  :( IsLower<MT5>::value ? j : 0UL ) );
4625  const size_t kend( ( IsLower<MT4>::value )
4626  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4627  :( K ) );
4628 
4629  ElementType value1( (~C)(i ,j) );
4630  ElementType value2( (~C)(i+1UL,j) );
4631 
4632  for( size_t k=kbegin; k<kend; ++k ) {
4633  value1 -= A(i ,k) * B(k,j);
4634  value2 -= A(i+1UL,k) * B(k,j);
4635  }
4636 
4637  (~C)(i ,j) = value1;
4638  (~C)(i+1UL,j) = value2;
4639  }
4640 
4641  if( i < M )
4642  {
4643  const size_t kbegin( ( IsUpper<MT4>::value )
4644  ?( ( IsLower<MT5>::value )
4645  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4646  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4647  :( IsLower<MT5>::value ? j : 0UL ) );
4648 
4649  ElementType value( (~C)(i,j) );
4650 
4651  for( size_t k=kbegin; k<K; ++k ) {
4652  value -= A(i,k) * B(k,j);
4653  }
4654 
4655  (~C)(i,j) = value;
4656  }
4657  }
4658  }
4660  //**********************************************************************************************
4661 
4662  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
4677  template< typename MT3 // Type of the left-hand side target matrix
4678  , typename MT4 // Type of the left-hand side matrix operand
4679  , typename MT5 > // Type of the right-hand side matrix operand
4680  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
4681  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4682  {
4683  const size_t M( A.rows() );
4684  const size_t N( B.columns() );
4685  const size_t K( A.columns() );
4686 
4687  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
4688 
4689  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
4690  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
4691 
4692  size_t i( 0UL );
4693 
4694  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
4695  for( size_t j=0UL; j<N; ++j )
4696  {
4697  const size_t kbegin( ( IsLower<MT5>::value )
4698  ?( ( IsUpper<MT4>::value )
4699  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4700  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4701  :( IsUpper<MT4>::value ? i : 0UL ) );
4702  const size_t kend( ( IsUpper<MT5>::value )
4703  ?( ( IsLower<MT4>::value )
4704  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4705  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
4706  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
4707 
4708  SIMDType xmm1( (~C).load(i ,j) );
4709  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4710  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4711  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
4712  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
4713  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
4714  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
4715  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
4716 
4717  for( size_t k=kbegin; k<kend; ++k ) {
4718  const SIMDType b1( set( B(k,j) ) );
4719  xmm1 = xmm1 - A.load(i ,k) * b1;
4720  xmm2 = xmm2 - A.load(i+SIMDSIZE ,k) * b1;
4721  xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,k) * b1;
4722  xmm4 = xmm4 - A.load(i+SIMDSIZE*3UL,k) * b1;
4723  xmm5 = xmm5 - A.load(i+SIMDSIZE*4UL,k) * b1;
4724  xmm6 = xmm6 - A.load(i+SIMDSIZE*5UL,k) * b1;
4725  xmm7 = xmm7 - A.load(i+SIMDSIZE*6UL,k) * b1;
4726  xmm8 = xmm8 - A.load(i+SIMDSIZE*7UL,k) * b1;
4727  }
4728 
4729  (~C).store( i , j, xmm1 );
4730  (~C).store( i+SIMDSIZE , j, xmm2 );
4731  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4732  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
4733  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
4734  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
4735  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
4736  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
4737  }
4738  }
4739 
4740  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4741  {
4742  size_t j( 0UL );
4743 
4744  for( ; (j+2UL) <= N; j+=2UL )
4745  {
4746  const size_t kbegin( ( IsLower<MT5>::value )
4747  ?( ( IsUpper<MT4>::value )
4748  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4749  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4750  :( IsUpper<MT4>::value ? i : 0UL ) );
4751  const size_t kend( ( IsUpper<MT5>::value )
4752  ?( ( IsLower<MT4>::value )
4753  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4754  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4755  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
4756 
4757  SIMDType xmm1( (~C).load(i ,j ) );
4758  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
4759  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
4760  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
4761  SIMDType xmm5( (~C).load(i ,j+1UL) );
4762  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
4763  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
4764  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
4765 
4766  for( size_t k=kbegin; k<kend; ++k ) {
4767  const SIMDType a1( A.load(i ,k) );
4768  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4769  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4770  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4771  const SIMDType b1( set( B(k,j ) ) );
4772  const SIMDType b2( set( B(k,j+1UL) ) );
4773  xmm1 = xmm1 - a1 * b1;
4774  xmm2 = xmm2 - a2 * b1;
4775  xmm3 = xmm3 - a3 * b1;
4776  xmm4 = xmm4 - a4 * b1;
4777  xmm5 = xmm5 - a1 * b2;
4778  xmm6 = xmm6 - a2 * b2;
4779  xmm7 = xmm7 - a3 * b2;
4780  xmm8 = xmm8 - a4 * b2;
4781  }
4782 
4783  (~C).store( i , j , xmm1 );
4784  (~C).store( i+SIMDSIZE , j , xmm2 );
4785  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
4786  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
4787  (~C).store( i , j+1UL, xmm5 );
4788  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
4789  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
4790  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
4791  }
4792 
4793  if( j < N )
4794  {
4795  const size_t kbegin( ( IsLower<MT5>::value )
4796  ?( ( IsUpper<MT4>::value )
4797  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4798  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4799  :( IsUpper<MT4>::value ? i : 0UL ) );
4800  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
4801 
4802  SIMDType xmm1( (~C).load(i ,j) );
4803  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4804  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4805  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
4806 
4807  for( size_t k=kbegin; k<kend; ++k ) {
4808  const SIMDType b1( set( B(k,j) ) );
4809  xmm1 = xmm1 - A.load(i ,k) * b1;
4810  xmm2 = xmm2 - A.load(i+SIMDSIZE ,k) * b1;
4811  xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,k) * b1;
4812  xmm4 = xmm4 - A.load(i+SIMDSIZE*3UL,k) * b1;
4813  }
4814 
4815  (~C).store( i , j, xmm1 );
4816  (~C).store( i+SIMDSIZE , j, xmm2 );
4817  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4818  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
4819  }
4820  }
4821 
4822  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4823  {
4824  size_t j( 0UL );
4825 
4826  for( ; (j+2UL) <= N; j+=2UL )
4827  {
4828  const size_t kbegin( ( IsLower<MT5>::value )
4829  ?( ( IsUpper<MT4>::value )
4830  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4831  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4832  :( IsUpper<MT4>::value ? i : 0UL ) );
4833  const size_t kend( ( IsUpper<MT5>::value )
4834  ?( ( IsLower<MT4>::value )
4835  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4836  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4837  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
4838 
4839  SIMDType xmm1( (~C).load(i ,j ) );
4840  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4841  SIMDType xmm3( (~C).load(i ,j+1UL) );
4842  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4843 
4844  for( size_t k=kbegin; k<kend; ++k ) {
4845  const SIMDType a1( A.load(i ,k) );
4846  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4847  const SIMDType b1( set( B(k,j ) ) );
4848  const SIMDType b2( set( B(k,j+1UL) ) );
4849  xmm1 = xmm1 - a1 * b1;
4850  xmm2 = xmm2 - a2 * b1;
4851  xmm3 = xmm3 - a1 * b2;
4852  xmm4 = xmm4 - a2 * b2;
4853  }
4854 
4855  (~C).store( i , j , xmm1 );
4856  (~C).store( i+SIMDSIZE, j , xmm2 );
4857  (~C).store( i , j+1UL, xmm3 );
4858  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
4859  }
4860 
4861  if( j < N )
4862  {
4863  const size_t kbegin( ( IsLower<MT5>::value )
4864  ?( ( IsUpper<MT4>::value )
4865  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4866  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4867  :( IsUpper<MT4>::value ? i : 0UL ) );
4868  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
4869 
4870  SIMDType xmm1( (~C).load(i ,j) );
4871  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
4872 
4873  for( size_t k=kbegin; k<kend; ++k ) {
4874  const SIMDType b1( set( B(k,j) ) );
4875  xmm1 = xmm1 - A.load(i ,k) * b1;
4876  xmm2 = xmm2 - A.load(i+SIMDSIZE,k) * b1;
4877  }
4878 
4879  (~C).store( i , j, xmm1 );
4880  (~C).store( i+SIMDSIZE, j, xmm2 );
4881  }
4882  }
4883 
4884  for( ; i<ipos; i+=SIMDSIZE )
4885  {
4886  size_t j( 0UL );
4887 
4888  for( ; (j+2UL) <= N; j+=2UL )
4889  {
4890  const size_t kbegin( ( IsLower<MT5>::value )
4891  ?( ( IsUpper<MT4>::value )
4892  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4893  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4894  :( IsUpper<MT4>::value ? i : 0UL ) );
4895  const size_t kend( ( IsUpper<MT5>::value )
4896  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4897  :( K ) );
4898 
4899  SIMDType xmm1( (~C).load(i,j ) );
4900  SIMDType xmm2( (~C).load(i,j+1UL) );
4901 
4902  for( size_t k=kbegin; k<kend; ++k ) {
4903  const SIMDType a1( A.load(i,k) );
4904  xmm1 = xmm1 - a1 * set( B(k,j ) );
4905  xmm2 = xmm2 - a1 * set( B(k,j+1UL) );
4906  }
4907 
4908  (~C).store( i, j , xmm1 );
4909  (~C).store( i, j+1UL, xmm2 );
4910  }
4911 
4912  if( j < N )
4913  {
4914  const size_t kbegin( ( IsLower<MT5>::value )
4915  ?( ( IsUpper<MT4>::value )
4916  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4917  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4918  :( IsUpper<MT4>::value ? i : 0UL ) );
4919 
4920  SIMDType xmm1( (~C).load(i,j) );
4921 
4922  for( size_t k=kbegin; k<K; ++k ) {
4923  xmm1 = xmm1 - A.load(i,k) * set( B(k,j) );
4924  }
4925 
4926  (~C).store( i, j, xmm1 );
4927  }
4928  }
4929 
4930  for( ; remainder && i<M; ++i )
4931  {
4932  size_t j( 0UL );
4933 
4934  for( ; (j+2UL) <= N; j+=2UL )
4935  {
4936  const size_t kbegin( ( IsLower<MT5>::value )
4937  ?( ( IsUpper<MT4>::value )
4938  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4939  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4940  :( IsUpper<MT4>::value ? i : 0UL ) );
4941  const size_t kend( ( IsUpper<MT5>::value )
4942  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4943  :( K ) );
4944 
4945  ElementType value1( (~C)(i,j ) );
4946  ElementType value2( (~C)(i,j+1UL) );
4947 
4948  for( size_t k=kbegin; k<kend; ++k ) {
4949  value1 -= A(i,k) * B(k,j );
4950  value2 -= A(i,k) * B(k,j+1UL);
4951  }
4952 
4953  (~C)(i,j ) = value1;
4954  (~C)(i,j+1UL) = value2;
4955  }
4956 
4957  if( j < N )
4958  {
4959  const size_t kbegin( ( IsLower<MT5>::value )
4960  ?( ( IsUpper<MT4>::value )
4961  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4962  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4963  :( IsUpper<MT4>::value ? i : 0UL ) );
4964 
4965  ElementType value( (~C)(i,j) );
4966 
4967  for( size_t k=kbegin; k<K; ++k ) {
4968  value -= A(i,k) * B(k,j);
4969  }
4970 
4971  (~C)(i,j) = value;
4972  }
4973  }
4974  }
4976  //**********************************************************************************************
4977 
4978  //**Default subtraction assignment to dense matrices (large matrices)***************************
4992  template< typename MT3 // Type of the left-hand side target matrix
4993  , typename MT4 // Type of the left-hand side matrix operand
4994  , typename MT5 > // Type of the right-hand side matrix operand
4995  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
4996  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4997  {
4998  selectDefaultSubAssignKernel( C, A, B );
4999  }
5001  //**********************************************************************************************
5002 
5003  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
5018  template< typename MT3 // Type of the left-hand side target matrix
5019  , typename MT4 // Type of the left-hand side matrix operand
5020  , typename MT5 > // Type of the right-hand side matrix operand
5021  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
5022  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
5023  {
5024  const size_t M( A.rows() );
5025  const size_t N( B.columns() );
5026  const size_t K( A.columns() );
5027 
5028  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5029 
5030  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
5031  {
5032  const size_t jend( min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
5033 
5034  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
5035  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
5036 
5037  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
5038  {
5039  const size_t iend( min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
5040 
5041  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
5042  {
5043  const size_t ktmp( min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
5044 
5045  size_t j( jj );
5046 
5047  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5048  {
5049  const size_t j1( j+SIMDSIZE );
5050  const size_t j2( j+SIMDSIZE*2UL );
5051  const size_t j3( j+SIMDSIZE*3UL );
5052 
5053  size_t i( ii );
5054 
5055  for( ; (i+2UL) <= iend; i+=2UL )
5056  {
5057  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5058  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5059  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5060  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
5061 
5062  SIMDType xmm1( (~C).load(i ,j ) );
5063  SIMDType xmm2( (~C).load(i ,j1) );
5064  SIMDType xmm3( (~C).load(i ,j2) );
5065  SIMDType xmm4( (~C).load(i ,j3) );
5066  SIMDType xmm5( (~C).load(i+1UL,j ) );
5067  SIMDType xmm6( (~C).load(i+1UL,j1) );
5068  SIMDType xmm7( (~C).load(i+1UL,j2) );
5069  SIMDType xmm8( (~C).load(i+1UL,j3) );
5070 
5071  for( size_t k=kbegin; k<kend; ++k ) {
5072  const SIMDType a1( set( A(i ,k) ) );
5073  const SIMDType a2( set( A(i+1UL,k) ) );
5074  const SIMDType b1( B.load(k,j ) );
5075  const SIMDType b2( B.load(k,j1) );
5076  const SIMDType b3( B.load(k,j2) );
5077  const SIMDType b4( B.load(k,j3) );
5078  xmm1 = xmm1 - a1 * b1;
5079  xmm2 = xmm2 - a1 * b2;
5080  xmm3 = xmm3 - a1 * b3;
5081  xmm4 = xmm4 - a1 * b4;
5082  xmm5 = xmm5 - a2 * b1;
5083  xmm6 = xmm6 - a2 * b2;
5084  xmm7 = xmm7 - a2 * b3;
5085  xmm8 = xmm8 - a2 * b4;
5086  }
5087 
5088  (~C).store( i , j , xmm1 );
5089  (~C).store( i , j1, xmm2 );
5090  (~C).store( i , j2, xmm3 );
5091  (~C).store( i , j3, xmm4 );
5092  (~C).store( i+1UL, j , xmm5 );
5093  (~C).store( i+1UL, j1, xmm6 );
5094  (~C).store( i+1UL, j2, xmm7 );
5095  (~C).store( i+1UL, j3, xmm8 );
5096  }
5097 
5098  if( i < iend )
5099  {
5100  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5101  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5102  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5103  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
5104 
5105  SIMDType xmm1( (~C).load(i,j ) );
5106  SIMDType xmm2( (~C).load(i,j1) );
5107  SIMDType xmm3( (~C).load(i,j2) );
5108  SIMDType xmm4( (~C).load(i,j3) );
5109 
5110  for( size_t k=kbegin; k<kend; ++k ) {
5111  const SIMDType a1( set( A(i,k) ) );
5112  xmm1 = xmm1 - a1 * B.load(k,j );
5113  xmm2 = xmm2 - a1 * B.load(k,j1);
5114  xmm3 = xmm3 - a1 * B.load(k,j2);
5115  xmm4 = xmm4 - a1 * B.load(k,j3);
5116  }
5117 
5118  (~C).store( i, j , xmm1 );
5119  (~C).store( i, j1, xmm2 );
5120  (~C).store( i, j2, xmm3 );
5121  (~C).store( i, j3, xmm4 );
5122  }
5123  }
5124 
5125  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5126  {
5127  const size_t j1( j+SIMDSIZE );
5128 
5129  size_t i( ii );
5130 
5131  for( ; (i+4UL) <= iend; i+=4UL )
5132  {
5133  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5134  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5135  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
5136  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
5137 
5138  SIMDType xmm1( (~C).load(i ,j ) );
5139  SIMDType xmm2( (~C).load(i ,j1) );
5140  SIMDType xmm3( (~C).load(i+1UL,j ) );
5141  SIMDType xmm4( (~C).load(i+1UL,j1) );
5142  SIMDType xmm5( (~C).load(i+2UL,j ) );
5143  SIMDType xmm6( (~C).load(i+2UL,j1) );
5144  SIMDType xmm7( (~C).load(i+3UL,j ) );
5145  SIMDType xmm8( (~C).load(i+3UL,j1) );
5146 
5147  for( size_t k=kbegin; k<kend; ++k ) {
5148  const SIMDType a1( set( A(i ,k) ) );
5149  const SIMDType a2( set( A(i+1UL,k) ) );
5150  const SIMDType a3( set( A(i+2UL,k) ) );
5151  const SIMDType a4( set( A(i+3UL,k) ) );
5152  const SIMDType b1( B.load(k,j ) );
5153  const SIMDType b2( B.load(k,j1) );
5154  xmm1 = xmm1 - a1 * b1;
5155  xmm2 = xmm2 - a1 * b2;
5156  xmm3 = xmm3 - a2 * b1;
5157  xmm4 = xmm4 - a2 * b2;
5158  xmm5 = xmm5 - a3 * b1;
5159  xmm6 = xmm6 - a3 * b2;
5160  xmm7 = xmm7 - a4 * b1;
5161  xmm8 = xmm8 - a4 * b2;
5162  }
5163 
5164  (~C).store( i , j , xmm1 );
5165  (~C).store( i , j1, xmm2 );
5166  (~C).store( i+1UL, j , xmm3 );
5167  (~C).store( i+1UL, j1, xmm4 );
5168  (~C).store( i+2UL, j , xmm5 );
5169  (~C).store( i+2UL, j1, xmm6 );
5170  (~C).store( i+3UL, j , xmm7 );
5171  (~C).store( i+3UL, j1, xmm8 );
5172  }
5173 
5174  for( ; (i+2UL) <= iend; i+=2UL )
5175  {
5176  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5177  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5178  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5179  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
5180 
5181  SIMDType xmm1( (~C).load(i ,j ) );
5182  SIMDType xmm2( (~C).load(i ,j1) );
5183  SIMDType xmm3( (~C).load(i+1UL,j ) );
5184  SIMDType xmm4( (~C).load(i+1UL,j1) );
5185 
5186  for( size_t k=kbegin; k<kend; ++k ) {
5187  const SIMDType a1( set( A(i ,k) ) );
5188  const SIMDType a2( set( A(i+1UL,k) ) );
5189  const SIMDType b1( B.load(k,j ) );
5190  const SIMDType b2( B.load(k,j1) );
5191  xmm1 = xmm1 - a1 * b1;
5192  xmm2 = xmm2 - a1 * b2;
5193  xmm3 = xmm3 - a2 * b1;
5194  xmm4 = xmm4 - a2 * b2;
5195  }
5196 
5197  (~C).store( i , j , xmm1 );
5198  (~C).store( i , j1, xmm2 );
5199  (~C).store( i+1UL, j , xmm3 );
5200  (~C).store( i+1UL, j1, xmm4 );
5201  }
5202 
5203  if( i < iend )
5204  {
5205  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5206  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5207  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5208  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
5209 
5210  SIMDType xmm1( (~C).load(i,j ) );
5211  SIMDType xmm2( (~C).load(i,j1) );
5212 
5213  for( size_t k=kbegin; k<kend; ++k ) {
5214  const SIMDType a1( set( A(i,k) ) );
5215  xmm1 = xmm1 - a1 * B.load(k,j );
5216  xmm2 = xmm2 - a1 * B.load(k,j1);
5217  }
5218 
5219  (~C).store( i, j , xmm1 );
5220  (~C).store( i, j1, xmm2 );
5221  }
5222  }
5223 
5224  for( ; j<jpos; j+=SIMDSIZE )
5225  {
5226  for( size_t i=ii; i<iend; ++i )
5227  {
5228  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5229  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5230  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5231  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
5232 
5233  SIMDType xmm1( (~C).load(i,j) );
5234 
5235  for( size_t k=kbegin; k<kend; ++k ) {
5236  const SIMDType a1( set( A(i,k) ) );
5237  xmm1 = xmm1 - a1 * B.load(k,j);
5238  }
5239 
5240  (~C).store( i, j, xmm1 );
5241  }
5242  }
5243 
5244  for( ; remainder && j<jend; ++j )
5245  {
5246  for( size_t i=ii; i<iend; ++i )
5247  {
5248  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5249  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5250  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5251  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
5252 
5253  ElementType value( (~C)(i,j) );
5254 
5255  for( size_t k=kbegin; k<kend; ++k ) {
5256  value -= A(i,k) * B(k,j);
5257  }
5258 
5259  (~C)(i,j) = value;
5260  }
5261  }
5262  }
5263  }
5264  }
5265  }
5267  //**********************************************************************************************
5268 
5269  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
5284  template< typename MT3 // Type of the left-hand side target matrix
5285  , typename MT4 // Type of the left-hand side matrix operand
5286  , typename MT5 > // Type of the right-hand side matrix operand
5287  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
5288  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
5289  {
5290  const size_t M( A.rows() );
5291  const size_t N( B.columns() );
5292  const size_t K( A.columns() );
5293 
5294  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5295 
5296  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
5297  {
5298  const size_t iend( min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
5299 
5300  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
5301  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
5302 
5303  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
5304  {
5305  const size_t jend( min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
5306 
5307  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
5308  {
5309  const size_t ktmp( min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
5310 
5311  size_t i( ii );
5312 
5313  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5314  {
5315  const size_t i1( i+SIMDSIZE );
5316  const size_t i2( i+SIMDSIZE*2UL );
5317  const size_t i3( i+SIMDSIZE*3UL );
5318 
5319  size_t j( jj );
5320 
5321  for( ; (j+2UL) <= jend; j+=2UL )
5322  {
5323  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5324  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5325  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
5326  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5327 
5328  SIMDType xmm1( (~C).load(i ,j ) );
5329  SIMDType xmm2( (~C).load(i1,j ) );
5330  SIMDType xmm3( (~C).load(i2,j ) );
5331  SIMDType xmm4( (~C).load(i3,j ) );
5332  SIMDType xmm5( (~C).load(i ,j+1UL) );
5333  SIMDType xmm6( (~C).load(i1,j+1UL) );
5334  SIMDType xmm7( (~C).load(i2,j+1UL) );
5335  SIMDType xmm8( (~C).load(i3,j+1UL) );
5336 
5337  for( size_t k=kbegin; k<kend; ++k ) {
5338  const SIMDType a1( A.load(i ,k) );
5339  const SIMDType a2( A.load(i1,k) );
5340  const SIMDType a3( A.load(i2,k) );
5341  const SIMDType a4( A.load(i3,k) );
5342  const SIMDType b1( set( B(k,j ) ) );
5343  const SIMDType b2( set( B(k,j+1UL) ) );
5344  xmm1 = xmm1 - a1 * b1;
5345  xmm2 = xmm2 - a2 * b1;
5346  xmm3 = xmm3 - a3 * b1;
5347  xmm4 = xmm4 - a4 * b1;
5348  xmm5 = xmm5 - a1 * b2;
5349  xmm6 = xmm6 - a2 * b2;
5350  xmm7 = xmm7 - a3 * b2;
5351  xmm8 = xmm8 - a4 * b2;
5352  }
5353 
5354  (~C).store( i , j , xmm1 );
5355  (~C).store( i1, j , xmm2 );
5356  (~C).store( i2, j , xmm3 );
5357  (~C).store( i3, j , xmm4 );
5358  (~C).store( i , j+1UL, xmm5 );
5359  (~C).store( i1, j+1UL, xmm6 );
5360  (~C).store( i2, j+1UL, xmm7 );
5361  (~C).store( i3, j+1UL, xmm8 );
5362  }
5363 
5364  if( j < jend )
5365  {
5366  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5367  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5368  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
5369  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5370 
5371  SIMDType xmm1( (~C).load(i ,j) );
5372  SIMDType xmm2( (~C).load(i1,j) );
5373  SIMDType xmm3( (~C).load(i2,j) );
5374  SIMDType xmm4( (~C).load(i3,j) );
5375 
5376  for( size_t k=kbegin; k<kend; ++k ) {
5377  const SIMDType b1( set( B(k,j) ) );
5378  xmm1 = xmm1 - A.load(i ,k) * b1;
5379  xmm2 = xmm2 - A.load(i1,k) * b1;
5380  xmm3 = xmm3 - A.load(i2,k) * b1;
5381  xmm4 = xmm4 - A.load(i3,k) * b1;
5382  }
5383 
5384  (~C).store( i , j, xmm1 );
5385  (~C).store( i1, j, xmm2 );
5386  (~C).store( i2, j, xmm3 );
5387  (~C).store( i3, j, xmm4 );
5388  }
5389  }
5390 
5391  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5392  {
5393  const size_t i1( i+SIMDSIZE );
5394 
5395  size_t j( jj );
5396 
5397  for( ; (j+4UL) <= jend; j+=4UL )
5398  {
5399  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5400  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5401  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
5402  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
5403 
5404  SIMDType xmm1( (~C).load(i ,j ) );
5405  SIMDType xmm2( (~C).load(i1,j ) );
5406  SIMDType xmm3( (~C).load(i ,j+1UL) );
5407  SIMDType xmm4( (~C).load(i1,j+1UL) );
5408  SIMDType xmm5( (~C).load(i ,j+2UL) );
5409  SIMDType xmm6( (~C).load(i1,j+2UL) );
5410  SIMDType xmm7( (~C).load(i ,j+3UL) );
5411  SIMDType xmm8( (~C).load(i1,j+3UL) );
5412 
5413  for( size_t k=kbegin; k<kend; ++k ) {
5414  const SIMDType a1( A.load(i ,k) );
5415  const SIMDType a2( A.load(i1,k) );
5416  const SIMDType b1( set( B(k,j ) ) );
5417  const SIMDType b2( set( B(k,j+1UL) ) );
5418  const SIMDType b3( set( B(k,j+2UL) ) );
5419  const SIMDType b4( set( B(k,j+3UL) ) );
5420  xmm1 = xmm1 - a1 * b1;
5421  xmm2 = xmm2 - a2 * b1;
5422  xmm3 = xmm3 - a1 * b2;
5423  xmm4 = xmm4 - a2 * b2;
5424  xmm5 = xmm5 - a1 * b3;
5425  xmm6 = xmm6 - a2 * b3;
5426  xmm7 = xmm7 - a1 * b4;
5427  xmm8 = xmm8 - a2 * b4;
5428  }
5429 
5430  (~C).store( i , j , xmm1 );
5431  (~C).store( i1, j , xmm2 );
5432  (~C).store( i , j+1UL, xmm3 );
5433  (~C).store( i1, j+1UL, xmm4 );
5434  (~C).store( i , j+2UL, xmm5 );
5435  (~C).store( i1, j+2UL, xmm6 );
5436  (~C).store( i , j+3UL, xmm7 );
5437  (~C).store( i1, j+3UL, xmm8 );
5438  }
5439 
5440  for( ; (j+2UL) <= jend; j+=2UL )
5441  {
5442  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5443  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5444  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
5445  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5446 
5447  SIMDType xmm1( (~C).load(i ,j ) );
5448  SIMDType xmm2( (~C).load(i1,j ) );
5449  SIMDType xmm3( (~C).load(i ,j+1UL) );
5450  SIMDType xmm4( (~C).load(i1,j+1UL) );
5451 
5452  for( size_t k=kbegin; k<kend; ++k ) {
5453  const SIMDType a1( A.load(i ,k) );
5454  const SIMDType a2( A.load(i1,k) );
5455  const SIMDType b1( set( B(k,j ) ) );
5456  const SIMDType b2( set( B(k,j+1UL) ) );
5457  xmm1 = xmm1 - a1 * b1;
5458  xmm2 = xmm2 - a2 * b1;
5459  xmm3 = xmm3 - a1 * b2;
5460  xmm4 = xmm4 - a2 * b2;
5461  }
5462 
5463  (~C).store( i , j , xmm1 );
5464  (~C).store( i1, j , xmm2 );
5465  (~C).store( i , j+1UL, xmm3 );
5466  (~C).store( i1, j+1UL, xmm4 );
5467  }
5468 
5469  if( j < jend )
5470  {
5471  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5472  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5473  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
5474  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5475 
5476  SIMDType xmm1( (~C).load(i ,j) );
5477  SIMDType xmm2( (~C).load(i1,j) );
5478 
5479  for( size_t k=kbegin; k<kend; ++k ) {
5480  const SIMDType b1( set( B(k,j) ) );
5481  xmm1 = xmm1 - A.load(i ,k) * b1;
5482  xmm2 = xmm2 - A.load(i1,k) * b1;
5483  }
5484 
5485  (~C).store( i , j, xmm1 );
5486  (~C).store( i1, j, xmm2 );
5487  }
5488  }
5489 
5490  for( ; i<ipos; i+=SIMDSIZE )
5491  {
5492  for( size_t j=jj; j<jend; ++j )
5493  {
5494  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5495  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5496  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE, ktmp ) ):( ktmp ),
5497  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5498 
5499  SIMDType xmm1( (~C).load(i,j) );
5500 
5501  for( size_t k=kbegin; k<kend; ++k ) {
5502  const SIMDType b1( set( B(k,j) ) );
5503  xmm1 = xmm1 - A.load(i,k) * b1;
5504  }
5505 
5506  (~C).store( i, j, xmm1 );
5507  }
5508  }
5509 
5510  for( ; remainder && i<iend; ++i )
5511  {
5512  for( size_t j=jj; j<jend; ++j )
5513  {
5514  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5515  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5516  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
5517  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5518 
5519  ElementType value( (~C)(i,j) );
5520 
5521  for( size_t k=kbegin; k<kend; ++k ) {
5522  value -= A(i,k) * B(k,j);
5523  }
5524 
5525  (~C)(i,j) = value;
5526  }
5527  }
5528  }
5529  }
5530  }
5531  }
5533  //**********************************************************************************************
5534 
5535  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
5549  template< typename MT3 // Type of the left-hand side target matrix
5550  , typename MT4 // Type of the left-hand side matrix operand
5551  , typename MT5 > // Type of the right-hand side matrix operand
5552  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
5553  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5554  {
5555  selectLargeSubAssignKernel( C, A, B );
5556  }
5558  //**********************************************************************************************
5559 
5560  //**BLAS-based subraction assignment to dense matrices******************************************
5561 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
5562 
5575  template< typename MT3 // Type of the left-hand side target matrix
5576  , typename MT4 // Type of the left-hand side matrix operand
5577  , typename MT5 > // Type of the right-hand side matrix operand
5578  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
5579  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5580  {
5581  typedef ElementType_<MT3> ET;
5582 
5583  if( IsTriangular<MT4>::value ) {
5584  ResultType_<MT3> tmp( serial( B ) );
5585  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
5586  subAssign( C, tmp );
5587  }
5588  else if( IsTriangular<MT5>::value ) {
5589  ResultType_<MT3> tmp( serial( A ) );
5590  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
5591  subAssign( C, tmp );
5592  }
5593  else {
5594  gemm( C, A, B, ET(-1), ET(1) );
5595  }
5596  }
5598 #endif
5599  //**********************************************************************************************
5600 
5601  //**Subtraction assignment to sparse matrices***************************************************
5602  // No special implementation for the subtraction assignment to sparse matrices.
5603  //**********************************************************************************************
5604 
5605  //**Multiplication assignment to dense matrices*************************************************
5606  // No special implementation for the multiplication assignment to dense matrices.
5607  //**********************************************************************************************
5608 
5609  //**Multiplication assignment to sparse matrices************************************************
5610  // No special implementation for the multiplication assignment to sparse matrices.
5611  //**********************************************************************************************
5612 
5613  //**SMP assignment to dense matrices************************************************************
5629  template< typename MT // Type of the target dense matrix
5630  , bool SO > // Storage order of the target dense matrix
5631  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
5632  smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5633  {
5635 
5636  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5637  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5638 
5639  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
5640  return;
5641  }
5642  else if( rhs.lhs_.columns() == 0UL ) {
5643  reset( ~lhs );
5644  return;
5645  }
5646 
5647  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
5648  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
5649 
5650  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5651  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5652  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5653  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5654  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5655  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
5656 
5657  smpAssign( ~lhs, A * B );
5658  }
5660  //**********************************************************************************************
5661 
5662  //**SMP assignment to sparse matrices***********************************************************
5678  template< typename MT // Type of the target sparse matrix
5679  , bool SO > // Storage order of the target sparse matrix
5680  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
5681  smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5682  {
5684 
5685  typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
5686 
5692  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<TmpType> );
5693 
5694  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5695  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5696 
5697  const TmpType tmp( rhs );
5698  smpAssign( ~lhs, tmp );
5699  }
5701  //**********************************************************************************************
5702 
5703  //**SMP addition assignment to dense matrices***************************************************
5719  template< typename MT // Type of the target dense matrix
5720  , bool SO > // Storage order of the target dense matrix
5721  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
5722  smpAddAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5723  {
5725 
5726  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5727  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5728 
5729  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5730  return;
5731  }
5732 
5733  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
5734  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
5735 
5736  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5737  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5738  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5739  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5740  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5741  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
5742 
5743  smpAddAssign( ~lhs, A * B );
5744  }
5746  //**********************************************************************************************
5747 
5748  //**SMP addition assignment to sparse matrices**************************************************
5749  // No special implementation for the SMP addition assignment to sparse matrices.
5750  //**********************************************************************************************
5751 
5752  //**SMP subtraction assignment to dense matrices************************************************
5768  template< typename MT // Type of the target dense matrix
5769  , bool SO > // Storage order of the target dense matrix
5770  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
5771  smpSubAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5772  {
5774 
5775  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5776  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5777 
5778  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5779  return;
5780  }
5781 
5782  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
5783  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
5784 
5785  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5786  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5787  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5788  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5789  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5790  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
5791 
5792  smpSubAssign( ~lhs, A * B );
5793  }
5795  //**********************************************************************************************
5796 
5797  //**SMP subtraction assignment to sparse matrices***********************************************
5798  // No special implementation for the SMP subtraction assignment to sparse matrices.
5799  //**********************************************************************************************
5800 
5801  //**SMP multiplication assignment to dense matrices*********************************************
5802  // No special implementation for the SMP multiplication assignment to dense matrices.
5803  //**********************************************************************************************
5804 
5805  //**SMP multiplication assignment to sparse matrices********************************************
5806  // No special implementation for the SMP multiplication assignment to sparse matrices.
5807  //**********************************************************************************************
5808 
5809  //**Compile time checks*************************************************************************
5817  //**********************************************************************************************
5818 };
5819 //*************************************************************************************************
5820 
5821 
5822 
5823 
5824 //=================================================================================================
5825 //
5826 // DMATSCALARMULTEXPR SPECIALIZATION
5827 //
5828 //=================================================================================================
5829 
5830 //*************************************************************************************************
5838 template< typename MT1 // Type of the left-hand side dense matrix
5839  , typename MT2 // Type of the right-hand side dense matrix
5840  , typename ST > // Type of the right-hand side scalar value
5841 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >
5842  : public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
5843  , private MatScalarMultExpr
5844  , private Computation
5845 {
5846  private:
5847  //**Type definitions****************************************************************************
5848  typedef TDMatDMatMultExpr<MT1,MT2> MMM;
5849  typedef ResultType_<MMM> RES;
5850  typedef ResultType_<MT1> RT1;
5851  typedef ResultType_<MT2> RT2;
5852  typedef ElementType_<RT1> ET1;
5853  typedef ElementType_<RT2> ET2;
5854  typedef CompositeType_<MT1> CT1;
5855  typedef CompositeType_<MT2> CT2;
5856  //**********************************************************************************************
5857 
5858  //**********************************************************************************************
5860  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
5861  //**********************************************************************************************
5862 
5863  //**********************************************************************************************
5865  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
5866  //**********************************************************************************************
5867 
5868  //**********************************************************************************************
5870 
5873  template< typename T1, typename T2, typename T3 >
5874  struct IsEvaluationRequired {
5875  enum : bool { value = ( evaluateLeft || evaluateRight ) };
5876  };
5877  //**********************************************************************************************
5878 
5879  //**********************************************************************************************
5881 
5883  template< typename T1, typename T2, typename T3, typename T4 >
5884  struct UseBlasKernel {
5886  HasMutableDataAccess<T1>::value &&
5887  HasConstDataAccess<T2>::value &&
5888  HasConstDataAccess<T3>::value &&
5889  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
5890  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
5891  IsBLASCompatible< ElementType_<T1> >::value &&
5892  IsBLASCompatible< ElementType_<T2> >::value &&
5893  IsBLASCompatible< ElementType_<T3> >::value &&
5894  IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
5895  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
5896  !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
5897  };
5898  //**********************************************************************************************
5899 
5900  //**********************************************************************************************
5902 
5904  template< typename T1, typename T2, typename T3, typename T4 >
5905  struct UseVectorizedDefaultKernel {
5906  enum : bool { value = useOptimizedKernels &&
5907  !( IsDiagonal<T2>::value && IsDiagonal<T3>::value ) &&
5908  !( IsDiagonal<T2>::value && IsColumnMajorMatrix<T1>::value ) &&
5909  !( IsDiagonal<T3>::value && IsRowMajorMatrix<T1>::value ) &&
5910  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
5911  AreSIMDCombinable< ElementType_<T1>
5912  , ElementType_<T2>
5913  , ElementType_<T3>
5914  , T4 >::value &&
5915  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
5916  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
5917  };
5918  //**********************************************************************************************
5919 
5920  public:
5921  //**Type definitions****************************************************************************
5922  typedef DMatScalarMultExpr<MMM,ST,true> This;
5923  typedef MultTrait_<RES,ST> ResultType;
5924  typedef OppositeType_<ResultType> OppositeType;
5925  typedef TransposeType_<ResultType> TransposeType;
5926  typedef ElementType_<ResultType> ElementType;
5927  typedef SIMDTrait_<ElementType> SIMDType;
5928  typedef const ElementType ReturnType;
5929  typedef const ResultType CompositeType;
5930 
5932  typedef const TDMatDMatMultExpr<MT1,MT2> LeftOperand;
5933 
5935  typedef ST RightOperand;
5936 
5938  typedef IfTrue_< evaluateLeft, const RT1, CT1 > LT;
5939 
5941  typedef IfTrue_< evaluateRight, const RT2, CT2 > RT;
5942  //**********************************************************************************************
5943 
5944  //**Compilation flags***************************************************************************
5946  enum : bool { simdEnabled = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
5947  MT1::simdEnabled && MT2::simdEnabled &&
5948  AreSIMDCombinable<ET1,ET2,ST>::value &&
5949  HasSIMDAdd<ET1,ET2>::value &&
5950  HasSIMDMult<ET1,ET2>::value };
5951 
5953  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
5954  !evaluateRight && MT2::smpAssignable };
5955  //**********************************************************************************************
5956 
5957  //**SIMD properties*****************************************************************************
5959  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
5960  //**********************************************************************************************
5961 
5962  //**Constructor*********************************************************************************
5968  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
5969  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
5970  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
5971  {}
5972  //**********************************************************************************************
5973 
5974  //**Access operator*****************************************************************************
5981  inline ResultType operator()( size_t i, size_t j ) const {
5982  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
5983  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
5984  return matrix_(i,j) * scalar_;
5985  }
5986  //**********************************************************************************************
5987 
5988  //**At function*********************************************************************************
5996  inline ReturnType at( size_t i, size_t j ) const {
5997  if( i >= matrix_.rows() ) {
5998  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
5999  }
6000  if( j >= matrix_.columns() ) {
6001  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
6002  }
6003  return (*this)(i,j);
6004  }
6005  //**********************************************************************************************
6006 
6007  //**Rows function*******************************************************************************
6012  inline size_t rows() const {
6013  return matrix_.rows();
6014  }
6015  //**********************************************************************************************
6016 
6017  //**Columns function****************************************************************************
6022  inline size_t columns() const {
6023  return matrix_.columns();
6024  }
6025  //**********************************************************************************************
6026 
6027  //**Left operand access*************************************************************************
6032  inline LeftOperand leftOperand() const {
6033  return matrix_;
6034  }
6035  //**********************************************************************************************
6036 
6037  //**Right operand access************************************************************************
6042  inline RightOperand rightOperand() const {
6043  return scalar_;
6044  }
6045  //**********************************************************************************************
6046 
6047  //**********************************************************************************************
6053  template< typename T >
6054  inline bool canAlias( const T* alias ) const {
6055  return matrix_.canAlias( alias );
6056  }
6057  //**********************************************************************************************
6058 
6059  //**********************************************************************************************
6065  template< typename T >
6066  inline bool isAliased( const T* alias ) const {
6067  return matrix_.isAliased( alias );
6068  }
6069  //**********************************************************************************************
6070 
6071  //**********************************************************************************************
6076  inline bool isAligned() const {
6077  return matrix_.isAligned();
6078  }
6079  //**********************************************************************************************
6080 
6081  //**********************************************************************************************
6086  inline bool canSMPAssign() const noexcept {
6087  return ( !BLAZE_BLAS_IS_PARALLEL ||
6088  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
6089  ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD );
6090  }
6091  //**********************************************************************************************
6092 
6093  private:
6094  //**Member variables****************************************************************************
6095  LeftOperand matrix_;
6096  RightOperand scalar_;
6097  //**********************************************************************************************
6098 
6099  //**Assignment to dense matrices****************************************************************
6111  template< typename MT // Type of the target dense matrix
6112  , bool SO > // Storage order of the target dense matrix
6113  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6114  {
6116 
6117  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6118  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6119 
6120  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6121  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6122 
6123  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
6124  return;
6125  }
6126  else if( left.columns() == 0UL ) {
6127  reset( ~lhs );
6128  return;
6129  }
6130 
6131  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6132  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6133 
6134  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6135  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6136  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6137  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6138  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6139  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6140 
6141  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
6142  }
6143  //**********************************************************************************************
6144 
6145  //**Assignment to dense matrices (kernel selection)*********************************************
6156  template< typename MT3 // Type of the left-hand side target matrix
6157  , typename MT4 // Type of the left-hand side matrix operand
6158  , typename MT5 // Type of the right-hand side matrix operand
6159  , typename ST2 > // Type of the scalar value
6160  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6161  {
6162  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
6163  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
6164  selectSmallAssignKernel( C, A, B, scalar );
6165  else
6166  selectBlasAssignKernel( C, A, B, scalar );
6167  }
6168  //**********************************************************************************************
6169 
6170  //**Default assignment to row-major dense matrices (general/general)****************************
6184  template< typename MT3 // Type of the left-hand side target matrix
6185  , typename MT4 // Type of the left-hand side matrix operand
6186  , typename MT5 // Type of the right-hand side matrix operand
6187  , typename ST2 > // Type of the scalar value
6188  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6189  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6190  {
6191  const size_t M( A.rows() );
6192  const size_t N( B.columns() );
6193  const size_t K( A.columns() );
6194 
6195  for( size_t i=0UL; i<M; ++i )
6196  {
6197  const size_t kbegin( ( IsUpper<MT4>::value )
6198  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6199  :( 0UL ) );
6200  const size_t kend( ( IsLower<MT4>::value )
6201  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6202  :( K ) );
6203  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
6204 
6205  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
6206  for( size_t j=0UL; j<N; ++j ) {
6207  reset( (~C)(i,j) );
6208  }
6209  continue;
6210  }
6211 
6212  {
6213  const size_t jbegin( ( IsUpper<MT5>::value )
6214  ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
6215  :( 0UL ) );
6216  const size_t jend( ( IsLower<MT5>::value )
6217  ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
6218  :( N ) );
6219  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6220 
6221  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
6222  for( size_t j=0UL; j<jbegin; ++j ) {
6223  reset( (~C)(i,j) );
6224  }
6225  }
6226  else if( IsStrictlyUpper<MT5>::value ) {
6227  reset( (~C)(i,0UL) );
6228  }
6229  for( size_t j=jbegin; j<jend; ++j ) {
6230  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6231  }
6232  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
6233  for( size_t j=jend; j<N; ++j ) {
6234  reset( (~C)(i,j) );
6235  }
6236  }
6237  else if( IsStrictlyLower<MT5>::value ) {
6238  reset( (~C)(i,N-1UL) );
6239  }
6240  }
6241 
6242  for( size_t k=kbegin+1UL; k<kend; ++k )
6243  {
6244  const size_t jbegin( ( IsUpper<MT5>::value )
6245  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
6246  :( 0UL ) );
6247  const size_t jend( ( IsLower<MT5>::value )
6248  ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
6249  :( N ) );
6250  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6251 
6252  for( size_t j=jbegin; j<jend; ++j ) {
6253  (~C)(i,j) += A(i,k) * B(k,j);
6254  }
6255  if( IsLower<MT5>::value ) {
6256  (~C)(i,jend) = A(i,k) * B(k,jend);
6257  }
6258  }
6259 
6260  {
6261  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
6262  ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? i+1UL : i )
6263  :( 0UL ) );
6264  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
6265  ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? i : i+1UL )
6266  :( N ) );
6267  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6268 
6269  for( size_t j=jbegin; j<jend; ++j ) {
6270  (~C)(i,j) *= scalar;
6271  }
6272  }
6273  }
6274  }
6275  //**********************************************************************************************
6276 
6277  //**Default assignment to column-major dense matrices (general/general)*************************
6291  template< typename MT3 // Type of the left-hand side target matrix
6292  , typename MT4 // Type of the left-hand side matrix operand
6293  , typename MT5 // Type of the right-hand side matrix operand
6294  , typename ST2 > // Type of the scalar value
6295  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6296  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6297  {
6298  const size_t M( A.rows() );
6299  const size_t N( B.columns() );
6300  const size_t K( A.columns() );
6301 
6302  for( size_t j=0UL; j<N; ++j )
6303  {
6304  const size_t kbegin( ( IsLower<MT5>::value )
6305  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6306  :( 0UL ) );
6307  const size_t kend( ( IsUpper<MT5>::value )
6308  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6309  :( K ) );
6310  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
6311 
6312  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
6313  for( size_t i=0UL; i<M; ++i ) {
6314  reset( (~C)(i,j) );
6315  }
6316  continue;
6317  }
6318 
6319  {
6320  const size_t ibegin( ( IsLower<MT4>::value )
6321  ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
6322  :( 0UL ) );
6323  const size_t iend( ( IsUpper<MT4>::value )
6324  ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
6325  :( M ) );
6326  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6327 
6328  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
6329  for( size_t i=0UL; i<ibegin; ++i ) {
6330  reset( (~C)(i,j) );
6331  }
6332  }
6333  else if( IsStrictlyLower<MT4>::value ) {
6334  reset( (~C)(0UL,j) );
6335  }
6336  for( size_t i=ibegin; i<iend; ++i ) {
6337  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6338  }
6339  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
6340  for( size_t i=iend; i<M; ++i ) {
6341  reset( (~C)(i,j) );
6342  }
6343  }
6344  else if( IsStrictlyUpper<MT4>::value ) {
6345  reset( (~C)(M-1UL,j) );
6346  }
6347  }
6348 
6349  for( size_t k=kbegin+1UL; k<kend; ++k )
6350  {
6351  const size_t ibegin( ( IsLower<MT4>::value )
6352  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
6353  :( 0UL ) );
6354  const size_t iend( ( IsUpper<MT4>::value )
6355  ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
6356  :( M ) );
6357  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6358 
6359  for( size_t i=ibegin; i<iend; ++i ) {
6360  (~C)(i,j) += A(i,k) * B(k,j);
6361  }
6362  if( IsUpper<MT4>::value ) {
6363  (~C)(iend,j) = A(iend,k) * B(k,j);
6364  }
6365  }
6366 
6367  {
6368  const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
6369  ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? j+1UL : j )
6370  :( 0UL ) );
6371  const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
6372  ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? j : j+1UL )
6373  :( M ) );
6374  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6375 
6376  for( size_t i=ibegin; i<iend; ++i ) {
6377  (~C)(i,j) *= scalar;
6378  }
6379  }
6380  }
6381  }
6382  //**********************************************************************************************
6383 
6384  //**Default assignment to row-major dense matrices (general/diagonal)***************************
6398  template< typename MT3 // Type of the left-hand side target matrix
6399  , typename MT4 // Type of the left-hand side matrix operand
6400  , typename MT5 // Type of the right-hand side matrix operand
6401  , typename ST2 > // Type of the scalar value
6402  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6403  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6404  {
6405  const size_t M( A.rows() );
6406  const size_t N( B.columns() );
6407 
6408  const size_t block( BLOCK_SIZE );
6409 
6410  for( size_t ii=0UL; ii<M; ii+=block ) {
6411  const size_t iend( min( M, ii+block ) );
6412  for( size_t jj=0UL; jj<N; jj+=block ) {
6413  const size_t jend( min( N, jj+block ) );
6414  for( size_t i=ii; i<iend; ++i )
6415  {
6416  const size_t jbegin( ( IsUpper<MT4>::value )
6417  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
6418  :( jj ) );
6419  const size_t jpos( ( IsLower<MT4>::value )
6420  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
6421  :( jend ) );
6422 
6423  if( IsUpper<MT4>::value ) {
6424  for( size_t j=jj; j<jbegin; ++j ) {
6425  reset( (~C)(i,j) );
6426  }
6427  }
6428  for( size_t j=jbegin; j<jpos; ++j ) {
6429  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6430  }
6431  if( IsLower<MT4>::value ) {
6432  for( size_t j=jpos; j<jend; ++j ) {
6433  reset( (~C)(i,j) );
6434  }
6435  }
6436  }
6437  }
6438  }
6439  }
6440  //**********************************************************************************************
6441 
6442  //**Default assignment to column-major dense matrices (general/diagonal)************************
6456  template< typename MT3 // Type of the left-hand side target matrix
6457  , typename MT4 // Type of the left-hand side matrix operand
6458  , typename MT5 // Type of the right-hand side matrix operand
6459  , typename ST2 > // Type of the scalar value
6460  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6461  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6462  {
6463  const size_t M( A.rows() );
6464  const size_t N( B.columns() );
6465 
6466  for( size_t j=0UL; j<N; ++j )
6467  {
6468  const size_t ibegin( ( IsLower<MT4>::value )
6469  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6470  :( 0UL ) );
6471  const size_t iend( ( IsUpper<MT4>::value )
6472  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6473  :( M ) );
6474  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6475 
6476  if( IsLower<MT4>::value ) {
6477  for( size_t i=0UL; i<ibegin; ++i ) {
6478  reset( (~C)(i,j) );
6479  }
6480  }
6481  for( size_t i=ibegin; i<iend; ++i ) {
6482  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6483  }
6484  if( IsUpper<MT4>::value ) {
6485  for( size_t i=iend; i<M; ++i ) {
6486  reset( (~C)(i,j) );
6487  }
6488  }
6489  }
6490  }
6491  //**********************************************************************************************
6492 
6493  //**Default assignment to row-major dense matrices (diagonal/general)***************************
6507  template< typename MT3 // Type of the left-hand side target matrix
6508  , typename MT4 // Type of the left-hand side matrix operand
6509  , typename MT5 // Type of the right-hand side matrix operand
6510  , typename ST2 > // Type of the scalar value
6511  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6512  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6513  {
6514  const size_t M( A.rows() );
6515  const size_t N( B.columns() );
6516 
6517  for( size_t i=0UL; i<M; ++i )
6518  {
6519  const size_t jbegin( ( IsUpper<MT5>::value )
6520  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
6521  :( 0UL ) );
6522  const size_t jend( ( IsLower<MT5>::value )
6523  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
6524  :( N ) );
6525  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6526 
6527  if( IsUpper<MT5>::value ) {
6528  for( size_t j=0UL; j<jbegin; ++j ) {
6529  reset( (~C)(i,j) );
6530  }
6531  }
6532  for( size_t j=jbegin; j<jend; ++j ) {
6533  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6534  }
6535  if( IsLower<MT5>::value ) {
6536  for( size_t j=jend; j<N; ++j ) {
6537  reset( (~C)(i,j) );
6538  }
6539  }
6540  }
6541  }
6542  //**********************************************************************************************
6543 
6544  //**Default assignment to column-major dense matrices (diagonal/general)************************
6558  template< typename MT3 // Type of the left-hand side target matrix
6559  , typename MT4 // Type of the left-hand side matrix operand
6560  , typename MT5 // Type of the right-hand side matrix operand
6561  , typename ST2 > // Type of the scalar value
6562  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6563  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6564  {
6565  const size_t M( A.rows() );
6566  const size_t N( B.columns() );
6567 
6568  const size_t block( BLOCK_SIZE );
6569 
6570  for( size_t jj=0UL; jj<N; jj+=block ) {
6571  const size_t jend( min( N, jj+block ) );
6572  for( size_t ii=0UL; ii<M; ii+=block ) {
6573  const size_t iend( min( M, ii+block ) );
6574  for( size_t j=jj; j<jend; ++j )
6575  {
6576  const size_t ibegin( ( IsLower<MT5>::value )
6577  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
6578  :( ii ) );
6579  const size_t ipos( ( IsUpper<MT5>::value )
6580  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
6581  :( iend ) );
6582 
6583  if( IsLower<MT5>::value ) {
6584  for( size_t i=ii; i<ibegin; ++i ) {
6585  reset( (~C)(i,j) );
6586  }
6587  }
6588  for( size_t i=ibegin; i<ipos; ++i ) {
6589  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6590  }
6591  if( IsUpper<MT5>::value ) {
6592  for( size_t i=ipos; i<iend; ++i ) {
6593  reset( (~C)(i,j) );
6594  }
6595  }
6596  }
6597  }
6598  }
6599  }
6600  //**********************************************************************************************
6601 
6602  //**Default assignment to dense matrices (diagonal/diagonal)************************************
6616  template< typename MT3 // Type of the left-hand side target matrix
6617  , typename MT4 // Type of the left-hand side matrix operand
6618  , typename MT5 // Type of the right-hand side matrix operand
6619  , typename ST2 > // Type of the scalar value
6620  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
6621  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6622  {
6623  reset( C );
6624 
6625  for( size_t i=0UL; i<A.rows(); ++i ) {
6626  C(i,i) = A(i,i) * B(i,i) * scalar;
6627  }
6628  }
6629  //**********************************************************************************************
6630 
6631  //**Default assignment to dense matrices (small matrices)***************************************
6645  template< typename MT3 // Type of the left-hand side target matrix
6646  , typename MT4 // Type of the left-hand side matrix operand
6647  , typename MT5 // Type of the right-hand side matrix operand
6648  , typename ST2 > // Type of the scalar value
6649  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6650  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6651  {
6652  selectDefaultAssignKernel( C, A, B, scalar );
6653  }
6654  //**********************************************************************************************
6655 
6656  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
6671  template< typename MT3 // Type of the left-hand side target matrix
6672  , typename MT4 // Type of the left-hand side matrix operand
6673  , typename MT5 // Type of the right-hand side matrix operand
6674  , typename ST2 > // Type of the scalar value
6675  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6676  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6677  {
6678  const size_t M( A.rows() );
6679  const size_t N( B.columns() );
6680  const size_t K( A.columns() );
6681 
6682  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6683 
6684  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
6685  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
6686 
6687  const SIMDType factor( set( scalar ) );
6688 
6689  size_t j( 0UL );
6690 
6691  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
6692  for( size_t i=0UL; i<M; ++i )
6693  {
6694  const size_t kbegin( ( IsUpper<MT4>::value )
6695  ?( ( IsLower<MT5>::value )
6696  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6697  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6698  :( IsLower<MT5>::value ? j : 0UL ) );
6699  const size_t kend( ( IsLower<MT4>::value )
6700  ?( ( IsUpper<MT5>::value )
6701  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
6702  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
6703  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
6704 
6705  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6706 
6707  for( size_t k=kbegin; k<kend; ++k ) {
6708  const SIMDType a1( set( A(i,k) ) );
6709  xmm1 = xmm1 + a1 * B.load(k,j );
6710  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
6711  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
6712  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
6713  xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
6714  xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
6715  xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
6716  xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
6717  }
6718 
6719  (~C).store( i, j , xmm1 * factor );
6720  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
6721  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
6722  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
6723  (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
6724  (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
6725  (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
6726  (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
6727  }
6728  }
6729 
6730  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6731  {
6732  size_t i( 0UL );
6733 
6734  for( ; (i+2UL) <= M; i+=2UL )
6735  {
6736  const size_t kbegin( ( IsUpper<MT4>::value )
6737  ?( ( IsLower<MT5>::value )
6738  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6739  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6740  :( IsLower<MT5>::value ? j : 0UL ) );
6741  const size_t kend( ( IsLower<MT4>::value )
6742  ?( ( IsUpper<MT5>::value )
6743  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
6744  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6745  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
6746 
6747  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6748 
6749  for( size_t k=kbegin; k<kend; ++k ) {
6750  const SIMDType a1( set( A(i ,k) ) );
6751  const SIMDType a2( set( A(i+1UL,k) ) );
6752  const SIMDType b1( B.load(k,j ) );
6753  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6754  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6755  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6756  xmm1 = xmm1 + a1 * b1;
6757  xmm2 = xmm2 + a1 * b2;
6758  xmm3 = xmm3 + a1 * b3;
6759  xmm4 = xmm4 + a1 * b4;
6760  xmm5 = xmm5 + a2 * b1;
6761  xmm6 = xmm6 + a2 * b2;
6762  xmm7 = xmm7 + a2 * b3;
6763  xmm8 = xmm8 + a2 * b4;
6764  }
6765 
6766  (~C).store( i , j , xmm1 * factor );
6767  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
6768  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
6769  (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
6770  (~C).store( i+1UL, j , xmm5 * factor );
6771  (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
6772  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
6773  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
6774  }
6775 
6776  if( i < M )
6777  {
6778  const size_t kbegin( ( IsUpper<MT4>::value )
6779  ?( ( IsLower<MT5>::value )
6780  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6781  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6782  :( IsLower<MT5>::value ? j : 0UL ) );
6783  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
6784 
6785  SIMDType xmm1, xmm2, xmm3, xmm4;
6786 
6787  for( size_t k=kbegin; k<kend; ++k ) {
6788  const SIMDType a1( set( A(i,k) ) );
6789  xmm1 = xmm1 + a1 * B.load(k,j );
6790  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
6791  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
6792  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
6793  }
6794 
6795  (~C).store( i, j , xmm1 * factor );
6796  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
6797  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
6798  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
6799  }
6800  }
6801 
6802  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6803  {
6804  size_t i( 0UL );
6805 
6806  for( ; (i+2UL) <= M; i+=2UL )
6807  {
6808  const size_t kbegin( ( IsUpper<MT4>::value )
6809  ?( ( IsLower<MT5>::value )
6810  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6811  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6812  :( IsLower<MT5>::value ? j : 0UL ) );
6813  const size_t kend( ( IsLower<MT4>::value )
6814  ?( ( IsUpper<MT5>::value )
6815  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
6816  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6817  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
6818 
6819  SIMDType xmm1, xmm2, xmm3, xmm4;
6820 
6821  for( size_t k=kbegin; k<kend; ++k ) {
6822  const SIMDType a1( set( A(i ,k) ) );
6823  const SIMDType a2( set( A(i+1UL,k) ) );
6824  const SIMDType b1( B.load(k,j ) );
6825  const SIMDType b2( B.load(k,j+SIMDSIZE) );
6826  xmm1 = xmm1 + a1 * b1;
6827  xmm2 = xmm2 + a1 * b2;
6828  xmm3 = xmm3 + a2 * b1;
6829  xmm4 = xmm4 + a2 * b2;
6830  }
6831 
6832  (~C).store( i , j , xmm1 * factor );
6833  (~C).store( i , j+SIMDSIZE, xmm2 * factor );
6834  (~C).store( i+1UL, j , xmm3 * factor );
6835  (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
6836  }
6837 
6838  if( i < M )
6839  {
6840  const size_t kbegin( ( IsUpper<MT4>::value )
6841  ?( ( IsLower<MT5>::value )
6842  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6843  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6844  :( IsLower<MT5>::value ? j : 0UL ) );
6845  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
6846 
6847  SIMDType xmm1, xmm2;
6848 
6849  for( size_t k=kbegin; k<kend; ++k ) {
6850  const SIMDType a1( set( A(i,k) ) );
6851  xmm1 = xmm1 + a1 * B.load(k,j );
6852  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
6853  }
6854 
6855  (~C).store( i, j , xmm1 * factor );
6856  (~C).store( i, j+SIMDSIZE, xmm2 * factor );
6857  }
6858  }
6859 
6860  for( ; j<jpos; j+=SIMDSIZE )
6861  {
6862  size_t i( 0UL );
6863 
6864  for( ; (i+2UL) <= M; i+=2UL )
6865  {
6866  const size_t kbegin( ( IsUpper<MT4>::value )
6867  ?( ( IsLower<MT5>::value )
6868  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6869  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6870  :( IsLower<MT5>::value ? j : 0UL ) );
6871  const size_t kend( ( IsLower<MT4>::value )
6872  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6873  :( K ) );
6874 
6875  SIMDType xmm1, xmm2;
6876 
6877  for( size_t k=kbegin; k<kend; ++k ) {
6878  const SIMDType b1( B.load(k,j) );
6879  xmm1 = xmm1 + set( A(i ,k) ) * b1;
6880  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
6881  }
6882 
6883  (~C).store( i , j, xmm1 * factor );
6884  (~C).store( i+1UL, j, xmm2 * factor );
6885  }
6886 
6887  if( i < M )
6888  {
6889  const size_t kbegin( ( IsUpper<MT4>::value )
6890  ?( ( IsLower<MT5>::value )
6891  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6892  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6893  :( IsLower<MT5>::value ? j : 0UL ) );
6894 
6895  SIMDType xmm1;
6896 
6897  for( size_t k=kbegin; k<K; ++k ) {
6898  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
6899  }
6900 
6901  (~C).store( i, j, xmm1 * factor );
6902  }
6903  }
6904 
6905  for( ; remainder && j<N; ++j )
6906  {
6907  size_t i( 0UL );
6908 
6909  for( ; (i+2UL) <= M; i+=2UL )
6910  {
6911  const size_t kbegin( ( IsUpper<MT4>::value )
6912  ?( ( IsLower<MT5>::value )
6913  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6914  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6915  :( IsLower<MT5>::value ? j : 0UL ) );
6916  const size_t kend( ( IsLower<MT4>::value )
6917  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6918  :( K ) );
6919 
6920  ElementType value1 = ElementType();
6921  ElementType value2 = ElementType();
6922 
6923  for( size_t k=kbegin; k<kend; ++k ) {
6924  value1 += A(i ,k) * B(k,j);
6925  value2 += A(i+1UL,k) * B(k,j);
6926  }
6927 
6928  (~C)(i ,j) = value1 * scalar;
6929  (~C)(i+1UL,j) = value2 * scalar;
6930  }
6931 
6932  if( i < M )
6933  {
6934  const size_t kbegin( ( IsUpper<MT4>::value )
6935  ?( ( IsLower<MT5>::value )
6936  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6937  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6938  :( IsLower<MT5>::value ? j : 0UL ) );
6939 
6940  ElementType value = ElementType();
6941 
6942  for( size_t k=kbegin; k<K; ++k ) {
6943  value += A(i,k) * B(k,j);
6944  }
6945 
6946  (~C)(i,j) = value * scalar;
6947  }
6948  }
6949  }
6950  //**********************************************************************************************
6951 
6952  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
6967  template< typename MT3 // Type of the left-hand side target matrix
6968  , typename MT4 // Type of the left-hand side matrix operand
6969  , typename MT5 // Type of the right-hand side matrix operand
6970  , typename ST2 > // Type of the scalar value
6971  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6972  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6973  {
6974  const size_t M( A.rows() );
6975  const size_t N( B.columns() );
6976  const size_t K( A.columns() );
6977 
6978  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6979 
6980  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
6981  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
6982 
6983  const SIMDType factor( set( scalar ) );
6984 
6985  size_t i( 0UL );
6986 
6987  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6988  for( size_t j=0UL; j<N; ++j )
6989  {
6990  const size_t kbegin( ( IsLower<MT5>::value )
6991  ?( ( IsUpper<MT4>::value )
6992  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6993  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6994  :( IsUpper<MT4>::value ? i : 0UL ) );
6995  const size_t kend( ( IsUpper<MT5>::value )
6996  ?( ( IsLower<MT4>::value )
6997  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
6998  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
6999  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
7000 
7001  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7002 
7003  for( size_t k=kbegin; k<kend; ++k ) {
7004  const SIMDType b1( set( B(k,j) ) );
7005  xmm1 = xmm1 + A.load(i ,k) * b1;
7006  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
7007  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
7008  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
7009  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
7010  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
7011  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
7012  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
7013  }
7014 
7015  (~C).store( i , j, xmm1 * factor );
7016  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
7017  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
7018  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
7019  (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
7020  (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
7021  (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
7022  (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
7023  }
7024  }
7025 
7026  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7027  {
7028  size_t j( 0UL );
7029 
7030  for( ; (j+2UL) <= N; j+=2UL )
7031  {
7032  const size_t kbegin( ( IsLower<MT5>::value )
7033  ?( ( IsUpper<MT4>::value )
7034  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7035  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7036  :( IsUpper<MT4>::value ? i : 0UL ) );
7037  const size_t kend( ( IsUpper<MT5>::value )
7038  ?( ( IsLower<MT4>::value )
7039  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7040  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7041  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
7042 
7043  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7044 
7045  for( size_t k=kbegin; k<kend; ++k ) {
7046  const SIMDType a1( A.load(i ,k) );
7047  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7048  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7049  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7050  const SIMDType b1( set( B(k,j ) ) );
7051  const SIMDType b2( set( B(k,j+1UL) ) );
7052  xmm1 = xmm1 + a1 * b1;
7053  xmm2 = xmm2 + a2 * b1;
7054  xmm3 = xmm3 + a3 * b1;
7055  xmm4 = xmm4 + a4 * b1;
7056  xmm5 = xmm5 + a1 * b2;
7057  xmm6 = xmm6 + a2 * b2;
7058  xmm7 = xmm7 + a3 * b2;
7059  xmm8 = xmm8 + a4 * b2;
7060  }
7061 
7062  (~C).store( i , j , xmm1 * factor );
7063  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
7064  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
7065  (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
7066  (~C).store( i , j+1UL, xmm5 * factor );
7067  (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
7068  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
7069  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
7070  }
7071 
7072  if( j < N )
7073  {
7074  const size_t kbegin( ( IsLower<MT5>::value )
7075  ?( ( IsUpper<MT4>::value )
7076  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7077  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7078  :( IsUpper<MT4>::value ? i : 0UL ) );
7079  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
7080 
7081  SIMDType xmm1, xmm2, xmm3, xmm4;
7082 
7083  for( size_t k=kbegin; k<kend; ++k ) {
7084  const SIMDType b1( set( B(k,j) ) );
7085  xmm1 = xmm1 + A.load(i ,k) * b1;
7086  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
7087  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
7088  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
7089  }
7090 
7091  (~C).store( i , j, xmm1 * factor );
7092  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
7093  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
7094  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
7095  }
7096  }
7097 
7098  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7099  {
7100  size_t j( 0UL );
7101 
7102  for( ; (j+2UL) <= N; j+=2UL )
7103  {
7104  const size_t kbegin( ( IsLower<MT5>::value )
7105  ?( ( IsUpper<MT4>::value )
7106  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7107  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7108  :( IsUpper<MT4>::value ? i : 0UL ) );
7109  const size_t kend( ( IsUpper<MT5>::value )
7110  ?( ( IsLower<MT4>::value )
7111  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7112  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7113  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
7114 
7115  SIMDType xmm1, xmm2, xmm3, xmm4;
7116 
7117  for( size_t k=kbegin; k<kend; ++k ) {
7118  const SIMDType a1( A.load(i ,k) );
7119  const SIMDType a2( A.load(i+SIMDSIZE,k) );
7120  const SIMDType b1( set( B(k,j ) ) );
7121  const SIMDType b2( set( B(k,j+1UL) ) );
7122  xmm1 = xmm1 + a1 * b1;
7123  xmm2 = xmm2 + a2 * b1;
7124  xmm3 = xmm3 + a1 * b2;
7125  xmm4 = xmm4 + a2 * b2;
7126  }
7127 
7128  (~C).store( i , j , xmm1 * factor );
7129  (~C).store( i+SIMDSIZE, j , xmm2 * factor );
7130  (~C).store( i , j+1UL, xmm3 * factor );
7131  (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
7132  }
7133 
7134  if( j < N )
7135  {
7136  const size_t kbegin( ( IsLower<MT5>::value )
7137  ?( ( IsUpper<MT4>::value )
7138  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7139  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7140  :( IsUpper<MT4>::value ? i : 0UL ) );
7141  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
7142 
7143  SIMDType xmm1, xmm2;
7144 
7145  for( size_t k=kbegin; k<kend; ++k ) {
7146  const SIMDType b1( set( B(k,j) ) );
7147  xmm1 = xmm1 + A.load(i ,k) * b1;
7148  xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
7149  }
7150 
7151  (~C).store( i , j, xmm1 * factor );
7152  (~C).store( i+SIMDSIZE, j, xmm2 * factor );
7153  }
7154  }
7155 
7156  for( ; i<ipos; i+=SIMDSIZE )
7157  {
7158  size_t j( 0UL );
7159 
7160  for( ; (j+2UL) <= N; j+=2UL )
7161  {
7162  const size_t kbegin( ( IsLower<MT5>::value )
7163  ?( ( IsUpper<MT4>::value )
7164  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7165  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7166  :( IsUpper<MT4>::value ? i : 0UL ) );
7167  const size_t kend( ( IsUpper<MT5>::value )
7168  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7169  :( K ) );
7170 
7171  SIMDType xmm1, xmm2;
7172 
7173  for( size_t k=kbegin; k<kend; ++k ) {
7174  const SIMDType a1( A.load(i,k) );
7175  xmm1 = xmm1 + a1 * set( B(k,j ) );
7176  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
7177  }
7178 
7179  (~C).store( i, j , xmm1 * factor );
7180  (~C).store( i, j+1UL, xmm2 * factor );
7181  }
7182 
7183  if( j < N )
7184  {
7185  const size_t kbegin( ( IsLower<MT5>::value )
7186  ?( ( IsUpper<MT4>::value )
7187  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7188  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7189  :( IsUpper<MT4>::value ? i : 0UL ) );
7190 
7191  SIMDType xmm1;
7192 
7193  for( size_t k=kbegin; k<K; ++k ) {
7194  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
7195  }
7196 
7197  (~C).store( i, j, xmm1 * factor );
7198  }
7199  }
7200 
7201  for( ; remainder && i<M; ++i )
7202  {
7203  size_t j( 0UL );
7204 
7205  for( ; (j+2UL) <= N; j+=2UL )
7206  {
7207  const size_t kbegin( ( IsLower<MT5>::value )
7208  ?( ( IsUpper<MT4>::value )
7209  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7210  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7211  :( IsUpper<MT4>::value ? i : 0UL ) );
7212  const size_t kend( ( IsUpper<MT5>::value )
7213  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7214  :( K ) );
7215 
7216  ElementType value1 = ElementType();
7217  ElementType value2 = ElementType();
7218 
7219  for( size_t k=kbegin; k<kend; ++k ) {
7220  value1 += A(i,k) * B(k,j );
7221  value2 += A(i,k) * B(k,j+1UL);
7222  }
7223 
7224  (~C)(i,j ) = value1 * scalar;
7225  (~C)(i,j+1UL) = value2 * scalar;
7226  }
7227 
7228  if( j < N )
7229  {
7230  const size_t kbegin( ( IsLower<MT5>::value )
7231  ?( ( IsUpper<MT4>::value )
7232  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7233  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7234  :( IsUpper<MT4>::value ? i : 0UL ) );
7235 
7236  ElementType value = ElementType();
7237 
7238  for( size_t k=kbegin; k<K; ++k ) {
7239  value += A(i,k) * B(k,j);
7240  }
7241 
7242  (~C)(i,j) = value * scalar;
7243  }
7244  }
7245  }
7246  //**********************************************************************************************
7247 
7248  //**Default assignment to dense matrices (large matrices)***************************************
7262  template< typename MT3 // Type of the left-hand side target matrix
7263  , typename MT4 // Type of the left-hand side matrix operand
7264  , typename MT5 // Type of the right-hand side matrix operand
7265  , typename ST2 > // Type of the scalar value
7266  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7267  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7268  {
7269  selectDefaultAssignKernel( C, A, B, scalar );
7270  }
7271  //**********************************************************************************************
7272 
7273  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
7288  template< typename MT3 // Type of the left-hand side target matrix
7289  , typename MT4 // Type of the left-hand side matrix operand
7290  , typename MT5 // Type of the right-hand side matrix operand
7291  , typename ST2 > // Type of the scalar value
7292  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7293  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7294  {
7295  const size_t M( A.rows() );
7296  const size_t N( B.columns() );
7297  const size_t K( A.columns() );
7298 
7299  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
7300 
7301  const SIMDType factor( set( scalar ) );
7302 
7303  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
7304  {
7305  const size_t jend( min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
7306 
7307  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
7308  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
7309 
7310  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
7311  {
7312  const size_t iend( min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
7313 
7314  for( size_t i=ii; i<iend; ++i ) {
7315  for( size_t j=jj; j<jend; ++j ) {
7316  reset( (~C)(i,j) );
7317  }
7318  }
7319 
7320  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
7321  {
7322  const size_t ktmp( min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
7323 
7324  size_t j( jj );
7325 
7326  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7327  {
7328  const size_t j1( j+SIMDSIZE );
7329  const size_t j2( j+SIMDSIZE*2UL );
7330  const size_t j3( j+SIMDSIZE*3UL );
7331 
7332  size_t i( ii );
7333 
7334  for( ; (i+2UL) <= iend; i+=2UL )
7335  {
7336  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7337  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7338  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7339  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
7340 
7341  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7342 
7343  for( size_t k=kbegin; k<kend; ++k ) {
7344  const SIMDType a1( set( A(i ,k) ) );
7345  const SIMDType a2( set( A(i+1UL,k) ) );
7346  const SIMDType b1( B.load(k,j ) );
7347  const SIMDType b2( B.load(k,j1) );
7348  const SIMDType b3( B.load(k,j2) );
7349  const SIMDType b4( B.load(k,j3) );
7350  xmm1 = xmm1 + a1 * b1;
7351  xmm2 = xmm2 + a1 * b2;
7352  xmm3 = xmm3 + a1 * b3;
7353  xmm4 = xmm4 + a1 * b4;
7354  xmm5 = xmm5 + a2 * b1;
7355  xmm6 = xmm6 + a2 * b2;
7356  xmm7 = xmm7 + a2 * b3;
7357  xmm8 = xmm8 + a2 * b4;
7358  }
7359 
7360  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7361  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
7362  (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
7363  (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
7364  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
7365  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
7366  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
7367  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
7368  }
7369 
7370  if( i < iend )
7371  {
7372  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7373  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7374  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7375  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
7376 
7377  SIMDType xmm1, xmm2, xmm3, xmm4;
7378 
7379  for( size_t k=kbegin; k<kend; ++k ) {
7380  const SIMDType a1( set( A(i,k) ) );
7381  xmm1 = xmm1 + a1 * B.load(k,j );
7382  xmm2 = xmm2 + a1 * B.load(k,j1);
7383  xmm3 = xmm3 + a1 * B.load(k,j2);
7384  xmm4 = xmm4 + a1 * B.load(k,j3);
7385  }
7386 
7387  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
7388  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
7389  (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
7390  (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
7391  }
7392  }
7393 
7394  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
7395  {
7396  const size_t j1( j+SIMDSIZE );
7397 
7398  size_t i( ii );
7399 
7400  for( ; (i+4UL) <= iend; i+=4UL )
7401  {
7402  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7403  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7404  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
7405  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
7406 
7407  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7408 
7409  for( size_t k=kbegin; k<kend; ++k ) {
7410  const SIMDType a1( set( A(i ,k) ) );
7411  const SIMDType a2( set( A(i+1UL,k) ) );
7412  const SIMDType a3( set( A(i+2UL,k) ) );
7413  const SIMDType a4( set( A(i+3UL,k) ) );
7414  const SIMDType b1( B.load(k,j ) );
7415  const SIMDType b2( B.load(k,j1) );
7416  xmm1 = xmm1 + a1 * b1;
7417  xmm2 = xmm2 + a1 * b2;
7418  xmm3 = xmm3 + a2 * b1;
7419  xmm4 = xmm4 + a2 * b2;
7420  xmm5 = xmm5 + a3 * b1;
7421  xmm6 = xmm6 + a3 * b2;
7422  xmm7 = xmm7 + a4 * b1;
7423  xmm8 = xmm8 + a4 * b2;
7424  }
7425 
7426  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7427  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
7428  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
7429  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
7430  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
7431  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
7432  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
7433  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
7434  }
7435 
7436  for( ; (i+2UL) <= iend; i+=2UL )
7437  {
7438  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7439  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7440  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7441  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
7442 
7443  SIMDType xmm1, xmm2, xmm3, xmm4;
7444 
7445  for( size_t k=kbegin; k<kend; ++k ) {
7446  const SIMDType a1( set( A(i ,k) ) );
7447  const SIMDType a2( set( A(i+1UL,k) ) );
7448  const SIMDType b1( B.load(k,j ) );
7449  const SIMDType b2( B.load(k,j1) );
7450  xmm1 = xmm1 + a1 * b1;
7451  xmm2 = xmm2 + a1 * b2;
7452  xmm3 = xmm3 + a2 * b1;
7453  xmm4 = xmm4 + a2 * b2;
7454  }
7455 
7456  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7457  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
7458  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
7459  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
7460  }
7461 
7462  if( i < iend )
7463  {
7464  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7465  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7466  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7467  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
7468 
7469  SIMDType xmm1, xmm2;
7470 
7471  for( size_t k=kbegin; k<kend; ++k ) {
7472  const SIMDType a1( set( A(i,k) ) );
7473  xmm1 = xmm1 + a1 * B.load(k,j );
7474  xmm2 = xmm2 + a1 * B.load(k,j1);
7475  }
7476 
7477  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
7478  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
7479  }
7480  }
7481 
7482  for( ; j<jpos; j+=SIMDSIZE )
7483  {
7484  for( size_t i=ii; i<iend; ++i )
7485  {
7486  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7487  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7488  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7489  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
7490 
7491  SIMDType xmm1;
7492 
7493  for( size_t k=kbegin; k<kend; ++k ) {
7494  const SIMDType a1( set( A(i,k) ) );
7495  xmm1 = xmm1 + a1 * B.load(k,j);
7496  }
7497 
7498  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
7499  }
7500  }
7501 
7502  for( ; remainder && j<jend; ++j )
7503  {
7504  for( size_t i=ii; i<iend; ++i )
7505  {
7506  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7507  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7508  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7509  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
7510 
7511  ElementType value = ElementType();
7512 
7513  for( size_t k=kbegin; k<kend; ++k ) {
7514  value += A(i,k) * B(k,j);
7515  }
7516 
7517  (~C)(i,j) += value * scalar;
7518  }
7519  }
7520  }
7521  }
7522  }
7523  }
7524  //**********************************************************************************************
7525 
7526  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
7541  template< typename MT3 // Type of the left-hand side target matrix
7542  , typename MT4 // Type of the left-hand side matrix operand
7543  , typename MT5 // Type of the right-hand side matrix operand
7544  , typename ST2 > // Type of the scalar value
7545  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7546  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7547  {
7548  const size_t M( A.rows() );
7549  const size_t N( B.columns() );
7550  const size_t K( A.columns() );
7551 
7552  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
7553 
7554  const SIMDType factor( set( scalar ) );
7555 
7556  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
7557  {
7558  const size_t iend( min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
7559 
7560  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
7561  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
7562 
7563  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
7564  {
7565  const size_t jend( min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
7566 
7567  for( size_t j=jj; j<jend; ++j ) {
7568  for( size_t i=ii; i<iend; ++i ) {
7569  reset( (~C)(i,j) );
7570  }
7571  }
7572 
7573  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
7574  {
7575  const size_t ktmp( min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
7576 
7577  size_t i( ii );
7578 
7579  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7580  {
7581  const size_t i1( i+SIMDSIZE );
7582  const size_t i2( i+SIMDSIZE*2UL );
7583  const size_t i3( i+SIMDSIZE*3UL );
7584 
7585  size_t j( jj );
7586 
7587  for( ; (j+2UL) <= jend; j+=2UL )
7588  {
7589  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7590  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7591  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
7592  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7593 
7594  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7595 
7596  for( size_t k=kbegin; k<kend; ++k ) {
7597  const SIMDType a1( A.load(i ,k) );
7598  const SIMDType a2( A.load(i1,k) );
7599  const SIMDType a3( A.load(i2,k) );
7600  const SIMDType a4( A.load(i3,k) );
7601  const SIMDType b1( set( B(k,j ) ) );
7602  const SIMDType b2( set( B(k,j+1UL) ) );
7603  xmm1 = xmm1 + a1 * b1;
7604  xmm2 = xmm2 + a2 * b1;
7605  xmm3 = xmm3 + a3 * b1;
7606  xmm4 = xmm4 + a4 * b1;
7607  xmm5 = xmm5 + a1 * b2;
7608  xmm6 = xmm6 + a2 * b2;
7609  xmm7 = xmm7 + a3 * b2;
7610  xmm8 = xmm8 + a4 * b2;
7611  }
7612 
7613  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7614  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
7615  (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
7616  (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
7617  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
7618  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
7619  (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
7620  (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
7621  }
7622 
7623  if( j < jend )
7624  {
7625  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7626  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7627  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
7628  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7629 
7630  SIMDType xmm1, xmm2, xmm3, xmm4;
7631 
7632  for( size_t k=kbegin; k<kend; ++k ) {
7633  const SIMDType b1( set( B(k,j) ) );
7634  xmm1 = xmm1 + A.load(i ,k) * b1;
7635  xmm2 = xmm2 + A.load(i1,k) * b1;
7636  xmm3 = xmm3 + A.load(i2,k) * b1;
7637  xmm4 = xmm4 + A.load(i3,k) * b1;
7638  }
7639 
7640  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
7641  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
7642  (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
7643  (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
7644  }
7645  }
7646 
7647  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7648  {
7649  const size_t i1( i+SIMDSIZE );
7650 
7651  size_t j( jj );
7652 
7653  for( ; (j+4UL) <= jend; j+=4UL )
7654  {
7655  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7656  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7657  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
7658  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
7659 
7660  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7661 
7662  for( size_t k=kbegin; k<kend; ++k ) {
7663  const SIMDType a1( A.load(i ,k) );
7664  const SIMDType a2( A.load(i1,k) );
7665  const SIMDType b1( set( B(k,j ) ) );
7666  const SIMDType b2( set( B(k,j+1UL) ) );
7667  const SIMDType b3( set( B(k,j+2UL) ) );
7668  const SIMDType b4( set( B(k,j+3UL) ) );
7669  xmm1 = xmm1 + a1 * b1;
7670  xmm2 = xmm2 + a2 * b1;
7671  xmm3 = xmm3 + a1 * b2;
7672  xmm4 = xmm4 + a2 * b2;
7673  xmm5 = xmm5 + a1 * b3;
7674  xmm6 = xmm6 + a2 * b3;
7675  xmm7 = xmm7 + a1 * b4;
7676  xmm8 = xmm8 + a2 * b4;
7677  }
7678 
7679  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7680  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
7681  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
7682  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
7683  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
7684  (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
7685  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
7686  (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
7687  }
7688 
7689  for( ; (j+2UL) <= jend; j+=2UL )
7690  {
7691  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7692  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7693  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
7694  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7695 
7696  SIMDType xmm1, xmm2, xmm3, xmm4;
7697 
7698  for( size_t k=kbegin; k<kend; ++k ) {
7699  const SIMDType a1( A.load(i ,k) );
7700  const SIMDType a2( A.load(i1,k) );
7701  const SIMDType b1( set( B(k,j ) ) );
7702  const SIMDType b2( set( B(k,j+1UL) ) );
7703  xmm1 = xmm1 + a1 * b1;
7704  xmm2 = xmm2 + a2 * b1;
7705  xmm3 = xmm3 + a1 * b2;
7706  xmm4 = xmm4 + a2 * b2;
7707  }
7708 
7709  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7710  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
7711  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
7712  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
7713  }
7714 
7715  if( j < jend )
7716  {
7717  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7718  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7719  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
7720  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7721 
7722  SIMDType xmm1, xmm2;
7723 
7724  for( size_t k=kbegin; k<kend; ++k ) {
7725  const SIMDType b1( set( B(k,j) ) );
7726  xmm1 = xmm1 + A.load(i ,k) * b1;
7727  xmm2 = xmm2 + A.load(i1,k) * b1;
7728  }
7729 
7730  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
7731  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
7732  }
7733  }
7734 
7735  for( ; i<ipos; i+=SIMDSIZE )
7736  {
7737  for( size_t j=jj; j<jend; ++j )
7738  {
7739  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7740  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7741  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE, ktmp ) ):( ktmp ),
7742  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7743 
7744  SIMDType xmm1;
7745 
7746  for( size_t k=kbegin; k<kend; ++k ) {
7747  const SIMDType b1( set( B(k,j) ) );
7748  xmm1 = xmm1 + A.load(i,k) * b1;
7749  }
7750 
7751  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
7752  }
7753  }
7754 
7755  for( ; remainder && i<iend; ++i )
7756  {
7757  for( size_t j=jj; j<jend; ++j )
7758  {
7759  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7760  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7761  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
7762  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7763 
7764  ElementType value = ElementType();
7765 
7766  for( size_t k=kbegin; k<kend; ++k ) {
7767  value += A(i,k) * B(k,j);
7768  }
7769 
7770  (~C)(i,j) += value * scalar;
7771  }
7772  }
7773  }
7774  }
7775  }
7776  }
7777  //**********************************************************************************************
7778 
7779  //**BLAS-based assignment to dense matrices (default)*******************************************
7793  template< typename MT3 // Type of the left-hand side target matrix
7794  , typename MT4 // Type of the left-hand side matrix operand
7795  , typename MT5 // Type of the right-hand side matrix operand
7796  , typename ST2 > // Type of the scalar value
7797  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7798  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7799  {
7800  selectLargeAssignKernel( C, A, B, scalar );
7801  }
7802  //**********************************************************************************************
7803 
7804  //**BLAS-based assignment to dense matrices*****************************************************
7805 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7806 
7819  template< typename MT3 // Type of the left-hand side target matrix
7820  , typename MT4 // Type of the left-hand side matrix operand
7821  , typename MT5 // Type of the right-hand side matrix operand
7822  , typename ST2 > // Type of the scalar value
7823  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7824  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7825  {
7826  typedef ElementType_<MT3> ET;
7827 
7828  if( IsTriangular<MT4>::value ) {
7829  assign( C, B );
7830  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7831  }
7832  else if( IsTriangular<MT5>::value ) {
7833  assign( C, A );
7834  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7835  }
7836  else {
7837  gemm( C, A, B, ET(scalar), ET(0) );
7838  }
7839  }
7840 #endif
7841  //**********************************************************************************************
7842 
7843  //**Assignment to sparse matrices***************************************************************
7855  template< typename MT // Type of the target sparse matrix
7856  , bool SO > // Storage order of the target sparse matrix
7857  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7858  {
7860 
7861  typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
7862 
7868  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<TmpType> );
7869 
7870  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7871  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7872 
7873  const TmpType tmp( serial( rhs ) );
7874  assign( ~lhs, tmp );
7875  }
7876  //**********************************************************************************************
7877 
7878  //**Addition assignment to dense matrices*******************************************************
7890  template< typename MT // Type of the target dense matrix
7891  , bool SO > // Storage order of the target dense matrix
7892  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7893  {
7895 
7896  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7897  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7898 
7899  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7900  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7901 
7902  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7903  return;
7904  }
7905 
7906  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7907  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7908 
7909  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7910  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7911  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7912  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7913  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7914  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7915 
7916  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
7917  }
7918  //**********************************************************************************************
7919 
7920  //**Addition assignment to dense matrices (kernel selection)************************************
7931  template< typename MT3 // Type of the left-hand side target matrix
7932  , typename MT4 // Type of the left-hand side matrix operand
7933  , typename MT5 // Type of the right-hand side matrix operand
7934  , typename ST2 > // Type of the scalar value
7935  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7936  {
7937  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
7938  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7939  selectSmallAddAssignKernel( C, A, B, scalar );
7940  else
7941  selectBlasAddAssignKernel( C, A, B, scalar );
7942  }
7943  //**********************************************************************************************
7944 
7945  //**Default addition assignment to dense matrices (general/general)*****************************
7959  template< typename MT3 // Type of the left-hand side target matrix
7960  , typename MT4 // Type of the left-hand side matrix operand
7961  , typename MT5 // Type of the right-hand side matrix operand
7962  , typename ST2 > // Type of the scalar value
7963  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
7964  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7965  {
7966  const ResultType tmp( serial( A * B * scalar ) );
7967  addAssign( C, tmp );
7968  }
7969  //**********************************************************************************************
7970 
7971  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
7985  template< typename MT3 // Type of the left-hand side target matrix
7986  , typename MT4 // Type of the left-hand side matrix operand
7987  , typename MT5 // Type of the right-hand side matrix operand
7988  , typename ST2 > // Type of the scalar value
7989  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
7990  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7991  {
7992  const size_t M( A.rows() );
7993  const size_t N( B.columns() );
7994 
7995  const size_t block( BLOCK_SIZE );
7996 
7997  for( size_t ii=0UL; ii<M; ii+=block ) {
7998  const size_t iend( min( M, ii+block ) );
7999  for( size_t jj=0UL; jj<N; jj+=block ) {
8000  const size_t jend( min( N, jj+block ) );
8001  for( size_t i=ii; i<iend; ++i )
8002  {
8003  const size_t jbegin( ( IsUpper<MT4>::value )
8004  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
8005  :( jj ) );
8006  const size_t jpos( ( IsLower<MT4>::value )
8007  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
8008  :( jend ) );
8009 
8010  for( size_t j=jbegin; j<jpos; ++j ) {
8011  (~C)(i,j) += A(i,j) * B(j,j) * scalar;
8012  }
8013  }
8014  }
8015  }
8016  }
8017  //**********************************************************************************************
8018 
8019  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
8033  template< typename MT3 // Type of the left-hand side target matrix
8034  , typename MT4 // Type of the left-hand side matrix operand
8035  , typename MT5 // Type of the right-hand side matrix operand
8036  , typename ST2 > // Type of the scalar value
8037  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
8038  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8039  {
8040  const size_t M( A.rows() );
8041  const size_t N( B.columns() );
8042 
8043  for( size_t j=0UL; j<N; ++j )
8044  {
8045  const size_t ibegin( ( IsLower<MT4>::value )
8046  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
8047  :( 0UL ) );
8048  const size_t iend( ( IsUpper<MT4>::value )
8049  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
8050  :( M ) );
8051  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
8052 
8053  const size_t inum( iend - ibegin );
8054  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
8055 
8056  for( size_t i=ibegin; i<ipos; i+=2UL ) {
8057  (~C)(i ,j) += A(i ,j) * B(j,j) * scalar;
8058  (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
8059  }
8060  if( ipos < iend ) {
8061  (~C)(ipos,j) += A(ipos,j) * B(j,j) * scalar;
8062  }
8063  }
8064  }
8065  //**********************************************************************************************
8066 
8067  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
8081  template< typename MT3 // Type of the left-hand side target matrix
8082  , typename MT4 // Type of the left-hand side matrix operand
8083  , typename MT5 // Type of the right-hand side matrix operand
8084  , typename ST2 > // Type of the scalar value
8085  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
8086  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
8087  {
8088  const size_t M( A.rows() );
8089  const size_t N( B.columns() );
8090 
8091  for( size_t i=0UL; i<M; ++i )
8092  {
8093  const size_t jbegin( ( IsUpper<MT5>::value )
8094  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
8095  :( 0UL ) );
8096  const size_t jend( ( IsLower<MT5>::value )
8097  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
8098  :( N ) );
8099  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
8100 
8101  const size_t jnum( jend - jbegin );
8102  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
8103 
8104  for( size_t j=jbegin; j<jpos; j+=2UL ) {
8105  (~C)(i,j ) += A(i,i) * B(i,j ) * scalar;
8106  (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
8107  }
8108  if( jpos < jend ) {
8109  (~C)(i,jpos) += A(i,i) * B(i,jpos) * scalar;
8110  }
8111  }
8112  }
8113  //**********************************************************************************************
8114 
8115  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
8129  template< typename MT3 // Type of the left-hand side target matrix
8130  , typename MT4 // Type of the left-hand side matrix operand
8131  , typename MT5 // Type of the right-hand side matrix operand
8132  , typename ST2 > // Type of the scalar value
8133  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
8134  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8135  {
8136  const size_t M( A.rows() );
8137  const size_t N( B.columns() );
8138 
8139  const size_t block( BLOCK_SIZE );
8140 
8141  for( size_t jj=0UL; jj<N; jj+=block ) {
8142  const size_t jend( min( N, jj+block ) );
8143  for( size_t ii=0UL; ii<M; ii+=block ) {
8144  const size_t iend( min( M, ii+block ) );
8145  for( size_t j=jj; j<jend; ++j )
8146  {
8147  const size_t ibegin( ( IsLower<MT5>::value )
8148  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
8149  :( ii ) );
8150  const size_t ipos( ( IsUpper<MT5>::value )
8151  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
8152  :( iend ) );
8153 
8154  for( size_t i=ibegin; i<ipos; ++i ) {
8155  (~C)(i,j) += A(i,i) * B(i,j) * scalar;
8156  }
8157  }
8158  }
8159  }
8160  }
8161  //**********************************************************************************************
8162 
8163  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
8177  template< typename MT3 // Type of the left-hand side target matrix
8178  , typename MT4 // Type of the left-hand side matrix operand
8179  , typename MT5 // Type of the right-hand side matrix operand
8180  , typename ST2 > // Type of the scalar value
8181  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
8182  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8183  {
8184  for( size_t i=0UL; i<A.rows(); ++i ) {
8185  C(i,i) += A(i,i) * B(i,i) * scalar;
8186  }
8187  }
8188  //**********************************************************************************************
8189 
8190  //**Default addition assignment to dense matrices (small matrices)******************************
8204  template< typename MT3 // Type of the left-hand side target matrix
8205  , typename MT4 // Type of the left-hand side matrix operand
8206  , typename MT5 // Type of the right-hand side matrix operand
8207  , typename ST2 > // Type of the scalar value
8208  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
8209  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8210  {
8211  selectDefaultAddAssignKernel( C, A, B, scalar );
8212  }
8213  //**********************************************************************************************
8214 
8215  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
8230  template< typename MT3 // Type of the left-hand side target matrix
8231  , typename MT4 // Type of the left-hand side matrix operand
8232  , typename MT5 // Type of the right-hand side matrix operand
8233  , typename ST2 > // Type of the scalar value
8234  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
8235  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
8236  {
8237  const size_t M( A.rows() );
8238  const size_t N( B.columns() );
8239  const size_t K( A.columns() );
8240 
8241  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
8242 
8243  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
8244  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
8245 
8246  const SIMDType factor( set( scalar ) );
8247 
8248  size_t j( 0UL );
8249 
8250  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
8251  for( size_t i=0UL; i<M; ++i )
8252  {
8253  const size_t kbegin( ( IsUpper<MT4>::value )
8254  ?( ( IsLower<MT5>::value )
8255  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8256  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8257  :( IsLower<MT5>::value ? j : 0UL ) );
8258  const size_t kend( ( IsLower<MT4>::value )
8259  ?( ( IsUpper<MT5>::value )
8260  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
8261  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
8262  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
8263 
8264  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8265 
8266  for( size_t k=kbegin; k<kend; ++k ) {
8267  const SIMDType a1( set( A(i,k) ) );
8268  xmm1 = xmm1 + a1 * B.load(k,j );
8269  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
8270  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
8271  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
8272  xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
8273  xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
8274  xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
8275  xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
8276  }
8277 
8278  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8279  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
8280  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
8281  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
8282  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
8283  (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
8284  (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
8285  (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
8286  }
8287  }
8288 
8289  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
8290  {
8291  size_t i( 0UL );
8292 
8293  for( ; (i+2UL) <= M; i+=2UL )
8294  {
8295  const size_t kbegin( ( IsUpper<MT4>::value )
8296  ?( ( IsLower<MT5>::value )
8297  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8298  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8299  :( IsLower<MT5>::value ? j : 0UL ) );
8300  const size_t kend( ( IsLower<MT4>::value )
8301  ?( ( IsUpper<MT5>::value )
8302  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
8303  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8304  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
8305 
8306  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8307 
8308  for( size_t k=kbegin; k<kend; ++k ) {
8309  const SIMDType a1( set( A(i ,k) ) );
8310  const SIMDType a2( set( A(i+1UL,k) ) );
8311  const SIMDType b1( B.load(k,j ) );
8312  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8313  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8314  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
8315  xmm1 = xmm1 + a1 * b1;
8316  xmm2 = xmm2 + a1 * b2;
8317  xmm3 = xmm3 + a1 * b3;
8318  xmm4 = xmm4 + a1 * b4;
8319  xmm5 = xmm5 + a2 * b1;
8320  xmm6 = xmm6 + a2 * b2;
8321  xmm7 = xmm7 + a2 * b3;
8322  xmm8 = xmm8 + a2 * b4;
8323  }
8324 
8325  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8326  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
8327  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
8328  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
8329  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8330  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
8331  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
8332  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
8333  }
8334 
8335  if( i < M )
8336  {
8337  const size_t kbegin( ( IsUpper<MT4>::value )
8338  ?( ( IsLower<MT5>::value )
8339  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8340  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8341  :( IsLower<MT5>::value ? j : 0UL ) );
8342  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
8343 
8344  SIMDType xmm1, xmm2, xmm3, xmm4;
8345 
8346  for( size_t k=kbegin; k<kend; ++k ) {
8347  const SIMDType a1( set( A(i,k) ) );
8348  xmm1 = xmm1 + a1 * B.load(k,j );
8349  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
8350  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
8351  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
8352  }
8353 
8354  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8355  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
8356  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
8357  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
8358  }
8359  }
8360 
8361  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8362  {
8363  size_t i( 0UL );
8364 
8365  for( ; (i+2UL) <= M; i+=2UL )
8366  {
8367  const size_t kbegin( ( IsUpper<MT4>::value )
8368  ?( ( IsLower<MT5>::value )
8369  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8370  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8371  :( IsLower<MT5>::value ? j : 0UL ) );
8372  const size_t kend( ( IsLower<MT4>::value )
8373  ?( ( IsUpper<MT5>::value )
8374  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
8375  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8376  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
8377 
8378  SIMDType xmm1, xmm2, xmm3, xmm4;
8379 
8380  for( size_t k=kbegin; k<kend; ++k ) {
8381  const SIMDType a1( set( A(i ,k) ) );
8382  const SIMDType a2( set( A(i+1UL,k) ) );
8383  const SIMDType b1( B.load(k,j ) );
8384  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8385  xmm1 = xmm1 + a1 * b1;
8386  xmm2 = xmm2 + a1 * b2;
8387  xmm3 = xmm3 + a2 * b1;
8388  xmm4 = xmm4 + a2 * b2;
8389  }
8390 
8391  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8392  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
8393  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
8394  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
8395  }
8396 
8397  if( i < M )
8398  {
8399  const size_t kbegin( ( IsUpper<MT4>::value )
8400  ?( ( IsLower<MT5>::value )
8401  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8402  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8403  :( IsLower<MT5>::value ? j : 0UL ) );
8404  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
8405 
8406  SIMDType xmm1, xmm2;
8407 
8408  for( size_t k=kbegin; k<kend; ++k ) {
8409  const SIMDType a1( set( A(i,k) ) );
8410  xmm1 = xmm1 + a1 * B.load(k,j );
8411  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
8412  }
8413 
8414  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8415  (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + xmm2 * factor );
8416  }
8417  }
8418 
8419  for( ; j<jpos; j+=SIMDSIZE )
8420  {
8421  size_t i( 0UL );
8422 
8423  for( ; (i+2UL) <= M; i+=2UL )
8424  {
8425  const size_t kbegin( ( IsUpper<MT4>::value )
8426  ?( ( IsLower<MT5>::value )
8427  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8428  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8429  :( IsLower<MT5>::value ? j : 0UL ) );
8430  const size_t kend( ( IsLower<MT4>::value )
8431  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8432  :( K ) );
8433 
8434  SIMDType xmm1, xmm2;
8435 
8436  for( size_t k=kbegin; k<kend; ++k ) {
8437  const SIMDType b1( B.load(k,j) );
8438  xmm1 = xmm1 + set( A(i ,k) ) * b1;
8439  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
8440  }
8441 
8442  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8443  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
8444  }
8445 
8446  if( i < M )
8447  {
8448  const size_t kbegin( ( IsUpper<MT4>::value )
8449  ?( ( IsLower<MT5>::value )
8450  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8451  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8452  :( IsLower<MT5>::value ? j : 0UL ) );
8453 
8454  SIMDType xmm1;
8455 
8456  for( size_t k=kbegin; k<K; ++k ) {
8457  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
8458  }
8459 
8460  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8461  }
8462  }
8463 
8464  for( ; remainder && j<N; ++j )
8465  {
8466  size_t i( 0UL );
8467 
8468  for( ; (i+2UL) <= M; i+=2UL )
8469  {
8470  const size_t kbegin( ( IsUpper<MT4>::value )
8471  ?( ( IsLower<MT5>::value )
8472  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8473  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8474  :( IsLower<MT5>::value ? j : 0UL ) );
8475  const size_t kend( ( IsLower<MT4>::value )
8476  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8477  :( K ) );
8478 
8479  ElementType value1 = ElementType();
8480  ElementType value2 = ElementType();
8481 
8482  for( size_t k=kbegin; k<kend; ++k ) {
8483  value1 += A(i ,k) * B(k,j);
8484  value2 += A(i+1UL,k) * B(k,j);
8485  }
8486 
8487  (~C)(i ,j) += value1 * scalar;
8488  (~C)(i+1UL,j) += value2 * scalar;
8489  }
8490 
8491  if( i < M )
8492  {
8493  const size_t kbegin( ( IsUpper<MT4>::value )
8494  ?( ( IsLower<MT5>::value )
8495  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8496  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8497  :( IsLower<MT5>::value ? j : 0UL ) );
8498 
8499  ElementType value = ElementType();
8500 
8501  for( size_t k=kbegin; k<K; ++k ) {
8502  value += A(i,k) * B(k,j);
8503  }
8504 
8505  (~C)(i,j) += value * scalar;
8506  }
8507  }
8508  }
8509  //**********************************************************************************************
8510 
8511  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
8526  template< typename MT3 // Type of the left-hand side target matrix
8527  , typename MT4 // Type of the left-hand side matrix operand
8528  , typename MT5 // Type of the right-hand side matrix operand
8529  , typename ST2 > // Type of the scalar value
8530  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
8531  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8532  {
8533  const size_t M( A.rows() );
8534  const size_t N( B.columns() );
8535  const size_t K( A.columns() );
8536 
8537  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
8538 
8539  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
8540  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
8541 
8542  const SIMDType factor( set( scalar ) );
8543 
8544  size_t i( 0UL );
8545 
8546  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
8547  for( size_t j=0UL; j<N; ++j )
8548  {
8549  const size_t kbegin( ( IsLower<MT5>::value )
8550  ?( ( IsUpper<MT4>::value )
8551  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8552  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8553  :( IsUpper<MT4>::value ? i : 0UL ) );
8554  const size_t kend( ( IsUpper<MT5>::value )
8555  ?( ( IsLower<MT4>::value )
8556  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
8557  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
8558  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
8559 
8560  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8561 
8562  for( size_t k=kbegin; k<kend; ++k ) {
8563  const SIMDType b1( set( B(k,j) ) );
8564  xmm1 = xmm1 + A.load(i ,k) * b1;
8565  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
8566  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
8567  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
8568  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
8569  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
8570  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
8571  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
8572  }
8573 
8574  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8575  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
8576  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
8577  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
8578  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
8579  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
8580  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
8581  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
8582  }
8583  }
8584 
8585  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
8586  {
8587  size_t j( 0UL );
8588 
8589  for( ; (j+2UL) <= N; j+=2UL )
8590  {
8591  const size_t kbegin( ( IsLower<MT5>::value )
8592  ?( ( IsUpper<MT4>::value )
8593  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8594  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8595  :( IsUpper<MT4>::value ? i : 0UL ) );
8596  const size_t kend( ( IsUpper<MT5>::value )
8597  ?( ( IsLower<MT4>::value )
8598  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8599  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8600  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
8601 
8602  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8603 
8604  for( size_t k=kbegin; k<kend; ++k ) {
8605  const SIMDType a1( A.load(i ,k) );
8606  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8607  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8608  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8609  const SIMDType b1( set( B(k,j ) ) );
8610  const SIMDType b2( set( B(k,j+1UL) ) );
8611  xmm1 = xmm1 + a1 * b1;
8612  xmm2 = xmm2 + a2 * b1;
8613  xmm3 = xmm3 + a3 * b1;
8614  xmm4 = xmm4 + a4 * b1;
8615  xmm5 = xmm5 + a1 * b2;
8616  xmm6 = xmm6 + a2 * b2;
8617  xmm7 = xmm7 + a3 * b2;
8618  xmm8 = xmm8 + a4 * b2;
8619  }
8620 
8621  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8622  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
8623  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
8624  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
8625  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
8626  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
8627  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
8628  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
8629  }
8630 
8631  if( j < N )
8632  {
8633  const size_t kbegin( ( IsLower<MT5>::value )
8634  ?( ( IsUpper<MT4>::value )
8635  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8636  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8637  :( IsUpper<MT4>::value ? i : 0UL ) );
8638  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
8639 
8640  SIMDType xmm1, xmm2, xmm3, xmm4;
8641 
8642  for( size_t k=kbegin; k<kend; ++k ) {
8643  const SIMDType b1( set( B(k,j) ) );
8644  xmm1 = xmm1 + A.load(i ,k) * b1;
8645  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
8646  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
8647  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
8648  }
8649 
8650  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8651  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
8652  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
8653  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
8654  }
8655  }
8656 
8657  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
8658  {
8659  size_t j( 0UL );
8660 
8661  for( ; (j+2UL) <= N; j+=2UL )
8662  {
8663  const size_t kbegin( ( IsLower<MT5>::value )
8664  ?( ( IsUpper<MT4>::value )
8665  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8666  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8667  :( IsUpper<MT4>::value ? i : 0UL ) );
8668  const size_t kend( ( IsUpper<MT5>::value )
8669  ?( ( IsLower<MT4>::value )
8670  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8671  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8672  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
8673 
8674  SIMDType xmm1, xmm2, xmm3, xmm4;
8675 
8676  for( size_t k=kbegin; k<kend; ++k ) {
8677  const SIMDType a1( A.load(i ,k) );
8678  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8679  const SIMDType b1( set( B(k,j ) ) );
8680  const SIMDType b2( set( B(k,j+1UL) ) );
8681  xmm1 = xmm1 + a1 * b1;
8682  xmm2 = xmm2 + a2 * b1;
8683  xmm3 = xmm3 + a1 * b2;
8684  xmm4 = xmm4 + a2 * b2;
8685  }
8686 
8687  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8688  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
8689  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
8690  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
8691  }
8692 
8693  if( j < N )
8694  {
8695  const size_t kbegin( ( IsLower<MT5>::value )
8696  ?( ( IsUpper<MT4>::value )
8697  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8698  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8699  :( IsUpper<MT4>::value ? i : 0UL ) );
8700  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
8701 
8702  SIMDType xmm1, xmm2;
8703 
8704  for( size_t k=kbegin; k<kend; ++k ) {
8705  const SIMDType b1( set( B(k,j) ) );
8706  xmm1 = xmm1 + A.load(i ,k) * b1;
8707  xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
8708  }
8709 
8710  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8711  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + xmm2 * factor );
8712  }
8713  }
8714 
8715  for( ; i<ipos; i+=SIMDSIZE )
8716  {
8717  size_t j( 0UL );
8718 
8719  for( ; (j+2UL) <= N; j+=2UL )
8720  {
8721  const size_t kbegin( ( IsLower<MT5>::value )
8722  ?( ( IsUpper<MT4>::value )
8723  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8724  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8725  :( IsUpper<MT4>::value ? i : 0UL ) );
8726  const size_t kend( ( IsUpper<MT5>::value )
8727  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8728  :( K ) );
8729 
8730  SIMDType xmm1, xmm2;
8731 
8732  for( size_t k=kbegin; k<kend; ++k ) {
8733  const SIMDType a1( A.load(i,k) );
8734  xmm1 = xmm1 + a1 * set( B(k,j ) );
8735  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
8736  }
8737 
8738  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8739  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
8740  }
8741 
8742  if( j < N )
8743  {
8744  const size_t kbegin( ( IsLower<MT5>::value )
8745  ?( ( IsUpper<MT4>::value )
8746  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8747  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8748  :( IsUpper<MT4>::value ? i : 0UL ) );
8749 
8750  SIMDType xmm1;
8751 
8752  for( size_t k=kbegin; k<K; ++k ) {
8753  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
8754  }
8755 
8756  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8757  }
8758  }
8759 
8760  for( ; remainder && i<M; ++i )
8761  {
8762  size_t j( 0UL );
8763 
8764  for( ; (j+2UL) <= N; j+=2UL )
8765  {
8766  const size_t kbegin( ( IsLower<MT5>::value )
8767  ?( ( IsUpper<MT4>::value )
8768  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8769  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8770  :( IsUpper<MT4>::value ? i : 0UL ) );
8771  const size_t kend( ( IsUpper<MT5>::value )
8772  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8773  :( K ) );
8774 
8775  ElementType value1 = ElementType();
8776  ElementType value2 = ElementType();
8777 
8778  for( size_t k=kbegin; k<kend; ++k ) {
8779  value1 += A(i,k) * B(k,j );
8780  value2 += A(i,k) * B(k,j+1UL);
8781  }
8782 
8783  (~C)(i,j ) += value1 * scalar;
8784  (~C)(i,j+1UL) += value2 * scalar;
8785  }
8786 
8787  if( j < N )
8788  {
8789  const size_t kbegin( ( IsLower<MT5>::value )
8790  ?( ( IsUpper<MT4>::value )
8791  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8792  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8793  :( IsUpper<MT4>::value ? i : 0UL ) );
8794 
8795  ElementType value = ElementType();
8796 
8797  for( size_t k=kbegin; k<K; ++k ) {
8798  value += A(i,k) * B(k,j);
8799  }
8800 
8801  (~C)(i,j) += value * scalar;
8802  }
8803  }
8804  }
8805  //**********************************************************************************************
8806 
8807  //**Default addition assignment to dense matrices (large matrices)******************************
8821  template< typename MT3 // Type of the left-hand side target matrix
8822  , typename MT4 // Type of the left-hand side matrix operand
8823  , typename MT5 // Type of the right-hand side matrix operand
8824  , typename ST2 > // Type of the scalar value
8825  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
8826  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8827  {
8828  selectDefaultAddAssignKernel( C, A, B, scalar );
8829  }
8830  //**********************************************************************************************
8831 
8832  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
8847  template< typename MT3 // Type of the left-hand side target matrix
8848  , typename MT4 // Type of the left-hand side matrix operand
8849  , typename MT5 // Type of the right-hand side matrix operand
8850  , typename ST2 > // Type of the scalar value
8851  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
8852  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
8853  {
8854  const size_t M( A.rows() );
8855  const size_t N( B.columns() );
8856  const size_t K( A.columns() );
8857 
8858  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
8859 
8860  const SIMDType factor( set( scalar ) );
8861 
8862  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
8863  {
8864  const size_t jend( min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
8865 
8866  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
8867  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
8868 
8869  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
8870  {
8871  const size_t iend( min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
8872 
8873  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
8874  {
8875  const size_t ktmp( min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
8876 
8877  size_t j( jj );
8878 
8879  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
8880  {
8881  const size_t j1( j+SIMDSIZE );
8882  const size_t j2( j+SIMDSIZE*2UL );
8883  const size_t j3( j+SIMDSIZE*3UL );
8884 
8885  size_t i( ii );
8886 
8887  for( ; (i+2UL) <= iend; i+=2UL )
8888  {
8889  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
8890  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
8891  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
8892  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
8893 
8894  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8895 
8896  for( size_t k=kbegin; k<kend; ++k ) {
8897  const SIMDType a1( set( A(i ,k) ) );
8898  const SIMDType a2( set( A(i+1UL,k) ) );
8899  const SIMDType b1( B.load(k,j ) );
8900  const SIMDType b2( B.load(k,j1) );
8901  const SIMDType b3( B.load(k,j2) );
8902  const SIMDType b4( B.load(k,j3) );
8903  xmm1 = xmm1 + a1 * b1;
8904  xmm2 = xmm2 + a1 * b2;
8905  xmm3 = xmm3 + a1 * b3;
8906  xmm4 = xmm4 + a1 * b4;
8907  xmm5 = xmm5 + a2 * b1;
8908  xmm6 = xmm6 + a2 * b2;
8909  xmm7 = xmm7 + a2 * b3;
8910  xmm8 = xmm8 + a2 * b4;
8911  }
8912 
8913  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8914  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
8915  (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
8916  (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
8917  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8918  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
8919  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
8920  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
8921  }
8922 
8923  if( i < iend )
8924  {
8925  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
8926  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
8927  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
8928  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
8929 
8930  SIMDType xmm1, xmm2, xmm3, xmm4;
8931 
8932  for( size_t k=kbegin; k<kend; ++k ) {
8933  const SIMDType a1( set( A(i,k) ) );
8934  xmm1 = xmm1 + a1 * B.load(k,j );
8935  xmm2 = xmm2 + a1 * B.load(k,j1);
8936  xmm3 = xmm3 + a1 * B.load(k,j2);
8937  xmm4 = xmm4 + a1 * B.load(k,j3);
8938  }
8939 
8940  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8941  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
8942  (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
8943  (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
8944  }
8945  }
8946 
8947  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8948  {
8949  const size_t j1( j+SIMDSIZE );
8950 
8951  size_t i( ii );
8952 
8953  for( ; (i+4UL) <= iend; i+=4UL )
8954  {
8955  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
8956  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
8957  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
8958  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
8959 
8960  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8961 
8962  for( size_t k=kbegin; k<kend; ++k ) {
8963  const SIMDType a1( set( A(i ,k) ) );
8964  const SIMDType a2( set( A(i+1UL,k) ) );
8965  const SIMDType a3( set( A(i+2UL,k) ) );
8966  const SIMDType a4( set( A(i+3UL,k) ) );
8967  const SIMDType b1( B.load(k,j ) );
8968  const SIMDType b2( B.load(k,j1) );
8969  xmm1 = xmm1 + a1 * b1;
8970  xmm2 = xmm2 + a1 * b2;
8971  xmm3 = xmm3 + a2 * b1;
8972  xmm4 = xmm4 + a2 * b2;
8973  xmm5 = xmm5 + a3 * b1;
8974  xmm6 = xmm6 + a3 * b2;
8975  xmm7 = xmm7 + a4 * b1;
8976  xmm8 = xmm8 + a4 * b2;
8977  }
8978 
8979  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8980  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
8981  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
8982  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
8983  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
8984  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
8985  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
8986  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
8987  }
8988 
8989  for( ; (i+2UL) <= iend; i+=2UL )
8990  {
8991  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
8992  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
8993  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
8994  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
8995 
8996  SIMDType xmm1, xmm2, xmm3, xmm4;
8997 
8998  for( size_t k=kbegin; k<kend; ++k ) {
8999  const SIMDType a1( set( A(i ,k) ) );
9000  const SIMDType a2( set( A(i+1UL,k) ) );
9001  const SIMDType b1( B.load(k,j ) );
9002  const SIMDType b2( B.load(k,j1) );
9003  xmm1 = xmm1 + a1 * b1;
9004  xmm2 = xmm2 + a1 * b2;
9005  xmm3 = xmm3 + a2 * b1;
9006  xmm4 = xmm4 + a2 * b2;
9007  }
9008 
9009  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9010  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
9011  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
9012  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
9013  }
9014 
9015  if( i < iend )
9016  {
9017  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9018  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9019  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9020  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
9021 
9022  SIMDType xmm1, xmm2;
9023 
9024  for( size_t k=kbegin; k<kend; ++k ) {
9025  const SIMDType a1( set( A(i,k) ) );
9026  xmm1 = xmm1 + a1 * B.load(k,j );
9027  xmm2 = xmm2 + a1 * B.load(k,j1);
9028  }
9029 
9030  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9031  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
9032  }
9033  }
9034 
9035  for( ; j<jpos; j+=SIMDSIZE )
9036  {
9037  for( size_t i=ii; i<iend; ++i )
9038  {
9039  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9040  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9041  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9042  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
9043 
9044  SIMDType xmm1;
9045 
9046  for( size_t k=kbegin; k<kend; ++k ) {
9047  const SIMDType a1( set( A(i,k) ) );
9048  xmm1 = xmm1 + a1 * B.load(k,j);
9049  }
9050 
9051  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
9052  }
9053  }
9054 
9055  for( ; remainder && j<jend; ++j )
9056  {
9057  for( size_t i=ii; i<iend; ++i )
9058  {
9059  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9060  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9061  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9062  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
9063 
9064  ElementType value = ElementType();
9065 
9066  for( size_t k=kbegin; k<kend; ++k ) {
9067  value += A(i,k) * B(k,j);
9068  }
9069 
9070  (~C)(i,j) += value * scalar;
9071  }
9072  }
9073  }
9074  }
9075  }
9076  }
9077  //**********************************************************************************************
9078 
9079  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
9094  template< typename MT3 // Type of the left-hand side target matrix
9095  , typename MT4 // Type of the left-hand side matrix operand
9096  , typename MT5 // Type of the right-hand side matrix operand
9097  , typename ST2 > // Type of the scalar value
9098  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
9099  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9100  {
9101  const size_t M( A.rows() );
9102  const size_t N( B.columns() );
9103  const size_t K( A.columns() );
9104 
9105  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
9106 
9107  const SIMDType factor( set( scalar ) );
9108 
9109  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
9110  {
9111  const size_t iend( min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
9112 
9113  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
9114  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
9115 
9116  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
9117  {
9118  const size_t jend( min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
9119 
9120  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
9121  {
9122  const size_t ktmp( min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
9123 
9124  size_t i( ii );
9125 
9126  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
9127  {
9128  const size_t i1( i+SIMDSIZE );
9129  const size_t i2( i+SIMDSIZE*2UL );
9130  const size_t i3( i+SIMDSIZE*3UL );
9131 
9132  size_t j( jj );
9133 
9134  for( ; (j+2UL) <= jend; j+=2UL )
9135  {
9136  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9137  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9138  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
9139  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
9140 
9141  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9142 
9143  for( size_t k=kbegin; k<kend; ++k ) {
9144  const SIMDType a1( A.load(i ,k) );
9145  const SIMDType a2( A.load(i1,k) );
9146  const SIMDType a3( A.load(i2,k) );
9147  const SIMDType a4( A.load(i3,k) );
9148  const SIMDType b1( set( B(k,j ) ) );
9149  const SIMDType b2( set( B(k,j+1UL) ) );
9150  xmm1 = xmm1 + a1 * b1;
9151  xmm2 = xmm2 + a2 * b1;
9152  xmm3 = xmm3 + a3 * b1;
9153  xmm4 = xmm4 + a4 * b1;
9154  xmm5 = xmm5 + a1 * b2;
9155  xmm6 = xmm6 + a2 * b2;
9156  xmm7 = xmm7 + a3 * b2;
9157  xmm8 = xmm8 + a4 * b2;
9158  }
9159 
9160  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9161  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
9162  (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
9163  (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
9164  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
9165  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
9166  (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
9167  (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
9168  }
9169 
9170  if( j < jend )
9171  {
9172  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9173  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9174  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
9175  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9176 
9177  SIMDType xmm1, xmm2, xmm3, xmm4;
9178 
9179  for( size_t k=kbegin; k<kend; ++k ) {
9180  const SIMDType b1( set( B(k,j) ) );
9181  xmm1 = xmm1 + A.load(i ,k) * b1;
9182  xmm2 = xmm2 + A.load(i1,k) * b1;
9183  xmm3 = xmm3 + A.load(i2,k) * b1;
9184  xmm4 = xmm4 + A.load(i3,k) * b1;
9185  }
9186 
9187  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
9188  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
9189  (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
9190  (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
9191  }
9192  }
9193 
9194  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
9195  {
9196  const size_t i1( i+SIMDSIZE );
9197 
9198  size_t j( jj );
9199 
9200  for( ; (j+4UL) <= jend; j+=4UL )
9201  {
9202  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9203  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9204  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
9205  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
9206 
9207  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9208 
9209  for( size_t k=kbegin; k<kend; ++k ) {
9210  const SIMDType a1( A.load(i ,k) );
9211  const SIMDType a2( A.load(i1,k) );
9212  const SIMDType b1( set( B(k,j ) ) );
9213  const SIMDType b2( set( B(k,j+1UL) ) );
9214  const SIMDType b3( set( B(k,j+2UL) ) );
9215  const SIMDType b4( set( B(k,j+3UL) ) );
9216  xmm1 = xmm1 + a1 * b1;
9217  xmm2 = xmm2 + a2 * b1;
9218  xmm3 = xmm3 + a1 * b2;
9219  xmm4 = xmm4 + a2 * b2;
9220  xmm5 = xmm5 + a1 * b3;
9221  xmm6 = xmm6 + a2 * b3;
9222  xmm7 = xmm7 + a1 * b4;
9223  xmm8 = xmm8 + a2 * b4;
9224  }
9225 
9226  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9227  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
9228  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
9229  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
9230  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
9231  (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
9232  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
9233  (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
9234  }
9235 
9236  for( ; (j+2UL) <= jend; j+=2UL )
9237  {
9238  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9239  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9240  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
9241  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
9242 
9243  SIMDType xmm1, xmm2, xmm3, xmm4;
9244 
9245  for( size_t k=kbegin; k<kend; ++k ) {
9246  const SIMDType a1( A.load(i ,k) );
9247  const SIMDType a2( A.load(i1,k) );
9248  const SIMDType b1( set( B(k,j ) ) );
9249  const SIMDType b2( set( B(k,j+1UL) ) );
9250  xmm1 = xmm1 + a1 * b1;
9251  xmm2 = xmm2 + a2 * b1;
9252  xmm3 = xmm3 + a1 * b2;
9253  xmm4 = xmm4 + a2 * b2;
9254  }
9255 
9256  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9257  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
9258  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
9259  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
9260  }
9261 
9262  if( j < jend )
9263  {
9264  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9265  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9266  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
9267  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9268 
9269  SIMDType xmm1, xmm2;
9270 
9271  for( size_t k=kbegin; k<kend; ++k ) {
9272  const SIMDType b1( set( B(k,j) ) );
9273  xmm1 = xmm1 + A.load(i ,k) * b1;
9274  xmm2 = xmm2 + A.load(i1,k) * b1;
9275  }
9276 
9277  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
9278  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
9279  }
9280  }
9281 
9282  for( ; i<ipos; i+=SIMDSIZE )
9283  {
9284  for( size_t j=jj; j<jend; ++j )
9285  {
9286  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9287  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9288  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE, ktmp ) ):( ktmp ),
9289  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9290 
9291  SIMDType xmm1;
9292 
9293  for( size_t k=kbegin; k<kend; ++k ) {
9294  const SIMDType b1( set( B(k,j) ) );
9295  xmm1 = xmm1 + A.load(i,k) * b1;
9296  }
9297 
9298  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
9299  }
9300  }
9301 
9302  for( ; remainder && i<iend; ++i )
9303  {
9304  for( size_t j=jj; j<jend; ++j )
9305  {
9306  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9307  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9308  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
9309  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9310 
9311  ElementType value = ElementType();
9312 
9313  for( size_t k=kbegin; k<kend; ++k ) {
9314  value += A(i,k) * B(k,j);
9315  }
9316 
9317  (~C)(i,j) += value * scalar;
9318  }
9319  }
9320  }
9321  }
9322  }
9323  }
9324  //**********************************************************************************************
9325 
9326  //**BLAS-based addition assignment to dense matrices (default)**********************************
9340  template< typename MT3 // Type of the left-hand side target matrix
9341  , typename MT4 // Type of the left-hand side matrix operand
9342  , typename MT5 // Type of the right-hand side matrix operand
9343  , typename ST2 > // Type of the scalar value
9344  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
9345  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9346  {
9347  selectLargeAddAssignKernel( C, A, B, scalar );
9348  }
9349  //**********************************************************************************************
9350 
9351  //**BLAS-based addition assignment to dense matrices********************************************
9352 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
9353 
9366  template< typename MT3 // Type of the left-hand side target matrix
9367  , typename MT4 // Type of the left-hand side matrix operand
9368  , typename MT5 // Type of the right-hand side matrix operand
9369  , typename ST2 > // Type of the scalar value
9370  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
9371  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9372  {
9373  typedef ElementType_<MT3> ET;
9374 
9375  if( IsTriangular<MT4>::value ) {
9376  ResultType_<MT3> tmp( serial( B ) );
9377  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9378  addAssign( C, tmp );
9379  }
9380  else if( IsTriangular<MT5>::value ) {
9381  ResultType_<MT3> tmp( serial( A ) );
9382  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9383  addAssign( C, tmp );
9384  }
9385  else {
9386  gemm( C, A, B, ET(scalar), ET(1) );
9387  }
9388  }
9389 #endif
9390  //**********************************************************************************************
9391 
9392  //**Addition assignment to sparse matrices******************************************************
9393  // No special implementation for the addition assignment to sparse matrices.
9394  //**********************************************************************************************
9395 
9396  //**Subtraction assignment to dense matrices****************************************************
9408  template< typename MT // Type of the target dense matrix
9409  , bool SO > // Storage order of the target dense matrix
9410  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9411  {
9413 
9414  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9415  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9416 
9417  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
9418  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
9419 
9420  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
9421  return;
9422  }
9423 
9424  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
9425  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
9426 
9427  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9428  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9429  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9430  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9431  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
9432  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
9433 
9434  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
9435  }
9436  //**********************************************************************************************
9437 
9438  //**Subtraction assignment to dense matrices (kernel selection)*********************************
9449  template< typename MT3 // Type of the left-hand side target matrix
9450  , typename MT4 // Type of the left-hand side matrix operand
9451  , typename MT5 // Type of the right-hand side matrix operand
9452  , typename ST2 > // Type of the scalar value
9453  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9454  {
9455  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
9456  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9457  selectSmallSubAssignKernel( C, A, B, scalar );
9458  else
9459  selectBlasSubAssignKernel( C, A, B, scalar );
9460  }
9461  //**********************************************************************************************
9462 
9463  //**Default subtraction assignment to dense matrices********************************************
9477  template< typename MT3 // Type of the left-hand side target matrix
9478  , typename MT4 // Type of the left-hand side matrix operand
9479  , typename MT5 // Type of the right-hand side matrix operand
9480  , typename ST2 > // Type of the scalar value
9481  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
9482  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9483  {
9484  const ResultType tmp( serial( A * B * scalar ) );
9485  subAssign( C, tmp );
9486  }
9487  //**********************************************************************************************
9488 
9489  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
9503  template< typename MT3 // Type of the left-hand side target matrix
9504  , typename MT4 // Type of the left-hand side matrix operand
9505  , typename MT5 // Type of the right-hand side matrix operand
9506  , typename ST2 > // Type of the scalar value
9507  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
9508  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9509  {
9510  const size_t M( A.rows() );
9511  const size_t N( B.columns() );
9512 
9513  const size_t block( BLOCK_SIZE );
9514 
9515  for( size_t ii=0UL; ii<M; ii+=block ) {
9516  const size_t iend( min( M, ii+block ) );
9517  for( size_t jj=0UL; jj<N; jj+=block ) {
9518  const size_t jend( min( N, jj+block ) );
9519  for( size_t i=ii; i<iend; ++i )
9520  {
9521  const size_t jbegin( ( IsUpper<MT4>::value )
9522  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
9523  :( jj ) );
9524  const size_t jpos( ( IsLower<MT4>::value )
9525  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
9526  :( jend ) );
9527 
9528  for( size_t j=jbegin; j<jpos; ++j ) {
9529  (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
9530  }
9531  }
9532  }
9533  }
9534  }
9535  //**********************************************************************************************
9536 
9537  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
9551  template< typename MT3 // Type of the left-hand side target matrix
9552  , typename MT4 // Type of the left-hand side matrix operand
9553  , typename MT5 // Type of the right-hand side matrix operand
9554  , typename ST2 > // Type of the scalar value
9555  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
9556  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9557  {
9558  const size_t M( A.rows() );
9559  const size_t N( B.columns() );
9560 
9561  for( size_t j=0UL; j<N; ++j )
9562  {
9563  const size_t ibegin( ( IsLower<MT4>::value )
9564  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
9565  :( 0UL ) );
9566  const size_t iend( ( IsUpper<MT4>::value )
9567  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
9568  :( M ) );
9569  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
9570 
9571  const size_t inum( iend - ibegin );
9572  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
9573 
9574  for( size_t i=ibegin; i<ipos; i+=2UL ) {
9575  (~C)(i ,j) -= A(i ,j) * B(j,j) * scalar;
9576  (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
9577  }
9578  if( ipos < iend ) {
9579  (~C)(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
9580  }
9581  }
9582  }
9583  //**********************************************************************************************
9584 
9585  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
9599  template< typename MT3 // Type of the left-hand side target matrix
9600  , typename MT4 // Type of the left-hand side matrix operand
9601  , typename MT5 // Type of the right-hand side matrix operand
9602  , typename ST2 > // Type of the scalar value
9603  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
9604  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9605  {
9606  const size_t M( A.rows() );
9607  const size_t N( B.columns() );
9608 
9609  for( size_t i=0UL; i<M; ++i )
9610  {
9611  const size_t jbegin( ( IsUpper<MT5>::value )
9612  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
9613  :( 0UL ) );
9614  const size_t jend( ( IsLower<MT5>::value )
9615  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
9616  :( N ) );
9617  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
9618 
9619  const size_t jnum( jend - jbegin );
9620  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
9621 
9622  for( size_t j=jbegin; j<jpos; j+=2UL ) {
9623  (~C)(i,j ) -= A(i,i) * B(i,j ) * scalar;
9624  (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
9625  }
9626  if( jpos < jend ) {
9627  (~C)(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
9628  }
9629  }
9630  }
9631  //**********************************************************************************************
9632 
9633  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
9647  template< typename MT3 // Type of the left-hand side target matrix
9648  , typename MT4 // Type of the left-hand side matrix operand
9649  , typename MT5 // Type of the right-hand side matrix operand
9650  , typename ST2 > // Type of the scalar value
9651  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
9652  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9653  {
9654  const size_t M( A.rows() );
9655  const size_t N( B.columns() );
9656 
9657  const size_t block( BLOCK_SIZE );
9658 
9659  for( size_t jj=0UL; jj<N; jj+=block ) {
9660  const size_t jend( min( N, jj+block ) );
9661  for( size_t ii=0UL; ii<M; ii+=block ) {
9662  const size_t iend( min( M, ii+block ) );
9663  for( size_t j=jj; j<jend; ++j )
9664  {
9665  const size_t ibegin( ( IsLower<MT5>::value )
9666  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
9667  :( ii ) );
9668  const size_t ipos( ( IsUpper<MT5>::value )
9669  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
9670  :( iend ) );
9671 
9672  for( size_t i=ibegin; i<ipos; ++i ) {
9673  (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
9674  }
9675  }
9676  }
9677  }
9678  }
9679  //**********************************************************************************************
9680 
9681  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
9695  template< typename MT3 // Type of the left-hand side target matrix
9696  , typename MT4 // Type of the left-hand side matrix operand
9697  , typename MT5 // Type of the right-hand side matrix operand
9698  , typename ST2 > // Type of the scalar value
9699  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
9700  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9701  {
9702  for( size_t i=0UL; i<A.rows(); ++i ) {
9703  C(i,i) -= A(i,i) * B(i,i) * scalar;
9704  }
9705  }
9706  //**********************************************************************************************
9707 
9708  //**Default subtraction assignment to dense matrices (small matrices)***************************
9722  template< typename MT3 // Type of the left-hand side target matrix
9723  , typename MT4 // Type of the left-hand side matrix operand
9724  , typename MT5 // Type of the right-hand side matrix operand
9725  , typename ST2 > // Type of the scalar value
9726  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
9727  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9728  {
9729  selectDefaultSubAssignKernel( C, A, B, scalar );
9730  }
9731  //**********************************************************************************************
9732 
9733  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
9748  template< typename MT3 // Type of the left-hand side target matrix
9749  , typename MT4 // Type of the left-hand side matrix operand
9750  , typename MT5 // Type of the right-hand side matrix operand
9751  , typename ST2 > // Type of the scalar value
9752  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
9753  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9754  {
9755  const size_t M( A.rows() );
9756  const size_t N( B.columns() );
9757  const size_t K( A.columns() );
9758 
9759  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
9760 
9761  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
9762  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
9763 
9764  const SIMDType factor( set( scalar ) );
9765 
9766  size_t j( 0UL );
9767 
9768  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
9769  for( size_t i=0UL; i<M; ++i )
9770  {
9771  const size_t kbegin( ( IsUpper<MT4>::value )
9772  ?( ( IsLower<MT5>::value )
9773  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9774  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9775  :( IsLower<MT5>::value ? j : 0UL ) );
9776  const size_t kend( ( IsLower<MT4>::value )
9777  ?( ( IsUpper<MT5>::value )
9778  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
9779  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
9780  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
9781 
9782  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9783 
9784  for( size_t k=kbegin; k<kend; ++k ) {
9785  const SIMDType a1( set( A(i,k) ) );
9786  xmm1 = xmm1 + a1 * B.load(k,j );
9787  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
9788  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
9789  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
9790  xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
9791  xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
9792  xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
9793  xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
9794  }
9795 
9796  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9797  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
9798  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
9799  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
9800  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
9801  (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
9802  (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
9803  (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
9804  }
9805  }
9806 
9807  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
9808  {
9809  size_t i( 0UL );
9810 
9811  for( ; (i+2UL) <= M; i+=2UL )
9812  {
9813  const size_t kbegin( ( IsUpper<MT4>::value )
9814  ?( ( IsLower<MT5>::value )
9815  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9816  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9817  :( IsLower<MT5>::value ? j : 0UL ) );
9818  const size_t kend( ( IsLower<MT4>::value )
9819  ?( ( IsUpper<MT5>::value )
9820  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
9821  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9822  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
9823 
9824  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9825 
9826  for( size_t k=kbegin; k<kend; ++k ) {
9827  const SIMDType a1( set( A(i ,k) ) );
9828  const SIMDType a2( set( A(i+1UL,k) ) );
9829  const SIMDType b1( B.load(k,j ) );
9830  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9831  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9832  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9833  xmm1 = xmm1 + a1 * b1;
9834  xmm2 = xmm2 + a1 * b2;
9835  xmm3 = xmm3 + a1 * b3;
9836  xmm4 = xmm4 + a1 * b4;
9837  xmm5 = xmm5 + a2 * b1;
9838  xmm6 = xmm6 + a2 * b2;
9839  xmm7 = xmm7 + a2 * b3;
9840  xmm8 = xmm8 + a2 * b4;
9841  }
9842 
9843  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9844  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
9845  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
9846  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
9847  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
9848  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
9849  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
9850  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
9851  }
9852 
9853  if( i < M )
9854  {
9855  const size_t kbegin( ( IsUpper<MT4>::value )
9856  ?( ( IsLower<MT5>::value )
9857  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9858  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9859  :( IsLower<MT5>::value ? j : 0UL ) );
9860  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
9861 
9862  SIMDType xmm1, xmm2, xmm3, xmm4;
9863 
9864  for( size_t k=kbegin; k<kend; ++k ) {
9865  const SIMDType a1( set( A(i,k) ) );
9866  xmm1 = xmm1 + a1 * B.load(k,j );
9867  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
9868  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
9869  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
9870  }
9871 
9872  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9873  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
9874  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
9875  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
9876  }
9877  }
9878 
9879  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
9880  {
9881  size_t i( 0UL );
9882 
9883  for( ; (i+2UL) <= M; i+=2UL )
9884  {
9885  const size_t kbegin( ( IsUpper<MT4>::value )
9886  ?( ( IsLower<MT5>::value )
9887  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9888  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9889  :( IsLower<MT5>::value ? j : 0UL ) );
9890  const size_t kend( ( IsLower<MT4>::value )
9891  ?( ( IsUpper<MT5>::value )
9892  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
9893  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9894  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
9895 
9896  SIMDType xmm1, xmm2, xmm3, xmm4;
9897 
9898  for( size_t k=kbegin; k<kend; ++k ) {
9899  const SIMDType a1( set( A(i ,k) ) );
9900  const SIMDType a2( set( A(i+1UL,k) ) );
9901  const SIMDType b1( B.load(k,j ) );
9902  const SIMDType b2( B.load(k,j+SIMDSIZE) );
9903  xmm1 = xmm1 + a1 * b1;
9904  xmm2 = xmm2 + a1 * b2;
9905  xmm3 = xmm3 + a2 * b1;
9906  xmm4 = xmm4 + a2 * b2;
9907  }
9908 
9909  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9910  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
9911  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
9912  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
9913  }
9914 
9915  if( i < M )
9916  {
9917  const size_t kbegin( ( IsUpper<MT4>::value )
9918  ?( ( IsLower<MT5>::value )
9919  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9920  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9921  :( IsLower<MT5>::value ? j : 0UL ) );
9922  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
9923 
9924  SIMDType xmm1, xmm2;
9925 
9926  for( size_t k=kbegin; k<kend; ++k ) {
9927  const SIMDType a1( set( A(i,k) ) );
9928  xmm1 = xmm1 + a1 * B.load(k,j );
9929  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
9930  }
9931 
9932  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9933  (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - xmm2 * factor );
9934  }
9935  }
9936 
9937  for( ; j<jpos; j+=SIMDSIZE )
9938  {
9939  size_t i( 0UL );
9940 
9941  for( ; (i+2UL) <= M; i+=2UL )
9942  {
9943  const size_t kbegin( ( IsUpper<MT4>::value )
9944  ?( ( IsLower<MT5>::value )
9945  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9946  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9947  :( IsLower<MT5>::value ? j : 0UL ) );
9948  const size_t kend( ( IsLower<MT4>::value )
9949  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
9950  :( K ) );
9951 
9952  SIMDType xmm1, xmm2;
9953 
9954  for( size_t k=kbegin; k<kend; ++k ) {
9955  const SIMDType b1( B.load(k,j) );
9956  xmm1 = xmm1 + set( A(i ,k) ) * b1;
9957  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
9958  }
9959 
9960  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9961  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
9962  }
9963 
9964  if( i < M )
9965  {
9966  const size_t kbegin( ( IsUpper<MT4>::value )
9967  ?( ( IsLower<MT5>::value )
9968  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9969  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9970  :( IsLower<MT5>::value ? j : 0UL ) );
9971 
9972  SIMDType xmm1;
9973 
9974  for( size_t k=kbegin; k<K; ++k ) {
9975  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
9976  }
9977 
9978  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
9979  }
9980  }
9981 
9982  for( ; remainder && j<N; ++j )
9983  {
9984  size_t i( 0UL );
9985 
9986  for( ; (i+2UL) <= M; i+=2UL )
9987  {
9988  const size_t kbegin( ( IsUpper<MT4>::value )
9989  ?( ( IsLower<MT5>::value )
9990  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9991  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9992  :( IsLower<MT5>::value ? j : 0UL ) );
9993  const size_t kend( ( IsLower<MT4>::value )
9994  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
9995  :( K ) );
9996 
9997  ElementType value1 = ElementType();
9998  ElementType value2 = ElementType();
9999 
10000  for( size_t k=kbegin; k<kend; ++k ) {
10001  value1 += A(i ,k) * B(k,j);
10002  value2 += A(i+1UL,k) * B(k,j);
10003  }
10004 
10005  (~C)(i ,j) -= value1 * scalar;
10006  (~C)(i+1UL,j) -= value2 * scalar;
10007  }
10008 
10009  if( i < M )
10010  {
10011  const size_t kbegin( ( IsUpper<MT4>::value )
10012  ?( ( IsLower<MT5>::value )
10013  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10014  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10015  :( IsLower<MT5>::value ? j : 0UL ) );
10016 
10017  ElementType value = ElementType();
10018 
10019  for( size_t k=kbegin; k<K; ++k ) {
10020  value += A(i,k) * B(k,j);
10021  }
10022 
10023  (~C)(i,j) -= value * scalar;
10024  }
10025  }
10026  }
10027  //**********************************************************************************************
10028 
10029  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
10044  template< typename MT3 // Type of the left-hand side target matrix
10045  , typename MT4 // Type of the left-hand side matrix operand
10046  , typename MT5 // Type of the right-hand side matrix operand
10047  , typename ST2 > // Type of the scalar value
10048  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
10049  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
10050  {
10051  const size_t M( A.rows() );
10052  const size_t N( B.columns() );
10053  const size_t K( A.columns() );
10054 
10055  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
10056 
10057  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
10058  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
10059 
10060  const SIMDType factor( set( scalar ) );
10061 
10062  size_t i( 0UL );
10063 
10064  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
10065  for( size_t j=0UL; j<N; ++j )
10066  {
10067  const size_t kbegin( ( IsLower<MT5>::value )
10068  ?( ( IsUpper<MT4>::value )
10069  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10070  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10071  :( IsUpper<MT4>::value ? i : 0UL ) );
10072  const size_t kend( ( IsUpper<MT5>::value )
10073  ?( ( IsLower<MT4>::value )
10074  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
10075  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
10076  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
10077 
10078  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10079 
10080  for( size_t k=kbegin; k<kend; ++k ) {
10081  const SIMDType b1( set( B(k,j) ) );
10082  xmm1 = xmm1 + A.load(i ,k) * b1;
10083  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
10084  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
10085  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
10086  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
10087  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
10088  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
10089  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
10090  }
10091 
10092  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10093  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
10094  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
10095  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
10096  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
10097  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
10098  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
10099  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
10100  }
10101  }
10102 
10103  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
10104  {
10105  size_t j( 0UL );
10106 
10107  for( ; (j+2UL) <= N; j+=2UL )
10108  {
10109  const size_t kbegin( ( IsLower<MT5>::value )
10110  ?( ( IsUpper<MT4>::value )
10111  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10112  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10113  :( IsUpper<MT4>::value ? i : 0UL ) );
10114  const size_t kend( ( IsUpper<MT5>::value )
10115  ?( ( IsLower<MT4>::value )
10116  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10117  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10118  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
10119 
10120  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10121 
10122  for( size_t k=kbegin; k<kend; ++k ) {
10123  const SIMDType a1( A.load(i ,k) );
10124  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10125  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10126  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10127  const SIMDType b1( set( B(k,j ) ) );
10128  const SIMDType b2( set( B(k,j+1UL) ) );
10129  xmm1 = xmm1 + a1 * b1;
10130  xmm2 = xmm2 + a2 * b1;
10131  xmm3 = xmm3 + a3 * b1;
10132  xmm4 = xmm4 + a4 * b1;
10133  xmm5 = xmm5 + a1 * b2;
10134  xmm6 = xmm6 + a2 * b2;
10135  xmm7 = xmm7 + a3 * b2;
10136  xmm8 = xmm8 + a4 * b2;
10137  }
10138 
10139  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10140  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
10141  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
10142  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
10143  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
10144  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
10145  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
10146  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
10147  }
10148 
10149  if( j < N )
10150  {
10151  const size_t kbegin( ( IsLower<MT5>::value )
10152  ?( ( IsUpper<MT4>::value )
10153  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10154  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10155  :( IsUpper<MT4>::value ? i : 0UL ) );
10156  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
10157 
10158  SIMDType xmm1, xmm2, xmm3, xmm4;
10159 
10160  for( size_t k=kbegin; k<kend; ++k ) {
10161  const SIMDType b1( set( B(k,j) ) );
10162  xmm1 = xmm1 + A.load(i ,k) * b1;
10163  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
10164  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
10165  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
10166  }
10167 
10168  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10169  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
10170  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
10171  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
10172  }
10173  }
10174 
10175  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
10176  {
10177  size_t j( 0UL );
10178 
10179  for( ; (j+2UL) <= N; j+=2UL )
10180  {
10181  const size_t kbegin( ( IsLower<MT5>::value )
10182  ?( ( IsUpper<MT4>::value )
10183  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10184  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10185  :( IsUpper<MT4>::value ? i : 0UL ) );
10186  const size_t kend( ( IsUpper<MT5>::value )
10187  ?( ( IsLower<MT4>::value )
10188  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10189  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10190  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
10191 
10192  SIMDType xmm1, xmm2, xmm3, xmm4;
10193 
10194  for( size_t k=kbegin; k<kend; ++k ) {
10195  const SIMDType a1( A.load(i ,k) );
10196  const SIMDType a2( A.load(i+SIMDSIZE,k) );
10197  const SIMDType b1( set( B(k,j ) ) );
10198  const SIMDType b2( set( B(k,j+1UL) ) );
10199  xmm1 = xmm1 + a1 * b1;
10200  xmm2 = xmm2 + a2 * b1;
10201  xmm3 = xmm3 + a1 * b2;
10202  xmm4 = xmm4 + a2 * b2;
10203  }
10204 
10205  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10206  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
10207  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10208  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
10209  }
10210 
10211  if( j < N )
10212  {
10213  const size_t kbegin( ( IsLower<MT5>::value )
10214  ?( ( IsUpper<MT4>::value )
10215  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10216  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10217  :( IsUpper<MT4>::value ? i : 0UL ) );
10218  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
10219 
10220  SIMDType xmm1, xmm2;
10221 
10222  for( size_t k=kbegin; k<kend; ++k ) {
10223  const SIMDType b1( set( B(k,j) ) );
10224  xmm1 = xmm1 + A.load(i ,k) * b1;
10225  xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
10226  }
10227 
10228  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10229  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - xmm2 * factor );
10230  }
10231  }
10232 
10233  for( ; i<ipos; i+=SIMDSIZE )
10234  {
10235  size_t j( 0UL );
10236 
10237  for( ; (j+2UL) <= N; j+=2UL )
10238  {
10239  const size_t kbegin( ( IsLower<MT5>::value )
10240  ?( ( IsUpper<MT4>::value )
10241  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10242  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10243  :( IsUpper<MT4>::value ? i : 0UL ) );
10244  const size_t kend( ( IsUpper<MT5>::value )
10245  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10246  :( K ) );
10247 
10248  SIMDType xmm1, xmm2;
10249 
10250  for( size_t k=kbegin; k<kend; ++k ) {
10251  const SIMDType a1( A.load(i,k) );
10252  xmm1 = xmm1 + a1 * set( B(k,j ) );
10253  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
10254  }
10255 
10256  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10257  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
10258  }
10259 
10260  if( j < N )
10261  {
10262  const size_t kbegin( ( IsLower<MT5>::value )
10263  ?( ( IsUpper<MT4>::value )
10264  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10265  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10266  :( IsUpper<MT4>::value ? i : 0UL ) );
10267 
10268  SIMDType xmm1;
10269 
10270  for( size_t k=kbegin; k<K; ++k ) {
10271  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
10272  }
10273 
10274  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10275  }
10276  }
10277 
10278  for( ; remainder && i<M; ++i )
10279  {
10280  size_t j( 0UL );
10281 
10282  for( ; (j+2UL) <= N; j+=2UL )
10283  {
10284  const size_t kbegin( ( IsLower<MT5>::value )
10285  ?( ( IsUpper<MT4>::value )
10286  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10287  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10288  :( IsUpper<MT4>::value ? i : 0UL ) );
10289  const size_t kend( ( IsUpper<MT5>::value )
10290  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10291  :( K ) );
10292 
10293  ElementType value1 = ElementType();
10294  ElementType value2 = ElementType();
10295 
10296  for( size_t k=kbegin; k<kend; ++k ) {
10297  value1 += A(i,k) * B(k,j );
10298  value2 += A(i,k) * B(k,j+1UL);
10299  }
10300 
10301  (~C)(i,j ) -= value1 * scalar;
10302  (~C)(i,j+1UL) -= value2 * scalar;
10303  }
10304 
10305  if( j < N )
10306  {
10307  const size_t kbegin( ( IsLower<MT5>::value )
10308  ?( ( IsUpper<MT4>::value )
10309  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10310  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10311  :( IsUpper<MT4>::value ? i : 0UL ) );
10312 
10313  ElementType value = ElementType();
10314 
10315  for( size_t k=kbegin; k<K; ++k ) {
10316  value += A(i,k) * B(k,j);
10317  }
10318 
10319  (~C)(i,j) -= value * scalar;
10320  }
10321  }
10322  }
10323  //**********************************************************************************************
10324 
10325  //**Default subtraction assignment to dense matrices (large matrices)***************************
10339  template< typename MT3 // Type of the left-hand side target matrix
10340  , typename MT4 // Type of the left-hand side matrix operand
10341  , typename MT5 // Type of the right-hand side matrix operand
10342  , typename ST2 > // Type of the scalar value
10343  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
10344  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10345  {
10346  selectDefaultSubAssignKernel( C, A, B, scalar );
10347  }
10348  //**********************************************************************************************
10349 
10350  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
10365  template< typename MT3 // Type of the left-hand side target matrix
10366  , typename MT4 // Type of the left-hand side matrix operand
10367  , typename MT5 // Type of the right-hand side matrix operand
10368  , typename ST2 > // Type of the scalar value
10369  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
10370  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
10371  {
10372  const size_t M( A.rows() );
10373  const size_t N( B.columns() );
10374  const size_t K( A.columns() );
10375 
10376  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
10377 
10378  const SIMDType factor( set( scalar ) );
10379 
10380  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
10381  {
10382  const size_t jend( min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
10383 
10384  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
10385  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
10386 
10387  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
10388  {
10389  const size_t iend( min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
10390 
10391  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
10392  {
10393  const size_t ktmp( min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
10394 
10395  size_t j( jj );
10396 
10397  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
10398  {
10399  const size_t j1( j+SIMDSIZE );
10400  const size_t j2( j+SIMDSIZE*2UL );
10401  const size_t j3( j+SIMDSIZE*3UL );
10402 
10403  size_t i( ii );
10404 
10405  for( ; (i+2UL) <= iend; i+=2UL )
10406  {
10407  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10408  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10409  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
10410  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
10411 
10412  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10413 
10414  for( size_t k=kbegin; k<kend; ++k ) {
10415  const SIMDType a1( set( A(i ,k) ) );
10416  const SIMDType a2( set( A(i+1UL,k) ) );
10417  const SIMDType b1( B.load(k,j ) );
10418  const SIMDType b2( B.load(k,j1) );
10419  const SIMDType b3( B.load(k,j2) );
10420  const SIMDType b4( B.load(k,j3) );
10421  xmm1 = xmm1 + a1 * b1;
10422  xmm2 = xmm2 + a1 * b2;
10423  xmm3 = xmm3 + a1 * b3;
10424  xmm4 = xmm4 + a1 * b4;
10425  xmm5 = xmm5 + a2 * b1;
10426  xmm6 = xmm6 + a2 * b2;
10427  xmm7 = xmm7 + a2 * b3;
10428  xmm8 = xmm8 + a2 * b4;
10429  }
10430 
10431  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10432  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10433  (~C).store( i , j2, (~C).load(i ,j2) - xmm3 * factor );
10434  (~C).store( i , j3, (~C).load(i ,j3) - xmm4 * factor );
10435  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
10436  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm6 * factor );
10437  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) - xmm7 * factor );
10438  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) - xmm8 * factor );
10439  }
10440 
10441  if( i < iend )
10442  {
10443  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10444  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10445  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10446  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
10447 
10448  SIMDType xmm1, xmm2, xmm3, xmm4;
10449 
10450  for( size_t k=kbegin; k<kend; ++k ) {
10451  const SIMDType a1( set( A(i,k) ) );
10452  xmm1 = xmm1 + a1 * B.load(k,j );
10453  xmm2 = xmm2 + a1 * B.load(k,j1);
10454  xmm3 = xmm3 + a1 * B.load(k,j2);
10455  xmm4 = xmm4 + a1 * B.load(k,j3);
10456  }
10457 
10458  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10459  (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
10460  (~C).store( i, j2, (~C).load(i,j2) - xmm3 * factor );
10461  (~C).store( i, j3, (~C).load(i,j3) - xmm4 * factor );
10462  }
10463  }
10464 
10465  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
10466  {
10467  const size_t j1( j+SIMDSIZE );
10468 
10469  size_t i( ii );
10470 
10471  for( ; (i+4UL) <= iend; i+=4UL )
10472  {
10473  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10474  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10475  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
10476  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
10477 
10478  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10479 
10480  for( size_t k=kbegin; k<kend; ++k ) {
10481  const SIMDType a1( set( A(i ,k) ) );
10482  const SIMDType a2( set( A(i+1UL,k) ) );
10483  const SIMDType a3( set( A(i+2UL,k) ) );
10484  const SIMDType a4( set( A(i+3UL,k) ) );
10485  const SIMDType b1( B.load(k,j ) );
10486  const SIMDType b2( B.load(k,j1) );
10487  xmm1 = xmm1 + a1 * b1;
10488  xmm2 = xmm2 + a1 * b2;
10489  xmm3 = xmm3 + a2 * b1;
10490  xmm4 = xmm4 + a2 * b2;
10491  xmm5 = xmm5 + a3 * b1;
10492  xmm6 = xmm6 + a3 * b2;
10493  xmm7 = xmm7 + a4 * b1;
10494  xmm8 = xmm8 + a4 * b2;
10495  }
10496 
10497  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10498  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10499  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10500  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
10501  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
10502  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) - xmm6 * factor );
10503  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
10504  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) - xmm8 * factor );
10505  }
10506 
10507  for( ; (i+2UL) <= iend; i+=2UL )
10508  {
10509  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10510  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10511  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
10512  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
10513 
10514  SIMDType xmm1, xmm2, xmm3, xmm4;
10515 
10516  for( size_t k=kbegin; k<kend; ++k ) {
10517  const SIMDType a1( set( A(i ,k) ) );
10518  const SIMDType a2( set( A(i+1UL,k) ) );
10519  const SIMDType b1( B.load(k,j ) );
10520  const SIMDType b2( B.load(k,j1) );
10521  xmm1 = xmm1 + a1 * b1;
10522  xmm2 = xmm2 + a1 * b2;
10523  xmm3 = xmm3 + a2 * b1;
10524  xmm4 = xmm4 + a2 * b2;
10525  }
10526 
10527  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10528  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10529  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10530  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
10531  }
10532 
10533  if( i < iend )
10534  {
10535  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10536  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10537  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10538  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
10539 
10540  SIMDType xmm1, xmm2;
10541 
10542  for( size_t k=kbegin; k<kend; ++k ) {
10543  const SIMDType a1( set( A(i,k) ) );
10544  xmm1 = xmm1 + a1 * B.load(k,j );
10545  xmm2 = xmm2 + a1 * B.load(k,j1);
10546  }
10547 
10548  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10549  (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
10550  }
10551  }
10552 
10553  for( ; j<jpos; j+=SIMDSIZE )
10554  {
10555  for( size_t i=ii; i<iend; ++i )
10556  {
10557  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10558  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10559  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10560  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
10561 
10562  SIMDType xmm1;
10563 
10564  for( size_t k=kbegin; k<kend; ++k ) {
10565  const SIMDType a1( set( A(i,k) ) );
10566  xmm1 = xmm1 + a1 * B.load(k,j);
10567  }
10568 
10569  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10570  }
10571  }
10572 
10573  for( ; remainder && j<jend; ++j )
10574  {
10575  for( size_t i=ii; i<iend; ++i )
10576  {
10577  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10578  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10579  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10580  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
10581 
10582  ElementType value = ElementType();
10583 
10584  for( size_t k=kbegin; k<kend; ++k ) {
10585  value += A(i,k) * B(k,j);
10586  }
10587 
10588  (~C)(i,j) -= value * scalar;
10589  }
10590  }
10591  }
10592  }
10593  }
10594  }
10595  //**********************************************************************************************
10596 
10597  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
10612  template< typename MT3 // Type of the left-hand side target matrix
10613  , typename MT4 // Type of the left-hand side matrix operand
10614  , typename MT5 // Type of the right-hand side matrix operand
10615  , typename ST2 > // Type of the scalar value
10616  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
10617  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
10618  {
10619  const size_t M( A.rows() );
10620  const size_t N( B.columns() );
10621  const size_t K( A.columns() );
10622 
10623  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
10624 
10625  const SIMDType factor( set( scalar ) );
10626 
10627  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
10628  {
10629  const size_t iend( min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
10630 
10631  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
10632  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
10633 
10634  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
10635  {
10636  const size_t jend( min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
10637 
10638  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
10639  {
10640  const size_t ktmp( min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
10641 
10642  size_t i( ii );
10643 
10644  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
10645  {
10646  const size_t i1( i+SIMDSIZE );
10647  const size_t i2( i+SIMDSIZE*2UL );
10648  const size_t i3( i+SIMDSIZE*3UL );
10649 
10650  size_t j( jj );
10651 
10652  for( ; (j+2UL) <= jend; j+=2UL )
10653  {
10654  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10655  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10656  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
10657  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
10658 
10659  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10660 
10661  for( size_t k=kbegin; k<kend; ++k ) {
10662  const SIMDType a1( A.load(i ,k) );
10663  const SIMDType a2( A.load(i1,k) );
10664  const SIMDType a3( A.load(i2,k) );
10665  const SIMDType a4( A.load(i3,k) );
10666  const SIMDType b1( set( B(k,j ) ) );
10667  const SIMDType b2( set( B(k,j+1UL) ) );
10668  xmm1 = xmm1 + a1 * b1;
10669  xmm2 = xmm2 + a2 * b1;
10670  xmm3 = xmm3 + a3 * b1;
10671  xmm4 = xmm4 + a4 * b1;
10672  xmm5 = xmm5 + a1 * b2;
10673  xmm6 = xmm6 + a2 * b2;
10674  xmm7 = xmm7 + a3 * b2;
10675  xmm8 = xmm8 + a4 * b2;
10676  }
10677 
10678  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10679  (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
10680  (~C).store( i2, j , (~C).load(i2,j ) - xmm3 * factor );
10681  (~C).store( i3, j , (~C).load(i3,j ) - xmm4 * factor );
10682  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
10683  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm6 * factor );
10684  (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) - xmm7 * factor );
10685  (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) - xmm8 * factor );
10686  }
10687 
10688  if( j < jend )
10689  {
10690  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10691  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10692  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
10693  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10694 
10695  SIMDType xmm1, xmm2, xmm3, xmm4;
10696 
10697  for( size_t k=kbegin; k<kend; ++k ) {
10698  const SIMDType b1( set( B(k,j) ) );
10699  xmm1 = xmm1 + A.load(i ,k) * b1;
10700  xmm2 = xmm2 + A.load(i1,k) * b1;
10701  xmm3 = xmm3 + A.load(i2,k) * b1;
10702  xmm4 = xmm4 + A.load(i3,k) * b1;
10703  }
10704 
10705  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10706  (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
10707  (~C).store( i2, j, (~C).load(i2,j) - xmm3 * factor );
10708  (~C).store( i3, j, (~C).load(i3,j) - xmm4 * factor );
10709  }
10710  }
10711 
10712  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
10713  {
10714  const size_t i1( i+SIMDSIZE );
10715 
10716  size_t j( jj );
10717 
10718  for( ; (j+4UL) <= jend; j+=4UL )
10719  {
10720  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10721  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10722  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
10723  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
10724 
10725  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10726 
10727  for( size_t k=kbegin; k<kend; ++k ) {
10728  const SIMDType a1( A.load(i ,k) );
10729  const SIMDType a2( A.load(i1,k) );
10730  const SIMDType b1( set( B(k,j ) ) );
10731  const SIMDType b2( set( B(k,j+1UL) ) );
10732  const SIMDType b3( set( B(k,j+2UL) ) );
10733  const SIMDType b4( set( B(k,j+3UL) ) );
10734  xmm1 = xmm1 + a1 * b1;
10735  xmm2 = xmm2 + a2 * b1;
10736  xmm3 = xmm3 + a1 * b2;
10737  xmm4 = xmm4 + a2 * b2;
10738  xmm5 = xmm5 + a1 * b3;
10739  xmm6 = xmm6 + a2 * b3;
10740  xmm7 = xmm7 + a1 * b4;
10741  xmm8 = xmm8 + a2 * b4;
10742  }
10743 
10744  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10745  (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
10746  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10747  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
10748  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
10749  (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) - xmm6 * factor );
10750  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
10751  (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) - xmm8 * factor );
10752  }
10753 
10754  for( ; (j+2UL) <= jend; j+=2UL )
10755  {
10756  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10757  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10758  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
10759  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
10760 
10761  SIMDType xmm1, xmm2, xmm3, xmm4;
10762 
10763  for( size_t k=kbegin; k<kend; ++k ) {
10764  const SIMDType a1( A.load(i ,k) );
10765  const SIMDType a2( A.load(i1,k) );
10766  const SIMDType b1( set( B(k,j ) ) );
10767  const SIMDType b2( set( B(k,j+1UL) ) );
10768  xmm1 = xmm1 + a1 * b1;
10769  xmm2 = xmm2 + a2 * b1;
10770  xmm3 = xmm3 + a1 * b2;
10771  xmm4 = xmm4 + a2 * b2;
10772  }
10773 
10774  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10775  (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
10776  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10777  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
10778  }
10779 
10780  if( j < jend )
10781  {
10782  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10783  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10784  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
10785  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10786 
10787  SIMDType xmm1, xmm2;
10788 
10789  for( size_t k=kbegin; k<kend; ++k ) {
10790  const SIMDType b1( set( B(k,j) ) );
10791  xmm1 = xmm1 + A.load(i ,k) * b1;
10792  xmm2 = xmm2 + A.load(i1,k) * b1;
10793  }
10794 
10795  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10796  (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
10797  }
10798  }
10799 
10800  for( ; i<ipos; i+=SIMDSIZE )
10801  {
10802  for( size_t j=jj; j<jend; ++j )
10803  {
10804  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10805  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10806  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE, ktmp ) ):( ktmp ),
10807  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10808 
10809  SIMDType xmm1;
10810 
10811  for( size_t k=kbegin; k<kend; ++k ) {
10812  const SIMDType b1( set( B(k,j) ) );
10813  xmm1 = xmm1 + A.load(i,k) * b1;
10814  }
10815 
10816  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10817  }
10818  }
10819 
10820  for( ; remainder && i<iend; ++i )
10821  {
10822  for( size_t j=jj; j<jend; ++j )
10823  {
10824  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10825  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10826  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
10827  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10828 
10829  ElementType value = ElementType();
10830 
10831  for( size_t k=kbegin; k<kend; ++k ) {
10832  value += A(i,k) * B(k,j);
10833  }
10834 
10835  (~C)(i,j) -= value * scalar;
10836  }
10837  }
10838  }
10839  }
10840  }
10841  }
10842  //**********************************************************************************************
10843 
10844  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
10858  template< typename MT3 // Type of the left-hand side target matrix
10859  , typename MT4 // Type of the left-hand side matrix operand
10860  , typename MT5 // Type of the right-hand side matrix operand
10861  , typename ST2 > // Type of the scalar value
10862  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
10863  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10864  {
10865  selectLargeSubAssignKernel( C, A, B, scalar );
10866  }
10867  //**********************************************************************************************
10868 
10869  //**BLAS-based subraction assignment to dense matrices******************************************
10870 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
10871 
10884  template< typename MT3 // Type of the left-hand side target matrix
10885  , typename MT4 // Type of the left-hand side matrix operand
10886  , typename MT5 // Type of the right-hand side matrix operand
10887  , typename ST2 > // Type of the scalar value
10888  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
10889  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10890  {
10891  typedef ElementType_<MT3> ET;
10892 
10893  if( IsTriangular<MT4>::value ) {
10894  ResultType_<MT3> tmp( serial( B ) );
10895  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
10896  subAssign( C, tmp );
10897  }
10898  else if( IsTriangular<MT5>::value ) {
10899  ResultType_<MT3> tmp( serial( A ) );
10900  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
10901  subAssign( C, tmp );
10902  }
10903  else {
10904  gemm( C, A, B, ET(-scalar), ET(1) );
10905  }
10906  }
10907 #endif
10908  //**********************************************************************************************
10909 
10910  //**Subtraction assignment to sparse matrices***************************************************
10911  // No special implementation for the subtraction assignment to sparse matrices.
10912  //**********************************************************************************************
10913 
10914  //**Multiplication assignment to dense matrices*************************************************
10915  // No special implementation for the multiplication assignment to dense matrices.
10916  //**********************************************************************************************
10917 
10918  //**Multiplication assignment to sparse matrices************************************************
10919  // No special implementation for the multiplication assignment to sparse matrices.
10920  //**********************************************************************************************
10921 
10922  //**SMP assignment to dense matrices************************************************************
10937  template< typename MT // Type of the target dense matrix
10938  , bool SO > // Storage order of the target dense matrix
10939  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
10940  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
10941  {
10943 
10944  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
10945  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
10946 
10947  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
10948  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
10949 
10950  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
10951  return;
10952  }
10953  else if( left.columns() == 0UL ) {
10954  reset( ~lhs );
10955  return;
10956  }
10957 
10958  LT A( left ); // Evaluation of the left-hand side dense matrix operand
10959  RT B( right ); // Evaluation of the right-hand side dense matrix operand
10960 
10961  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
10962  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
10963  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
10964  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
10965  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
10966  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
10967 
10968  smpAssign( ~lhs, A * B * rhs.scalar_ );
10969  }
10970  //**********************************************************************************************
10971 
10972  //**SMP assignment to sparse matrices***********************************************************
10987  template< typename MT // Type of the target sparse matrix
10988  , bool SO > // Storage order of the target sparse matrix
10989  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
10990  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
10991  {
10993 
10994  typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
10995 
11001  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<TmpType> );
11002 
11003  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11004  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11005 
11006  const TmpType tmp( rhs );
11007  smpAssign( ~lhs, tmp );
11008  }
11009  //**********************************************************************************************
11010 
11011  //**SMP addition assignment to dense matrices***************************************************
11026  template< typename MT // Type of the target dense matrix
11027  , bool SO > // Storage order of the target dense matrix
11028  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
11029  smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11030  {
11032 
11033  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11034  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11035 
11036  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
11037  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
11038 
11039  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
11040  return;
11041  }
11042 
11043  LT A( left ); // Evaluation of the left-hand side dense matrix operand
11044  RT B( right ); // Evaluation of the right-hand side dense matrix operand
11045 
11046  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11047  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11048  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11049  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11050  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
11051  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
11052 
11053  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
11054  }
11055  //**********************************************************************************************
11056 
11057  //**SMP addition assignment to sparse matrices**************************************************
11058  // No special implementation for the SMP addition assignment to sparse matrices.
11059  //**********************************************************************************************
11060 
11061  //**SMP subtraction assignment to dense matrices************************************************
11076  template< typename MT // Type of the target dense matrix
11077  , bool SO > // Storage order of the target dense matrix
11078  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
11079  smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11080  {
11082 
11083  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11084  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11085 
11086  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
11087  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
11088 
11089  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
11090  return;
11091  }
11092 
11093  LT A( left ); // Evaluation of the left-hand side dense matrix operand
11094  RT B( right ); // Evaluation of the right-hand side dense matrix operand
11095 
11096  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11097  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11098  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11099  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11100  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
11101  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
11102 
11103  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
11104  }
11105  //**********************************************************************************************
11106 
11107  //**SMP subtraction assignment to sparse matrices***********************************************
11108  // No special implementation for the SMP subtraction assignment to sparse matrices.
11109  //**********************************************************************************************
11110 
11111  //**SMP multiplication assignment to dense matrices*********************************************
11112  // No special implementation for the SMP multiplication assignment to dense matrices.
11113  //**********************************************************************************************
11114 
11115  //**SMP multiplication assignment to sparse matrices********************************************
11116  // No special implementation for the SMP multiplication assignment to sparse matrices.
11117  //**********************************************************************************************
11118 
11119  //**Compile time checks*************************************************************************
11127  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
11128  //**********************************************************************************************
11129 };
11131 //*************************************************************************************************
11132 
11133 
11134 
11135 
11136 //=================================================================================================
11137 //
11138 // GLOBAL BINARY ARITHMETIC OPERATORS
11139 //
11140 //=================================================================================================
11141 
11142 //*************************************************************************************************
11171 template< typename T1 // Type of the left-hand side dense matrix
11172  , typename T2 > // Type of the right-hand side dense matrix
11173 inline const TDMatDMatMultExpr<T1,T2>
11175 {
11177 
11178  if( (~lhs).columns() != (~rhs).rows() ) {
11179  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
11180  }
11181 
11182  return TDMatDMatMultExpr<T1,T2>( ~lhs, ~rhs );
11183 }
11184 //*************************************************************************************************
11185 
11186 
11187 
11188 
11189 //=================================================================================================
11190 //
11191 // ROWS SPECIALIZATIONS
11192 //
11193 //=================================================================================================
11194 
11195 //*************************************************************************************************
11197 template< typename MT1, typename MT2 >
11198 struct Rows< TDMatDMatMultExpr<MT1,MT2> > : public Rows<MT1>
11199 {};
11201 //*************************************************************************************************
11202 
11203 
11204 
11205 
11206 //=================================================================================================
11207 //
11208 // COLUMNS SPECIALIZATIONS
11209 //
11210 //=================================================================================================
11211 
11212 //*************************************************************************************************
11214 template< typename MT1, typename MT2 >
11215 struct Columns< TDMatDMatMultExpr<MT1,MT2> > : public Columns<MT2>
11216 {};
11218 //*************************************************************************************************
11219 
11220 
11221 
11222 
11223 //=================================================================================================
11224 //
11225 // ISALIGNED SPECIALIZATIONS
11226 //
11227 //=================================================================================================
11228 
11229 //*************************************************************************************************
11231 template< typename MT1, typename MT2 >
11232 struct IsAligned< TDMatDMatMultExpr<MT1,MT2> >
11233  : public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
11234 {};
11236 //*************************************************************************************************
11237 
11238 
11239 
11240 
11241 //=================================================================================================
11242 //
11243 // ISLOWER SPECIALIZATIONS
11244 //
11245 //=================================================================================================
11246 
11247 //*************************************************************************************************
11249 template< typename MT1, typename MT2 >
11250 struct IsLower< TDMatDMatMultExpr<MT1,MT2> >
11251  : public BoolConstant< And< IsLower<MT1>, IsLower<MT2> >::value >
11252 {};
11254 //*************************************************************************************************
11255 
11256 
11257 
11258 
11259 //=================================================================================================
11260 //
11261 // ISUNILOWER SPECIALIZATIONS
11262 //
11263 //=================================================================================================
11264 
11265 //*************************************************************************************************
11267 template< typename MT1, typename MT2 >
11268 struct IsUniLower< TDMatDMatMultExpr<MT1,MT2> >
11269  : public BoolConstant< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
11270 {};
11272 //*************************************************************************************************
11273 
11274 
11275 
11276 
11277 //=================================================================================================
11278 //
11279 // ISSTRICTLYLOWER SPECIALIZATIONS
11280 //
11281 //=================================================================================================
11282 
11283 //*************************************************************************************************
11285 template< typename MT1, typename MT2 >
11286 struct IsStrictlyLower< TDMatDMatMultExpr<MT1,MT2> >
11287  : public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
11288  , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
11289 {};
11291 //*************************************************************************************************
11292 
11293 
11294 
11295 
11296 //=================================================================================================
11297 //
11298 // ISUPPER SPECIALIZATIONS
11299 //
11300 //=================================================================================================
11301 
11302 //*************************************************************************************************
11304 template< typename MT1, typename MT2 >
11305 struct IsUpper< TDMatDMatMultExpr<MT1,MT2> >
11306  : public BoolConstant< And< IsUpper<MT1>, IsUpper<MT2> >::value >
11307 {};
11309 //*************************************************************************************************
11310 
11311 
11312 
11313 
11314 //=================================================================================================
11315 //
11316 // ISUNIUPPER SPECIALIZATIONS
11317 //
11318 //=================================================================================================
11319 
11320 //*************************************************************************************************
11322 template< typename MT1, typename MT2 >
11323 struct IsUniUpper< TDMatDMatMultExpr<MT1,MT2> >
11324  : public BoolConstant< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
11325 {};
11327 //*************************************************************************************************
11328 
11329 
11330 
11331 
11332 //=================================================================================================
11333 //
11334 // ISSTRICTLYUPPER SPECIALIZATIONS
11335 //
11336 //=================================================================================================
11337 
11338 //*************************************************************************************************
11340 template< typename MT1, typename MT2 >
11341 struct IsStrictlyUpper< TDMatDMatMultExpr<MT1,MT2> >
11342  : public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
11343  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
11344 {};
11346 //*************************************************************************************************
11347 
11348 
11349 
11350 
11351 //=================================================================================================
11352 //
11353 // EXPRESSION TRAIT SPECIALIZATIONS
11354 //
11355 //=================================================================================================
11356 
11357 //*************************************************************************************************
11359 template< typename MT1, typename MT2, typename VT >
11360 struct TDMatDVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
11361 {
11362  public:
11363  //**********************************************************************************************
11364  using Type = If_< And< IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
11365  , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2>
11366  , IsDenseVector<VT>, IsColumnVector<VT> >
11367  , TDMatDVecMultExprTrait_< MT1, DMatDVecMultExprTrait_<MT2,VT> >
11368  , INVALID_TYPE >;
11369  //**********************************************************************************************
11370 };
11372 //*************************************************************************************************
11373 
11374 
11375 //*************************************************************************************************
11377 template< typename MT1, typename MT2, typename VT >
11378 struct TDMatSVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
11379 {
11380  public:
11381  //**********************************************************************************************
11382  using Type = If_< And< IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
11383  , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2>
11384  , IsSparseVector<VT>, IsColumnVector<VT> >
11385  , TDMatDVecMultExprTrait_< MT1, DMatSVecMultExprTrait_<MT2,VT> >
11386  , INVALID_TYPE >;
11387  //**********************************************************************************************
11388 };
11390 //*************************************************************************************************
11391 
11392 
11393 //*************************************************************************************************
11395 template< typename VT, typename MT1, typename MT2 >
11396 struct TDVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
11397 {
11398  public:
11399  //**********************************************************************************************
11400  using Type = If_< And< IsDenseVector<VT>, IsRowVector<VT>
11401  , IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
11402  , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2> >
11403  , TDVecDMatMultExprTrait_< TDVecTDMatMultExprTrait_<VT,MT1>, MT2 >
11404  , INVALID_TYPE >;
11405  //**********************************************************************************************
11406 };
11408 //*************************************************************************************************
11409 
11410 
11411 //*************************************************************************************************
11413 template< typename VT, typename MT1, typename MT2 >
11414 struct TSVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
11415 {
11416  public:
11417  //**********************************************************************************************
11418  using Type = If_< And< IsSparseVector<VT>, IsRowVector<VT>
11419  , IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
11420  , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2> >
11421  , TDVecDMatMultExprTrait_< TSVecTDMatMultExprTrait_<VT,MT1>, MT2 >
11422  , INVALID_TYPE >;
11423  //**********************************************************************************************
11424 };
11426 //*************************************************************************************************
11427 
11428 
11429 //*************************************************************************************************
11431 template< typename MT1, typename MT2, bool AF >
11432 struct SubmatrixExprTrait< TDMatDMatMultExpr<MT1,MT2>, AF >
11433 {
11434  public:
11435  //**********************************************************************************************
11436  using Type = MultExprTrait_< SubmatrixExprTrait_<const MT1,AF>
11437  , SubmatrixExprTrait_<const MT2,AF> >;
11438  //**********************************************************************************************
11439 };
11441 //*************************************************************************************************
11442 
11443 
11444 //*************************************************************************************************
11446 template< typename MT1, typename MT2 >
11447 struct RowExprTrait< TDMatDMatMultExpr<MT1,MT2> >
11448 {
11449  public:
11450  //**********************************************************************************************
11451  using Type = MultExprTrait_< RowExprTrait_<const MT1>, MT2 >;
11452  //**********************************************************************************************
11453 };
11455 //*************************************************************************************************
11456 
11457 
11458 //*************************************************************************************************
11460 template< typename MT1, typename MT2 >
11461 struct ColumnExprTrait< TDMatDMatMultExpr<MT1,MT2> >
11462 {
11463  public:
11464  //**********************************************************************************************
11465  using Type = MultExprTrait_< MT1, ColumnExprTrait_<const MT2> >;
11466  //**********************************************************************************************
11467 };
11469 //*************************************************************************************************
11470 
11471 } // namespace blaze
11472 
11473 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Data type constraint.
If_< IsExpression< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:243
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:438
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:437
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:249
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:157
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:234
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
Header file for the IsRowVector type trait.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:418
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1669
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:338
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1716
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:408
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:126
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:156
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:128
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:230
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Header file for the TSVecTDMatMultExprTrait class template.
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:233
Header file for the TDMatSVecMultExprTrait class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:364
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:159
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:254
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:374
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:428
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:236
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:232
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:158
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:109
Header file for the IsNumeric type trait.
BLAZE_ALWAYS_INLINE const EnableIf_< And< IsIntegral< T >, HasSize< T, 1UL > >, If_< IsSigned< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:76
Header file for the HasConstDataAccess type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:126
System settings for the BLAS mode.
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Utility type for generic codes.
If_< IsExpression< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:240
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:275
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:354
Constraints on the storage order of matrix types.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:237
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:235
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:243
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Header file for the IsDenseVector type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:396
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
Header file for the AreSIMDCombinable type trait.
Header file for the IsRowMajorMatrix type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:384
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:246
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Header file for the TDVecDMatMultExprTrait class template.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:231
Header file for the TDMatDVecMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Header file for the IsComplex type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:290
Header file for the complex data type.
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:155
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.