TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
55 #include <blaze/math/Functions.h>
56 #include <blaze/math/Intrinsics.h>
57 #include <blaze/math/shims/Reset.h>
96 #include <blaze/system/BLAS.h>
97 #include <blaze/system/Blocking.h>
100 #include <blaze/util/Assert.h>
101 #include <blaze/util/Complex.h>
105 #include <blaze/util/DisableIf.h>
106 #include <blaze/util/EnableIf.h>
107 #include <blaze/util/Exception.h>
108 #include <blaze/util/InvalidType.h>
110 #include <blaze/util/mpl/And.h>
111 #include <blaze/util/mpl/Not.h>
112 #include <blaze/util/mpl/Or.h>
113 #include <blaze/util/SelectType.h>
114 #include <blaze/util/Types.h>
124 
125 
126 namespace blaze {
127 
128 //=================================================================================================
129 //
130 // CLASS TDMATDMATMULTEXPR
131 //
132 //=================================================================================================
133 
134 //*************************************************************************************************
141 template< typename MT1 // Type of the left-hand side dense matrix
142  , typename MT2 > // Type of the right-hand side dense matrix
143 class TDMatDMatMultExpr : public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
144  , private MatMatMultExpr
145  , private Computation
146 {
147  private:
148  //**Type definitions****************************************************************************
149  typedef typename MT1::ResultType RT1;
150  typedef typename MT2::ResultType RT2;
151  typedef typename RT1::ElementType ET1;
152  typedef typename RT2::ElementType ET2;
153  typedef typename MT1::CompositeType CT1;
154  typedef typename MT2::CompositeType CT2;
155  //**********************************************************************************************
156 
157  //**********************************************************************************************
160  //**********************************************************************************************
161 
162  //**********************************************************************************************
164  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
165  //**********************************************************************************************
166 
167  //**********************************************************************************************
169 
173  template< typename T1, typename T2, typename T3 >
174  struct IsEvaluationRequired {
175  enum { value = ( evaluateLeft || evaluateRight ) };
176  };
178  //**********************************************************************************************
179 
180  //**********************************************************************************************
182 
185  template< typename T1, typename T2, typename T3 >
186  struct UseBlasKernel {
187  enum { value = BLAZE_BLAS_MODE &&
188  HasMutableDataAccess<T1>::value &&
189  HasConstDataAccess<T2>::value &&
190  HasConstDataAccess<T3>::value &&
191  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
192  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
193  IsBlasCompatible<typename T1::ElementType>::value &&
194  IsBlasCompatible<typename T2::ElementType>::value &&
195  IsBlasCompatible<typename T3::ElementType>::value &&
196  IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
197  IsSame< typename T1::ElementType, typename T3::ElementType >::value };
198  };
200  //**********************************************************************************************
201 
202  //**********************************************************************************************
204 
207  template< typename T1, typename T2, typename T3 >
208  struct UseVectorizedDefaultKernel {
209  enum { value = useOptimizedKernels &&
210  !( IsDiagonal<T2>::value && IsDiagonal<T3>::value ) &&
211  !( IsDiagonal<T2>::value && IsColumnMajorMatrix<T1>::value ) &&
212  !( IsDiagonal<T3>::value && IsRowMajorMatrix<T1>::value ) &&
213  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
214  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
215  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
216  IntrinsicTrait<typename T1::ElementType>::addition &&
217  IntrinsicTrait<typename T1::ElementType>::subtraction &&
218  IntrinsicTrait<typename T1::ElementType>::multiplication };
219  };
221  //**********************************************************************************************
222 
223  public:
224  //**Type definitions****************************************************************************
231  typedef const ElementType ReturnType;
232  typedef const ResultType CompositeType;
233 
235  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
236 
238  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
239 
242 
245  //**********************************************************************************************
246 
247  //**Compilation flags***************************************************************************
249  enum { vectorizable = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
250  MT1::vectorizable && MT2::vectorizable &&
254 
256  enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
257  !evaluateRight && MT2::smpAssignable };
258  //**********************************************************************************************
259 
260  //**Constructor*********************************************************************************
266  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs )
267  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
268  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
269  {
270  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
271  }
272  //**********************************************************************************************
273 
274  //**Access operator*****************************************************************************
281  inline ReturnType operator()( size_t i, size_t j ) const {
282  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
283  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
284 
285  const size_t kbegin( ( IsUpper<MT1>::value )
286  ?( ( IsLower<MT2>::value )
287  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
288  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
289  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
290  :( ( IsLower<MT2>::value )
291  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
292  :( 0UL ) ) );
293  const size_t kend( ( IsLower<MT1>::value )
294  ?( ( IsUpper<MT2>::value )
295  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
296  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
297  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
298  :( ( IsUpper<MT2>::value )
299  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
300  :( lhs_.columns() ) ) );
301 
302  if( lhs_.columns() == 0UL ||
303  ( ( IsTriangular<MT1>::value || IsTriangular<MT2>::value ) && kbegin >= kend ) )
304  return ElementType();
305 
307  return lhs_(i,i) * rhs_(i,j);
308 
310  return lhs_(i,j) * rhs_(j,j);
311 
312  const size_t knum( kend - kbegin );
313  const size_t kpos( kbegin + ( ( knum - 1UL ) & size_t(-2) ) + 1UL );
314 
315  ElementType tmp( lhs_(i,kbegin) * rhs_(kbegin,j) );
316 
317  for( size_t k=kbegin+1UL; k<kpos; k+=2UL ) {
318  tmp += lhs_(i,k ) * rhs_(k ,j);
319  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
320  }
321  if( kpos < kend ) {
322  tmp += lhs_(i,kpos) * rhs_(kpos,j);
323  }
324 
325  return tmp;
326  }
327  //**********************************************************************************************
328 
329  //**At function*********************************************************************************
337  inline ReturnType at( size_t i, size_t j ) const {
338  if( i >= lhs_.rows() ) {
339  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
340  }
341  if( j >= rhs_.columns() ) {
342  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
343  }
344  return (*this)(i,j);
345  }
346  //**********************************************************************************************
347 
348  //**Rows function*******************************************************************************
353  inline size_t rows() const {
354  return lhs_.rows();
355  }
356  //**********************************************************************************************
357 
358  //**Columns function****************************************************************************
363  inline size_t columns() const {
364  return rhs_.columns();
365  }
366  //**********************************************************************************************
367 
368  //**Left operand access*************************************************************************
373  inline LeftOperand leftOperand() const {
374  return lhs_;
375  }
376  //**********************************************************************************************
377 
378  //**Right operand access************************************************************************
383  inline RightOperand rightOperand() const {
384  return rhs_;
385  }
386  //**********************************************************************************************
387 
388  //**********************************************************************************************
394  template< typename T >
395  inline bool canAlias( const T* alias ) const {
396  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
397  }
398  //**********************************************************************************************
399 
400  //**********************************************************************************************
406  template< typename T >
407  inline bool isAliased( const T* alias ) const {
408  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
409  }
410  //**********************************************************************************************
411 
412  //**********************************************************************************************
417  inline bool isAligned() const {
418  return lhs_.isAligned() && rhs_.isAligned();
419  }
420  //**********************************************************************************************
421 
422  //**********************************************************************************************
427  inline bool canSMPAssign() const {
428  return ( !BLAZE_BLAS_IS_PARALLEL ||
429  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
430  ( columns() > SMP_TDMATDMATMULT_THRESHOLD );
431  }
432  //**********************************************************************************************
433 
434  private:
435  //**Member variables****************************************************************************
436  LeftOperand lhs_;
437  RightOperand rhs_;
438  //**********************************************************************************************
439 
440  //**Assignment to dense matrices****************************************************************
453  template< typename MT // Type of the target dense matrix
454  , bool SO > // Storage order of the target dense matrix
455  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
456  {
458 
459  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
460  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
461 
462  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
463  return;
464  }
465  else if( rhs.lhs_.columns() == 0UL ) {
466  reset( ~lhs );
467  return;
468  }
469 
470  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
471  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
472 
473  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
474  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
475  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
476  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
477  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
478  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
479 
480  TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
481  }
483  //**********************************************************************************************
484 
485  //**Assignment to dense matrices (kernel selection)*********************************************
496  template< typename MT3 // Type of the left-hand side target matrix
497  , typename MT4 // Type of the left-hand side matrix operand
498  , typename MT5 > // Type of the right-hand side matrix operand
499  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
500  {
502  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
503  selectSmallAssignKernel( C, A, B );
504  else
505  selectBlasAssignKernel( C, A, B );
506  }
508  //**********************************************************************************************
509 
510  //**Default assignment to row-major dense matrices (general/general)****************************
524  template< typename MT3 // Type of the left-hand side target matrix
525  , typename MT4 // Type of the left-hand side matrix operand
526  , typename MT5 > // Type of the right-hand side matrix operand
527  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
528  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
529  {
530  const size_t M( A.rows() );
531  const size_t N( B.columns() );
532  const size_t K( A.columns() );
533 
534  for( size_t i=0UL; i<M; ++i )
535  {
536  const size_t kbegin( ( IsUpper<MT4>::value )
537  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
538  :( 0UL ) );
539  const size_t kend( ( IsLower<MT4>::value )
540  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
541  :( K ) );
542  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
543 
544  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
545  for( size_t j=0UL; j<N; ++j ) {
546  reset( (~C)(i,j) );
547  }
548  continue;
549  }
550 
551  {
552  const size_t jbegin( ( IsUpper<MT5>::value )
553  ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
554  :( 0UL ) );
555  const size_t jend( ( IsLower<MT5>::value )
556  ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
557  :( N ) );
558  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
559 
560  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
561  for( size_t j=0UL; j<jbegin; ++j ) {
562  reset( (~C)(i,j) );
563  }
564  }
565  else if( IsStrictlyUpper<MT5>::value ) {
566  reset( (~C)(i,0UL) );
567  }
568  for( size_t j=jbegin; j<jend; ++j ) {
569  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
570  }
571  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
572  for( size_t j=jend; j<N; ++j ) {
573  reset( (~C)(i,j) );
574  }
575  }
576  else if( IsStrictlyLower<MT5>::value ) {
577  reset( (~C)(i,N-1UL) );
578  }
579  }
580 
581  for( size_t k=kbegin+1UL; k<kend; ++k )
582  {
583  const size_t jbegin( ( IsUpper<MT5>::value )
584  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
585  :( 0UL ) );
586  const size_t jend( ( IsLower<MT5>::value )
587  ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
588  :( N ) );
589  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
590 
591  for( size_t j=jbegin; j<jend; ++j ) {
592  (~C)(i,j) += A(i,k) * B(k,j);
593  }
594  if( IsLower<MT5>::value ) {
595  (~C)(i,jend) = A(i,k) * B(k,jend);
596  }
597  }
598  }
599  }
601  //**********************************************************************************************
602 
603  //**Default assignment to column-major dense matrices (general/general)*************************
617  template< typename MT3 // Type of the left-hand side target matrix
618  , typename MT4 // Type of the left-hand side matrix operand
619  , typename MT5 > // Type of the right-hand side matrix operand
620  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
621  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
622  {
623  const size_t M( A.rows() );
624  const size_t N( B.columns() );
625  const size_t K( A.columns() );
626 
627  for( size_t j=0UL; j<N; ++j )
628  {
629  const size_t kbegin( ( IsLower<MT5>::value )
630  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
631  :( 0UL ) );
632  const size_t kend( ( IsUpper<MT5>::value )
633  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
634  :( K ) );
635  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
636 
637  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
638  for( size_t i=0UL; i<M; ++i ) {
639  reset( (~C)(i,j) );
640  }
641  continue;
642  }
643 
644  {
645  const size_t ibegin( ( IsLower<MT4>::value )
646  ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
647  :( 0UL ) );
648  const size_t iend( ( IsUpper<MT4>::value )
649  ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
650  :( M ) );
651  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
652 
653  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
654  for( size_t i=0UL; i<ibegin; ++i ) {
655  reset( (~C)(i,j) );
656  }
657  }
658  else if( IsStrictlyLower<MT4>::value ) {
659  reset( (~C)(0UL,j) );
660  }
661  for( size_t i=ibegin; i<iend; ++i ) {
662  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
663  }
664  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
665  for( size_t i=iend; i<M; ++i ) {
666  reset( (~C)(i,j) );
667  }
668  }
669  else if( IsStrictlyUpper<MT4>::value ) {
670  reset( (~C)(M-1UL,j) );
671  }
672  }
673 
674  for( size_t k=kbegin+1UL; k<kend; ++k )
675  {
676  const size_t ibegin( ( IsLower<MT4>::value )
677  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
678  :( 0UL ) );
679  const size_t iend( ( IsUpper<MT4>::value )
680  ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
681  :( M ) );
682  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
683 
684  for( size_t i=ibegin; i<iend; ++i ) {
685  (~C)(i,j) += A(i,k) * B(k,j);
686  }
687  if( IsUpper<MT4>::value ) {
688  (~C)(iend,j) = A(iend,k) * B(k,j);
689  }
690  }
691  }
692  }
694  //**********************************************************************************************
695 
696  //**Default assignment to row-major dense matrices (general/diagonal)***************************
710  template< typename MT3 // Type of the left-hand side target matrix
711  , typename MT4 // Type of the left-hand side matrix operand
712  , typename MT5 > // Type of the right-hand side matrix operand
713  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
714  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
715  {
716  const size_t M( A.rows() );
717  const size_t N( B.columns() );
718 
719  const size_t block( BLOCK_SIZE );
720 
721  for( size_t ii=0UL; ii<M; ii+=block ) {
722  const size_t iend( min( M, ii+block ) );
723  for( size_t jj=0UL; jj<N; jj+=block ) {
724  const size_t jend( min( N, jj+block ) );
725  for( size_t i=ii; i<iend; ++i )
726  {
727  const size_t jbegin( ( IsUpper<MT4>::value )
728  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
729  :( jj ) );
730  const size_t jpos( ( IsLower<MT4>::value )
731  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
732  :( jend ) );
733 
734  if( IsUpper<MT4>::value ) {
735  for( size_t j=jj; j<jbegin; ++j ) {
736  reset( (~C)(i,j) );
737  }
738  }
739  for( size_t j=jbegin; j<jpos; ++j ) {
740  (~C)(i,j) = A(i,j) * B(j,j);
741  }
742  if( IsLower<MT4>::value ) {
743  for( size_t j=jpos; j<jend; ++j ) {
744  reset( (~C)(i,j) );
745  }
746  }
747  }
748  }
749  }
750  }
752  //**********************************************************************************************
753 
754  //**Default assignment to column-major dense matrices (general/diagonal)************************
768  template< typename MT3 // Type of the left-hand side target matrix
769  , typename MT4 // Type of the left-hand side matrix operand
770  , typename MT5 > // Type of the right-hand side matrix operand
771  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
772  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
773  {
774  const size_t M( A.rows() );
775  const size_t N( B.columns() );
776 
777  for( size_t j=0UL; j<N; ++j )
778  {
779  const size_t ibegin( ( IsLower<MT4>::value )
780  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
781  :( 0UL ) );
782  const size_t iend( ( IsUpper<MT4>::value )
783  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
784  :( M ) );
785  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
786 
787  if( IsLower<MT4>::value ) {
788  for( size_t i=0UL; i<ibegin; ++i ) {
789  reset( (~C)(i,j) );
790  }
791  }
792  for( size_t i=ibegin; i<iend; ++i ) {
793  (~C)(i,j) = A(i,j) * B(j,j);
794  }
795  if( IsUpper<MT4>::value ) {
796  for( size_t i=iend; i<M; ++i ) {
797  reset( (~C)(i,j) );
798  }
799  }
800  }
801  }
803  //**********************************************************************************************
804 
805  //**Default assignment to row-major dense matrices (diagonal/general)***************************
819  template< typename MT3 // Type of the left-hand side target matrix
820  , typename MT4 // Type of the left-hand side matrix operand
821  , typename MT5 > // Type of the right-hand side matrix operand
822  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
823  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
824  {
825  const size_t M( A.rows() );
826  const size_t N( B.columns() );
827 
828  for( size_t i=0UL; i<M; ++i )
829  {
830  const size_t jbegin( ( IsUpper<MT5>::value )
831  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
832  :( 0UL ) );
833  const size_t jend( ( IsLower<MT5>::value )
834  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
835  :( N ) );
836  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
837 
838  if( IsUpper<MT5>::value ) {
839  for( size_t j=0UL; j<jbegin; ++j ) {
840  reset( (~C)(i,j) );
841  }
842  }
843  for( size_t j=jbegin; j<jend; ++j ) {
844  (~C)(i,j) = A(i,i) * B(i,j);
845  }
846  if( IsLower<MT5>::value ) {
847  for( size_t j=jend; j<N; ++j ) {
848  reset( (~C)(i,j) );
849  }
850  }
851  }
852  }
854  //**********************************************************************************************
855 
856  //**Default assignment to column-major dense matrices (diagonal/general)************************
870  template< typename MT3 // Type of the left-hand side target matrix
871  , typename MT4 // Type of the left-hand side matrix operand
872  , typename MT5 > // Type of the right-hand side matrix operand
873  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
874  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
875  {
876  const size_t M( A.rows() );
877  const size_t N( B.columns() );
878 
879  const size_t block( BLOCK_SIZE );
880 
881  for( size_t jj=0UL; jj<N; jj+=block ) {
882  const size_t jend( min( N, jj+block ) );
883  for( size_t ii=0UL; ii<M; ii+=block ) {
884  const size_t iend( min( M, ii+block ) );
885  for( size_t j=jj; j<jend; ++j )
886  {
887  const size_t ibegin( ( IsLower<MT5>::value )
888  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
889  :( ii ) );
890  const size_t ipos( ( IsUpper<MT5>::value )
891  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
892  :( iend ) );
893 
894  if( IsLower<MT5>::value ) {
895  for( size_t i=ii; i<ibegin; ++i ) {
896  reset( (~C)(i,j) );
897  }
898  }
899  for( size_t i=ibegin; i<ipos; ++i ) {
900  (~C)(i,j) = A(i,i) * B(i,j);
901  }
902  if( IsUpper<MT5>::value ) {
903  for( size_t i=ipos; i<iend; ++i ) {
904  reset( (~C)(i,j) );
905  }
906  }
907  }
908  }
909  }
910  }
912  //**********************************************************************************************
913 
914  //**Default assignment to dense matrices (diagonal/diagonal)************************************
928  template< typename MT3 // Type of the left-hand side target matrix
929  , typename MT4 // Type of the left-hand side matrix operand
930  , typename MT5 > // Type of the right-hand side matrix operand
931  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
932  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
933  {
934  reset( C );
935 
936  for( size_t i=0UL; i<A.rows(); ++i ) {
937  C(i,i) = A(i,i) * B(i,i);
938  }
939  }
941  //**********************************************************************************************
942 
943  //**Default assignment to dense matrices (small matrices)***************************************
957  template< typename MT3 // Type of the left-hand side target matrix
958  , typename MT4 // Type of the left-hand side matrix operand
959  , typename MT5 > // Type of the right-hand side matrix operand
960  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
961  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
962  {
963  selectDefaultAssignKernel( ~C, A, B );
964  }
966  //**********************************************************************************************
967 
968  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
983  template< typename MT3 // Type of the left-hand side target matrix
984  , typename MT4 // Type of the left-hand side matrix operand
985  , typename MT5 > // Type of the right-hand side matrix operand
986  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
987  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
988  {
989  typedef IntrinsicTrait<ElementType> IT;
990 
991  const size_t M( A.rows() );
992  const size_t N( B.columns() );
993  const size_t K( A.columns() );
994 
995  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
996 
997  const size_t jpos( remainder ? ( N & size_t(-IT::size) ) : N );
998  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % IT::size ) ) == jpos, "Invalid end calculation" );
999 
1000  size_t j( 0UL );
1001 
1002  for( ; (j+IT::size*7UL) < jpos; j+=IT::size*8UL ) {
1003  for( size_t i=0UL; i<M; ++i )
1004  {
1005  const size_t kbegin( ( IsUpper<MT4>::value )
1006  ?( ( IsLower<MT5>::value )
1007  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1008  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1009  :( IsLower<MT5>::value ? j : 0UL ) );
1010  const size_t kend( ( IsLower<MT4>::value )
1011  ?( ( IsUpper<MT5>::value )
1012  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
1013  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1014  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
1015 
1016  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1017 
1018  for( size_t k=kbegin; k<kend; ++k ) {
1019  const IntrinsicType a1( set( A(i,k) ) );
1020  xmm1 = xmm1 + a1 * B.load(k,j );
1021  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1022  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1023  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1024  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
1025  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
1026  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
1027  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
1028  }
1029 
1030  (~C).store( i, j , xmm1 );
1031  (~C).store( i, j+IT::size , xmm2 );
1032  (~C).store( i, j+IT::size*2UL, xmm3 );
1033  (~C).store( i, j+IT::size*3UL, xmm4 );
1034  (~C).store( i, j+IT::size*4UL, xmm5 );
1035  (~C).store( i, j+IT::size*5UL, xmm6 );
1036  (~C).store( i, j+IT::size*6UL, xmm7 );
1037  (~C).store( i, j+IT::size*7UL, xmm8 );
1038  }
1039  }
1040 
1041  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
1042  {
1043  size_t i( 0UL );
1044 
1045  for( ; (i+2UL) <= M; i+=2UL )
1046  {
1047  const size_t kbegin( ( IsUpper<MT4>::value )
1048  ?( ( IsLower<MT5>::value )
1049  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1050  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1051  :( IsLower<MT5>::value ? j : 0UL ) );
1052  const size_t kend( ( IsLower<MT4>::value )
1053  ?( ( IsUpper<MT5>::value )
1054  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
1055  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1056  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
1057 
1058  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1059 
1060  for( size_t k=kbegin; k<kend; ++k ) {
1061  const IntrinsicType a1( set( A(i ,k) ) );
1062  const IntrinsicType a2( set( A(i+1UL,k) ) );
1063  const IntrinsicType b1( B.load(k,j ) );
1064  const IntrinsicType b2( B.load(k,j+IT::size ) );
1065  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
1066  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
1067  xmm1 = xmm1 + a1 * b1;
1068  xmm2 = xmm2 + a1 * b2;
1069  xmm3 = xmm3 + a1 * b3;
1070  xmm4 = xmm4 + a1 * b4;
1071  xmm5 = xmm5 + a2 * b1;
1072  xmm6 = xmm6 + a2 * b2;
1073  xmm7 = xmm7 + a2 * b3;
1074  xmm8 = xmm8 + a2 * b4;
1075  }
1076 
1077  (~C).store( i , j , xmm1 );
1078  (~C).store( i , j+IT::size , xmm2 );
1079  (~C).store( i , j+IT::size*2UL, xmm3 );
1080  (~C).store( i , j+IT::size*3UL, xmm4 );
1081  (~C).store( i+1UL, j , xmm5 );
1082  (~C).store( i+1UL, j+IT::size , xmm6 );
1083  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
1084  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
1085  }
1086 
1087  if( i < M )
1088  {
1089  const size_t kbegin( ( IsUpper<MT4>::value )
1090  ?( ( IsLower<MT5>::value )
1091  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1092  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1093  :( IsLower<MT5>::value ? j : 0UL ) );
1094  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
1095 
1096  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1097 
1098  for( size_t k=kbegin; k<kend; ++k ) {
1099  const IntrinsicType a1( set( A(i,k) ) );
1100  xmm1 = xmm1 + a1 * B.load(k,j );
1101  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1102  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1103  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1104  }
1105 
1106  (~C).store( i, j , xmm1 );
1107  (~C).store( i, j+IT::size , xmm2 );
1108  (~C).store( i, j+IT::size*2UL, xmm3 );
1109  (~C).store( i, j+IT::size*3UL, xmm4 );
1110  }
1111  }
1112 
1113  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
1114  {
1115  size_t i( 0UL );
1116 
1117  for( ; (i+2UL) <= M; i+=2UL )
1118  {
1119  const size_t kbegin( ( IsUpper<MT4>::value )
1120  ?( ( IsLower<MT5>::value )
1121  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1122  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1123  :( IsLower<MT5>::value ? j : 0UL ) );
1124  const size_t kend( ( IsLower<MT4>::value )
1125  ?( ( IsUpper<MT5>::value )
1126  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
1127  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1128  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
1129 
1130  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1131 
1132  for( size_t k=kbegin; k<kend; ++k ) {
1133  const IntrinsicType a1( set( A(i ,k) ) );
1134  const IntrinsicType a2( set( A(i+1UL,k) ) );
1135  const IntrinsicType b1( B.load(k,j ) );
1136  const IntrinsicType b2( B.load(k,j+IT::size) );
1137  xmm1 = xmm1 + a1 * b1;
1138  xmm2 = xmm2 + a1 * b2;
1139  xmm3 = xmm3 + a2 * b1;
1140  xmm4 = xmm4 + a2 * b2;
1141  }
1142 
1143  (~C).store( i , j , xmm1 );
1144  (~C).store( i , j+IT::size, xmm2 );
1145  (~C).store( i+1UL, j , xmm3 );
1146  (~C).store( i+1UL, j+IT::size, xmm4 );
1147  }
1148 
1149  if( i < M )
1150  {
1151  const size_t kbegin( ( IsUpper<MT4>::value )
1152  ?( ( IsLower<MT5>::value )
1153  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1154  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1155  :( IsLower<MT5>::value ? j : 0UL ) );
1156  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
1157 
1158  IntrinsicType xmm1, xmm2;
1159 
1160  for( size_t k=kbegin; k<kend; ++k ) {
1161  const IntrinsicType a1( set( A(i,k) ) );
1162  xmm1 = xmm1 + a1 * B.load(k,j );
1163  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
1164  }
1165 
1166  (~C).store( i, j , xmm1 );
1167  (~C).store( i, j+IT::size, xmm2 );
1168  }
1169  }
1170 
1171  for( ; j<jpos; j+=IT::size )
1172  {
1173  size_t i( 0UL );
1174 
1175  for( ; (i+2UL) <= M; i+=2UL )
1176  {
1177  const size_t kbegin( ( IsUpper<MT4>::value )
1178  ?( ( IsLower<MT5>::value )
1179  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1180  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1181  :( IsLower<MT5>::value ? j : 0UL ) );
1182  const size_t kend( ( IsLower<MT4>::value )
1183  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1184  :( K ) );
1185 
1186  IntrinsicType xmm1, xmm2;
1187 
1188  for( size_t k=kbegin; k<kend; ++k ) {
1189  const IntrinsicType b1( B.load(k,j) );
1190  xmm1 = xmm1 + set( A(i ,k) ) * b1;
1191  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
1192  }
1193 
1194  (~C).store( i , j, xmm1 );
1195  (~C).store( i+1UL, j, xmm2 );
1196  }
1197 
1198  if( i < M )
1199  {
1200  const size_t kbegin( ( IsUpper<MT4>::value )
1201  ?( ( IsLower<MT5>::value )
1202  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1203  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1204  :( IsLower<MT5>::value ? j : 0UL ) );
1205 
1206  IntrinsicType xmm1;
1207 
1208  for( size_t k=kbegin; k<K; ++k ) {
1209  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
1210  }
1211 
1212  (~C).store( i, j, xmm1 );
1213  }
1214  }
1215 
1216  for( ; remainder && j<N; ++j )
1217  {
1218  size_t i( 0UL );
1219 
1220  for( ; (i+2UL) <= M; i+=2UL )
1221  {
1222  const size_t kbegin( ( IsUpper<MT4>::value )
1223  ?( ( IsLower<MT5>::value )
1224  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1225  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1226  :( IsLower<MT5>::value ? j : 0UL ) );
1227  const size_t kend( ( IsLower<MT4>::value )
1228  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1229  :( K ) );
1230 
1231  ElementType value1 = ElementType();
1232  ElementType value2 = ElementType();
1233 
1234  for( size_t k=kbegin; k<kend; ++k ) {
1235  value1 += A(i ,k) * B(k,j);
1236  value2 += A(i+1UL,k) * B(k,j);
1237  }
1238 
1239  (~C)(i ,j) = value1;
1240  (~C)(i+1UL,j) = value2;
1241  }
1242 
1243  if( i < M )
1244  {
1245  const size_t kbegin( ( IsUpper<MT4>::value )
1246  ?( ( IsLower<MT5>::value )
1247  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1248  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1249  :( IsLower<MT5>::value ? j : 0UL ) );
1250 
1251  ElementType value = ElementType();
1252 
1253  for( size_t k=kbegin; k<K; ++k ) {
1254  value += A(i,k) * B(k,j);
1255  }
1256 
1257  (~C)(i,j) = value;
1258  }
1259  }
1260  }
1262  //**********************************************************************************************
1263 
1264  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1279  template< typename MT3 // Type of the left-hand side target matrix
1280  , typename MT4 // Type of the left-hand side matrix operand
1281  , typename MT5 > // Type of the right-hand side matrix operand
1282  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1283  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1284  {
1285  typedef IntrinsicTrait<ElementType> IT;
1286 
1287  const size_t M( A.rows() );
1288  const size_t N( B.columns() );
1289  const size_t K( A.columns() );
1290 
1291  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1292 
1293  const size_t ipos( remainder ? ( M & size_t(-IT::size) ) : M );
1294  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % IT::size ) ) == ipos, "Invalid end calculation" );
1295 
1296  size_t i( 0UL );
1297 
1298  for( ; (i+IT::size*7UL) < ipos; i+=IT::size*8UL ) {
1299  for( size_t j=0UL; j<N; ++j )
1300  {
1301  const size_t kbegin( ( IsLower<MT5>::value )
1302  ?( ( IsUpper<MT4>::value )
1303  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1304  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1305  :( IsUpper<MT4>::value ? i : 0UL ) );
1306  const size_t kend( ( IsUpper<MT5>::value )
1307  ?( ( IsLower<MT4>::value )
1308  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1309  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1310  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
1311 
1312  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1313 
1314  for( size_t k=kbegin; k<kend; ++k ) {
1315  const IntrinsicType b1( set( B(k,j) ) );
1316  xmm1 = xmm1 + A.load(i ,k) * b1;
1317  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1318  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1319  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1320  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
1321  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
1322  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
1323  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
1324  }
1325 
1326  (~C).store( i , j, xmm1 );
1327  (~C).store( i+IT::size , j, xmm2 );
1328  (~C).store( i+IT::size*2UL, j, xmm3 );
1329  (~C).store( i+IT::size*3UL, j, xmm4 );
1330  (~C).store( i+IT::size*4UL, j, xmm5 );
1331  (~C).store( i+IT::size*5UL, j, xmm6 );
1332  (~C).store( i+IT::size*6UL, j, xmm7 );
1333  (~C).store( i+IT::size*7UL, j, xmm8 );
1334  }
1335  }
1336 
1337  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
1338  {
1339  size_t j( 0UL );
1340 
1341  for( ; (j+2UL) <= N; j+=2UL )
1342  {
1343  const size_t kbegin( ( IsLower<MT5>::value )
1344  ?( ( IsUpper<MT4>::value )
1345  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1346  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1347  :( IsUpper<MT4>::value ? i : 0UL ) );
1348  const size_t kend( ( IsUpper<MT5>::value )
1349  ?( ( IsLower<MT4>::value )
1350  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1351  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1352  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
1353 
1354  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1355 
1356  for( size_t k=kbegin; k<kend; ++k ) {
1357  const IntrinsicType a1( A.load(i ,k) );
1358  const IntrinsicType a2( A.load(i+IT::size ,k) );
1359  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
1360  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
1361  const IntrinsicType b1( set( B(k,j ) ) );
1362  const IntrinsicType b2( set( B(k,j+1UL) ) );
1363  xmm1 = xmm1 + a1 * b1;
1364  xmm2 = xmm2 + a2 * b1;
1365  xmm3 = xmm3 + a3 * b1;
1366  xmm4 = xmm4 + a4 * b1;
1367  xmm5 = xmm5 + a1 * b2;
1368  xmm6 = xmm6 + a2 * b2;
1369  xmm7 = xmm7 + a3 * b2;
1370  xmm8 = xmm8 + a4 * b2;
1371  }
1372 
1373  (~C).store( i , j , xmm1 );
1374  (~C).store( i+IT::size , j , xmm2 );
1375  (~C).store( i+IT::size*2UL, j , xmm3 );
1376  (~C).store( i+IT::size*3UL, j , xmm4 );
1377  (~C).store( i , j+1UL, xmm5 );
1378  (~C).store( i+IT::size , j+1UL, xmm6 );
1379  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
1380  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
1381  }
1382 
1383  if( j < N )
1384  {
1385  const size_t kbegin( ( IsLower<MT5>::value )
1386  ?( ( IsUpper<MT4>::value )
1387  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1388  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1389  :( IsUpper<MT4>::value ? i : 0UL ) );
1390  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
1391 
1392  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1393 
1394  for( size_t k=kbegin; k<kend; ++k ) {
1395  const IntrinsicType b1( set( B(k,j) ) );
1396  xmm1 = xmm1 + A.load(i ,k) * b1;
1397  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1398  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1399  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1400  }
1401 
1402  (~C).store( i , j, xmm1 );
1403  (~C).store( i+IT::size , j, xmm2 );
1404  (~C).store( i+IT::size*2UL, j, xmm3 );
1405  (~C).store( i+IT::size*3UL, j, xmm4 );
1406  }
1407  }
1408 
1409  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
1410  {
1411  size_t j( 0UL );
1412 
1413  for( ; (j+2UL) <= N; j+=2UL )
1414  {
1415  const size_t kbegin( ( IsLower<MT5>::value )
1416  ?( ( IsUpper<MT4>::value )
1417  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1418  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1419  :( IsUpper<MT4>::value ? i : 0UL ) );
1420  const size_t kend( ( IsUpper<MT5>::value )
1421  ?( ( IsLower<MT4>::value )
1422  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1423  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1424  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
1425 
1426  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1427 
1428  for( size_t k=kbegin; k<kend; ++k ) {
1429  const IntrinsicType a1( A.load(i ,k) );
1430  const IntrinsicType a2( A.load(i+IT::size,k) );
1431  const IntrinsicType b1( set( B(k,j ) ) );
1432  const IntrinsicType b2( set( B(k,j+1UL) ) );
1433  xmm1 = xmm1 + a1 * b1;
1434  xmm2 = xmm2 + a2 * b1;
1435  xmm3 = xmm3 + a1 * b2;
1436  xmm4 = xmm4 + a2 * b2;
1437  }
1438 
1439  (~C).store( i , j , xmm1 );
1440  (~C).store( i+IT::size, j , xmm2 );
1441  (~C).store( i , j+1UL, xmm3 );
1442  (~C).store( i+IT::size, j+1UL, xmm4 );
1443  }
1444 
1445  if( j < N )
1446  {
1447  const size_t kbegin( ( IsLower<MT5>::value )
1448  ?( ( IsUpper<MT4>::value )
1449  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1450  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1451  :( IsUpper<MT4>::value ? i : 0UL ) );
1452  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
1453 
1454  IntrinsicType xmm1, xmm2;
1455 
1456  for( size_t k=kbegin; k<kend; ++k ) {
1457  const IntrinsicType b1( set( B(k,j) ) );
1458  xmm1 = xmm1 + A.load(i ,k) * b1;
1459  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
1460  }
1461 
1462  (~C).store( i , j, xmm1 );
1463  (~C).store( i+IT::size, j, xmm2 );
1464  }
1465  }
1466 
1467  for( ; i<ipos; i+=IT::size )
1468  {
1469  size_t j( 0UL );
1470 
1471  for( ; (j+2UL) <= N; j+=2UL )
1472  {
1473  const size_t kbegin( ( IsLower<MT5>::value )
1474  ?( ( IsUpper<MT4>::value )
1475  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1476  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1477  :( IsUpper<MT4>::value ? i : 0UL ) );
1478  const size_t kend( ( IsUpper<MT5>::value )
1479  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1480  :( K ) );
1481 
1482  IntrinsicType xmm1, xmm2;
1483 
1484  for( size_t k=kbegin; k<kend; ++k ) {
1485  const IntrinsicType a1( A.load(i,k) );
1486  xmm1 = xmm1 + a1 * set( B(k,j ) );
1487  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
1488  }
1489 
1490  (~C).store( i, j , xmm1 );
1491  (~C).store( i, j+1UL, xmm2 );
1492  }
1493 
1494  if( j < N )
1495  {
1496  const size_t kbegin( ( IsLower<MT5>::value )
1497  ?( ( IsUpper<MT4>::value )
1498  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1499  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1500  :( IsUpper<MT4>::value ? i : 0UL ) );
1501 
1502  IntrinsicType xmm1;
1503 
1504  for( size_t k=kbegin; k<K; ++k ) {
1505  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
1506  }
1507 
1508  (~C).store( i, j, xmm1 );
1509  }
1510  }
1511 
1512  for( ; remainder && i<M; ++i )
1513  {
1514  size_t j( 0UL );
1515 
1516  for( ; (j+2UL) <= N; j+=2UL )
1517  {
1518  const size_t kbegin( ( IsLower<MT5>::value )
1519  ?( ( IsUpper<MT4>::value )
1520  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1521  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1522  :( IsUpper<MT4>::value ? i : 0UL ) );
1523  const size_t kend( ( IsUpper<MT5>::value )
1524  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1525  :( K ) );
1526 
1527  ElementType value1 = ElementType();
1528  ElementType value2 = ElementType();
1529 
1530  for( size_t k=kbegin; k<kend; ++k ) {
1531  value1 += A(i,k) * B(k,j );
1532  value2 += A(i,k) * B(k,j+1UL);
1533  }
1534 
1535  (~C)(i,j ) = value1;
1536  (~C)(i,j+1UL) = value2;
1537  }
1538 
1539  if( j < N )
1540  {
1541  const size_t kbegin( ( IsLower<MT5>::value )
1542  ?( ( IsUpper<MT4>::value )
1543  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1544  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1545  :( IsUpper<MT4>::value ? i : 0UL ) );
1546 
1547  ElementType value = ElementType();
1548 
1549  for( size_t k=kbegin; k<K; ++k ) {
1550  value += A(i,k) * B(k,j);
1551  }
1552 
1553  (~C)(i,j) = value;
1554  }
1555  }
1556  }
1558  //**********************************************************************************************
1559 
1560  //**Default assignment to dense matrices (large matrices)***************************************
1574  template< typename MT3 // Type of the left-hand side target matrix
1575  , typename MT4 // Type of the left-hand side matrix operand
1576  , typename MT5 > // Type of the right-hand side matrix operand
1577  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1578  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1579  {
1580  selectDefaultAssignKernel( C, A, B );
1581  }
1583  //**********************************************************************************************
1584 
1585  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
1600  template< typename MT3 // Type of the left-hand side target matrix
1601  , typename MT4 // Type of the left-hand side matrix operand
1602  , typename MT5 > // Type of the right-hand side matrix operand
1603  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1604  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1605  {
1606  typedef IntrinsicTrait<ElementType> IT;
1607 
1608  const size_t M( A.rows() );
1609  const size_t N( B.columns() );
1610  const size_t K( A.columns() );
1611 
1612  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1613 
1614  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
1615  {
1616  const size_t jend( min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
1617 
1618  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1619  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % IT::size ) ) == jpos, "Invalid end calculation" );
1620 
1621  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
1622  {
1623  const size_t iend( min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
1624 
1625  for( size_t i=ii; i<iend; ++i ) {
1626  for( size_t j=jj; j<jend; ++j ) {
1627  reset( (~C)(i,j) );
1628  }
1629  }
1630 
1631  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
1632  {
1633  const size_t ktmp( min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
1634 
1635  size_t j( jj );
1636 
1637  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
1638  {
1639  const size_t j1( j+IT::size );
1640  const size_t j2( j+IT::size*2UL );
1641  const size_t j3( j+IT::size*3UL );
1642 
1643  size_t i( ii );
1644 
1645  for( ; (i+2UL) <= iend; i+=2UL )
1646  {
1647  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1648  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1649  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1650  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
1651 
1652  IntrinsicType xmm1( (~C).load(i ,j ) );
1653  IntrinsicType xmm2( (~C).load(i ,j1) );
1654  IntrinsicType xmm3( (~C).load(i ,j2) );
1655  IntrinsicType xmm4( (~C).load(i ,j3) );
1656  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1657  IntrinsicType xmm6( (~C).load(i+1UL,j1) );
1658  IntrinsicType xmm7( (~C).load(i+1UL,j2) );
1659  IntrinsicType xmm8( (~C).load(i+1UL,j3) );
1660 
1661  for( size_t k=kbegin; k<kend; ++k ) {
1662  const IntrinsicType a1( set( A(i ,k) ) );
1663  const IntrinsicType a2( set( A(i+1UL,k) ) );
1664  const IntrinsicType b1( B.load(k,j ) );
1665  const IntrinsicType b2( B.load(k,j1) );
1666  const IntrinsicType b3( B.load(k,j2) );
1667  const IntrinsicType b4( B.load(k,j3) );
1668  xmm1 = xmm1 + a1 * b1;
1669  xmm2 = xmm2 + a1 * b2;
1670  xmm3 = xmm3 + a1 * b3;
1671  xmm4 = xmm4 + a1 * b4;
1672  xmm5 = xmm5 + a2 * b1;
1673  xmm6 = xmm6 + a2 * b2;
1674  xmm7 = xmm7 + a2 * b3;
1675  xmm8 = xmm8 + a2 * b4;
1676  }
1677 
1678  (~C).store( i , j , xmm1 );
1679  (~C).store( i , j1, xmm2 );
1680  (~C).store( i , j2, xmm3 );
1681  (~C).store( i , j3, xmm4 );
1682  (~C).store( i+1UL, j , xmm5 );
1683  (~C).store( i+1UL, j1, xmm6 );
1684  (~C).store( i+1UL, j2, xmm7 );
1685  (~C).store( i+1UL, j3, xmm8 );
1686  }
1687 
1688  if( i < iend )
1689  {
1690  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1691  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1692  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1693  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
1694 
1695  IntrinsicType xmm1( (~C).load(i,j ) );
1696  IntrinsicType xmm2( (~C).load(i,j1) );
1697  IntrinsicType xmm3( (~C).load(i,j2) );
1698  IntrinsicType xmm4( (~C).load(i,j3) );
1699 
1700  for( size_t k=kbegin; k<kend; ++k ) {
1701  const IntrinsicType a1( set( A(i,k) ) );
1702  xmm1 = xmm1 + a1 * B.load(k,j );
1703  xmm2 = xmm2 + a1 * B.load(k,j1);
1704  xmm3 = xmm3 + a1 * B.load(k,j2);
1705  xmm4 = xmm4 + a1 * B.load(k,j3);
1706  }
1707 
1708  (~C).store( i, j , xmm1 );
1709  (~C).store( i, j1, xmm2 );
1710  (~C).store( i, j2, xmm3 );
1711  (~C).store( i, j3, xmm4 );
1712  }
1713  }
1714 
1715  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
1716  {
1717  const size_t j1( j+IT::size );
1718 
1719  size_t i( ii );
1720 
1721  for( ; (i+4UL) <= iend; i+=4UL )
1722  {
1723  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1724  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1725  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
1726  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
1727 
1728  IntrinsicType xmm1( (~C).load(i ,j ) );
1729  IntrinsicType xmm2( (~C).load(i ,j1) );
1730  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1731  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
1732  IntrinsicType xmm5( (~C).load(i+2UL,j ) );
1733  IntrinsicType xmm6( (~C).load(i+2UL,j1) );
1734  IntrinsicType xmm7( (~C).load(i+3UL,j ) );
1735  IntrinsicType xmm8( (~C).load(i+3UL,j1) );
1736 
1737  for( size_t k=kbegin; k<kend; ++k ) {
1738  const IntrinsicType a1( set( A(i ,k) ) );
1739  const IntrinsicType a2( set( A(i+1UL,k) ) );
1740  const IntrinsicType a3( set( A(i+2UL,k) ) );
1741  const IntrinsicType a4( set( A(i+3UL,k) ) );
1742  const IntrinsicType b1( B.load(k,j ) );
1743  const IntrinsicType b2( B.load(k,j1) );
1744  xmm1 = xmm1 + a1 * b1;
1745  xmm2 = xmm2 + a1 * b2;
1746  xmm3 = xmm3 + a2 * b1;
1747  xmm4 = xmm4 + a2 * b2;
1748  xmm5 = xmm5 + a3 * b1;
1749  xmm6 = xmm6 + a3 * b2;
1750  xmm7 = xmm7 + a4 * b1;
1751  xmm8 = xmm8 + a4 * b2;
1752  }
1753 
1754  (~C).store( i , j , xmm1 );
1755  (~C).store( i , j1, xmm2 );
1756  (~C).store( i+1UL, j , xmm3 );
1757  (~C).store( i+1UL, j1, xmm4 );
1758  (~C).store( i+2UL, j , xmm5 );
1759  (~C).store( i+2UL, j1, xmm6 );
1760  (~C).store( i+3UL, j , xmm7 );
1761  (~C).store( i+3UL, j1, xmm8 );
1762  }
1763 
1764  for( ; (i+2UL) <= iend; i+=2UL )
1765  {
1766  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1767  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1768  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1769  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
1770 
1771  IntrinsicType xmm1( (~C).load(i ,j ) );
1772  IntrinsicType xmm2( (~C).load(i ,j1) );
1773  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1774  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
1775 
1776  for( size_t k=kbegin; k<kend; ++k ) {
1777  const IntrinsicType a1( set( A(i ,k) ) );
1778  const IntrinsicType a2( set( A(i+1UL,k) ) );
1779  const IntrinsicType b1( B.load(k,j ) );
1780  const IntrinsicType b2( B.load(k,j1) );
1781  xmm1 = xmm1 + a1 * b1;
1782  xmm2 = xmm2 + a1 * b2;
1783  xmm3 = xmm3 + a2 * b1;
1784  xmm4 = xmm4 + a2 * b2;
1785  }
1786 
1787  (~C).store( i , j , xmm1 );
1788  (~C).store( i , j1, xmm2 );
1789  (~C).store( i+1UL, j , xmm3 );
1790  (~C).store( i+1UL, j1, xmm4 );
1791  }
1792 
1793  if( i < iend )
1794  {
1795  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1796  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1797  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1798  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
1799 
1800  IntrinsicType xmm1( (~C).load(i,j ) );
1801  IntrinsicType xmm2( (~C).load(i,j1) );
1802 
1803  for( size_t k=kbegin; k<kend; ++k ) {
1804  const IntrinsicType a1( set( A(i,k) ) );
1805  xmm1 = xmm1 + a1 * B.load(k,j );
1806  xmm2 = xmm2 + a1 * B.load(k,j1);
1807  }
1808 
1809  (~C).store( i, j , xmm1 );
1810  (~C).store( i, j1, xmm2 );
1811  }
1812  }
1813 
1814  for( ; j<jpos; j+=IT::size )
1815  {
1816  for( size_t i=ii; i<iend; ++i )
1817  {
1818  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1819  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1820  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1821  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
1822 
1823  IntrinsicType xmm1( (~C).load(i,j) );
1824 
1825  for( size_t k=kbegin; k<kend; ++k ) {
1826  const IntrinsicType a1( set( A(i,k) ) );
1827  xmm1 = xmm1 + a1 * B.load(k,j);
1828  }
1829 
1830  (~C).store( i, j, xmm1 );
1831  }
1832  }
1833 
1834  for( ; remainder && j<jend; ++j )
1835  {
1836  for( size_t i=ii; i<iend; ++i )
1837  {
1838  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1839  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1840  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1841  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
1842 
1843  ElementType value( (~C)(i,j) );
1844 
1845  for( size_t k=kbegin; k<kend; ++k ) {
1846  value += A(i,k) * B(k,j);
1847  }
1848 
1849  (~C)(i,j) = value;
1850  }
1851  }
1852  }
1853  }
1854  }
1855  }
1857  //**********************************************************************************************
1858 
1859  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
1874  template< typename MT3 // Type of the left-hand side target matrix
1875  , typename MT4 // Type of the left-hand side matrix operand
1876  , typename MT5 > // Type of the right-hand side matrix operand
1877  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1878  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1879  {
1880  typedef IntrinsicTrait<ElementType> IT;
1881 
1882  const size_t M( A.rows() );
1883  const size_t N( B.columns() );
1884  const size_t K( A.columns() );
1885 
1886  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1887 
1888  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
1889  {
1890  const size_t iend( min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
1891 
1892  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1893  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % IT::size ) ) == ipos, "Invalid end calculation" );
1894 
1895  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
1896  {
1897  const size_t jend( min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
1898 
1899  for( size_t j=jj; j<jend; ++j ) {
1900  for( size_t i=ii; i<iend; ++i ) {
1901  reset( (~C)(i,j) );
1902  }
1903  }
1904 
1905  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
1906  {
1907  const size_t ktmp( min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
1908 
1909  size_t i( ii );
1910 
1911  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
1912  {
1913  const size_t i1( i+IT::size );
1914  const size_t i2( i+IT::size*2UL );
1915  const size_t i3( i+IT::size*3UL );
1916 
1917  size_t j( jj );
1918 
1919  for( ; (j+2UL) <= jend; j+=2UL )
1920  {
1921  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1922  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1923  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
1924  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1925 
1926  IntrinsicType xmm1( (~C).load(i ,j ) );
1927  IntrinsicType xmm2( (~C).load(i1,j ) );
1928  IntrinsicType xmm3( (~C).load(i2,j ) );
1929  IntrinsicType xmm4( (~C).load(i3,j ) );
1930  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
1931  IntrinsicType xmm6( (~C).load(i1,j+1UL) );
1932  IntrinsicType xmm7( (~C).load(i2,j+1UL) );
1933  IntrinsicType xmm8( (~C).load(i3,j+1UL) );
1934 
1935  for( size_t k=kbegin; k<kend; ++k ) {
1936  const IntrinsicType a1( A.load(i ,k) );
1937  const IntrinsicType a2( A.load(i1,k) );
1938  const IntrinsicType a3( A.load(i2,k) );
1939  const IntrinsicType a4( A.load(i3,k) );
1940  const IntrinsicType b1( set( B(k,j ) ) );
1941  const IntrinsicType b2( set( B(k,j+1UL) ) );
1942  xmm1 = xmm1 + a1 * b1;
1943  xmm2 = xmm2 + a2 * b1;
1944  xmm3 = xmm3 + a3 * b1;
1945  xmm4 = xmm4 + a4 * b1;
1946  xmm5 = xmm5 + a1 * b2;
1947  xmm6 = xmm6 + a2 * b2;
1948  xmm7 = xmm7 + a3 * b2;
1949  xmm8 = xmm8 + a4 * b2;
1950  }
1951 
1952  (~C).store( i , j , xmm1 );
1953  (~C).store( i1, j , xmm2 );
1954  (~C).store( i2, j , xmm3 );
1955  (~C).store( i3, j , xmm4 );
1956  (~C).store( i , j+1UL, xmm5 );
1957  (~C).store( i1, j+1UL, xmm6 );
1958  (~C).store( i2, j+1UL, xmm7 );
1959  (~C).store( i3, j+1UL, xmm8 );
1960  }
1961 
1962  if( j < jend )
1963  {
1964  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1965  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1966  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
1967  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1968 
1969  IntrinsicType xmm1( (~C).load(i ,j) );
1970  IntrinsicType xmm2( (~C).load(i1,j) );
1971  IntrinsicType xmm3( (~C).load(i2,j) );
1972  IntrinsicType xmm4( (~C).load(i3,j) );
1973 
1974  for( size_t k=kbegin; k<kend; ++k ) {
1975  const IntrinsicType b1( set( B(k,j) ) );
1976  xmm1 = xmm1 + A.load(i ,k) * b1;
1977  xmm2 = xmm2 + A.load(i1,k) * b1;
1978  xmm3 = xmm3 + A.load(i2,k) * b1;
1979  xmm4 = xmm4 + A.load(i3,k) * b1;
1980  }
1981 
1982  (~C).store( i , j, xmm1 );
1983  (~C).store( i1, j, xmm2 );
1984  (~C).store( i2, j, xmm3 );
1985  (~C).store( i3, j, xmm4 );
1986  }
1987  }
1988 
1989  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
1990  {
1991  const size_t i1( i+IT::size );
1992 
1993  size_t j( jj );
1994 
1995  for( ; (j+4UL) <= jend; j+=4UL )
1996  {
1997  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1998  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1999  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
2000  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
2001 
2002  IntrinsicType xmm1( (~C).load(i ,j ) );
2003  IntrinsicType xmm2( (~C).load(i1,j ) );
2004  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2005  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
2006  IntrinsicType xmm5( (~C).load(i ,j+2UL) );
2007  IntrinsicType xmm6( (~C).load(i1,j+2UL) );
2008  IntrinsicType xmm7( (~C).load(i ,j+3UL) );
2009  IntrinsicType xmm8( (~C).load(i1,j+3UL) );
2010 
2011  for( size_t k=kbegin; k<kend; ++k ) {
2012  const IntrinsicType a1( A.load(i ,k) );
2013  const IntrinsicType a2( A.load(i1,k) );
2014  const IntrinsicType b1( set( B(k,j ) ) );
2015  const IntrinsicType b2( set( B(k,j+1UL) ) );
2016  const IntrinsicType b3( set( B(k,j+2UL) ) );
2017  const IntrinsicType b4( set( B(k,j+3UL) ) );
2018  xmm1 = xmm1 + a1 * b1;
2019  xmm2 = xmm2 + a2 * b1;
2020  xmm3 = xmm3 + a1 * b2;
2021  xmm4 = xmm4 + a2 * b2;
2022  xmm5 = xmm5 + a1 * b3;
2023  xmm6 = xmm6 + a2 * b3;
2024  xmm7 = xmm7 + a1 * b4;
2025  xmm8 = xmm8 + a2 * b4;
2026  }
2027 
2028  (~C).store( i , j , xmm1 );
2029  (~C).store( i1, j , xmm2 );
2030  (~C).store( i , j+1UL, xmm3 );
2031  (~C).store( i1, j+1UL, xmm4 );
2032  (~C).store( i , j+2UL, xmm5 );
2033  (~C).store( i1, j+2UL, xmm6 );
2034  (~C).store( i , j+3UL, xmm7 );
2035  (~C).store( i1, j+3UL, xmm8 );
2036  }
2037 
2038  for( ; (j+2UL) <= jend; j+=2UL )
2039  {
2040  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2041  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2042  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
2043  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2044 
2045  IntrinsicType xmm1( (~C).load(i ,j ) );
2046  IntrinsicType xmm2( (~C).load(i1,j ) );
2047  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2048  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
2049 
2050  for( size_t k=kbegin; k<kend; ++k ) {
2051  const IntrinsicType a1( A.load(i ,k) );
2052  const IntrinsicType a2( A.load(i1,k) );
2053  const IntrinsicType b1( set( B(k,j ) ) );
2054  const IntrinsicType b2( set( B(k,j+1UL) ) );
2055  xmm1 = xmm1 + a1 * b1;
2056  xmm2 = xmm2 + a2 * b1;
2057  xmm3 = xmm3 + a1 * b2;
2058  xmm4 = xmm4 + a2 * b2;
2059  }
2060 
2061  (~C).store( i , j , xmm1 );
2062  (~C).store( i1, j , xmm2 );
2063  (~C).store( i , j+1UL, xmm3 );
2064  (~C).store( i1, j+1UL, xmm4 );
2065  }
2066 
2067  if( j < jend )
2068  {
2069  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2070  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2071  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
2072  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2073 
2074  IntrinsicType xmm1( (~C).load(i ,j) );
2075  IntrinsicType xmm2( (~C).load(i1,j) );
2076 
2077  for( size_t k=kbegin; k<kend; ++k ) {
2078  const IntrinsicType b1( set( B(k,j) ) );
2079  xmm1 = xmm1 + A.load(i ,k) * b1;
2080  xmm2 = xmm2 + A.load(i1,k) * b1;
2081  }
2082 
2083  (~C).store( i , j, xmm1 );
2084  (~C).store( i1, j, xmm2 );
2085  }
2086  }
2087 
2088  for( ; i<ipos; i+=IT::size )
2089  {
2090  for( size_t j=jj; j<jend; ++j )
2091  {
2092  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2093  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2094  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
2095  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2096 
2097  IntrinsicType xmm1( (~C).load(i,j) );
2098 
2099  for( size_t k=kbegin; k<kend; ++k ) {
2100  const IntrinsicType b1( set( B(k,j) ) );
2101  xmm1 = xmm1 + A.load(i,k) * b1;
2102  }
2103 
2104  (~C).store( i, j, xmm1 );
2105  }
2106  }
2107 
2108  for( ; remainder && i<iend; ++i )
2109  {
2110  for( size_t j=jj; j<jend; ++j )
2111  {
2112  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2113  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2114  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
2115  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2116 
2117  ElementType value( (~C)(i,j) );
2118 
2119  for( size_t k=kbegin; k<kend; ++k ) {
2120  value += A(i,k) * B(k,j);
2121  }
2122 
2123  (~C)(i,j) = value;
2124  }
2125  }
2126  }
2127  }
2128  }
2129  }
2131  //**********************************************************************************************
2132 
2133  //**BLAS-based assignment to dense matrices (default)*******************************************
2147  template< typename MT3 // Type of the left-hand side target matrix
2148  , typename MT4 // Type of the left-hand side matrix operand
2149  , typename MT5 > // Type of the right-hand side matrix operand
2150  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2151  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2152  {
2153  selectLargeAssignKernel( C, A, B );
2154  }
2156  //**********************************************************************************************
2157 
2158  //**BLAS-based assignment to dense matrices*****************************************************
2159 #if BLAZE_BLAS_MODE
2160 
2173  template< typename MT3 // Type of the left-hand side target matrix
2174  , typename MT4 // Type of the left-hand side matrix operand
2175  , typename MT5 > // Type of the right-hand side matrix operand
2176  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2177  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2178  {
2179  typedef typename MT3::ElementType ET;
2180 
2181  if( IsTriangular<MT4>::value ) {
2182  assign( C, B );
2183  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2184  }
2185  else if( IsTriangular<MT5>::value ) {
2186  assign( C, A );
2187  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2188  }
2189  else {
2190  gemm( C, A, B, ET(1), ET(0) );
2191  }
2192  }
2194 #endif
2195  //**********************************************************************************************
2196 
2197  //**Assignment to sparse matrices***************************************************************
2210  template< typename MT // Type of the target sparse matrix
2211  , bool SO > // Storage order of the target sparse matrix
2212  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2213  {
2215 
2216  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
2217 
2224 
2225  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2226  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2227 
2228  const TmpType tmp( serial( rhs ) );
2229  assign( ~lhs, tmp );
2230  }
2232  //**********************************************************************************************
2233 
2234  //**Addition assignment to dense matrices*******************************************************
2247  template< typename MT // Type of the target dense matrix
2248  , bool SO > // Storage order of the target dense matrix
2249  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2250  {
2252 
2253  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2254  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2255 
2256  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2257  return;
2258  }
2259 
2260  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2261  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2262 
2263  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2264  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2265  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2266  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2267  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2268  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2269 
2270  TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2271  }
2273  //**********************************************************************************************
2274 
2275  //**Addition assignment to dense matrices (kernel selection)************************************
2286  template< typename MT3 // Type of the left-hand side target matrix
2287  , typename MT4 // Type of the left-hand side matrix operand
2288  , typename MT5 > // Type of the right-hand side matrix operand
2289  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2290  {
2291  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
2292  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2293  selectSmallAddAssignKernel( C, A, B );
2294  else
2295  selectBlasAddAssignKernel( C, A, B );
2296  }
2298  //**********************************************************************************************
2299 
2300  //**Default addition assignment to row-major dense matrices (general/general)*******************
2314  template< typename MT3 // Type of the left-hand side target matrix
2315  , typename MT4 // Type of the left-hand side matrix operand
2316  , typename MT5 > // Type of the right-hand side matrix operand
2317  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2318  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2319  {
2320  const size_t M( A.rows() );
2321  const size_t N( B.columns() );
2322  const size_t K( A.columns() );
2323 
2324  for( size_t i=0UL; i<M; ++i )
2325  {
2326  const size_t kbegin( ( IsUpper<MT4>::value )
2327  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2328  :( 0UL ) );
2329  const size_t kend( ( IsLower<MT4>::value )
2330  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2331  :( K ) );
2332  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2333 
2334  for( size_t k=kbegin; k<kend; ++k )
2335  {
2336  const size_t jbegin( ( IsUpper<MT5>::value )
2337  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
2338  :( 0UL ) );
2339  const size_t jend( ( IsLower<MT5>::value )
2340  ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
2341  :( N ) );
2342  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2343 
2344  const size_t jnum( jend - jbegin );
2345  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2346 
2347  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2348  (~C)(i,j ) += A(i,k) * B(k,j );
2349  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2350  }
2351  if( jpos < jend ) {
2352  (~C)(i,jpos) += A(i,k) * B(k,jpos);
2353  }
2354  }
2355  }
2356  }
2358  //**********************************************************************************************
2359 
2360  //**Default addition assignment to column-major dense matrices (general/general)****************
2374  template< typename MT3 // Type of the left-hand side target matrix
2375  , typename MT4 // Type of the left-hand side matrix operand
2376  , typename MT5 > // Type of the right-hand side matrix operand
2377  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2378  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2379  {
2380  const size_t M( A.rows() );
2381  const size_t N( B.columns() );
2382  const size_t K( A.columns() );
2383 
2384  for( size_t j=0UL; j<N; ++j )
2385  {
2386  const size_t kbegin( ( IsLower<MT5>::value )
2387  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2388  :( 0UL ) );
2389  const size_t kend( ( IsUpper<MT5>::value )
2390  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2391  :( K ) );
2392  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2393 
2394  for( size_t k=kbegin; k<kend; ++k )
2395  {
2396  const size_t ibegin( ( IsLower<MT4>::value )
2397  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
2398  :( 0UL ) );
2399  const size_t iend( ( IsUpper<MT4>::value )
2400  ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
2401  :( M ) );
2402  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2403 
2404  const size_t inum( iend - ibegin );
2405  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2406 
2407  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2408  (~C)(i ,j) += A(i ,k) * B(k,j);
2409  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2410  }
2411  if( ipos < iend ) {
2412  (~C)(ipos,j) += A(ipos,k) * B(k,j);
2413  }
2414  }
2415  }
2416  }
2418  //**********************************************************************************************
2419 
2420  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
2434  template< typename MT3 // Type of the left-hand side target matrix
2435  , typename MT4 // Type of the left-hand side matrix operand
2436  , typename MT5 > // Type of the right-hand side matrix operand
2437  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2438  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2439  {
2440  const size_t M( A.rows() );
2441  const size_t N( B.columns() );
2442 
2443  const size_t block( BLOCK_SIZE );
2444 
2445  for( size_t ii=0UL; ii<M; ii+=block ) {
2446  const size_t iend( min( M, ii+block ) );
2447  for( size_t jj=0UL; jj<N; jj+=block ) {
2448  const size_t jend( min( N, jj+block ) );
2449  for( size_t i=ii; i<iend; ++i )
2450  {
2451  const size_t jbegin( ( IsUpper<MT4>::value )
2452  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
2453  :( jj ) );
2454  const size_t jpos( ( IsLower<MT4>::value )
2455  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
2456  :( jend ) );
2457 
2458  for( size_t j=jbegin; j<jpos; ++j ) {
2459  (~C)(i,j) += A(i,j) * B(j,j);
2460  }
2461  }
2462  }
2463  }
2464  }
2466  //**********************************************************************************************
2467 
2468  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
2482  template< typename MT3 // Type of the left-hand side target matrix
2483  , typename MT4 // Type of the left-hand side matrix operand
2484  , typename MT5 > // Type of the right-hand side matrix operand
2485  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2486  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2487  {
2488  const size_t M( A.rows() );
2489  const size_t N( B.columns() );
2490 
2491  for( size_t j=0UL; j<N; ++j )
2492  {
2493  const size_t ibegin( ( IsLower<MT4>::value )
2494  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2495  :( 0UL ) );
2496  const size_t iend( ( IsUpper<MT4>::value )
2497  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2498  :( M ) );
2499  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2500 
2501  const size_t inum( iend - ibegin );
2502  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2503 
2504  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2505  (~C)(i ,j) += A(i ,j) * B(j,j);
2506  (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j);
2507  }
2508  if( ipos < iend ) {
2509  (~C)(ipos,j) += A(ipos,j) * B(j,j);
2510  }
2511  }
2512  }
2514  //**********************************************************************************************
2515 
2516  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
2530  template< typename MT3 // Type of the left-hand side target matrix
2531  , typename MT4 // Type of the left-hand side matrix operand
2532  , typename MT5 > // Type of the right-hand side matrix operand
2533  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2534  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2535  {
2536  const size_t M( A.rows() );
2537  const size_t N( B.columns() );
2538 
2539  for( size_t i=0UL; i<M; ++i )
2540  {
2541  const size_t jbegin( ( IsUpper<MT5>::value )
2542  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2543  :( 0UL ) );
2544  const size_t jend( ( IsLower<MT5>::value )
2545  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2546  :( N ) );
2547  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2548 
2549  const size_t jnum( jend - jbegin );
2550  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2551 
2552  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2553  (~C)(i,j ) += A(i,i) * B(i,j );
2554  (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL);
2555  }
2556  if( jpos < jend ) {
2557  (~C)(i,jpos) += A(i,i) * B(i,jpos);
2558  }
2559  }
2560  }
2562  //**********************************************************************************************
2563 
2564  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
2578  template< typename MT3 // Type of the left-hand side target matrix
2579  , typename MT4 // Type of the left-hand side matrix operand
2580  , typename MT5 > // Type of the right-hand side matrix operand
2581  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2582  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2583  {
2584  const size_t M( A.rows() );
2585  const size_t N( B.columns() );
2586 
2587  const size_t block( BLOCK_SIZE );
2588 
2589  for( size_t jj=0UL; jj<N; jj+=block ) {
2590  const size_t jend( min( N, jj+block ) );
2591  for( size_t ii=0UL; ii<M; ii+=block ) {
2592  const size_t iend( min( M, ii+block ) );
2593  for( size_t j=jj; j<jend; ++j )
2594  {
2595  const size_t ibegin( ( IsLower<MT5>::value )
2596  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
2597  :( ii ) );
2598  const size_t ipos( ( IsUpper<MT5>::value )
2599  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
2600  :( iend ) );
2601 
2602  for( size_t i=ibegin; i<ipos; ++i ) {
2603  (~C)(i,j) += A(i,i) * B(i,j);
2604  }
2605  }
2606  }
2607  }
2608  }
2610  //**********************************************************************************************
2611 
2612  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2626  template< typename MT3 // Type of the left-hand side target matrix
2627  , typename MT4 // Type of the left-hand side matrix operand
2628  , typename MT5 > // Type of the right-hand side matrix operand
2629  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
2630  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2631  {
2632  for( size_t i=0UL; i<A.rows(); ++i ) {
2633  C(i,i) += A(i,i) * B(i,i);
2634  }
2635  }
2637  //**********************************************************************************************
2638 
2639  //**Default addition assignment to dense matrices (small matrices)******************************
2653  template< typename MT3 // Type of the left-hand side target matrix
2654  , typename MT4 // Type of the left-hand side matrix operand
2655  , typename MT5 > // Type of the right-hand side matrix operand
2656  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2657  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2658  {
2659  selectDefaultAddAssignKernel( C, A, B );
2660  }
2662  //**********************************************************************************************
2663 
2664  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2679  template< typename MT3 // Type of the left-hand side target matrix
2680  , typename MT4 // Type of the left-hand side matrix operand
2681  , typename MT5 > // Type of the right-hand side matrix operand
2682  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2683  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2684  {
2685  typedef IntrinsicTrait<ElementType> IT;
2686 
2687  const size_t M( A.rows() );
2688  const size_t N( B.columns() );
2689  const size_t K( A.columns() );
2690 
2691  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2692 
2693  const size_t jpos( remainder ? ( N & size_t(-IT::size) ) : N );
2694  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % IT::size ) ) == jpos, "Invalid end calculation" );
2695 
2696  size_t j( 0UL );
2697 
2698  for( ; (j+IT::size*7UL) < jpos; j+=IT::size*8UL ) {
2699  for( size_t i=0UL; i<M; ++i )
2700  {
2701  const size_t kbegin( ( IsUpper<MT4>::value )
2702  ?( ( IsLower<MT5>::value )
2703  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2704  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2705  :( IsLower<MT5>::value ? j : 0UL ) );
2706  const size_t kend( ( IsLower<MT4>::value )
2707  ?( ( IsUpper<MT5>::value )
2708  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
2709  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2710  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
2711 
2712  IntrinsicType xmm1( (~C).load(i,j ) );
2713  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
2714  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
2715  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
2716  IntrinsicType xmm5( (~C).load(i,j+IT::size*4UL) );
2717  IntrinsicType xmm6( (~C).load(i,j+IT::size*5UL) );
2718  IntrinsicType xmm7( (~C).load(i,j+IT::size*6UL) );
2719  IntrinsicType xmm8( (~C).load(i,j+IT::size*7UL) );
2720 
2721  for( size_t k=kbegin; k<kend; ++k ) {
2722  const IntrinsicType a1( set( A(i,k) ) );
2723  xmm1 = xmm1 + a1 * B.load(k,j );
2724  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2725  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2726  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2727  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
2728  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
2729  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
2730  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
2731  }
2732 
2733  (~C).store( i, j , xmm1 );
2734  (~C).store( i, j+IT::size , xmm2 );
2735  (~C).store( i, j+IT::size*2UL, xmm3 );
2736  (~C).store( i, j+IT::size*3UL, xmm4 );
2737  (~C).store( i, j+IT::size*4UL, xmm5 );
2738  (~C).store( i, j+IT::size*5UL, xmm6 );
2739  (~C).store( i, j+IT::size*6UL, xmm7 );
2740  (~C).store( i, j+IT::size*7UL, xmm8 );
2741  }
2742  }
2743 
2744  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
2745  {
2746  size_t i( 0UL );
2747 
2748  for( ; (i+2UL) <= M; i+=2UL )
2749  {
2750  const size_t kbegin( ( IsUpper<MT4>::value )
2751  ?( ( IsLower<MT5>::value )
2752  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2753  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2754  :( IsLower<MT5>::value ? j : 0UL ) );
2755  const size_t kend( ( IsLower<MT4>::value )
2756  ?( ( IsUpper<MT5>::value )
2757  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
2758  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2759  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
2760 
2761  IntrinsicType xmm1( (~C).load(i ,j ) );
2762  IntrinsicType xmm2( (~C).load(i ,j+IT::size ) );
2763  IntrinsicType xmm3( (~C).load(i ,j+IT::size*2UL) );
2764  IntrinsicType xmm4( (~C).load(i ,j+IT::size*3UL) );
2765  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
2766  IntrinsicType xmm6( (~C).load(i+1UL,j+IT::size ) );
2767  IntrinsicType xmm7( (~C).load(i+1UL,j+IT::size*2UL) );
2768  IntrinsicType xmm8( (~C).load(i+1UL,j+IT::size*3UL) );
2769 
2770  for( size_t k=kbegin; k<kend; ++k ) {
2771  const IntrinsicType a1( set( A(i ,k) ) );
2772  const IntrinsicType a2( set( A(i+1UL,k) ) );
2773  const IntrinsicType b1( B.load(k,j ) );
2774  const IntrinsicType b2( B.load(k,j+IT::size ) );
2775  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
2776  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
2777  xmm1 = xmm1 + a1 * b1;
2778  xmm2 = xmm2 + a1 * b2;
2779  xmm3 = xmm3 + a1 * b3;
2780  xmm4 = xmm4 + a1 * b4;
2781  xmm5 = xmm5 + a2 * b1;
2782  xmm6 = xmm6 + a2 * b2;
2783  xmm7 = xmm7 + a2 * b3;
2784  xmm8 = xmm8 + a2 * b4;
2785  }
2786 
2787  (~C).store( i , j , xmm1 );
2788  (~C).store( i , j+IT::size , xmm2 );
2789  (~C).store( i , j+IT::size*2UL, xmm3 );
2790  (~C).store( i , j+IT::size*3UL, xmm4 );
2791  (~C).store( i+1UL, j , xmm5 );
2792  (~C).store( i+1UL, j+IT::size , xmm6 );
2793  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
2794  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
2795  }
2796 
2797  if( i < M )
2798  {
2799  const size_t kbegin( ( IsUpper<MT4>::value )
2800  ?( ( IsLower<MT5>::value )
2801  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2802  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2803  :( IsLower<MT5>::value ? j : 0UL ) );
2804  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
2805 
2806  IntrinsicType xmm1( (~C).load(i,j ) );
2807  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
2808  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
2809  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
2810 
2811  for( size_t k=kbegin; k<kend; ++k ) {
2812  const IntrinsicType a1( set( A(i,k) ) );
2813  xmm1 = xmm1 + a1 * B.load(k,j );
2814  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2815  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2816  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2817  }
2818 
2819  (~C).store( i, j , xmm1 );
2820  (~C).store( i, j+IT::size , xmm2 );
2821  (~C).store( i, j+IT::size*2UL, xmm3 );
2822  (~C).store( i, j+IT::size*3UL, xmm4 );
2823  }
2824  }
2825 
2826  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
2827  {
2828  size_t i( 0UL );
2829 
2830  for( ; (i+2UL) <= M; i+=2UL )
2831  {
2832  const size_t kbegin( ( IsUpper<MT4>::value )
2833  ?( ( IsLower<MT5>::value )
2834  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2835  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2836  :( IsLower<MT5>::value ? j : 0UL ) );
2837  const size_t kend( ( IsLower<MT4>::value )
2838  ?( ( IsUpper<MT5>::value )
2839  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
2840  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2841  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
2842 
2843  IntrinsicType xmm1( (~C).load(i ,j ) );
2844  IntrinsicType xmm2( (~C).load(i ,j+IT::size) );
2845  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
2846  IntrinsicType xmm4( (~C).load(i+1UL,j+IT::size) );
2847 
2848  for( size_t k=kbegin; k<kend; ++k ) {
2849  const IntrinsicType a1( set( A(i ,k) ) );
2850  const IntrinsicType a2( set( A(i+1UL,k) ) );
2851  const IntrinsicType b1( B.load(k,j ) );
2852  const IntrinsicType b2( B.load(k,j+IT::size) );
2853  xmm1 = xmm1 + a1 * b1;
2854  xmm2 = xmm2 + a1 * b2;
2855  xmm3 = xmm3 + a2 * b1;
2856  xmm4 = xmm4 + a2 * b2;
2857  }
2858 
2859  (~C).store( i , j , xmm1 );
2860  (~C).store( i , j+IT::size, xmm2 );
2861  (~C).store( i+1UL, j , xmm3 );
2862  (~C).store( i+1UL, j+IT::size, xmm4 );
2863  }
2864 
2865  if( i < M )
2866  {
2867  const size_t kbegin( ( IsUpper<MT4>::value )
2868  ?( ( IsLower<MT5>::value )
2869  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2870  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2871  :( IsLower<MT5>::value ? j : 0UL ) );
2872  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
2873 
2874  IntrinsicType xmm1( (~C).load(i,j ) );
2875  IntrinsicType xmm2( (~C).load(i,j+IT::size) );
2876 
2877  for( size_t k=kbegin; k<kend; ++k ) {
2878  const IntrinsicType a1( set( A(i,k) ) );
2879  xmm1 = xmm1 + a1 * B.load(k,j );
2880  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
2881  }
2882 
2883  (~C).store( i, j , xmm1 );
2884  (~C).store( i, j+IT::size, xmm2 );
2885  }
2886  }
2887 
2888  for( ; j<jpos; j+=IT::size )
2889  {
2890  size_t i( 0UL );
2891 
2892  for( ; (i+2UL) <= M; i+=2UL )
2893  {
2894  const size_t kbegin( ( IsUpper<MT4>::value )
2895  ?( ( IsLower<MT5>::value )
2896  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2897  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2898  :( IsLower<MT5>::value ? j : 0UL ) );
2899  const size_t kend( ( IsLower<MT4>::value )
2900  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2901  :( K ) );
2902 
2903  IntrinsicType xmm1( (~C).load(i ,j) );
2904  IntrinsicType xmm2( (~C).load(i+1UL,j) );
2905 
2906  for( size_t k=kbegin; k<kend; ++k ) {
2907  const IntrinsicType b1( B.load(k,j) );
2908  xmm1 = xmm1 + set( A(i ,k) ) * b1;
2909  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
2910  }
2911 
2912  (~C).store( i , j, xmm1 );
2913  (~C).store( i+1UL, j, xmm2 );
2914  }
2915 
2916  if( i < M )
2917  {
2918  const size_t kbegin( ( IsUpper<MT4>::value )
2919  ?( ( IsLower<MT5>::value )
2920  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2921  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2922  :( IsLower<MT5>::value ? j : 0UL ) );
2923 
2924  IntrinsicType xmm1( (~C).load(i,j) );
2925 
2926  for( size_t k=kbegin; k<K; ++k ) {
2927  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
2928  }
2929 
2930  (~C).store( i, j, xmm1 );
2931  }
2932  }
2933 
2934  for( ; remainder && j<N; ++j )
2935  {
2936  size_t i( 0UL );
2937 
2938  for( ; (i+2UL) <= M; i+=2UL )
2939  {
2940  const size_t kbegin( ( IsUpper<MT4>::value )
2941  ?( ( IsLower<MT5>::value )
2942  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2943  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2944  :( IsLower<MT5>::value ? j : 0UL ) );
2945  const size_t kend( ( IsLower<MT4>::value )
2946  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2947  :( K ) );
2948 
2949  ElementType value1( (~C)(i ,j) );
2950  ElementType value2( (~C)(i+1UL,j) );;
2951 
2952  for( size_t k=kbegin; k<kend; ++k ) {
2953  value1 += A(i ,k) * B(k,j);
2954  value2 += A(i+1UL,k) * B(k,j);
2955  }
2956 
2957  (~C)(i ,j) = value1;
2958  (~C)(i+1UL,j) = value2;
2959  }
2960 
2961  if( i < M )
2962  {
2963  const size_t kbegin( ( IsUpper<MT4>::value )
2964  ?( ( IsLower<MT5>::value )
2965  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2966  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2967  :( IsLower<MT5>::value ? j : 0UL ) );
2968 
2969  ElementType value( (~C)(i,j) );
2970 
2971  for( size_t k=kbegin; k<K; ++k ) {
2972  value += A(i,k) * B(k,j);
2973  }
2974 
2975  (~C)(i,j) = value;
2976  }
2977  }
2978  }
2980  //**********************************************************************************************
2981 
2982  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2997  template< typename MT3 // Type of the left-hand side target matrix
2998  , typename MT4 // Type of the left-hand side matrix operand
2999  , typename MT5 > // Type of the right-hand side matrix operand
3000  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3001  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3002  {
3003  typedef IntrinsicTrait<ElementType> IT;
3004 
3005  const size_t M( A.rows() );
3006  const size_t N( B.columns() );
3007  const size_t K( A.columns() );
3008 
3009  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3010 
3011  const size_t ipos( remainder ? ( M & size_t(-IT::size) ) : M );
3012  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % IT::size ) ) == ipos, "Invalid end calculation" );
3013 
3014  size_t i( 0UL );
3015 
3016  for( ; (i+IT::size*7UL) < ipos; i+=IT::size*8UL ) {
3017  for( size_t j=0UL; j<N; ++j )
3018  {
3019  const size_t kbegin( ( IsLower<MT5>::value )
3020  ?( ( IsUpper<MT4>::value )
3021  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3022  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3023  :( IsUpper<MT4>::value ? i : 0UL ) );
3024  const size_t kend( ( IsUpper<MT5>::value )
3025  ?( ( IsLower<MT4>::value )
3026  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3027  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3028  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
3029 
3030  IntrinsicType xmm1( (~C).load(i ,j) );
3031  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
3032  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
3033  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
3034  IntrinsicType xmm5( (~C).load(i+IT::size*4UL,j) );
3035  IntrinsicType xmm6( (~C).load(i+IT::size*5UL,j) );
3036  IntrinsicType xmm7( (~C).load(i+IT::size*6UL,j) );
3037  IntrinsicType xmm8( (~C).load(i+IT::size*7UL,j) );
3038 
3039  for( size_t k=kbegin; k<kend; ++k ) {
3040  const IntrinsicType b1( set( B(k,j) ) );
3041  xmm1 = xmm1 + A.load(i ,k) * b1;
3042  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3043  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3044  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3045  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
3046  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
3047  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
3048  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
3049  }
3050 
3051  (~C).store( i , j, xmm1 );
3052  (~C).store( i+IT::size , j, xmm2 );
3053  (~C).store( i+IT::size*2UL, j, xmm3 );
3054  (~C).store( i+IT::size*3UL, j, xmm4 );
3055  (~C).store( i+IT::size*4UL, j, xmm5 );
3056  (~C).store( i+IT::size*5UL, j, xmm6 );
3057  (~C).store( i+IT::size*6UL, j, xmm7 );
3058  (~C).store( i+IT::size*7UL, j, xmm8 );
3059  }
3060  }
3061 
3062  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
3063  {
3064  size_t j( 0UL );
3065 
3066  for( ; (j+2UL) <= N; j+=2UL )
3067  {
3068  const size_t kbegin( ( IsLower<MT5>::value )
3069  ?( ( IsUpper<MT4>::value )
3070  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3071  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3072  :( IsUpper<MT4>::value ? i : 0UL ) );
3073  const size_t kend( ( IsUpper<MT5>::value )
3074  ?( ( IsLower<MT4>::value )
3075  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3076  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3077  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
3078 
3079  IntrinsicType xmm1( (~C).load(i ,j ) );
3080  IntrinsicType xmm2( (~C).load(i+IT::size ,j ) );
3081  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j ) );
3082  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j ) );
3083  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
3084  IntrinsicType xmm6( (~C).load(i+IT::size ,j+1UL) );
3085  IntrinsicType xmm7( (~C).load(i+IT::size*2UL,j+1UL) );
3086  IntrinsicType xmm8( (~C).load(i+IT::size*3UL,j+1UL) );
3087 
3088  for( size_t k=kbegin; k<kend; ++k ) {
3089  const IntrinsicType a1( A.load(i ,k) );
3090  const IntrinsicType a2( A.load(i+IT::size ,k) );
3091  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
3092  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
3093  const IntrinsicType b1( set( B(k,j ) ) );
3094  const IntrinsicType b2( set( B(k,j+1UL) ) );
3095  xmm1 = xmm1 + a1 * b1;
3096  xmm2 = xmm2 + a2 * b1;
3097  xmm3 = xmm3 + a3 * b1;
3098  xmm4 = xmm4 + a4 * b1;
3099  xmm5 = xmm5 + a1 * b2;
3100  xmm6 = xmm6 + a2 * b2;
3101  xmm7 = xmm7 + a3 * b2;
3102  xmm8 = xmm8 + a4 * b2;
3103  }
3104 
3105  (~C).store( i , j , xmm1 );
3106  (~C).store( i+IT::size , j , xmm2 );
3107  (~C).store( i+IT::size*2UL, j , xmm3 );
3108  (~C).store( i+IT::size*3UL, j , xmm4 );
3109  (~C).store( i , j+1UL, xmm5 );
3110  (~C).store( i+IT::size , j+1UL, xmm6 );
3111  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
3112  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
3113  }
3114 
3115  if( j < N )
3116  {
3117  const size_t kbegin( ( IsLower<MT5>::value )
3118  ?( ( IsUpper<MT4>::value )
3119  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3120  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3121  :( IsUpper<MT4>::value ? i : 0UL ) );
3122  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
3123 
3124  IntrinsicType xmm1( (~C).load(i ,j) );
3125  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
3126  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
3127  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
3128 
3129  for( size_t k=kbegin; k<kend; ++k ) {
3130  const IntrinsicType b1( set( B(k,j) ) );
3131  xmm1 = xmm1 + A.load(i ,k) * b1;
3132  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3133  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3134  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3135  }
3136 
3137  (~C).store( i , j, xmm1 );
3138  (~C).store( i+IT::size , j, xmm2 );
3139  (~C).store( i+IT::size*2UL, j, xmm3 );
3140  (~C).store( i+IT::size*3UL, j, xmm4 );
3141  }
3142  }
3143 
3144  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
3145  {
3146  size_t j( 0UL );
3147 
3148  for( ; (j+2UL) <= N; j+=2UL )
3149  {
3150  const size_t kbegin( ( IsLower<MT5>::value )
3151  ?( ( IsUpper<MT4>::value )
3152  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3153  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3154  :( IsUpper<MT4>::value ? i : 0UL ) );
3155  const size_t kend( ( IsUpper<MT5>::value )
3156  ?( ( IsLower<MT4>::value )
3157  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3158  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3159  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
3160 
3161  IntrinsicType xmm1( (~C).load(i ,j ) );
3162  IntrinsicType xmm2( (~C).load(i+IT::size,j ) );
3163  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3164  IntrinsicType xmm4( (~C).load(i+IT::size,j+1UL) );
3165 
3166  for( size_t k=kbegin; k<kend; ++k ) {
3167  const IntrinsicType a1( A.load(i ,k) );
3168  const IntrinsicType a2( A.load(i+IT::size,k) );
3169  const IntrinsicType b1( set( B(k,j ) ) );
3170  const IntrinsicType b2( set( B(k,j+1UL) ) );
3171  xmm1 = xmm1 + a1 * b1;
3172  xmm2 = xmm2 + a2 * b1;
3173  xmm3 = xmm3 + a1 * b2;
3174  xmm4 = xmm4 + a2 * b2;
3175  }
3176 
3177  (~C).store( i , j , xmm1 );
3178  (~C).store( i+IT::size, j , xmm2 );
3179  (~C).store( i , j+1UL, xmm3 );
3180  (~C).store( i+IT::size, j+1UL, xmm4 );
3181  }
3182 
3183  if( j < N )
3184  {
3185  const size_t kbegin( ( IsLower<MT5>::value )
3186  ?( ( IsUpper<MT4>::value )
3187  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3188  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3189  :( IsUpper<MT4>::value ? i : 0UL ) );
3190  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
3191 
3192  IntrinsicType xmm1( (~C).load(i ,j) );
3193  IntrinsicType xmm2( (~C).load(i+IT::size,j) );
3194 
3195  for( size_t k=kbegin; k<kend; ++k ) {
3196  const IntrinsicType b1( set( B(k,j) ) );
3197  xmm1 = xmm1 + A.load(i ,k) * b1;
3198  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
3199  }
3200 
3201  (~C).store( i , j, xmm1 );
3202  (~C).store( i+IT::size, j, xmm2 );
3203  }
3204  }
3205 
3206  for( ; i<ipos; i+=IT::size )
3207  {
3208  size_t j( 0UL );
3209 
3210  for( ; (j+2UL) <= N; j+=2UL )
3211  {
3212  const size_t kbegin( ( IsLower<MT5>::value )
3213  ?( ( IsUpper<MT4>::value )
3214  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3215  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3216  :( IsUpper<MT4>::value ? i : 0UL ) );
3217  const size_t kend( ( IsUpper<MT5>::value )
3218  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3219  :( K ) );
3220 
3221  IntrinsicType xmm1( (~C).load(i,j ) );
3222  IntrinsicType xmm2( (~C).load(i,j+1UL) );
3223 
3224  for( size_t k=kbegin; k<kend; ++k ) {
3225  const IntrinsicType a1( A.load(i,k) );
3226  xmm1 = xmm1 + a1 * set( B(k,j ) );
3227  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
3228  }
3229 
3230  (~C).store( i, j , xmm1 );
3231  (~C).store( i, j+1UL, xmm2 );
3232  }
3233 
3234  if( j < N )
3235  {
3236  const size_t kbegin( ( IsLower<MT5>::value )
3237  ?( ( IsUpper<MT4>::value )
3238  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3239  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3240  :( IsUpper<MT4>::value ? i : 0UL ) );
3241 
3242  IntrinsicType xmm1( (~C).load(i,j) );
3243 
3244  for( size_t k=kbegin; k<K; ++k ) {
3245  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
3246  }
3247 
3248  (~C).store( i, j, xmm1 );
3249  }
3250  }
3251 
3252  for( ; remainder && i<M; ++i )
3253  {
3254  size_t j( 0UL );
3255 
3256  for( ; (j+2UL) <= N; j+=2UL )
3257  {
3258  const size_t kbegin( ( IsLower<MT5>::value )
3259  ?( ( IsUpper<MT4>::value )
3260  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3261  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3262  :( IsUpper<MT4>::value ? i : 0UL ) );
3263  const size_t kend( ( IsUpper<MT5>::value )
3264  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3265  :( K ) );
3266 
3267  ElementType value1( (~C)(i,j ) );
3268  ElementType value2( (~C)(i,j+1UL) );
3269 
3270  for( size_t k=kbegin; k<kend; ++k ) {
3271  value1 += A(i,k) * B(k,j );
3272  value2 += A(i,k) * B(k,j+1UL);
3273  }
3274 
3275  (~C)(i,j ) = value1;
3276  (~C)(i,j+1UL) = value2;
3277  }
3278 
3279  if( j < N )
3280  {
3281  const size_t kbegin( ( IsLower<MT5>::value )
3282  ?( ( IsUpper<MT4>::value )
3283  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3284  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3285  :( IsUpper<MT4>::value ? i : 0UL ) );
3286 
3287  ElementType value( (~C)(i,j) );
3288 
3289  for( size_t k=kbegin; k<K; ++k ) {
3290  value += A(i,k) * B(k,j);
3291  }
3292 
3293  (~C)(i,j) = value;
3294  }
3295  }
3296  }
3298  //**********************************************************************************************
3299 
3300  //**Default addition assignment to dense matrices (large matrices)******************************
3314  template< typename MT3 // Type of the left-hand side target matrix
3315  , typename MT4 // Type of the left-hand side matrix operand
3316  , typename MT5 > // Type of the right-hand side matrix operand
3317  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3318  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3319  {
3320  selectDefaultAddAssignKernel( C, A, B );
3321  }
3323  //**********************************************************************************************
3324 
3325  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
3340  template< typename MT3 // Type of the left-hand side target matrix
3341  , typename MT4 // Type of the left-hand side matrix operand
3342  , typename MT5 > // Type of the right-hand side matrix operand
3343  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3344  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3345  {
3346  typedef IntrinsicTrait<ElementType> IT;
3347 
3348  const size_t M( A.rows() );
3349  const size_t N( B.columns() );
3350  const size_t K( A.columns() );
3351 
3352  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3353 
3354  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
3355  {
3356  const size_t jend( min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
3357 
3358  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3359  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % IT::size ) ) == jpos, "Invalid end calculation" );
3360 
3361  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
3362  {
3363  const size_t iend( min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
3364 
3365  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
3366  {
3367  const size_t ktmp( min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
3368 
3369  size_t j( jj );
3370 
3371  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
3372  {
3373  const size_t j1( j+IT::size );
3374  const size_t j2( j+IT::size*2UL );
3375  const size_t j3( j+IT::size*3UL );
3376 
3377  size_t i( ii );
3378 
3379  for( ; (i+2UL) <= iend; i+=2UL )
3380  {
3381  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3382  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3383  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3384  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
3385 
3386  IntrinsicType xmm1( (~C).load(i ,j ) );
3387  IntrinsicType xmm2( (~C).load(i ,j1) );
3388  IntrinsicType xmm3( (~C).load(i ,j2) );
3389  IntrinsicType xmm4( (~C).load(i ,j3) );
3390  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
3391  IntrinsicType xmm6( (~C).load(i+1UL,j1) );
3392  IntrinsicType xmm7( (~C).load(i+1UL,j2) );
3393  IntrinsicType xmm8( (~C).load(i+1UL,j3) );
3394 
3395  for( size_t k=kbegin; k<kend; ++k ) {
3396  const IntrinsicType a1( set( A(i ,k) ) );
3397  const IntrinsicType a2( set( A(i+1UL,k) ) );
3398  const IntrinsicType b1( B.load(k,j ) );
3399  const IntrinsicType b2( B.load(k,j1) );
3400  const IntrinsicType b3( B.load(k,j2) );
3401  const IntrinsicType b4( B.load(k,j3) );
3402  xmm1 = xmm1 + a1 * b1;
3403  xmm2 = xmm2 + a1 * b2;
3404  xmm3 = xmm3 + a1 * b3;
3405  xmm4 = xmm4 + a1 * b4;
3406  xmm5 = xmm5 + a2 * b1;
3407  xmm6 = xmm6 + a2 * b2;
3408  xmm7 = xmm7 + a2 * b3;
3409  xmm8 = xmm8 + a2 * b4;
3410  }
3411 
3412  (~C).store( i , j , xmm1 );
3413  (~C).store( i , j1, xmm2 );
3414  (~C).store( i , j2, xmm3 );
3415  (~C).store( i , j3, xmm4 );
3416  (~C).store( i+1UL, j , xmm5 );
3417  (~C).store( i+1UL, j1, xmm6 );
3418  (~C).store( i+1UL, j2, xmm7 );
3419  (~C).store( i+1UL, j3, xmm8 );
3420  }
3421 
3422  if( i < iend )
3423  {
3424  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3425  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3426  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3427  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
3428 
3429  IntrinsicType xmm1( (~C).load(i,j ) );
3430  IntrinsicType xmm2( (~C).load(i,j1) );
3431  IntrinsicType xmm3( (~C).load(i,j2) );
3432  IntrinsicType xmm4( (~C).load(i,j3) );
3433 
3434  for( size_t k=kbegin; k<kend; ++k ) {
3435  const IntrinsicType a1( set( A(i,k) ) );
3436  xmm1 = xmm1 + a1 * B.load(k,j );
3437  xmm2 = xmm2 + a1 * B.load(k,j1);
3438  xmm3 = xmm3 + a1 * B.load(k,j2);
3439  xmm4 = xmm4 + a1 * B.load(k,j3);
3440  }
3441 
3442  (~C).store( i, j , xmm1 );
3443  (~C).store( i, j1, xmm2 );
3444  (~C).store( i, j2, xmm3 );
3445  (~C).store( i, j3, xmm4 );
3446  }
3447  }
3448 
3449  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
3450  {
3451  const size_t j1( j+IT::size );
3452 
3453  size_t i( ii );
3454 
3455  for( ; (i+4UL) <= iend; i+=4UL )
3456  {
3457  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3458  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3459  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
3460  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
3461 
3462  IntrinsicType xmm1( (~C).load(i ,j ) );
3463  IntrinsicType xmm2( (~C).load(i ,j1) );
3464  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
3465  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
3466  IntrinsicType xmm5( (~C).load(i+2UL,j ) );
3467  IntrinsicType xmm6( (~C).load(i+2UL,j1) );
3468  IntrinsicType xmm7( (~C).load(i+3UL,j ) );
3469  IntrinsicType xmm8( (~C).load(i+3UL,j1) );
3470 
3471  for( size_t k=kbegin; k<kend; ++k ) {
3472  const IntrinsicType a1( set( A(i ,k) ) );
3473  const IntrinsicType a2( set( A(i+1UL,k) ) );
3474  const IntrinsicType a3( set( A(i+2UL,k) ) );
3475  const IntrinsicType a4( set( A(i+3UL,k) ) );
3476  const IntrinsicType b1( B.load(k,j ) );
3477  const IntrinsicType b2( B.load(k,j1) );
3478  xmm1 = xmm1 + a1 * b1;
3479  xmm2 = xmm2 + a1 * b2;
3480  xmm3 = xmm3 + a2 * b1;
3481  xmm4 = xmm4 + a2 * b2;
3482  xmm5 = xmm5 + a3 * b1;
3483  xmm6 = xmm6 + a3 * b2;
3484  xmm7 = xmm7 + a4 * b1;
3485  xmm8 = xmm8 + a4 * b2;
3486  }
3487 
3488  (~C).store( i , j , xmm1 );
3489  (~C).store( i , j1, xmm2 );
3490  (~C).store( i+1UL, j , xmm3 );
3491  (~C).store( i+1UL, j1, xmm4 );
3492  (~C).store( i+2UL, j , xmm5 );
3493  (~C).store( i+2UL, j1, xmm6 );
3494  (~C).store( i+3UL, j , xmm7 );
3495  (~C).store( i+3UL, j1, xmm8 );
3496  }
3497 
3498  for( ; (i+2UL) <= iend; i+=2UL )
3499  {
3500  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3501  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3502  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3503  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
3504 
3505  IntrinsicType xmm1( (~C).load(i ,j ) );
3506  IntrinsicType xmm2( (~C).load(i ,j1) );
3507  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
3508  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
3509 
3510  for( size_t k=kbegin; k<kend; ++k ) {
3511  const IntrinsicType a1( set( A(i ,k) ) );
3512  const IntrinsicType a2( set( A(i+1UL,k) ) );
3513  const IntrinsicType b1( B.load(k,j ) );
3514  const IntrinsicType b2( B.load(k,j1) );
3515  xmm1 = xmm1 + a1 * b1;
3516  xmm2 = xmm2 + a1 * b2;
3517  xmm3 = xmm3 + a2 * b1;
3518  xmm4 = xmm4 + a2 * b2;
3519  }
3520 
3521  (~C).store( i , j , xmm1 );
3522  (~C).store( i , j1, xmm2 );
3523  (~C).store( i+1UL, j , xmm3 );
3524  (~C).store( i+1UL, j1, xmm4 );
3525  }
3526 
3527  if( i < iend )
3528  {
3529  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3530  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3531  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3532  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
3533 
3534  IntrinsicType xmm1( (~C).load(i,j ) );
3535  IntrinsicType xmm2( (~C).load(i,j1) );
3536 
3537  for( size_t k=kbegin; k<kend; ++k ) {
3538  const IntrinsicType a1( set( A(i,k) ) );
3539  xmm1 = xmm1 + a1 * B.load(k,j );
3540  xmm2 = xmm2 + a1 * B.load(k,j1);
3541  }
3542 
3543  (~C).store( i, j , xmm1 );
3544  (~C).store( i, j1, xmm2 );
3545  }
3546  }
3547 
3548  for( ; j<jpos; j+=IT::size )
3549  {
3550  for( size_t i=ii; i<iend; ++i )
3551  {
3552  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3553  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3554  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3555  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
3556 
3557  IntrinsicType xmm1( (~C).load(i,j) );
3558 
3559  for( size_t k=kbegin; k<kend; ++k ) {
3560  const IntrinsicType a1( set( A(i,k) ) );
3561  xmm1 = xmm1 + a1 * B.load(k,j);
3562  }
3563 
3564  (~C).store( i, j, xmm1 );
3565  }
3566  }
3567 
3568  for( ; remainder && j<jend; ++j )
3569  {
3570  for( size_t i=ii; i<iend; ++i )
3571  {
3572  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3573  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3574  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3575  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
3576 
3577  ElementType value( (~C)(i,j) );
3578 
3579  for( size_t k=kbegin; k<kend; ++k ) {
3580  value += A(i,k) * B(k,j);
3581  }
3582 
3583  (~C)(i,j) = value;
3584  }
3585  }
3586  }
3587  }
3588  }
3589  }
3591  //**********************************************************************************************
3592 
3593  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
3608  template< typename MT3 // Type of the left-hand side target matrix
3609  , typename MT4 // Type of the left-hand side matrix operand
3610  , typename MT5 > // Type of the right-hand side matrix operand
3611  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3612  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3613  {
3614  typedef IntrinsicTrait<ElementType> IT;
3615 
3616  const size_t M( A.rows() );
3617  const size_t N( B.columns() );
3618  const size_t K( A.columns() );
3619 
3620  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3621 
3622  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
3623  {
3624  const size_t iend( min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
3625 
3626  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3627  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % IT::size ) ) == ipos, "Invalid end calculation" );
3628 
3629  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
3630  {
3631  const size_t jend( min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
3632 
3633  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
3634  {
3635  const size_t ktmp( min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
3636 
3637  size_t i( ii );
3638 
3639  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
3640  {
3641  const size_t i1( i+IT::size );
3642  const size_t i2( i+IT::size*2UL );
3643  const size_t i3( i+IT::size*3UL );
3644 
3645  size_t j( jj );
3646 
3647  for( ; (j+2UL) <= jend; j+=2UL )
3648  {
3649  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3650  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3651  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
3652  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3653 
3654  IntrinsicType xmm1( (~C).load(i ,j ) );
3655  IntrinsicType xmm2( (~C).load(i1,j ) );
3656  IntrinsicType xmm3( (~C).load(i2,j ) );
3657  IntrinsicType xmm4( (~C).load(i3,j ) );
3658  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
3659  IntrinsicType xmm6( (~C).load(i1,j+1UL) );
3660  IntrinsicType xmm7( (~C).load(i2,j+1UL) );
3661  IntrinsicType xmm8( (~C).load(i3,j+1UL) );
3662 
3663  for( size_t k=kbegin; k<kend; ++k ) {
3664  const IntrinsicType a1( A.load(i ,k) );
3665  const IntrinsicType a2( A.load(i1,k) );
3666  const IntrinsicType a3( A.load(i2,k) );
3667  const IntrinsicType a4( A.load(i3,k) );
3668  const IntrinsicType b1( set( B(k,j ) ) );
3669  const IntrinsicType b2( set( B(k,j+1UL) ) );
3670  xmm1 = xmm1 + a1 * b1;
3671  xmm2 = xmm2 + a2 * b1;
3672  xmm3 = xmm3 + a3 * b1;
3673  xmm4 = xmm4 + a4 * b1;
3674  xmm5 = xmm5 + a1 * b2;
3675  xmm6 = xmm6 + a2 * b2;
3676  xmm7 = xmm7 + a3 * b2;
3677  xmm8 = xmm8 + a4 * b2;
3678  }
3679 
3680  (~C).store( i , j , xmm1 );
3681  (~C).store( i1, j , xmm2 );
3682  (~C).store( i2, j , xmm3 );
3683  (~C).store( i3, j , xmm4 );
3684  (~C).store( i , j+1UL, xmm5 );
3685  (~C).store( i1, j+1UL, xmm6 );
3686  (~C).store( i2, j+1UL, xmm7 );
3687  (~C).store( i3, j+1UL, xmm8 );
3688  }
3689 
3690  if( j < jend )
3691  {
3692  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3693  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3694  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
3695  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3696 
3697  IntrinsicType xmm1( (~C).load(i ,j) );
3698  IntrinsicType xmm2( (~C).load(i1,j) );
3699  IntrinsicType xmm3( (~C).load(i2,j) );
3700  IntrinsicType xmm4( (~C).load(i3,j) );
3701 
3702  for( size_t k=kbegin; k<kend; ++k ) {
3703  const IntrinsicType b1( set( B(k,j) ) );
3704  xmm1 = xmm1 + A.load(i ,k) * b1;
3705  xmm2 = xmm2 + A.load(i1,k) * b1;
3706  xmm3 = xmm3 + A.load(i2,k) * b1;
3707  xmm4 = xmm4 + A.load(i3,k) * b1;
3708  }
3709 
3710  (~C).store( i , j, xmm1 );
3711  (~C).store( i1, j, xmm2 );
3712  (~C).store( i2, j, xmm3 );
3713  (~C).store( i3, j, xmm4 );
3714  }
3715  }
3716 
3717  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
3718  {
3719  const size_t i1( i+IT::size );
3720 
3721  size_t j( jj );
3722 
3723  for( ; (j+4UL) <= jend; j+=4UL )
3724  {
3725  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3726  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3727  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
3728  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
3729 
3730  IntrinsicType xmm1( (~C).load(i ,j ) );
3731  IntrinsicType xmm2( (~C).load(i1,j ) );
3732  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3733  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
3734  IntrinsicType xmm5( (~C).load(i ,j+2UL) );
3735  IntrinsicType xmm6( (~C).load(i1,j+2UL) );
3736  IntrinsicType xmm7( (~C).load(i ,j+3UL) );
3737  IntrinsicType xmm8( (~C).load(i1,j+3UL) );
3738 
3739  for( size_t k=kbegin; k<kend; ++k ) {
3740  const IntrinsicType a1( A.load(i ,k) );
3741  const IntrinsicType a2( A.load(i1,k) );
3742  const IntrinsicType b1( set( B(k,j ) ) );
3743  const IntrinsicType b2( set( B(k,j+1UL) ) );
3744  const IntrinsicType b3( set( B(k,j+2UL) ) );
3745  const IntrinsicType b4( set( B(k,j+3UL) ) );
3746  xmm1 = xmm1 + a1 * b1;
3747  xmm2 = xmm2 + a2 * b1;
3748  xmm3 = xmm3 + a1 * b2;
3749  xmm4 = xmm4 + a2 * b2;
3750  xmm5 = xmm5 + a1 * b3;
3751  xmm6 = xmm6 + a2 * b3;
3752  xmm7 = xmm7 + a1 * b4;
3753  xmm8 = xmm8 + a2 * b4;
3754  }
3755 
3756  (~C).store( i , j , xmm1 );
3757  (~C).store( i1, j , xmm2 );
3758  (~C).store( i , j+1UL, xmm3 );
3759  (~C).store( i1, j+1UL, xmm4 );
3760  (~C).store( i , j+2UL, xmm5 );
3761  (~C).store( i1, j+2UL, xmm6 );
3762  (~C).store( i , j+3UL, xmm7 );
3763  (~C).store( i1, j+3UL, xmm8 );
3764  }
3765 
3766  for( ; (j+2UL) <= jend; j+=2UL )
3767  {
3768  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3769  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3770  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
3771  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3772 
3773  IntrinsicType xmm1( (~C).load(i ,j ) );
3774  IntrinsicType xmm2( (~C).load(i1,j ) );
3775  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3776  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
3777 
3778  for( size_t k=kbegin; k<kend; ++k ) {
3779  const IntrinsicType a1( A.load(i ,k) );
3780  const IntrinsicType a2( A.load(i1,k) );
3781  const IntrinsicType b1( set( B(k,j ) ) );
3782  const IntrinsicType b2( set( B(k,j+1UL) ) );
3783  xmm1 = xmm1 + a1 * b1;
3784  xmm2 = xmm2 + a2 * b1;
3785  xmm3 = xmm3 + a1 * b2;
3786  xmm4 = xmm4 + a2 * b2;
3787  }
3788 
3789  (~C).store( i , j , xmm1 );
3790  (~C).store( i1, j , xmm2 );
3791  (~C).store( i , j+1UL, xmm3 );
3792  (~C).store( i1, j+1UL, xmm4 );
3793  }
3794 
3795  if( j < jend )
3796  {
3797  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3798  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3799  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
3800  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3801 
3802  IntrinsicType xmm1( (~C).load(i ,j) );
3803  IntrinsicType xmm2( (~C).load(i1,j) );
3804 
3805  for( size_t k=kbegin; k<kend; ++k ) {
3806  const IntrinsicType b1( set( B(k,j) ) );
3807  xmm1 = xmm1 + A.load(i ,k) * b1;
3808  xmm2 = xmm2 + A.load(i1,k) * b1;
3809  }
3810 
3811  (~C).store( i , j, xmm1 );
3812  (~C).store( i1, j, xmm2 );
3813  }
3814  }
3815 
3816  for( ; i<ipos; i+=IT::size )
3817  {
3818  for( size_t j=jj; j<jend; ++j )
3819  {
3820  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3821  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3822  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
3823  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3824 
3825  IntrinsicType xmm1( (~C).load(i,j) );
3826 
3827  for( size_t k=kbegin; k<kend; ++k ) {
3828  const IntrinsicType b1( set( B(k,j) ) );
3829  xmm1 = xmm1 + A.load(i,k) * b1;
3830  }
3831 
3832  (~C).store( i, j, xmm1 );
3833  }
3834  }
3835 
3836  for( ; remainder && i<iend; ++i )
3837  {
3838  for( size_t j=jj; j<jend; ++j )
3839  {
3840  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3841  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3842  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
3843  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3844 
3845  ElementType value( (~C)(i,j) );
3846 
3847  for( size_t k=kbegin; k<kend; ++k ) {
3848  value += A(i,k) * B(k,j);
3849  }
3850 
3851  (~C)(i,j) = value;
3852  }
3853  }
3854  }
3855  }
3856  }
3857  }
3859  //**********************************************************************************************
3860 
3861  //**BLAS-based addition assignment to dense matrices (default)**********************************
3875  template< typename MT3 // Type of the left-hand side target matrix
3876  , typename MT4 // Type of the left-hand side matrix operand
3877  , typename MT5 > // Type of the right-hand side matrix operand
3878  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3879  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3880  {
3881  selectLargeAddAssignKernel( C, A, B );
3882  }
3884  //**********************************************************************************************
3885 
3886  //**BLAS-based addition assignment to dense matrices********************************************
3887 #if BLAZE_BLAS_MODE
3888 
3901  template< typename MT3 // Type of the left-hand side target matrix
3902  , typename MT4 // Type of the left-hand side matrix operand
3903  , typename MT5 > // Type of the right-hand side matrix operand
3904  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3905  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3906  {
3907  typedef typename MT3::ElementType ET;
3908 
3909  if( IsTriangular<MT4>::value ) {
3910  typename MT3::ResultType tmp( serial( B ) );
3911  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3912  addAssign( C, tmp );
3913  }
3914  else if( IsTriangular<MT5>::value ) {
3915  typename MT3::ResultType tmp( serial( A ) );
3916  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3917  addAssign( C, tmp );
3918  }
3919  else {
3920  gemm( C, A, B, ET(1), ET(1) );
3921  }
3922  }
3924 #endif
3925  //**********************************************************************************************
3926 
3927  //**Addition assignment to sparse matrices******************************************************
3928  // No special implementation for the addition assignment to sparse matrices.
3929  //**********************************************************************************************
3930 
3931  //**Subtraction assignment to dense matrices****************************************************
3944  template< typename MT // Type of the target dense matrix
3945  , bool SO > // Storage order of the target dense matrix
3946  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
3947  {
3949 
3950  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3951  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3952 
3953  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3954  return;
3955  }
3956 
3957  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3958  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3959 
3960  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3961  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3962  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3963  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3964  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3965  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3966 
3967  TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3968  }
3970  //**********************************************************************************************
3971 
3972  //**Subtraction assignment to dense matrices (kernel selection)*********************************
3983  template< typename MT3 // Type of the left-hand side target matrix
3984  , typename MT4 // Type of the left-hand side matrix operand
3985  , typename MT5 > // Type of the right-hand side matrix operand
3986  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3987  {
3988  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
3989  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
3990  selectSmallSubAssignKernel( C, A, B );
3991  else
3992  selectBlasSubAssignKernel( C, A, B );
3993  }
3995  //**********************************************************************************************
3996 
3997  //**Default subtraction assignment to row-major dense matrices (general/general)****************
4011  template< typename MT3 // Type of the left-hand side target matrix
4012  , typename MT4 // Type of the left-hand side matrix operand
4013  , typename MT5 > // Type of the right-hand side matrix operand
4014  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4015  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4016  {
4017  const size_t M( A.rows() );
4018  const size_t N( B.columns() );
4019  const size_t K( A.columns() );
4020 
4021  for( size_t i=0UL; i<M; ++i )
4022  {
4023  const size_t kbegin( ( IsUpper<MT4>::value )
4024  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4025  :( 0UL ) );
4026  const size_t kend( ( IsLower<MT4>::value )
4027  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4028  :( K ) );
4029  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4030 
4031  for( size_t k=kbegin; k<kend; ++k )
4032  {
4033  const size_t jbegin( ( IsUpper<MT5>::value )
4034  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
4035  :( 0UL ) );
4036  const size_t jend( ( IsLower<MT5>::value )
4037  ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
4038  :( N ) );
4039  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4040 
4041  const size_t jnum( jend - jbegin );
4042  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4043 
4044  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4045  (~C)(i,j ) -= A(i,k) * B(k,j );
4046  (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4047  }
4048  if( jpos < jend ) {
4049  (~C)(i,jpos) -= A(i,k) * B(k,jpos);
4050  }
4051  }
4052  }
4053  }
4055  //**********************************************************************************************
4056 
4057  //**Default subtraction assignment to column-major dense matrices (general/general)*************
4071  template< typename MT3 // Type of the left-hand side target matrix
4072  , typename MT4 // Type of the left-hand side matrix operand
4073  , typename MT5 > // Type of the right-hand side matrix operand
4074  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4075  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4076  {
4077  const size_t M( A.rows() );
4078  const size_t N( B.columns() );
4079  const size_t K( A.columns() );
4080 
4081  for( size_t j=0UL; j<N; ++j )
4082  {
4083  const size_t kbegin( ( IsLower<MT5>::value )
4084  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4085  :( 0UL ) );
4086  const size_t kend( ( IsUpper<MT5>::value )
4087  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4088  :( K ) );
4089  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4090 
4091  for( size_t k=kbegin; k<kend; ++k )
4092  {
4093  const size_t ibegin( ( IsLower<MT4>::value )
4094  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
4095  :( 0UL ) );
4096  const size_t iend( ( IsUpper<MT4>::value )
4097  ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
4098  :( M ) );
4099  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4100 
4101  const size_t inum( iend - ibegin );
4102  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4103 
4104  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4105  (~C)(i ,j) -= A(i ,k) * B(k,j);
4106  (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4107  }
4108  if( ipos < iend ) {
4109  (~C)(ipos,j) -= A(ipos,k) * B(k,j);
4110  }
4111  }
4112  }
4113  }
4115  //**********************************************************************************************
4116 
4117  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
4131  template< typename MT3 // Type of the left-hand side target matrix
4132  , typename MT4 // Type of the left-hand side matrix operand
4133  , typename MT5 > // Type of the right-hand side matrix operand
4134  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4135  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4136  {
4137  const size_t M( A.rows() );
4138  const size_t N( B.columns() );
4139 
4140  const size_t block( BLOCK_SIZE );
4141 
4142  for( size_t ii=0UL; ii<M; ii+=block ) {
4143  const size_t iend( min( M, ii+block ) );
4144  for( size_t jj=0UL; jj<N; jj+=block ) {
4145  const size_t jend( min( N, jj+block ) );
4146  for( size_t i=ii; i<iend; ++i )
4147  {
4148  const size_t jbegin( ( IsUpper<MT4>::value )
4149  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
4150  :( jj ) );
4151  const size_t jpos( ( IsLower<MT4>::value )
4152  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
4153  :( jend ) );
4154 
4155  for( size_t j=jbegin; j<jpos; ++j ) {
4156  (~C)(i,j) -= A(i,j) * B(j,j);
4157  }
4158  }
4159  }
4160  }
4161  }
4163  //**********************************************************************************************
4164 
4165  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
4179  template< typename MT3 // Type of the left-hand side target matrix
4180  , typename MT4 // Type of the left-hand side matrix operand
4181  , typename MT5 > // Type of the right-hand side matrix operand
4182  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4183  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4184  {
4185  const size_t M( A.rows() );
4186  const size_t N( B.columns() );
4187 
4188  for( size_t j=0UL; j<N; ++j )
4189  {
4190  const size_t ibegin( ( IsLower<MT4>::value )
4191  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4192  :( 0UL ) );
4193  const size_t iend( ( IsUpper<MT4>::value )
4194  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4195  :( M ) );
4196  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4197 
4198  const size_t inum( iend - ibegin );
4199  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4200 
4201  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4202  (~C)(i ,j) -= A(i ,j) * B(j,j);
4203  (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4204  }
4205  if( ipos < iend ) {
4206  (~C)(ipos,j) -= A(ipos,j) * B(j,j);
4207  }
4208  }
4209  }
4211  //**********************************************************************************************
4212 
4213  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
4227  template< typename MT3 // Type of the left-hand side target matrix
4228  , typename MT4 // Type of the left-hand side matrix operand
4229  , typename MT5 > // Type of the right-hand side matrix operand
4230  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4231  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4232  {
4233  const size_t M( A.rows() );
4234  const size_t N( B.columns() );
4235 
4236  for( size_t i=0UL; i<M; ++i )
4237  {
4238  const size_t jbegin( ( IsUpper<MT5>::value )
4239  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4240  :( 0UL ) );
4241  const size_t jend( ( IsLower<MT5>::value )
4242  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4243  :( N ) );
4244  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4245 
4246  const size_t jnum( jend - jbegin );
4247  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4248 
4249  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4250  (~C)(i,j ) -= A(i,i) * B(i,j );
4251  (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4252  }
4253  if( jpos < jend ) {
4254  (~C)(i,jpos) -= A(i,i) * B(i,jpos);
4255  }
4256  }
4257  }
4259  //**********************************************************************************************
4260 
4261  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
4275  template< typename MT3 // Type of the left-hand side target matrix
4276  , typename MT4 // Type of the left-hand side matrix operand
4277  , typename MT5 > // Type of the right-hand side matrix operand
4278  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4279  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4280  {
4281  const size_t M( A.rows() );
4282  const size_t N( B.columns() );
4283 
4284  const size_t block( BLOCK_SIZE );
4285 
4286  for( size_t jj=0UL; jj<N; jj+=block ) {
4287  const size_t jend( min( N, jj+block ) );
4288  for( size_t ii=0UL; ii<M; ii+=block ) {
4289  const size_t iend( min( M, ii+block ) );
4290  for( size_t j=jj; j<jend; ++j )
4291  {
4292  const size_t ibegin( ( IsLower<MT5>::value )
4293  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
4294  :( ii ) );
4295  const size_t ipos( ( IsUpper<MT5>::value )
4296  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
4297  :( iend ) );
4298 
4299  for( size_t i=ibegin; i<ipos; ++i ) {
4300  (~C)(i,j) -= A(i,i) * B(i,j);
4301  }
4302  }
4303  }
4304  }
4305  }
4307  //**********************************************************************************************
4308 
4309  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
4323  template< typename MT3 // Type of the left-hand side target matrix
4324  , typename MT4 // Type of the left-hand side matrix operand
4325  , typename MT5 > // Type of the right-hand side matrix operand
4326  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
4327  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4328  {
4329  for( size_t i=0UL; i<A.rows(); ++i ) {
4330  C(i,i) -= A(i,i) * B(i,i);
4331  }
4332  }
4334  //**********************************************************************************************
4335 
4336  //**Default subtraction assignment to dense matrices (small matrices)***************************
4350  template< typename MT3 // Type of the left-hand side target matrix
4351  , typename MT4 // Type of the left-hand side matrix operand
4352  , typename MT5 > // Type of the right-hand side matrix operand
4353  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4354  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4355  {
4356  selectDefaultSubAssignKernel( C, A, B );
4357  }
4359  //**********************************************************************************************
4360 
4361  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
4376  template< typename MT3 // Type of the left-hand side target matrix
4377  , typename MT4 // Type of the left-hand side matrix operand
4378  , typename MT5 > // Type of the right-hand side matrix operand
4379  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4380  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4381  {
4382  typedef IntrinsicTrait<ElementType> IT;
4383 
4384  const size_t M( A.rows() );
4385  const size_t N( B.columns() );
4386  const size_t K( A.columns() );
4387 
4388  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
4389 
4390  const size_t jpos( remainder ? ( N & size_t(-IT::size) ) : N );
4391  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % IT::size ) ) == jpos, "Invalid end calculation" );
4392 
4393  size_t j( 0UL );
4394 
4395  for( ; (j+IT::size*7UL) < jpos; j+=IT::size*8UL ) {
4396  for( size_t i=0UL; i<M; ++i )
4397  {
4398  const size_t kbegin( ( IsUpper<MT4>::value )
4399  ?( ( IsLower<MT5>::value )
4400  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4401  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4402  :( IsLower<MT5>::value ? j : 0UL ) );
4403  const size_t kend( ( IsLower<MT4>::value )
4404  ?( ( IsUpper<MT5>::value )
4405  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
4406  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4407  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
4408 
4409  IntrinsicType xmm1( (~C).load(i,j ) );
4410  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
4411  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
4412  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
4413  IntrinsicType xmm5( (~C).load(i,j+IT::size*4UL) );
4414  IntrinsicType xmm6( (~C).load(i,j+IT::size*5UL) );
4415  IntrinsicType xmm7( (~C).load(i,j+IT::size*6UL) );
4416  IntrinsicType xmm8( (~C).load(i,j+IT::size*7UL) );
4417 
4418  for( size_t k=kbegin; k<kend; ++k ) {
4419  const IntrinsicType a1( set( A(i,k) ) );
4420  xmm1 = xmm1 - a1 * B.load(k,j );
4421  xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
4422  xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
4423  xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
4424  xmm5 = xmm5 - a1 * B.load(k,j+IT::size*4UL);
4425  xmm6 = xmm6 - a1 * B.load(k,j+IT::size*5UL);
4426  xmm7 = xmm7 - a1 * B.load(k,j+IT::size*6UL);
4427  xmm8 = xmm8 - a1 * B.load(k,j+IT::size*7UL);
4428  }
4429 
4430  (~C).store( i, j , xmm1 );
4431  (~C).store( i, j+IT::size , xmm2 );
4432  (~C).store( i, j+IT::size*2UL, xmm3 );
4433  (~C).store( i, j+IT::size*3UL, xmm4 );
4434  (~C).store( i, j+IT::size*4UL, xmm5 );
4435  (~C).store( i, j+IT::size*5UL, xmm6 );
4436  (~C).store( i, j+IT::size*6UL, xmm7 );
4437  (~C).store( i, j+IT::size*7UL, xmm8 );
4438  }
4439  }
4440 
4441  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
4442  {
4443  size_t i( 0UL );
4444 
4445  for( ; (i+2UL) <= M; i+=2UL )
4446  {
4447  const size_t kbegin( ( IsUpper<MT4>::value )
4448  ?( ( IsLower<MT5>::value )
4449  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4450  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4451  :( IsLower<MT5>::value ? j : 0UL ) );
4452  const size_t kend( ( IsLower<MT4>::value )
4453  ?( ( IsUpper<MT5>::value )
4454  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
4455  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4456  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
4457 
4458  IntrinsicType xmm1( (~C).load(i ,j ) );
4459  IntrinsicType xmm2( (~C).load(i ,j+IT::size ) );
4460  IntrinsicType xmm3( (~C).load(i ,j+IT::size*2UL) );
4461  IntrinsicType xmm4( (~C).load(i ,j+IT::size*3UL) );
4462  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
4463  IntrinsicType xmm6( (~C).load(i+1UL,j+IT::size ) );
4464  IntrinsicType xmm7( (~C).load(i+1UL,j+IT::size*2UL) );
4465  IntrinsicType xmm8( (~C).load(i+1UL,j+IT::size*3UL) );
4466 
4467  for( size_t k=kbegin; k<kend; ++k ) {
4468  const IntrinsicType a1( set( A(i ,k) ) );
4469  const IntrinsicType a2( set( A(i+1UL,k) ) );
4470  const IntrinsicType b1( B.load(k,j ) );
4471  const IntrinsicType b2( B.load(k,j+IT::size ) );
4472  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
4473  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
4474  xmm1 = xmm1 - a1 * b1;
4475  xmm2 = xmm2 - a1 * b2;
4476  xmm3 = xmm3 - a1 * b3;
4477  xmm4 = xmm4 - a1 * b4;
4478  xmm5 = xmm5 - a2 * b1;
4479  xmm6 = xmm6 - a2 * b2;
4480  xmm7 = xmm7 - a2 * b3;
4481  xmm8 = xmm8 - a2 * b4;
4482  }
4483 
4484  (~C).store( i , j , xmm1 );
4485  (~C).store( i , j+IT::size , xmm2 );
4486  (~C).store( i , j+IT::size*2UL, xmm3 );
4487  (~C).store( i , j+IT::size*3UL, xmm4 );
4488  (~C).store( i+1UL, j , xmm5 );
4489  (~C).store( i+1UL, j+IT::size , xmm6 );
4490  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
4491  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
4492  }
4493 
4494  if( i < M )
4495  {
4496  const size_t kbegin( ( IsUpper<MT4>::value )
4497  ?( ( IsLower<MT5>::value )
4498  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4499  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4500  :( IsLower<MT5>::value ? j : 0UL ) );
4501  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
4502 
4503  IntrinsicType xmm1( (~C).load(i,j ) );
4504  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
4505  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
4506  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
4507 
4508  for( size_t k=kbegin; k<kend; ++k ) {
4509  const IntrinsicType a1( set( A(i,k) ) );
4510  xmm1 = xmm1 - a1 * B.load(k,j );
4511  xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
4512  xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
4513  xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
4514  }
4515 
4516  (~C).store( i, j , xmm1 );
4517  (~C).store( i, j+IT::size , xmm2 );
4518  (~C).store( i, j+IT::size*2UL, xmm3 );
4519  (~C).store( i, j+IT::size*3UL, xmm4 );
4520  }
4521  }
4522 
4523  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
4524  {
4525  size_t i( 0UL );
4526 
4527  for( ; (i+2UL) <= M; i+=2UL )
4528  {
4529  const size_t kbegin( ( IsUpper<MT4>::value )
4530  ?( ( IsLower<MT5>::value )
4531  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4532  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4533  :( IsLower<MT5>::value ? j : 0UL ) );
4534  const size_t kend( ( IsLower<MT4>::value )
4535  ?( ( IsUpper<MT5>::value )
4536  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
4537  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4538  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
4539 
4540  IntrinsicType xmm1( (~C).load(i ,j ) );
4541  IntrinsicType xmm2( (~C).load(i ,j+IT::size) );
4542  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
4543  IntrinsicType xmm4( (~C).load(i+1UL,j+IT::size) );
4544 
4545  for( size_t k=kbegin; k<kend; ++k ) {
4546  const IntrinsicType a1( set( A(i ,k) ) );
4547  const IntrinsicType a2( set( A(i+1UL,k) ) );
4548  const IntrinsicType b1( B.load(k,j ) );
4549  const IntrinsicType b2( B.load(k,j+IT::size) );
4550  xmm1 = xmm1 - a1 * b1;
4551  xmm2 = xmm2 - a1 * b2;
4552  xmm3 = xmm3 - a2 * b1;
4553  xmm4 = xmm4 - a2 * b2;
4554  }
4555 
4556  (~C).store( i , j , xmm1 );
4557  (~C).store( i , j+IT::size, xmm2 );
4558  (~C).store( i+1UL, j , xmm3 );
4559  (~C).store( i+1UL, j+IT::size, xmm4 );
4560  }
4561 
4562  if( i < M )
4563  {
4564  const size_t kbegin( ( IsUpper<MT4>::value )
4565  ?( ( IsLower<MT5>::value )
4566  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4567  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4568  :( IsLower<MT5>::value ? j : 0UL ) );
4569  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
4570 
4571  IntrinsicType xmm1( (~C).load(i,j ) );
4572  IntrinsicType xmm2( (~C).load(i,j+IT::size) );
4573 
4574  for( size_t k=kbegin; k<kend; ++k ) {
4575  const IntrinsicType a1( set( A(i,k) ) );
4576  xmm1 = xmm1 - a1 * B.load(k,j );
4577  xmm2 = xmm2 - a1 * B.load(k,j+IT::size);
4578  }
4579 
4580  (~C).store( i, j , xmm1 );
4581  (~C).store( i, j+IT::size, xmm2 );
4582  }
4583  }
4584 
4585  for( ; j<jpos; j+=IT::size )
4586  {
4587  size_t i( 0UL );
4588 
4589  for( ; (i+2UL) <= M; i+=2UL )
4590  {
4591  const size_t kbegin( ( IsUpper<MT4>::value )
4592  ?( ( IsLower<MT5>::value )
4593  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4594  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4595  :( IsLower<MT5>::value ? j : 0UL ) );
4596  const size_t kend( ( IsLower<MT4>::value )
4597  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4598  :( K ) );
4599 
4600  IntrinsicType xmm1( (~C).load(i ,j) );
4601  IntrinsicType xmm2( (~C).load(i+1UL,j) );
4602 
4603  for( size_t k=kbegin; k<kend; ++k ) {
4604  const IntrinsicType b1( B.load(k,j) );
4605  xmm1 = xmm1 - set( A(i ,k) ) * b1;
4606  xmm2 = xmm2 - set( A(i+1UL,k) ) * b1;
4607  }
4608 
4609  (~C).store( i , j, xmm1 );
4610  (~C).store( i+1UL, j, xmm2 );
4611  }
4612 
4613  if( i < M )
4614  {
4615  const size_t kbegin( ( IsUpper<MT4>::value )
4616  ?( ( IsLower<MT5>::value )
4617  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4618  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4619  :( IsLower<MT5>::value ? j : 0UL ) );
4620 
4621  IntrinsicType xmm1( (~C).load(i,j) );
4622 
4623  for( size_t k=kbegin; k<K; ++k ) {
4624  xmm1 = xmm1 - set( A(i,k) ) * B.load(k,j);
4625  }
4626 
4627  (~C).store( i, j, xmm1 );
4628  }
4629  }
4630 
4631  for( ; remainder && j<N; ++j )
4632  {
4633  size_t i( 0UL );
4634 
4635  for( ; (i+2UL) <= M; i+=2UL )
4636  {
4637  const size_t kbegin( ( IsUpper<MT4>::value )
4638  ?( ( IsLower<MT5>::value )
4639  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4640  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4641  :( IsLower<MT5>::value ? j : 0UL ) );
4642  const size_t kend( ( IsLower<MT4>::value )
4643  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4644  :( K ) );
4645 
4646  ElementType value1( (~C)(i ,j) );
4647  ElementType value2( (~C)(i+1UL,j) );
4648 
4649  for( size_t k=kbegin; k<kend; ++k ) {
4650  value1 -= A(i ,k) * B(k,j);
4651  value2 -= A(i+1UL,k) * B(k,j);
4652  }
4653 
4654  (~C)(i ,j) = value1;
4655  (~C)(i+1UL,j) = value2;
4656  }
4657 
4658  if( i < M )
4659  {
4660  const size_t kbegin( ( IsUpper<MT4>::value )
4661  ?( ( IsLower<MT5>::value )
4662  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4663  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4664  :( IsLower<MT5>::value ? j : 0UL ) );
4665 
4666  ElementType value( (~C)(i,j) );
4667 
4668  for( size_t k=kbegin; k<K; ++k ) {
4669  value -= A(i,k) * B(k,j);
4670  }
4671 
4672  (~C)(i,j) = value;
4673  }
4674  }
4675  }
4677  //**********************************************************************************************
4678 
4679  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
4694  template< typename MT3 // Type of the left-hand side target matrix
4695  , typename MT4 // Type of the left-hand side matrix operand
4696  , typename MT5 > // Type of the right-hand side matrix operand
4697  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4698  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4699  {
4700  typedef IntrinsicTrait<ElementType> IT;
4701 
4702  const size_t M( A.rows() );
4703  const size_t N( B.columns() );
4704  const size_t K( A.columns() );
4705 
4706  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
4707 
4708  const size_t ipos( remainder ? ( M & size_t(-IT::size) ) : M );
4709  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % IT::size ) ) == ipos, "Invalid end calculation" );
4710 
4711  size_t i( 0UL );
4712 
4713  for( ; (i+IT::size*7UL) < ipos; i+=IT::size*8UL ) {
4714  for( size_t j=0UL; j<N; ++j )
4715  {
4716  const size_t kbegin( ( IsLower<MT5>::value )
4717  ?( ( IsUpper<MT4>::value )
4718  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4719  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4720  :( IsUpper<MT4>::value ? i : 0UL ) );
4721  const size_t kend( ( IsUpper<MT5>::value )
4722  ?( ( IsLower<MT4>::value )
4723  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4724  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
4725  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
4726 
4727  IntrinsicType xmm1( (~C).load(i ,j) );
4728  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
4729  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
4730  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
4731  IntrinsicType xmm5( (~C).load(i+IT::size*4UL,j) );
4732  IntrinsicType xmm6( (~C).load(i+IT::size*5UL,j) );
4733  IntrinsicType xmm7( (~C).load(i+IT::size*6UL,j) );
4734  IntrinsicType xmm8( (~C).load(i+IT::size*7UL,j) );
4735 
4736  for( size_t k=kbegin; k<kend; ++k ) {
4737  const IntrinsicType b1( set( B(k,j) ) );
4738  xmm1 = xmm1 - A.load(i ,k) * b1;
4739  xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
4740  xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
4741  xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
4742  xmm5 = xmm5 - A.load(i+IT::size*4UL,k) * b1;
4743  xmm6 = xmm6 - A.load(i+IT::size*5UL,k) * b1;
4744  xmm7 = xmm7 - A.load(i+IT::size*6UL,k) * b1;
4745  xmm8 = xmm8 - A.load(i+IT::size*7UL,k) * b1;
4746  }
4747 
4748  (~C).store( i , j, xmm1 );
4749  (~C).store( i+IT::size , j, xmm2 );
4750  (~C).store( i+IT::size*2UL, j, xmm3 );
4751  (~C).store( i+IT::size*3UL, j, xmm4 );
4752  (~C).store( i+IT::size*4UL, j, xmm5 );
4753  (~C).store( i+IT::size*5UL, j, xmm6 );
4754  (~C).store( i+IT::size*6UL, j, xmm7 );
4755  (~C).store( i+IT::size*7UL, j, xmm8 );
4756  }
4757  }
4758 
4759  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
4760  {
4761  size_t j( 0UL );
4762 
4763  for( ; (j+2UL) <= N; j+=2UL )
4764  {
4765  const size_t kbegin( ( IsLower<MT5>::value )
4766  ?( ( IsUpper<MT4>::value )
4767  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4768  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4769  :( IsUpper<MT4>::value ? i : 0UL ) );
4770  const size_t kend( ( IsUpper<MT5>::value )
4771  ?( ( IsLower<MT4>::value )
4772  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4773  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4774  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
4775 
4776  IntrinsicType xmm1( (~C).load(i ,j ) );
4777  IntrinsicType xmm2( (~C).load(i+IT::size ,j ) );
4778  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j ) );
4779  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j ) );
4780  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
4781  IntrinsicType xmm6( (~C).load(i+IT::size ,j+1UL) );
4782  IntrinsicType xmm7( (~C).load(i+IT::size*2UL,j+1UL) );
4783  IntrinsicType xmm8( (~C).load(i+IT::size*3UL,j+1UL) );
4784 
4785  for( size_t k=kbegin; k<kend; ++k ) {
4786  const IntrinsicType a1( A.load(i ,k) );
4787  const IntrinsicType a2( A.load(i+IT::size ,k) );
4788  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
4789  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
4790  const IntrinsicType b1( set( B(k,j ) ) );
4791  const IntrinsicType b2( set( B(k,j+1UL) ) );
4792  xmm1 = xmm1 - a1 * b1;
4793  xmm2 = xmm2 - a2 * b1;
4794  xmm3 = xmm3 - a3 * b1;
4795  xmm4 = xmm4 - a4 * b1;
4796  xmm5 = xmm5 - a1 * b2;
4797  xmm6 = xmm6 - a2 * b2;
4798  xmm7 = xmm7 - a3 * b2;
4799  xmm8 = xmm8 - a4 * b2;
4800  }
4801 
4802  (~C).store( i , j , xmm1 );
4803  (~C).store( i+IT::size , j , xmm2 );
4804  (~C).store( i+IT::size*2UL, j , xmm3 );
4805  (~C).store( i+IT::size*3UL, j , xmm4 );
4806  (~C).store( i , j+1UL, xmm5 );
4807  (~C).store( i+IT::size , j+1UL, xmm6 );
4808  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
4809  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
4810  }
4811 
4812  if( j < N )
4813  {
4814  const size_t kbegin( ( IsLower<MT5>::value )
4815  ?( ( IsUpper<MT4>::value )
4816  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4817  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4818  :( IsUpper<MT4>::value ? i : 0UL ) );
4819  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
4820 
4821  IntrinsicType xmm1( (~C).load(i ,j) );
4822  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
4823  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
4824  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
4825 
4826  for( size_t k=kbegin; k<kend; ++k ) {
4827  const IntrinsicType b1( set( B(k,j) ) );
4828  xmm1 = xmm1 - A.load(i ,k) * b1;
4829  xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
4830  xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
4831  xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
4832  }
4833 
4834  (~C).store( i , j, xmm1 );
4835  (~C).store( i+IT::size , j, xmm2 );
4836  (~C).store( i+IT::size*2UL, j, xmm3 );
4837  (~C).store( i+IT::size*3UL, j, xmm4 );
4838  }
4839  }
4840 
4841  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
4842  {
4843  size_t j( 0UL );
4844 
4845  for( ; (j+2UL) <= N; j+=2UL )
4846  {
4847  const size_t kbegin( ( IsLower<MT5>::value )
4848  ?( ( IsUpper<MT4>::value )
4849  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4850  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4851  :( IsUpper<MT4>::value ? i : 0UL ) );
4852  const size_t kend( ( IsUpper<MT5>::value )
4853  ?( ( IsLower<MT4>::value )
4854  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4855  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4856  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
4857 
4858  IntrinsicType xmm1( (~C).load(i ,j ) );
4859  IntrinsicType xmm2( (~C).load(i+IT::size,j ) );
4860  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
4861  IntrinsicType xmm4( (~C).load(i+IT::size,j+1UL) );
4862 
4863  for( size_t k=kbegin; k<kend; ++k ) {
4864  const IntrinsicType a1( A.load(i ,k) );
4865  const IntrinsicType a2( A.load(i+IT::size,k) );
4866  const IntrinsicType b1( set( B(k,j ) ) );
4867  const IntrinsicType b2( set( B(k,j+1UL) ) );
4868  xmm1 = xmm1 - a1 * b1;
4869  xmm2 = xmm2 - a2 * b1;
4870  xmm3 = xmm3 - a1 * b2;
4871  xmm4 = xmm4 - a2 * b2;
4872  }
4873 
4874  (~C).store( i , j , xmm1 );
4875  (~C).store( i+IT::size, j , xmm2 );
4876  (~C).store( i , j+1UL, xmm3 );
4877  (~C).store( i+IT::size, j+1UL, xmm4 );
4878  }
4879 
4880  if( j < N )
4881  {
4882  const size_t kbegin( ( IsLower<MT5>::value )
4883  ?( ( IsUpper<MT4>::value )
4884  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4885  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4886  :( IsUpper<MT4>::value ? i : 0UL ) );
4887  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
4888 
4889  IntrinsicType xmm1( (~C).load(i ,j) );
4890  IntrinsicType xmm2( (~C).load(i+IT::size,j) );
4891 
4892  for( size_t k=kbegin; k<kend; ++k ) {
4893  const IntrinsicType b1( set( B(k,j) ) );
4894  xmm1 = xmm1 - A.load(i ,k) * b1;
4895  xmm2 = xmm2 - A.load(i+IT::size,k) * b1;
4896  }
4897 
4898  (~C).store( i , j, xmm1 );
4899  (~C).store( i+IT::size, j, xmm2 );
4900  }
4901  }
4902 
4903  for( ; i<ipos; i+=IT::size )
4904  {
4905  size_t j( 0UL );
4906 
4907  for( ; (j+2UL) <= N; j+=2UL )
4908  {
4909  const size_t kbegin( ( IsLower<MT5>::value )
4910  ?( ( IsUpper<MT4>::value )
4911  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4912  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4913  :( IsUpper<MT4>::value ? i : 0UL ) );
4914  const size_t kend( ( IsUpper<MT5>::value )
4915  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4916  :( K ) );
4917 
4918  IntrinsicType xmm1( (~C).load(i,j ) );
4919  IntrinsicType xmm2( (~C).load(i,j+1UL) );
4920 
4921  for( size_t k=kbegin; k<kend; ++k ) {
4922  const IntrinsicType a1( A.load(i,k) );
4923  xmm1 = xmm1 - a1 * set( B(k,j ) );
4924  xmm2 = xmm2 - a1 * set( B(k,j+1UL) );
4925  }
4926 
4927  (~C).store( i, j , xmm1 );
4928  (~C).store( i, j+1UL, xmm2 );
4929  }
4930 
4931  if( j < N )
4932  {
4933  const size_t kbegin( ( IsLower<MT5>::value )
4934  ?( ( IsUpper<MT4>::value )
4935  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4936  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4937  :( IsUpper<MT4>::value ? i : 0UL ) );
4938 
4939  IntrinsicType xmm1( (~C).load(i,j) );
4940 
4941  for( size_t k=kbegin; k<K; ++k ) {
4942  xmm1 = xmm1 - A.load(i,k) * set( B(k,j) );
4943  }
4944 
4945  (~C).store( i, j, xmm1 );
4946  }
4947  }
4948 
4949  for( ; remainder && i<M; ++i )
4950  {
4951  size_t j( 0UL );
4952 
4953  for( ; (j+2UL) <= N; j+=2UL )
4954  {
4955  const size_t kbegin( ( IsLower<MT5>::value )
4956  ?( ( IsUpper<MT4>::value )
4957  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4958  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4959  :( IsUpper<MT4>::value ? i : 0UL ) );
4960  const size_t kend( ( IsUpper<MT5>::value )
4961  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4962  :( K ) );
4963 
4964  ElementType value1( (~C)(i,j ) );
4965  ElementType value2( (~C)(i,j+1UL) );
4966 
4967  for( size_t k=kbegin; k<kend; ++k ) {
4968  value1 -= A(i,k) * B(k,j );
4969  value2 -= A(i,k) * B(k,j+1UL);
4970  }
4971 
4972  (~C)(i,j ) = value1;
4973  (~C)(i,j+1UL) = value2;
4974  }
4975 
4976  if( j < N )
4977  {
4978  const size_t kbegin( ( IsLower<MT5>::value )
4979  ?( ( IsUpper<MT4>::value )
4980  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4981  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4982  :( IsUpper<MT4>::value ? i : 0UL ) );
4983 
4984  ElementType value( (~C)(i,j) );
4985 
4986  for( size_t k=kbegin; k<K; ++k ) {
4987  value -= A(i,k) * B(k,j);
4988  }
4989 
4990  (~C)(i,j) = value;
4991  }
4992  }
4993  }
4995  //**********************************************************************************************
4996 
4997  //**Default subtraction assignment to dense matrices (large matrices)***************************
5011  template< typename MT3 // Type of the left-hand side target matrix
5012  , typename MT4 // Type of the left-hand side matrix operand
5013  , typename MT5 > // Type of the right-hand side matrix operand
5014  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
5015  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5016  {
5017  selectDefaultSubAssignKernel( C, A, B );
5018  }
5020  //**********************************************************************************************
5021 
5022  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
5037  template< typename MT3 // Type of the left-hand side target matrix
5038  , typename MT4 // Type of the left-hand side matrix operand
5039  , typename MT5 > // Type of the right-hand side matrix operand
5040  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
5041  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
5042  {
5043  typedef IntrinsicTrait<ElementType> IT;
5044 
5045  const size_t M( A.rows() );
5046  const size_t N( B.columns() );
5047  const size_t K( A.columns() );
5048 
5049  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5050 
5051  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
5052  {
5053  const size_t jend( min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
5054 
5055  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
5056  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % IT::size ) ) == jpos, "Invalid end calculation" );
5057 
5058  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
5059  {
5060  const size_t iend( min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
5061 
5062  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
5063  {
5064  const size_t ktmp( min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
5065 
5066  size_t j( jj );
5067 
5068  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
5069  {
5070  const size_t j1( j+IT::size );
5071  const size_t j2( j+IT::size*2UL );
5072  const size_t j3( j+IT::size*3UL );
5073 
5074  size_t i( ii );
5075 
5076  for( ; (i+2UL) <= iend; i+=2UL )
5077  {
5078  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5079  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5080  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5081  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
5082 
5083  IntrinsicType xmm1( (~C).load(i ,j ) );
5084  IntrinsicType xmm2( (~C).load(i ,j1) );
5085  IntrinsicType xmm3( (~C).load(i ,j2) );
5086  IntrinsicType xmm4( (~C).load(i ,j3) );
5087  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
5088  IntrinsicType xmm6( (~C).load(i+1UL,j1) );
5089  IntrinsicType xmm7( (~C).load(i+1UL,j2) );
5090  IntrinsicType xmm8( (~C).load(i+1UL,j3) );
5091 
5092  for( size_t k=kbegin; k<kend; ++k ) {
5093  const IntrinsicType a1( set( A(i ,k) ) );
5094  const IntrinsicType a2( set( A(i+1UL,k) ) );
5095  const IntrinsicType b1( B.load(k,j ) );
5096  const IntrinsicType b2( B.load(k,j1) );
5097  const IntrinsicType b3( B.load(k,j2) );
5098  const IntrinsicType b4( B.load(k,j3) );
5099  xmm1 = xmm1 - a1 * b1;
5100  xmm2 = xmm2 - a1 * b2;
5101  xmm3 = xmm3 - a1 * b3;
5102  xmm4 = xmm4 - a1 * b4;
5103  xmm5 = xmm5 - a2 * b1;
5104  xmm6 = xmm6 - a2 * b2;
5105  xmm7 = xmm7 - a2 * b3;
5106  xmm8 = xmm8 - a2 * b4;
5107  }
5108 
5109  (~C).store( i , j , xmm1 );
5110  (~C).store( i , j1, xmm2 );
5111  (~C).store( i , j2, xmm3 );
5112  (~C).store( i , j3, xmm4 );
5113  (~C).store( i+1UL, j , xmm5 );
5114  (~C).store( i+1UL, j1, xmm6 );
5115  (~C).store( i+1UL, j2, xmm7 );
5116  (~C).store( i+1UL, j3, xmm8 );
5117  }
5118 
5119  if( i < iend )
5120  {
5121  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5122  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5123  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5124  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
5125 
5126  IntrinsicType xmm1( (~C).load(i,j ) );
5127  IntrinsicType xmm2( (~C).load(i,j1) );
5128  IntrinsicType xmm3( (~C).load(i,j2) );
5129  IntrinsicType xmm4( (~C).load(i,j3) );
5130 
5131  for( size_t k=kbegin; k<kend; ++k ) {
5132  const IntrinsicType a1( set( A(i,k) ) );
5133  xmm1 = xmm1 - a1 * B.load(k,j );
5134  xmm2 = xmm2 - a1 * B.load(k,j1);
5135  xmm3 = xmm3 - a1 * B.load(k,j2);
5136  xmm4 = xmm4 - a1 * B.load(k,j3);
5137  }
5138 
5139  (~C).store( i, j , xmm1 );
5140  (~C).store( i, j1, xmm2 );
5141  (~C).store( i, j2, xmm3 );
5142  (~C).store( i, j3, xmm4 );
5143  }
5144  }
5145 
5146  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
5147  {
5148  const size_t j1( j+IT::size );
5149 
5150  size_t i( ii );
5151 
5152  for( ; (i+4UL) <= iend; i+=4UL )
5153  {
5154  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5155  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5156  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
5157  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
5158 
5159  IntrinsicType xmm1( (~C).load(i ,j ) );
5160  IntrinsicType xmm2( (~C).load(i ,j1) );
5161  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
5162  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
5163  IntrinsicType xmm5( (~C).load(i+2UL,j ) );
5164  IntrinsicType xmm6( (~C).load(i+2UL,j1) );
5165  IntrinsicType xmm7( (~C).load(i+3UL,j ) );
5166  IntrinsicType xmm8( (~C).load(i+3UL,j1) );
5167 
5168  for( size_t k=kbegin; k<kend; ++k ) {
5169  const IntrinsicType a1( set( A(i ,k) ) );
5170  const IntrinsicType a2( set( A(i+1UL,k) ) );
5171  const IntrinsicType a3( set( A(i+2UL,k) ) );
5172  const IntrinsicType a4( set( A(i+3UL,k) ) );
5173  const IntrinsicType b1( B.load(k,j ) );
5174  const IntrinsicType b2( B.load(k,j1) );
5175  xmm1 = xmm1 - a1 * b1;
5176  xmm2 = xmm2 - a1 * b2;
5177  xmm3 = xmm3 - a2 * b1;
5178  xmm4 = xmm4 - a2 * b2;
5179  xmm5 = xmm5 - a3 * b1;
5180  xmm6 = xmm6 - a3 * b2;
5181  xmm7 = xmm7 - a4 * b1;
5182  xmm8 = xmm8 - a4 * b2;
5183  }
5184 
5185  (~C).store( i , j , xmm1 );
5186  (~C).store( i , j1, xmm2 );
5187  (~C).store( i+1UL, j , xmm3 );
5188  (~C).store( i+1UL, j1, xmm4 );
5189  (~C).store( i+2UL, j , xmm5 );
5190  (~C).store( i+2UL, j1, xmm6 );
5191  (~C).store( i+3UL, j , xmm7 );
5192  (~C).store( i+3UL, j1, xmm8 );
5193  }
5194 
5195  for( ; (i+2UL) <= iend; i+=2UL )
5196  {
5197  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5198  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5199  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5200  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
5201 
5202  IntrinsicType xmm1( (~C).load(i ,j ) );
5203  IntrinsicType xmm2( (~C).load(i ,j1) );
5204  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
5205  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
5206 
5207  for( size_t k=kbegin; k<kend; ++k ) {
5208  const IntrinsicType a1( set( A(i ,k) ) );
5209  const IntrinsicType a2( set( A(i+1UL,k) ) );
5210  const IntrinsicType b1( B.load(k,j ) );
5211  const IntrinsicType b2( B.load(k,j1) );
5212  xmm1 = xmm1 - a1 * b1;
5213  xmm2 = xmm2 - a1 * b2;
5214  xmm3 = xmm3 - a2 * b1;
5215  xmm4 = xmm4 - a2 * b2;
5216  }
5217 
5218  (~C).store( i , j , xmm1 );
5219  (~C).store( i , j1, xmm2 );
5220  (~C).store( i+1UL, j , xmm3 );
5221  (~C).store( i+1UL, j1, xmm4 );
5222  }
5223 
5224  if( i < iend )
5225  {
5226  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5227  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5228  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5229  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
5230 
5231  IntrinsicType xmm1( (~C).load(i,j ) );
5232  IntrinsicType xmm2( (~C).load(i,j1) );
5233 
5234  for( size_t k=kbegin; k<kend; ++k ) {
5235  const IntrinsicType a1( set( A(i,k) ) );
5236  xmm1 = xmm1 - a1 * B.load(k,j );
5237  xmm2 = xmm2 - a1 * B.load(k,j1);
5238  }
5239 
5240  (~C).store( i, j , xmm1 );
5241  (~C).store( i, j1, xmm2 );
5242  }
5243  }
5244 
5245  for( ; j<jpos; j+=IT::size )
5246  {
5247  for( size_t i=ii; i<iend; ++i )
5248  {
5249  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5250  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5251  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5252  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
5253 
5254  IntrinsicType xmm1( (~C).load(i,j) );
5255 
5256  for( size_t k=kbegin; k<kend; ++k ) {
5257  const IntrinsicType a1( set( A(i,k) ) );
5258  xmm1 = xmm1 - a1 * B.load(k,j);
5259  }
5260 
5261  (~C).store( i, j, xmm1 );
5262  }
5263  }
5264 
5265  for( ; remainder && j<jend; ++j )
5266  {
5267  for( size_t i=ii; i<iend; ++i )
5268  {
5269  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5270  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5271  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5272  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
5273 
5274  ElementType value( (~C)(i,j) );
5275 
5276  for( size_t k=kbegin; k<kend; ++k ) {
5277  value -= A(i,k) * B(k,j);
5278  }
5279 
5280  (~C)(i,j) = value;
5281  }
5282  }
5283  }
5284  }
5285  }
5286  }
5288  //**********************************************************************************************
5289 
5290  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
5305  template< typename MT3 // Type of the left-hand side target matrix
5306  , typename MT4 // Type of the left-hand side matrix operand
5307  , typename MT5 > // Type of the right-hand side matrix operand
5308  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
5309  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
5310  {
5311  typedef IntrinsicTrait<ElementType> IT;
5312 
5313  const size_t M( A.rows() );
5314  const size_t N( B.columns() );
5315  const size_t K( A.columns() );
5316 
5317  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5318 
5319  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
5320  {
5321  const size_t iend( min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
5322 
5323  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
5324  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % IT::size ) ) == ipos, "Invalid end calculation" );
5325 
5326  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
5327  {
5328  const size_t jend( min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
5329 
5330  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
5331  {
5332  const size_t ktmp( min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
5333 
5334  size_t i( ii );
5335 
5336  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
5337  {
5338  const size_t i1( i+IT::size );
5339  const size_t i2( i+IT::size*2UL );
5340  const size_t i3( i+IT::size*3UL );
5341 
5342  size_t j( jj );
5343 
5344  for( ; (j+2UL) <= jend; j+=2UL )
5345  {
5346  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5347  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5348  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
5349  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5350 
5351  IntrinsicType xmm1( (~C).load(i ,j ) );
5352  IntrinsicType xmm2( (~C).load(i1,j ) );
5353  IntrinsicType xmm3( (~C).load(i2,j ) );
5354  IntrinsicType xmm4( (~C).load(i3,j ) );
5355  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
5356  IntrinsicType xmm6( (~C).load(i1,j+1UL) );
5357  IntrinsicType xmm7( (~C).load(i2,j+1UL) );
5358  IntrinsicType xmm8( (~C).load(i3,j+1UL) );
5359 
5360  for( size_t k=kbegin; k<kend; ++k ) {
5361  const IntrinsicType a1( A.load(i ,k) );
5362  const IntrinsicType a2( A.load(i1,k) );
5363  const IntrinsicType a3( A.load(i2,k) );
5364  const IntrinsicType a4( A.load(i3,k) );
5365  const IntrinsicType b1( set( B(k,j ) ) );
5366  const IntrinsicType b2( set( B(k,j+1UL) ) );
5367  xmm1 = xmm1 - a1 * b1;
5368  xmm2 = xmm2 - a2 * b1;
5369  xmm3 = xmm3 - a3 * b1;
5370  xmm4 = xmm4 - a4 * b1;
5371  xmm5 = xmm5 - a1 * b2;
5372  xmm6 = xmm6 - a2 * b2;
5373  xmm7 = xmm7 - a3 * b2;
5374  xmm8 = xmm8 - a4 * b2;
5375  }
5376 
5377  (~C).store( i , j , xmm1 );
5378  (~C).store( i1, j , xmm2 );
5379  (~C).store( i2, j , xmm3 );
5380  (~C).store( i3, j , xmm4 );
5381  (~C).store( i , j+1UL, xmm5 );
5382  (~C).store( i1, j+1UL, xmm6 );
5383  (~C).store( i2, j+1UL, xmm7 );
5384  (~C).store( i3, j+1UL, xmm8 );
5385  }
5386 
5387  if( j < jend )
5388  {
5389  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5390  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5391  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
5392  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5393 
5394  IntrinsicType xmm1( (~C).load(i ,j) );
5395  IntrinsicType xmm2( (~C).load(i1,j) );
5396  IntrinsicType xmm3( (~C).load(i2,j) );
5397  IntrinsicType xmm4( (~C).load(i3,j) );
5398 
5399  for( size_t k=kbegin; k<kend; ++k ) {
5400  const IntrinsicType b1( set( B(k,j) ) );
5401  xmm1 = xmm1 - A.load(i ,k) * b1;
5402  xmm2 = xmm2 - A.load(i1,k) * b1;
5403  xmm3 = xmm3 - A.load(i2,k) * b1;
5404  xmm4 = xmm4 - A.load(i3,k) * b1;
5405  }
5406 
5407  (~C).store( i , j, xmm1 );
5408  (~C).store( i1, j, xmm2 );
5409  (~C).store( i2, j, xmm3 );
5410  (~C).store( i3, j, xmm4 );
5411  }
5412  }
5413 
5414  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
5415  {
5416  const size_t i1( i+IT::size );
5417 
5418  size_t j( jj );
5419 
5420  for( ; (j+4UL) <= jend; j+=4UL )
5421  {
5422  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5423  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5424  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
5425  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
5426 
5427  IntrinsicType xmm1( (~C).load(i ,j ) );
5428  IntrinsicType xmm2( (~C).load(i1,j ) );
5429  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
5430  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
5431  IntrinsicType xmm5( (~C).load(i ,j+2UL) );
5432  IntrinsicType xmm6( (~C).load(i1,j+2UL) );
5433  IntrinsicType xmm7( (~C).load(i ,j+3UL) );
5434  IntrinsicType xmm8( (~C).load(i1,j+3UL) );
5435 
5436  for( size_t k=kbegin; k<kend; ++k ) {
5437  const IntrinsicType a1( A.load(i ,k) );
5438  const IntrinsicType a2( A.load(i1,k) );
5439  const IntrinsicType b1( set( B(k,j ) ) );
5440  const IntrinsicType b2( set( B(k,j+1UL) ) );
5441  const IntrinsicType b3( set( B(k,j+2UL) ) );
5442  const IntrinsicType b4( set( B(k,j+3UL) ) );
5443  xmm1 = xmm1 - a1 * b1;
5444  xmm2 = xmm2 - a2 * b1;
5445  xmm3 = xmm3 - a1 * b2;
5446  xmm4 = xmm4 - a2 * b2;
5447  xmm5 = xmm5 - a1 * b3;
5448  xmm6 = xmm6 - a2 * b3;
5449  xmm7 = xmm7 - a1 * b4;
5450  xmm8 = xmm8 - a2 * b4;
5451  }
5452 
5453  (~C).store( i , j , xmm1 );
5454  (~C).store( i1, j , xmm2 );
5455  (~C).store( i , j+1UL, xmm3 );
5456  (~C).store( i1, j+1UL, xmm4 );
5457  (~C).store( i , j+2UL, xmm5 );
5458  (~C).store( i1, j+2UL, xmm6 );
5459  (~C).store( i , j+3UL, xmm7 );
5460  (~C).store( i1, j+3UL, xmm8 );
5461  }
5462 
5463  for( ; (j+2UL) <= jend; j+=2UL )
5464  {
5465  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5466  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5467  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
5468  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5469 
5470  IntrinsicType xmm1( (~C).load(i ,j ) );
5471  IntrinsicType xmm2( (~C).load(i1,j ) );
5472  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
5473  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
5474 
5475  for( size_t k=kbegin; k<kend; ++k ) {
5476  const IntrinsicType a1( A.load(i ,k) );
5477  const IntrinsicType a2( A.load(i1,k) );
5478  const IntrinsicType b1( set( B(k,j ) ) );
5479  const IntrinsicType b2( set( B(k,j+1UL) ) );
5480  xmm1 = xmm1 - a1 * b1;
5481  xmm2 = xmm2 - a2 * b1;
5482  xmm3 = xmm3 - a1 * b2;
5483  xmm4 = xmm4 - a2 * b2;
5484  }
5485 
5486  (~C).store( i , j , xmm1 );
5487  (~C).store( i1, j , xmm2 );
5488  (~C).store( i , j+1UL, xmm3 );
5489  (~C).store( i1, j+1UL, xmm4 );
5490  }
5491 
5492  if( j < jend )
5493  {
5494  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5495  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5496  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
5497  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5498 
5499  IntrinsicType xmm1( (~C).load(i ,j) );
5500  IntrinsicType xmm2( (~C).load(i1,j) );
5501 
5502  for( size_t k=kbegin; k<kend; ++k ) {
5503  const IntrinsicType b1( set( B(k,j) ) );
5504  xmm1 = xmm1 - A.load(i ,k) * b1;
5505  xmm2 = xmm2 - A.load(i1,k) * b1;
5506  }
5507 
5508  (~C).store( i , j, xmm1 );
5509  (~C).store( i1, j, xmm2 );
5510  }
5511  }
5512 
5513  for( ; i<ipos; i+=IT::size )
5514  {
5515  for( size_t j=jj; j<jend; ++j )
5516  {
5517  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5518  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5519  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
5520  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5521 
5522  IntrinsicType xmm1( (~C).load(i,j) );
5523 
5524  for( size_t k=kbegin; k<kend; ++k ) {
5525  const IntrinsicType b1( set( B(k,j) ) );
5526  xmm1 = xmm1 - A.load(i,k) * b1;
5527  }
5528 
5529  (~C).store( i, j, xmm1 );
5530  }
5531  }
5532 
5533  for( ; remainder && i<iend; ++i )
5534  {
5535  for( size_t j=jj; j<jend; ++j )
5536  {
5537  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5538  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5539  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
5540  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5541 
5542  ElementType value( (~C)(i,j) );
5543 
5544  for( size_t k=kbegin; k<kend; ++k ) {
5545  value -= A(i,k) * B(k,j);
5546  }
5547 
5548  (~C)(i,j) = value;
5549  }
5550  }
5551  }
5552  }
5553  }
5554  }
5556  //**********************************************************************************************
5557 
5558  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
5572  template< typename MT3 // Type of the left-hand side target matrix
5573  , typename MT4 // Type of the left-hand side matrix operand
5574  , typename MT5 > // Type of the right-hand side matrix operand
5575  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
5576  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5577  {
5578  selectLargeSubAssignKernel( C, A, B );
5579  }
5581  //**********************************************************************************************
5582 
5583  //**BLAS-based subraction assignment to dense matrices******************************************
5584 #if BLAZE_BLAS_MODE
5585 
5598  template< typename MT3 // Type of the left-hand side target matrix
5599  , typename MT4 // Type of the left-hand side matrix operand
5600  , typename MT5 > // Type of the right-hand side matrix operand
5601  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
5602  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5603  {
5604  typedef typename MT3::ElementType ET;
5605 
5606  if( IsTriangular<MT4>::value ) {
5607  typename MT3::ResultType tmp( serial( B ) );
5608  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
5609  subAssign( C, tmp );
5610  }
5611  else if( IsTriangular<MT5>::value ) {
5612  typename MT3::ResultType tmp( serial( A ) );
5613  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
5614  subAssign( C, tmp );
5615  }
5616  else {
5617  gemm( C, A, B, ET(-1), ET(1) );
5618  }
5619  }
5621 #endif
5622  //**********************************************************************************************
5623 
5624  //**Subtraction assignment to sparse matrices***************************************************
5625  // No special implementation for the subtraction assignment to sparse matrices.
5626  //**********************************************************************************************
5627 
5628  //**Multiplication assignment to dense matrices*************************************************
5629  // No special implementation for the multiplication assignment to dense matrices.
5630  //**********************************************************************************************
5631 
5632  //**Multiplication assignment to sparse matrices************************************************
5633  // No special implementation for the multiplication assignment to sparse matrices.
5634  //**********************************************************************************************
5635 
5636  //**SMP assignment to dense matrices************************************************************
5652  template< typename MT // Type of the target dense matrix
5653  , bool SO > // Storage order of the target dense matrix
5654  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5655  smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5656  {
5658 
5659  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5660  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5661 
5662  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
5663  return;
5664  }
5665  else if( rhs.lhs_.columns() == 0UL ) {
5666  reset( ~lhs );
5667  return;
5668  }
5669 
5670  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
5671  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
5672 
5673  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5674  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5675  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5676  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5677  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5678  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
5679 
5680  smpAssign( ~lhs, A * B );
5681  }
5683  //**********************************************************************************************
5684 
5685  //**SMP assignment to sparse matrices***********************************************************
5701  template< typename MT // Type of the target sparse matrix
5702  , bool SO > // Storage order of the target sparse matrix
5703  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5704  smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5705  {
5707 
5708  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
5709 
5716 
5717  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5718  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5719 
5720  const TmpType tmp( rhs );
5721  smpAssign( ~lhs, tmp );
5722  }
5724  //**********************************************************************************************
5725 
5726  //**SMP addition assignment to dense matrices***************************************************
5742  template< typename MT // Type of the target dense matrix
5743  , bool SO > // Storage order of the target dense matrix
5744  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5745  smpAddAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5746  {
5748 
5749  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5750  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5751 
5752  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5753  return;
5754  }
5755 
5756  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
5757  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
5758 
5759  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5760  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5761  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5762  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5763  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5764  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
5765 
5766  smpAddAssign( ~lhs, A * B );
5767  }
5769  //**********************************************************************************************
5770 
5771  //**SMP addition assignment to sparse matrices**************************************************
5772  // No special implementation for the SMP addition assignment to sparse matrices.
5773  //**********************************************************************************************
5774 
5775  //**SMP subtraction assignment to dense matrices************************************************
5791  template< typename MT // Type of the target dense matrix
5792  , bool SO > // Storage order of the target dense matrix
5793  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5794  smpSubAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5795  {
5797 
5798  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5799  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5800 
5801  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5802  return;
5803  }
5804 
5805  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
5806  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
5807 
5808  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5809  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5810  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5811  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5812  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5813  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
5814 
5815  smpSubAssign( ~lhs, A * B );
5816  }
5818  //**********************************************************************************************
5819 
5820  //**SMP subtraction assignment to sparse matrices***********************************************
5821  // No special implementation for the SMP subtraction assignment to sparse matrices.
5822  //**********************************************************************************************
5823 
5824  //**SMP multiplication assignment to dense matrices*********************************************
5825  // No special implementation for the SMP multiplication assignment to dense matrices.
5826  //**********************************************************************************************
5827 
5828  //**SMP multiplication assignment to sparse matrices********************************************
5829  // No special implementation for the SMP multiplication assignment to sparse matrices.
5830  //**********************************************************************************************
5831 
5832  //**Compile time checks*************************************************************************
5840  //**********************************************************************************************
5841 };
5842 //*************************************************************************************************
5843 
5844 
5845 
5846 
5847 //=================================================================================================
5848 //
5849 // DMATSCALARMULTEXPR SPECIALIZATION
5850 //
5851 //=================================================================================================
5852 
5853 //*************************************************************************************************
5861 template< typename MT1 // Type of the left-hand side dense matrix
5862  , typename MT2 // Type of the right-hand side dense matrix
5863  , typename ST > // Type of the right-hand side scalar value
5864 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >
5865  : public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
5866  , private MatScalarMultExpr
5867  , private Computation
5868 {
5869  private:
5870  //**Type definitions****************************************************************************
5871  typedef TDMatDMatMultExpr<MT1,MT2> MMM;
5872  typedef typename MMM::ResultType RES;
5873  typedef typename MT1::ResultType RT1;
5874  typedef typename MT2::ResultType RT2;
5875  typedef typename RT1::ElementType ET1;
5876  typedef typename RT2::ElementType ET2;
5877  typedef typename MT1::CompositeType CT1;
5878  typedef typename MT2::CompositeType CT2;
5879  //**********************************************************************************************
5880 
5881  //**********************************************************************************************
5883  enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
5884  //**********************************************************************************************
5885 
5886  //**********************************************************************************************
5888  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
5889  //**********************************************************************************************
5890 
5891  //**********************************************************************************************
5893 
5896  template< typename T1, typename T2, typename T3 >
5897  struct IsEvaluationRequired {
5898  enum { value = ( evaluateLeft || evaluateRight ) };
5899  };
5900  //**********************************************************************************************
5901 
5902  //**********************************************************************************************
5904 
5906  template< typename T1, typename T2, typename T3, typename T4 >
5907  struct UseBlasKernel {
5908  enum { value = BLAZE_BLAS_MODE &&
5909  HasMutableDataAccess<T1>::value &&
5910  HasConstDataAccess<T2>::value &&
5911  HasConstDataAccess<T3>::value &&
5912  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
5913  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5914  IsBlasCompatible<typename T1::ElementType>::value &&
5915  IsBlasCompatible<typename T2::ElementType>::value &&
5916  IsBlasCompatible<typename T3::ElementType>::value &&
5917  IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
5918  IsSame< typename T1::ElementType, typename T3::ElementType >::value &&
5919  !( IsBuiltin<typename T1::ElementType>::value && IsComplex<T4>::value ) };
5920  };
5921  //**********************************************************************************************
5922 
5923  //**********************************************************************************************
5925 
5927  template< typename T1, typename T2, typename T3, typename T4 >
5928  struct UseVectorizedDefaultKernel {
5929  enum { value = useOptimizedKernels &&
5930  !( IsDiagonal<T2>::value && IsDiagonal<T3>::value ) &&
5931  !( IsDiagonal<T2>::value && IsColumnMajorMatrix<T1>::value ) &&
5932  !( IsDiagonal<T3>::value && IsRowMajorMatrix<T1>::value ) &&
5933  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5934  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
5935  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
5936  IsSame<typename T1::ElementType,T4>::value &&
5937  IntrinsicTrait<typename T1::ElementType>::addition &&
5938  IntrinsicTrait<typename T1::ElementType>::subtraction &&
5939  IntrinsicTrait<typename T1::ElementType>::multiplication };
5940  };
5941  //**********************************************************************************************
5942 
5943  public:
5944  //**Type definitions****************************************************************************
5945  typedef DMatScalarMultExpr<MMM,ST,true> This;
5946  typedef typename MultTrait<RES,ST>::Type ResultType;
5947  typedef typename ResultType::OppositeType OppositeType;
5948  typedef typename ResultType::TransposeType TransposeType;
5949  typedef typename ResultType::ElementType ElementType;
5950  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
5951  typedef const ElementType ReturnType;
5952  typedef const ResultType CompositeType;
5953 
5955  typedef const TDMatDMatMultExpr<MT1,MT2> LeftOperand;
5956 
5958  typedef ST RightOperand;
5959 
5961  typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type LT;
5962 
5964  typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type RT;
5965  //**********************************************************************************************
5966 
5967  //**Compilation flags***************************************************************************
5969  enum { vectorizable = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
5970  MT1::vectorizable && MT2::vectorizable &&
5971  IsSame<ET1,ET2>::value &&
5972  IsSame<ET1,ST>::value &&
5973  IntrinsicTrait<ET1>::addition &&
5974  IntrinsicTrait<ET1>::multiplication };
5975 
5977  enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
5978  !evaluateRight && MT2::smpAssignable };
5979  //**********************************************************************************************
5980 
5981  //**Constructor*********************************************************************************
5987  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
5988  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
5989  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
5990  {}
5991  //**********************************************************************************************
5992 
5993  //**Access operator*****************************************************************************
6000  inline ResultType operator()( size_t i, size_t j ) const {
6001  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
6002  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
6003  return matrix_(i,j) * scalar_;
6004  }
6005  //**********************************************************************************************
6006 
6007  //**At function*********************************************************************************
6015  inline ReturnType at( size_t i, size_t j ) const {
6016  if( i >= matrix_.rows() ) {
6017  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
6018  }
6019  if( j >= matrix_.columns() ) {
6020  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
6021  }
6022  return (*this)(i,j);
6023  }
6024  //**********************************************************************************************
6025 
6026  //**Rows function*******************************************************************************
6031  inline size_t rows() const {
6032  return matrix_.rows();
6033  }
6034  //**********************************************************************************************
6035 
6036  //**Columns function****************************************************************************
6041  inline size_t columns() const {
6042  return matrix_.columns();
6043  }
6044  //**********************************************************************************************
6045 
6046  //**Left operand access*************************************************************************
6051  inline LeftOperand leftOperand() const {
6052  return matrix_;
6053  }
6054  //**********************************************************************************************
6055 
6056  //**Right operand access************************************************************************
6061  inline RightOperand rightOperand() const {
6062  return scalar_;
6063  }
6064  //**********************************************************************************************
6065 
6066  //**********************************************************************************************
6072  template< typename T >
6073  inline bool canAlias( const T* alias ) const {
6074  return matrix_.canAlias( alias );
6075  }
6076  //**********************************************************************************************
6077 
6078  //**********************************************************************************************
6084  template< typename T >
6085  inline bool isAliased( const T* alias ) const {
6086  return matrix_.isAliased( alias );
6087  }
6088  //**********************************************************************************************
6089 
6090  //**********************************************************************************************
6095  inline bool isAligned() const {
6096  return matrix_.isAligned();
6097  }
6098  //**********************************************************************************************
6099 
6100  //**********************************************************************************************
6105  inline bool canSMPAssign() const {
6106  typename MMM::RightOperand B( matrix_.rightOperand() );
6107  return ( !BLAZE_BLAS_IS_PARALLEL ||
6108  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
6109  ( B.columns() > SMP_TDMATDMATMULT_THRESHOLD );
6110  }
6111  //**********************************************************************************************
6112 
6113  private:
6114  //**Member variables****************************************************************************
6115  LeftOperand matrix_;
6116  RightOperand scalar_;
6117  //**********************************************************************************************
6118 
6119  //**Assignment to dense matrices****************************************************************
6131  template< typename MT // Type of the target dense matrix
6132  , bool SO > // Storage order of the target dense matrix
6133  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6134  {
6136 
6137  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6138  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6139 
6140  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6141  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6142 
6143  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
6144  return;
6145  }
6146  else if( left.columns() == 0UL ) {
6147  reset( ~lhs );
6148  return;
6149  }
6150 
6151  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6152  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6153 
6154  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6155  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6156  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6157  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6158  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6159  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6160 
6161  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
6162  }
6163  //**********************************************************************************************
6164 
6165  //**Assignment to dense matrices (kernel selection)*********************************************
6176  template< typename MT3 // Type of the left-hand side target matrix
6177  , typename MT4 // Type of the left-hand side matrix operand
6178  , typename MT5 // Type of the right-hand side matrix operand
6179  , typename ST2 > // Type of the scalar value
6180  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6181  {
6182  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
6183  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
6184  selectSmallAssignKernel( C, A, B, scalar );
6185  else
6186  selectBlasAssignKernel( C, A, B, scalar );
6187  }
6188  //**********************************************************************************************
6189 
6190  //**Default assignment to row-major dense matrices (general/general)****************************
6204  template< typename MT3 // Type of the left-hand side target matrix
6205  , typename MT4 // Type of the left-hand side matrix operand
6206  , typename MT5 // Type of the right-hand side matrix operand
6207  , typename ST2 > // Type of the scalar value
6208  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6209  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6210  {
6211  const size_t M( A.rows() );
6212  const size_t N( B.columns() );
6213  const size_t K( A.columns() );
6214 
6215  for( size_t i=0UL; i<M; ++i )
6216  {
6217  const size_t kbegin( ( IsUpper<MT4>::value )
6218  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6219  :( 0UL ) );
6220  const size_t kend( ( IsLower<MT4>::value )
6221  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6222  :( K ) );
6223  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
6224 
6225  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
6226  for( size_t j=0UL; j<N; ++j ) {
6227  reset( (~C)(i,j) );
6228  }
6229  continue;
6230  }
6231 
6232  {
6233  const size_t jbegin( ( IsUpper<MT5>::value )
6234  ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
6235  :( 0UL ) );
6236  const size_t jend( ( IsLower<MT5>::value )
6237  ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
6238  :( N ) );
6239  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6240 
6241  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
6242  for( size_t j=0UL; j<jbegin; ++j ) {
6243  reset( (~C)(i,j) );
6244  }
6245  }
6246  else if( IsStrictlyUpper<MT5>::value ) {
6247  reset( (~C)(i,0UL) );
6248  }
6249  for( size_t j=jbegin; j<jend; ++j ) {
6250  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6251  }
6252  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
6253  for( size_t j=jend; j<N; ++j ) {
6254  reset( (~C)(i,j) );
6255  }
6256  }
6257  else if( IsStrictlyLower<MT5>::value ) {
6258  reset( (~C)(i,N-1UL) );
6259  }
6260  }
6261 
6262  for( size_t k=kbegin+1UL; k<kend; ++k )
6263  {
6264  const size_t jbegin( ( IsUpper<MT5>::value )
6265  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
6266  :( 0UL ) );
6267  const size_t jend( ( IsLower<MT5>::value )
6268  ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
6269  :( N ) );
6270  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6271 
6272  for( size_t j=jbegin; j<jend; ++j ) {
6273  (~C)(i,j) += A(i,k) * B(k,j);
6274  }
6275  if( IsLower<MT5>::value ) {
6276  (~C)(i,jend) = A(i,k) * B(k,jend);
6277  }
6278  }
6279 
6280  {
6281  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
6282  ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? i+1UL : i )
6283  :( 0UL ) );
6284  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
6285  ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? i : i+1UL )
6286  :( N ) );
6287  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6288 
6289  for( size_t j=jbegin; j<jend; ++j ) {
6290  (~C)(i,j) *= scalar;
6291  }
6292  }
6293  }
6294  }
6295  //**********************************************************************************************
6296 
6297  //**Default assignment to column-major dense matrices (general/general)*************************
6311  template< typename MT3 // Type of the left-hand side target matrix
6312  , typename MT4 // Type of the left-hand side matrix operand
6313  , typename MT5 // Type of the right-hand side matrix operand
6314  , typename ST2 > // Type of the scalar value
6315  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6316  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6317  {
6318  const size_t M( A.rows() );
6319  const size_t N( B.columns() );
6320  const size_t K( A.columns() );
6321 
6322  for( size_t j=0UL; j<N; ++j )
6323  {
6324  const size_t kbegin( ( IsLower<MT5>::value )
6325  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6326  :( 0UL ) );
6327  const size_t kend( ( IsUpper<MT5>::value )
6328  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6329  :( K ) );
6330  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
6331 
6332  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
6333  for( size_t i=0UL; i<M; ++i ) {
6334  reset( (~C)(i,j) );
6335  }
6336  continue;
6337  }
6338 
6339  {
6340  const size_t ibegin( ( IsLower<MT4>::value )
6341  ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
6342  :( 0UL ) );
6343  const size_t iend( ( IsUpper<MT4>::value )
6344  ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
6345  :( M ) );
6346  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6347 
6348  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
6349  for( size_t i=0UL; i<ibegin; ++i ) {
6350  reset( (~C)(i,j) );
6351  }
6352  }
6353  else if( IsStrictlyLower<MT4>::value ) {
6354  reset( (~C)(0UL,j) );
6355  }
6356  for( size_t i=ibegin; i<iend; ++i ) {
6357  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6358  }
6359  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
6360  for( size_t i=iend; i<M; ++i ) {
6361  reset( (~C)(i,j) );
6362  }
6363  }
6364  else if( IsStrictlyUpper<MT4>::value ) {
6365  reset( (~C)(M-1UL,j) );
6366  }
6367  }
6368 
6369  for( size_t k=kbegin+1UL; k<kend; ++k )
6370  {
6371  const size_t ibegin( ( IsLower<MT4>::value )
6372  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
6373  :( 0UL ) );
6374  const size_t iend( ( IsUpper<MT4>::value )
6375  ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
6376  :( M ) );
6377  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6378 
6379  for( size_t i=ibegin; i<iend; ++i ) {
6380  (~C)(i,j) += A(i,k) * B(k,j);
6381  }
6382  if( IsUpper<MT4>::value ) {
6383  (~C)(iend,j) = A(iend,k) * B(k,j);
6384  }
6385  }
6386 
6387  {
6388  const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
6389  ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? j+1UL : j )
6390  :( 0UL ) );
6391  const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
6392  ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? j : j+1UL )
6393  :( M ) );
6394  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6395 
6396  for( size_t i=ibegin; i<iend; ++i ) {
6397  (~C)(i,j) *= scalar;
6398  }
6399  }
6400  }
6401  }
6402  //**********************************************************************************************
6403 
6404  //**Default assignment to row-major dense matrices (general/diagonal)***************************
6418  template< typename MT3 // Type of the left-hand side target matrix
6419  , typename MT4 // Type of the left-hand side matrix operand
6420  , typename MT5 // Type of the right-hand side matrix operand
6421  , typename ST2 > // Type of the scalar value
6422  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6423  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6424  {
6425  const size_t M( A.rows() );
6426  const size_t N( B.columns() );
6427 
6428  const size_t block( BLOCK_SIZE );
6429 
6430  for( size_t ii=0UL; ii<M; ii+=block ) {
6431  const size_t iend( min( M, ii+block ) );
6432  for( size_t jj=0UL; jj<N; jj+=block ) {
6433  const size_t jend( min( N, jj+block ) );
6434  for( size_t i=ii; i<iend; ++i )
6435  {
6436  const size_t jbegin( ( IsUpper<MT4>::value )
6437  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
6438  :( jj ) );
6439  const size_t jpos( ( IsLower<MT4>::value )
6440  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
6441  :( jend ) );
6442 
6443  if( IsUpper<MT4>::value ) {
6444  for( size_t j=jj; j<jbegin; ++j ) {
6445  reset( (~C)(i,j) );
6446  }
6447  }
6448  for( size_t j=jbegin; j<jpos; ++j ) {
6449  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6450  }
6451  if( IsLower<MT4>::value ) {
6452  for( size_t j=jpos; j<jend; ++j ) {
6453  reset( (~C)(i,j) );
6454  }
6455  }
6456  }
6457  }
6458  }
6459  }
6460  //**********************************************************************************************
6461 
6462  //**Default assignment to column-major dense matrices (general/diagonal)************************
6476  template< typename MT3 // Type of the left-hand side target matrix
6477  , typename MT4 // Type of the left-hand side matrix operand
6478  , typename MT5 // Type of the right-hand side matrix operand
6479  , typename ST2 > // Type of the scalar value
6480  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6481  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6482  {
6483  const size_t M( A.rows() );
6484  const size_t N( B.columns() );
6485 
6486  for( size_t j=0UL; j<N; ++j )
6487  {
6488  const size_t ibegin( ( IsLower<MT4>::value )
6489  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6490  :( 0UL ) );
6491  const size_t iend( ( IsUpper<MT4>::value )
6492  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6493  :( M ) );
6494  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6495 
6496  if( IsLower<MT4>::value ) {
6497  for( size_t i=0UL; i<ibegin; ++i ) {
6498  reset( (~C)(i,j) );
6499  }
6500  }
6501  for( size_t i=ibegin; i<iend; ++i ) {
6502  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6503  }
6504  if( IsUpper<MT4>::value ) {
6505  for( size_t i=iend; i<M; ++i ) {
6506  reset( (~C)(i,j) );
6507  }
6508  }
6509  }
6510  }
6511  //**********************************************************************************************
6512 
6513  //**Default assignment to row-major dense matrices (diagonal/general)***************************
6527  template< typename MT3 // Type of the left-hand side target matrix
6528  , typename MT4 // Type of the left-hand side matrix operand
6529  , typename MT5 // Type of the right-hand side matrix operand
6530  , typename ST2 > // Type of the scalar value
6531  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6532  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6533  {
6534  const size_t M( A.rows() );
6535  const size_t N( B.columns() );
6536 
6537  for( size_t i=0UL; i<M; ++i )
6538  {
6539  const size_t jbegin( ( IsUpper<MT5>::value )
6540  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
6541  :( 0UL ) );
6542  const size_t jend( ( IsLower<MT5>::value )
6543  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
6544  :( N ) );
6545  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6546 
6547  if( IsUpper<MT5>::value ) {
6548  for( size_t j=0UL; j<jbegin; ++j ) {
6549  reset( (~C)(i,j) );
6550  }
6551  }
6552  for( size_t j=jbegin; j<jend; ++j ) {
6553  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6554  }
6555  if( IsLower<MT5>::value ) {
6556  for( size_t j=jend; j<N; ++j ) {
6557  reset( (~C)(i,j) );
6558  }
6559  }
6560  }
6561  }
6562  //**********************************************************************************************
6563 
6564  //**Default assignment to column-major dense matrices (diagonal/general)************************
6578  template< typename MT3 // Type of the left-hand side target matrix
6579  , typename MT4 // Type of the left-hand side matrix operand
6580  , typename MT5 // Type of the right-hand side matrix operand
6581  , typename ST2 > // Type of the scalar value
6582  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6583  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6584  {
6585  const size_t M( A.rows() );
6586  const size_t N( B.columns() );
6587 
6588  const size_t block( BLOCK_SIZE );
6589 
6590  for( size_t jj=0UL; jj<N; jj+=block ) {
6591  const size_t jend( min( N, jj+block ) );
6592  for( size_t ii=0UL; ii<M; ii+=block ) {
6593  const size_t iend( min( M, ii+block ) );
6594  for( size_t j=jj; j<jend; ++j )
6595  {
6596  const size_t ibegin( ( IsLower<MT5>::value )
6597  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
6598  :( ii ) );
6599  const size_t ipos( ( IsUpper<MT5>::value )
6600  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
6601  :( iend ) );
6602 
6603  if( IsLower<MT5>::value ) {
6604  for( size_t i=ii; i<ibegin; ++i ) {
6605  reset( (~C)(i,j) );
6606  }
6607  }
6608  for( size_t i=ibegin; i<ipos; ++i ) {
6609  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6610  }
6611  if( IsUpper<MT5>::value ) {
6612  for( size_t i=ipos; i<iend; ++i ) {
6613  reset( (~C)(i,j) );
6614  }
6615  }
6616  }
6617  }
6618  }
6619  }
6620  //**********************************************************************************************
6621 
6622  //**Default assignment to dense matrices (diagonal/diagonal)************************************
6636  template< typename MT3 // Type of the left-hand side target matrix
6637  , typename MT4 // Type of the left-hand side matrix operand
6638  , typename MT5 // Type of the right-hand side matrix operand
6639  , typename ST2 > // Type of the scalar value
6640  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
6641  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6642  {
6643  reset( C );
6644 
6645  for( size_t i=0UL; i<A.rows(); ++i ) {
6646  C(i,i) = A(i,i) * B(i,i) * scalar;
6647  }
6648  }
6649  //**********************************************************************************************
6650 
6651  //**Default assignment to dense matrices (small matrices)***************************************
6665  template< typename MT3 // Type of the left-hand side target matrix
6666  , typename MT4 // Type of the left-hand side matrix operand
6667  , typename MT5 // Type of the right-hand side matrix operand
6668  , typename ST2 > // Type of the scalar value
6669  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6670  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6671  {
6672  selectDefaultAssignKernel( C, A, B, scalar );
6673  }
6674  //**********************************************************************************************
6675 
6676  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
6691  template< typename MT3 // Type of the left-hand side target matrix
6692  , typename MT4 // Type of the left-hand side matrix operand
6693  , typename MT5 // Type of the right-hand side matrix operand
6694  , typename ST2 > // Type of the scalar value
6695  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6696  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6697  {
6698  typedef IntrinsicTrait<ElementType> IT;
6699 
6700  const size_t M( A.rows() );
6701  const size_t N( B.columns() );
6702  const size_t K( A.columns() );
6703 
6704  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6705 
6706  const size_t jpos( remainder ? ( N & size_t(-IT::size) ) : N );
6707  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % IT::size ) ) == jpos, "Invalid end calculation" );
6708 
6709  const IntrinsicType factor( set( scalar ) );
6710 
6711  size_t j( 0UL );
6712 
6713  for( ; (j+IT::size*7UL) < jpos; j+=IT::size*8UL ) {
6714  for( size_t i=0UL; i<M; ++i )
6715  {
6716  const size_t kbegin( ( IsUpper<MT4>::value )
6717  ?( ( IsLower<MT5>::value )
6718  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6719  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6720  :( IsLower<MT5>::value ? j : 0UL ) );
6721  const size_t kend( ( IsLower<MT4>::value )
6722  ?( ( IsUpper<MT5>::value )
6723  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
6724  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
6725  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
6726 
6727  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6728 
6729  for( size_t k=kbegin; k<kend; ++k ) {
6730  const IntrinsicType a1( set( A(i,k) ) );
6731  xmm1 = xmm1 + a1 * B.load(k,j );
6732  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
6733  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
6734  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
6735  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
6736  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
6737  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
6738  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
6739  }
6740 
6741  (~C).store( i, j , xmm1 * factor );
6742  (~C).store( i, j+IT::size , xmm2 * factor );
6743  (~C).store( i, j+IT::size*2UL, xmm3 * factor );
6744  (~C).store( i, j+IT::size*3UL, xmm4 * factor );
6745  (~C).store( i, j+IT::size*4UL, xmm5 * factor );
6746  (~C).store( i, j+IT::size*5UL, xmm6 * factor );
6747  (~C).store( i, j+IT::size*6UL, xmm7 * factor );
6748  (~C).store( i, j+IT::size*7UL, xmm8 * factor );
6749  }
6750  }
6751 
6752  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
6753  {
6754  size_t i( 0UL );
6755 
6756  for( ; (i+2UL) <= M; i+=2UL )
6757  {
6758  const size_t kbegin( ( IsUpper<MT4>::value )
6759  ?( ( IsLower<MT5>::value )
6760  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6761  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6762  :( IsLower<MT5>::value ? j : 0UL ) );
6763  const size_t kend( ( IsLower<MT4>::value )
6764  ?( ( IsUpper<MT5>::value )
6765  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
6766  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6767  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
6768 
6769  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6770 
6771  for( size_t k=kbegin; k<kend; ++k ) {
6772  const IntrinsicType a1( set( A(i ,k) ) );
6773  const IntrinsicType a2( set( A(i+1UL,k) ) );
6774  const IntrinsicType b1( B.load(k,j ) );
6775  const IntrinsicType b2( B.load(k,j+IT::size ) );
6776  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
6777  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
6778  xmm1 = xmm1 + a1 * b1;
6779  xmm2 = xmm2 + a1 * b2;
6780  xmm3 = xmm3 + a1 * b3;
6781  xmm4 = xmm4 + a1 * b4;
6782  xmm5 = xmm5 + a2 * b1;
6783  xmm6 = xmm6 + a2 * b2;
6784  xmm7 = xmm7 + a2 * b3;
6785  xmm8 = xmm8 + a2 * b4;
6786  }
6787 
6788  (~C).store( i , j , xmm1 * factor );
6789  (~C).store( i , j+IT::size , xmm2 * factor );
6790  (~C).store( i , j+IT::size*2UL, xmm3 * factor );
6791  (~C).store( i , j+IT::size*3UL, xmm4 * factor );
6792  (~C).store( i+1UL, j , xmm5 * factor );
6793  (~C).store( i+1UL, j+IT::size , xmm6 * factor );
6794  (~C).store( i+1UL, j+IT::size*2UL, xmm7 * factor );
6795  (~C).store( i+1UL, j+IT::size*3UL, xmm8 * factor );
6796  }
6797 
6798  if( i < M )
6799  {
6800  const size_t kbegin( ( IsUpper<MT4>::value )
6801  ?( ( IsLower<MT5>::value )
6802  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6803  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6804  :( IsLower<MT5>::value ? j : 0UL ) );
6805  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
6806 
6807  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6808 
6809  for( size_t k=kbegin; k<kend; ++k ) {
6810  const IntrinsicType a1( set( A(i,k) ) );
6811  xmm1 = xmm1 + a1 * B.load(k,j );
6812  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
6813  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
6814  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
6815  }
6816 
6817  (~C).store( i, j , xmm1 * factor );
6818  (~C).store( i, j+IT::size , xmm2 * factor );
6819  (~C).store( i, j+IT::size*2UL, xmm3 * factor );
6820  (~C).store( i, j+IT::size*3UL, xmm4 * factor );
6821  }
6822  }
6823 
6824  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
6825  {
6826  size_t i( 0UL );
6827 
6828  for( ; (i+2UL) <= M; i+=2UL )
6829  {
6830  const size_t kbegin( ( IsUpper<MT4>::value )
6831  ?( ( IsLower<MT5>::value )
6832  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6833  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6834  :( IsLower<MT5>::value ? j : 0UL ) );
6835  const size_t kend( ( IsLower<MT4>::value )
6836  ?( ( IsUpper<MT5>::value )
6837  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
6838  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6839  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
6840 
6841  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6842 
6843  for( size_t k=kbegin; k<kend; ++k ) {
6844  const IntrinsicType a1( set( A(i ,k) ) );
6845  const IntrinsicType a2( set( A(i+1UL,k) ) );
6846  const IntrinsicType b1( B.load(k,j ) );
6847  const IntrinsicType b2( B.load(k,j+IT::size) );
6848  xmm1 = xmm1 + a1 * b1;
6849  xmm2 = xmm2 + a1 * b2;
6850  xmm3 = xmm3 + a2 * b1;
6851  xmm4 = xmm4 + a2 * b2;
6852  }
6853 
6854  (~C).store( i , j , xmm1 * factor );
6855  (~C).store( i , j+IT::size, xmm2 * factor );
6856  (~C).store( i+1UL, j , xmm3 * factor );
6857  (~C).store( i+1UL, j+IT::size, xmm4 * factor );
6858  }
6859 
6860  if( i < M )
6861  {
6862  const size_t kbegin( ( IsUpper<MT4>::value )
6863  ?( ( IsLower<MT5>::value )
6864  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6865  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6866  :( IsLower<MT5>::value ? j : 0UL ) );
6867  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
6868 
6869  IntrinsicType xmm1, xmm2;
6870 
6871  for( size_t k=kbegin; k<kend; ++k ) {
6872  const IntrinsicType a1( set( A(i,k) ) );
6873  xmm1 = xmm1 + a1 * B.load(k,j );
6874  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
6875  }
6876 
6877  (~C).store( i, j , xmm1 * factor );
6878  (~C).store( i, j+IT::size, xmm2 * factor );
6879  }
6880  }
6881 
6882  for( ; j<jpos; j+=IT::size )
6883  {
6884  size_t i( 0UL );
6885 
6886  for( ; (i+2UL) <= M; i+=2UL )
6887  {
6888  const size_t kbegin( ( IsUpper<MT4>::value )
6889  ?( ( IsLower<MT5>::value )
6890  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6891  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6892  :( IsLower<MT5>::value ? j : 0UL ) );
6893  const size_t kend( ( IsLower<MT4>::value )
6894  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6895  :( K ) );
6896 
6897  IntrinsicType xmm1, xmm2;
6898 
6899  for( size_t k=kbegin; k<kend; ++k ) {
6900  const IntrinsicType b1( B.load(k,j) );
6901  xmm1 = xmm1 + set( A(i ,k) ) * b1;
6902  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
6903  }
6904 
6905  (~C).store( i , j, xmm1 * factor );
6906  (~C).store( i+1UL, j, xmm2 * factor );
6907  }
6908 
6909  if( i < M )
6910  {
6911  const size_t kbegin( ( IsUpper<MT4>::value )
6912  ?( ( IsLower<MT5>::value )
6913  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6914  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6915  :( IsLower<MT5>::value ? j : 0UL ) );
6916 
6917  IntrinsicType xmm1;
6918 
6919  for( size_t k=kbegin; k<K; ++k ) {
6920  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
6921  }
6922 
6923  (~C).store( i, j, xmm1 * factor );
6924  }
6925  }
6926 
6927  for( ; remainder && j<N; ++j )
6928  {
6929  size_t i( 0UL );
6930 
6931  for( ; (i+2UL) <= M; i+=2UL )
6932  {
6933  const size_t kbegin( ( IsUpper<MT4>::value )
6934  ?( ( IsLower<MT5>::value )
6935  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6936  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6937  :( IsLower<MT5>::value ? j : 0UL ) );
6938  const size_t kend( ( IsLower<MT4>::value )
6939  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6940  :( K ) );
6941 
6942  ElementType value1 = ElementType();
6943  ElementType value2 = ElementType();
6944 
6945  for( size_t k=kbegin; k<kend; ++k ) {
6946  value1 += A(i ,k) * B(k,j);
6947  value2 += A(i+1UL,k) * B(k,j);
6948  }
6949 
6950  (~C)(i ,j) = value1 * scalar;
6951  (~C)(i+1UL,j) = value2 * scalar;
6952  }
6953 
6954  if( i < M )
6955  {
6956  const size_t kbegin( ( IsUpper<MT4>::value )
6957  ?( ( IsLower<MT5>::value )
6958  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6959  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6960  :( IsLower<MT5>::value ? j : 0UL ) );
6961 
6962  ElementType value = ElementType();
6963 
6964  for( size_t k=kbegin; k<K; ++k ) {
6965  value += A(i,k) * B(k,j);
6966  }
6967 
6968  (~C)(i,j) = value * scalar;
6969  }
6970  }
6971  }
6972  //**********************************************************************************************
6973 
6974  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
6989  template< typename MT3 // Type of the left-hand side target matrix
6990  , typename MT4 // Type of the left-hand side matrix operand
6991  , typename MT5 // Type of the right-hand side matrix operand
6992  , typename ST2 > // Type of the scalar value
6993  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6994  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6995  {
6996  typedef IntrinsicTrait<ElementType> IT;
6997 
6998  const size_t M( A.rows() );
6999  const size_t N( B.columns() );
7000  const size_t K( A.columns() );
7001 
7002  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
7003 
7004  const size_t ipos( remainder ? ( M & size_t(-IT::size) ) : M );
7005  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % IT::size ) ) == ipos, "Invalid end calculation" );
7006 
7007  const IntrinsicType factor( set( scalar ) );
7008 
7009  size_t i( 0UL );
7010 
7011  for( ; (i+IT::size*7UL) < ipos; i+=IT::size*8UL ) {
7012  for( size_t j=0UL; j<N; ++j )
7013  {
7014  const size_t kbegin( ( IsLower<MT5>::value )
7015  ?( ( IsUpper<MT4>::value )
7016  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7017  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7018  :( IsUpper<MT4>::value ? i : 0UL ) );
7019  const size_t kend( ( IsUpper<MT5>::value )
7020  ?( ( IsLower<MT4>::value )
7021  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
7022  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
7023  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
7024 
7025  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7026 
7027  for( size_t k=kbegin; k<kend; ++k ) {
7028  const IntrinsicType b1( set( B(k,j) ) );
7029  xmm1 = xmm1 + A.load(i ,k) * b1;
7030  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
7031  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
7032  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
7033  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
7034  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
7035  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
7036  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
7037  }
7038 
7039  (~C).store( i , j, xmm1 * factor );
7040  (~C).store( i+IT::size , j, xmm2 * factor );
7041  (~C).store( i+IT::size*2UL, j, xmm3 * factor );
7042  (~C).store( i+IT::size*3UL, j, xmm4 * factor );
7043  (~C).store( i+IT::size*4UL, j, xmm5 * factor );
7044  (~C).store( i+IT::size*5UL, j, xmm6 * factor );
7045  (~C).store( i+IT::size*6UL, j, xmm7 * factor );
7046  (~C).store( i+IT::size*7UL, j, xmm8 * factor );
7047  }
7048  }
7049 
7050  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
7051  {
7052  size_t j( 0UL );
7053 
7054  for( ; (j+2UL) <= N; j+=2UL )
7055  {
7056  const size_t kbegin( ( IsLower<MT5>::value )
7057  ?( ( IsUpper<MT4>::value )
7058  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7059  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7060  :( IsUpper<MT4>::value ? i : 0UL ) );
7061  const size_t kend( ( IsUpper<MT5>::value )
7062  ?( ( IsLower<MT4>::value )
7063  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7064  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7065  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
7066 
7067  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7068 
7069  for( size_t k=kbegin; k<kend; ++k ) {
7070  const IntrinsicType a1( A.load(i ,k) );
7071  const IntrinsicType a2( A.load(i+IT::size ,k) );
7072  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
7073  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
7074  const IntrinsicType b1( set( B(k,j ) ) );
7075  const IntrinsicType b2( set( B(k,j+1UL) ) );
7076  xmm1 = xmm1 + a1 * b1;
7077  xmm2 = xmm2 + a2 * b1;
7078  xmm3 = xmm3 + a3 * b1;
7079  xmm4 = xmm4 + a4 * b1;
7080  xmm5 = xmm5 + a1 * b2;
7081  xmm6 = xmm6 + a2 * b2;
7082  xmm7 = xmm7 + a3 * b2;
7083  xmm8 = xmm8 + a4 * b2;
7084  }
7085 
7086  (~C).store( i , j , xmm1 * factor );
7087  (~C).store( i+IT::size , j , xmm2 * factor );
7088  (~C).store( i+IT::size*2UL, j , xmm3 * factor );
7089  (~C).store( i+IT::size*3UL, j , xmm4 * factor );
7090  (~C).store( i , j+1UL, xmm5 * factor );
7091  (~C).store( i+IT::size , j+1UL, xmm6 * factor );
7092  (~C).store( i+IT::size*2UL, j+1UL, xmm7 * factor );
7093  (~C).store( i+IT::size*3UL, j+1UL, xmm8 * factor );
7094  }
7095 
7096  if( j < N )
7097  {
7098  const size_t kbegin( ( IsLower<MT5>::value )
7099  ?( ( IsUpper<MT4>::value )
7100  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7101  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7102  :( IsUpper<MT4>::value ? i : 0UL ) );
7103  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
7104 
7105  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7106 
7107  for( size_t k=kbegin; k<kend; ++k ) {
7108  const IntrinsicType b1( set( B(k,j) ) );
7109  xmm1 = xmm1 + A.load(i ,k) * b1;
7110  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
7111  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
7112  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
7113  }
7114 
7115  (~C).store( i , j, xmm1 * factor );
7116  (~C).store( i+IT::size , j, xmm2 * factor );
7117  (~C).store( i+IT::size*2UL, j, xmm3 * factor );
7118  (~C).store( i+IT::size*3UL, j, xmm4 * factor );
7119  }
7120  }
7121 
7122  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
7123  {
7124  size_t j( 0UL );
7125 
7126  for( ; (j+2UL) <= N; j+=2UL )
7127  {
7128  const size_t kbegin( ( IsLower<MT5>::value )
7129  ?( ( IsUpper<MT4>::value )
7130  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7131  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7132  :( IsUpper<MT4>::value ? i : 0UL ) );
7133  const size_t kend( ( IsUpper<MT5>::value )
7134  ?( ( IsLower<MT4>::value )
7135  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7136  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7137  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
7138 
7139  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7140 
7141  for( size_t k=kbegin; k<kend; ++k ) {
7142  const IntrinsicType a1( A.load(i ,k) );
7143  const IntrinsicType a2( A.load(i+IT::size,k) );
7144  const IntrinsicType b1( set( B(k,j ) ) );
7145  const IntrinsicType b2( set( B(k,j+1UL) ) );
7146  xmm1 = xmm1 + a1 * b1;
7147  xmm2 = xmm2 + a2 * b1;
7148  xmm3 = xmm3 + a1 * b2;
7149  xmm4 = xmm4 + a2 * b2;
7150  }
7151 
7152  (~C).store( i , j , xmm1 * factor );
7153  (~C).store( i+IT::size, j , xmm2 * factor );
7154  (~C).store( i , j+1UL, xmm3 * factor );
7155  (~C).store( i+IT::size, j+1UL, xmm4 * factor );
7156  }
7157 
7158  if( j < N )
7159  {
7160  const size_t kbegin( ( IsLower<MT5>::value )
7161  ?( ( IsUpper<MT4>::value )
7162  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7163  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7164  :( IsUpper<MT4>::value ? i : 0UL ) );
7165  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
7166 
7167  IntrinsicType xmm1, xmm2;
7168 
7169  for( size_t k=kbegin; k<kend; ++k ) {
7170  const IntrinsicType b1( set( B(k,j) ) );
7171  xmm1 = xmm1 + A.load(i ,k) * b1;
7172  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
7173  }
7174 
7175  (~C).store( i , j, xmm1 * factor );
7176  (~C).store( i+IT::size, j, xmm2 * factor );
7177  }
7178  }
7179 
7180  for( ; i<ipos; i+=IT::size )
7181  {
7182  size_t j( 0UL );
7183 
7184  for( ; (j+2UL) <= N; j+=2UL )
7185  {
7186  const size_t kbegin( ( IsLower<MT5>::value )
7187  ?( ( IsUpper<MT4>::value )
7188  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7189  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7190  :( IsUpper<MT4>::value ? i : 0UL ) );
7191  const size_t kend( ( IsUpper<MT5>::value )
7192  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7193  :( K ) );
7194 
7195  IntrinsicType xmm1, xmm2;
7196 
7197  for( size_t k=kbegin; k<kend; ++k ) {
7198  const IntrinsicType a1( A.load(i,k) );
7199  xmm1 = xmm1 + a1 * set( B(k,j ) );
7200  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
7201  }
7202 
7203  (~C).store( i, j , xmm1 * factor );
7204  (~C).store( i, j+1UL, xmm2 * factor );
7205  }
7206 
7207  if( j < N )
7208  {
7209  const size_t kbegin( ( IsLower<MT5>::value )
7210  ?( ( IsUpper<MT4>::value )
7211  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7212  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7213  :( IsUpper<MT4>::value ? i : 0UL ) );
7214 
7215  IntrinsicType xmm1;
7216 
7217  for( size_t k=kbegin; k<K; ++k ) {
7218  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
7219  }
7220 
7221  (~C).store( i, j, xmm1 * factor );
7222  }
7223  }
7224 
7225  for( ; remainder && i<M; ++i )
7226  {
7227  size_t j( 0UL );
7228 
7229  for( ; (j+2UL) <= N; j+=2UL )
7230  {
7231  const size_t kbegin( ( IsLower<MT5>::value )
7232  ?( ( IsUpper<MT4>::value )
7233  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7234  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7235  :( IsUpper<MT4>::value ? i : 0UL ) );
7236  const size_t kend( ( IsUpper<MT5>::value )
7237  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7238  :( K ) );
7239 
7240  ElementType value1 = ElementType();
7241  ElementType value2 = ElementType();
7242 
7243  for( size_t k=kbegin; k<kend; ++k ) {
7244  value1 += A(i,k) * B(k,j );
7245  value2 += A(i,k) * B(k,j+1UL);
7246  }
7247 
7248  (~C)(i,j ) = value1 * scalar;
7249  (~C)(i,j+1UL) = value2 * scalar;
7250  }
7251 
7252  if( j < N )
7253  {
7254  const size_t kbegin( ( IsLower<MT5>::value )
7255  ?( ( IsUpper<MT4>::value )
7256  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7257  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7258  :( IsUpper<MT4>::value ? i : 0UL ) );
7259 
7260  ElementType value = ElementType();
7261 
7262  for( size_t k=kbegin; k<K; ++k ) {
7263  value += A(i,k) * B(k,j);
7264  }
7265 
7266  (~C)(i,j) = value * scalar;
7267  }
7268  }
7269  }
7270  //**********************************************************************************************
7271 
7272  //**Default assignment to dense matrices (large matrices)***************************************
7286  template< typename MT3 // Type of the left-hand side target matrix
7287  , typename MT4 // Type of the left-hand side matrix operand
7288  , typename MT5 // Type of the right-hand side matrix operand
7289  , typename ST2 > // Type of the scalar value
7290  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7291  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7292  {
7293  selectDefaultAssignKernel( C, A, B, scalar );
7294  }
7295  //**********************************************************************************************
7296 
7297  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
7312  template< typename MT3 // Type of the left-hand side target matrix
7313  , typename MT4 // Type of the left-hand side matrix operand
7314  , typename MT5 // Type of the right-hand side matrix operand
7315  , typename ST2 > // Type of the scalar value
7316  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7317  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7318  {
7319  typedef IntrinsicTrait<ElementType> IT;
7320 
7321  const size_t M( A.rows() );
7322  const size_t N( B.columns() );
7323  const size_t K( A.columns() );
7324 
7325  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
7326 
7327  const IntrinsicType factor( set( scalar ) );
7328 
7329  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
7330  {
7331  const size_t jend( min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
7332 
7333  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
7334  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % IT::size ) ) == jpos, "Invalid end calculation" );
7335 
7336  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
7337  {
7338  const size_t iend( min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
7339 
7340  for( size_t i=ii; i<iend; ++i ) {
7341  for( size_t j=jj; j<jend; ++j ) {
7342  reset( (~C)(i,j) );
7343  }
7344  }
7345 
7346  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
7347  {
7348  const size_t ktmp( min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
7349 
7350  size_t j( jj );
7351 
7352  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
7353  {
7354  const size_t j1( j+IT::size );
7355  const size_t j2( j+IT::size*2UL );
7356  const size_t j3( j+IT::size*3UL );
7357 
7358  size_t i( ii );
7359 
7360  for( ; (i+2UL) <= iend; i+=2UL )
7361  {
7362  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7363  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7364  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7365  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
7366 
7367  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7368 
7369  for( size_t k=kbegin; k<kend; ++k ) {
7370  const IntrinsicType a1( set( A(i ,k) ) );
7371  const IntrinsicType a2( set( A(i+1UL,k) ) );
7372  const IntrinsicType b1( B.load(k,j ) );
7373  const IntrinsicType b2( B.load(k,j1) );
7374  const IntrinsicType b3( B.load(k,j2) );
7375  const IntrinsicType b4( B.load(k,j3) );
7376  xmm1 = xmm1 + a1 * b1;
7377  xmm2 = xmm2 + a1 * b2;
7378  xmm3 = xmm3 + a1 * b3;
7379  xmm4 = xmm4 + a1 * b4;
7380  xmm5 = xmm5 + a2 * b1;
7381  xmm6 = xmm6 + a2 * b2;
7382  xmm7 = xmm7 + a2 * b3;
7383  xmm8 = xmm8 + a2 * b4;
7384  }
7385 
7386  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7387  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
7388  (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
7389  (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
7390  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
7391  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
7392  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
7393  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
7394  }
7395 
7396  if( i < iend )
7397  {
7398  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7399  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7400  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7401  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
7402 
7403  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7404 
7405  for( size_t k=kbegin; k<kend; ++k ) {
7406  const IntrinsicType a1( set( A(i,k) ) );
7407  xmm1 = xmm1 + a1 * B.load(k,j );
7408  xmm2 = xmm2 + a1 * B.load(k,j1);
7409  xmm3 = xmm3 + a1 * B.load(k,j2);
7410  xmm4 = xmm4 + a1 * B.load(k,j3);
7411  }
7412 
7413  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
7414  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
7415  (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
7416  (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
7417  }
7418  }
7419 
7420  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
7421  {
7422  const size_t j1( j+IT::size );
7423 
7424  size_t i( ii );
7425 
7426  for( ; (i+4UL) <= iend; i+=4UL )
7427  {
7428  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7429  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7430  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
7431  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
7432 
7433  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7434 
7435  for( size_t k=kbegin; k<kend; ++k ) {
7436  const IntrinsicType a1( set( A(i ,k) ) );
7437  const IntrinsicType a2( set( A(i+1UL,k) ) );
7438  const IntrinsicType a3( set( A(i+2UL,k) ) );
7439  const IntrinsicType a4( set( A(i+3UL,k) ) );
7440  const IntrinsicType b1( B.load(k,j ) );
7441  const IntrinsicType b2( B.load(k,j1) );
7442  xmm1 = xmm1 + a1 * b1;
7443  xmm2 = xmm2 + a1 * b2;
7444  xmm3 = xmm3 + a2 * b1;
7445  xmm4 = xmm4 + a2 * b2;
7446  xmm5 = xmm5 + a3 * b1;
7447  xmm6 = xmm6 + a3 * b2;
7448  xmm7 = xmm7 + a4 * b1;
7449  xmm8 = xmm8 + a4 * b2;
7450  }
7451 
7452  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7453  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
7454  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
7455  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
7456  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
7457  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
7458  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
7459  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
7460  }
7461 
7462  for( ; (i+2UL) <= iend; i+=2UL )
7463  {
7464  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7465  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7466  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7467  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
7468 
7469  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7470 
7471  for( size_t k=kbegin; k<kend; ++k ) {
7472  const IntrinsicType a1( set( A(i ,k) ) );
7473  const IntrinsicType a2( set( A(i+1UL,k) ) );
7474  const IntrinsicType b1( B.load(k,j ) );
7475  const IntrinsicType b2( B.load(k,j1) );
7476  xmm1 = xmm1 + a1 * b1;
7477  xmm2 = xmm2 + a1 * b2;
7478  xmm3 = xmm3 + a2 * b1;
7479  xmm4 = xmm4 + a2 * b2;
7480  }
7481 
7482  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7483  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
7484  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
7485  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
7486  }
7487 
7488  if( i < iend )
7489  {
7490  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7491  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7492  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7493  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
7494 
7495  IntrinsicType xmm1, xmm2;
7496 
7497  for( size_t k=kbegin; k<kend; ++k ) {
7498  const IntrinsicType a1( set( A(i,k) ) );
7499  xmm1 = xmm1 + a1 * B.load(k,j );
7500  xmm2 = xmm2 + a1 * B.load(k,j1);
7501  }
7502 
7503  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
7504  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
7505  }
7506  }
7507 
7508  for( ; j<jpos; j+=IT::size )
7509  {
7510  for( size_t i=ii; i<iend; ++i )
7511  {
7512  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7513  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7514  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7515  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
7516 
7517  IntrinsicType xmm1;
7518 
7519  for( size_t k=kbegin; k<kend; ++k ) {
7520  const IntrinsicType a1( set( A(i,k) ) );
7521  xmm1 = xmm1 + a1 * B.load(k,j);
7522  }
7523 
7524  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
7525  }
7526  }
7527 
7528  for( ; remainder && j<jend; ++j )
7529  {
7530  for( size_t i=ii; i<iend; ++i )
7531  {
7532  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7533  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7534  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7535  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
7536 
7537  ElementType value = ElementType();
7538 
7539  for( size_t k=kbegin; k<kend; ++k ) {
7540  value += A(i,k) * B(k,j);
7541  }
7542 
7543  (~C)(i,j) += value * scalar;
7544  }
7545  }
7546  }
7547  }
7548  }
7549  }
7550  //**********************************************************************************************
7551 
7552  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
7567  template< typename MT3 // Type of the left-hand side target matrix
7568  , typename MT4 // Type of the left-hand side matrix operand
7569  , typename MT5 // Type of the right-hand side matrix operand
7570  , typename ST2 > // Type of the scalar value
7571  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7572  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7573  {
7574  typedef IntrinsicTrait<ElementType> IT;
7575 
7576  const size_t M( A.rows() );
7577  const size_t N( B.columns() );
7578  const size_t K( A.columns() );
7579 
7580  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
7581 
7582  const IntrinsicType factor( set( scalar ) );
7583 
7584  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
7585  {
7586  const size_t iend( min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
7587 
7588  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
7589  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % IT::size ) ) == ipos, "Invalid end calculation" );
7590 
7591  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
7592  {
7593  const size_t jend( min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
7594 
7595  for( size_t j=jj; j<jend; ++j ) {
7596  for( size_t i=ii; i<iend; ++i ) {
7597  reset( (~C)(i,j) );
7598  }
7599  }
7600 
7601  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
7602  {
7603  const size_t ktmp( min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
7604 
7605  size_t i( ii );
7606 
7607  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
7608  {
7609  const size_t i1( i+IT::size );
7610  const size_t i2( i+IT::size*2UL );
7611  const size_t i3( i+IT::size*3UL );
7612 
7613  size_t j( jj );
7614 
7615  for( ; (j+2UL) <= jend; j+=2UL )
7616  {
7617  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7618  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7619  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
7620  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7621 
7622  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7623 
7624  for( size_t k=kbegin; k<kend; ++k ) {
7625  const IntrinsicType a1( A.load(i ,k) );
7626  const IntrinsicType a2( A.load(i1,k) );
7627  const IntrinsicType a3( A.load(i2,k) );
7628  const IntrinsicType a4( A.load(i3,k) );
7629  const IntrinsicType b1( set( B(k,j ) ) );
7630  const IntrinsicType b2( set( B(k,j+1UL) ) );
7631  xmm1 = xmm1 + a1 * b1;
7632  xmm2 = xmm2 + a2 * b1;
7633  xmm3 = xmm3 + a3 * b1;
7634  xmm4 = xmm4 + a4 * b1;
7635  xmm5 = xmm5 + a1 * b2;
7636  xmm6 = xmm6 + a2 * b2;
7637  xmm7 = xmm7 + a3 * b2;
7638  xmm8 = xmm8 + a4 * b2;
7639  }
7640 
7641  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7642  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
7643  (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
7644  (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
7645  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
7646  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
7647  (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
7648  (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
7649  }
7650 
7651  if( j < jend )
7652  {
7653  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7654  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7655  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
7656  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7657 
7658  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7659 
7660  for( size_t k=kbegin; k<kend; ++k ) {
7661  const IntrinsicType b1( set( B(k,j) ) );
7662  xmm1 = xmm1 + A.load(i ,k) * b1;
7663  xmm2 = xmm2 + A.load(i1,k) * b1;
7664  xmm3 = xmm3 + A.load(i2,k) * b1;
7665  xmm4 = xmm4 + A.load(i3,k) * b1;
7666  }
7667 
7668  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
7669  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
7670  (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
7671  (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
7672  }
7673  }
7674 
7675  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
7676  {
7677  const size_t i1( i+IT::size );
7678 
7679  size_t j( jj );
7680 
7681  for( ; (j+4UL) <= jend; j+=4UL )
7682  {
7683  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7684  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7685  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
7686  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
7687 
7688  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7689 
7690  for( size_t k=kbegin; k<kend; ++k ) {
7691  const IntrinsicType a1( A.load(i ,k) );
7692  const IntrinsicType a2( A.load(i1,k) );
7693  const IntrinsicType b1( set( B(k,j ) ) );
7694  const IntrinsicType b2( set( B(k,j+1UL) ) );
7695  const IntrinsicType b3( set( B(k,j+2UL) ) );
7696  const IntrinsicType b4( set( B(k,j+3UL) ) );
7697  xmm1 = xmm1 + a1 * b1;
7698  xmm2 = xmm2 + a2 * b1;
7699  xmm3 = xmm3 + a1 * b2;
7700  xmm4 = xmm4 + a2 * b2;
7701  xmm5 = xmm5 + a1 * b3;
7702  xmm6 = xmm6 + a2 * b3;
7703  xmm7 = xmm7 + a1 * b4;
7704  xmm8 = xmm8 + a2 * b4;
7705  }
7706 
7707  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7708  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
7709  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
7710  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
7711  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
7712  (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
7713  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
7714  (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
7715  }
7716 
7717  for( ; (j+2UL) <= jend; j+=2UL )
7718  {
7719  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7720  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7721  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
7722  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7723 
7724  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7725 
7726  for( size_t k=kbegin; k<kend; ++k ) {
7727  const IntrinsicType a1( A.load(i ,k) );
7728  const IntrinsicType a2( A.load(i1,k) );
7729  const IntrinsicType b1( set( B(k,j ) ) );
7730  const IntrinsicType b2( set( B(k,j+1UL) ) );
7731  xmm1 = xmm1 + a1 * b1;
7732  xmm2 = xmm2 + a2 * b1;
7733  xmm3 = xmm3 + a1 * b2;
7734  xmm4 = xmm4 + a2 * b2;
7735  }
7736 
7737  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7738  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
7739  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
7740  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
7741  }
7742 
7743  if( j < jend )
7744  {
7745  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7746  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7747  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
7748  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7749 
7750  IntrinsicType xmm1, xmm2;
7751 
7752  for( size_t k=kbegin; k<kend; ++k ) {
7753  const IntrinsicType b1( set( B(k,j) ) );
7754  xmm1 = xmm1 + A.load(i ,k) * b1;
7755  xmm2 = xmm2 + A.load(i1,k) * b1;
7756  }
7757 
7758  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
7759  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
7760  }
7761  }
7762 
7763  for( ; i<ipos; i+=IT::size )
7764  {
7765  for( size_t j=jj; j<jend; ++j )
7766  {
7767  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7768  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7769  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
7770  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7771 
7772  IntrinsicType xmm1;
7773 
7774  for( size_t k=kbegin; k<kend; ++k ) {
7775  const IntrinsicType b1( set( B(k,j) ) );
7776  xmm1 = xmm1 + A.load(i,k) * b1;
7777  }
7778 
7779  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
7780  }
7781  }
7782 
7783  for( ; remainder && i<iend; ++i )
7784  {
7785  for( size_t j=jj; j<jend; ++j )
7786  {
7787  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7788  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7789  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
7790  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7791 
7792  ElementType value = ElementType();
7793 
7794  for( size_t k=kbegin; k<kend; ++k ) {
7795  value += A(i,k) * B(k,j);
7796  }
7797 
7798  (~C)(i,j) += value * scalar;
7799  }
7800  }
7801  }
7802  }
7803  }
7804  }
7805  //**********************************************************************************************
7806 
7807  //**BLAS-based assignment to dense matrices (default)*******************************************
7821  template< typename MT3 // Type of the left-hand side target matrix
7822  , typename MT4 // Type of the left-hand side matrix operand
7823  , typename MT5 // Type of the right-hand side matrix operand
7824  , typename ST2 > // Type of the scalar value
7825  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7826  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7827  {
7828  selectLargeAssignKernel( C, A, B, scalar );
7829  }
7830  //**********************************************************************************************
7831 
7832  //**BLAS-based assignment to dense matrices*****************************************************
7833 #if BLAZE_BLAS_MODE
7834 
7847  template< typename MT3 // Type of the left-hand side target matrix
7848  , typename MT4 // Type of the left-hand side matrix operand
7849  , typename MT5 // Type of the right-hand side matrix operand
7850  , typename ST2 > // Type of the scalar value
7851  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7852  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7853  {
7854  typedef typename MT3::ElementType ET;
7855 
7856  if( IsTriangular<MT4>::value ) {
7857  assign( C, B );
7858  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7859  }
7860  else if( IsTriangular<MT5>::value ) {
7861  assign( C, A );
7862  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7863  }
7864  else {
7865  gemm( C, A, B, ET(scalar), ET(0) );
7866  }
7867  }
7868 #endif
7869  //**********************************************************************************************
7870 
7871  //**Assignment to sparse matrices***************************************************************
7883  template< typename MT // Type of the target sparse matrix
7884  , bool SO > // Storage order of the target sparse matrix
7885  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7886  {
7888 
7889  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
7890 
7897 
7898  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7899  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7900 
7901  const TmpType tmp( serial( rhs ) );
7902  assign( ~lhs, tmp );
7903  }
7904  //**********************************************************************************************
7905 
7906  //**Addition assignment to dense matrices*******************************************************
7918  template< typename MT // Type of the target dense matrix
7919  , bool SO > // Storage order of the target dense matrix
7920  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7921  {
7923 
7924  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7925  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7926 
7927  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7928  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7929 
7930  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7931  return;
7932  }
7933 
7934  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7935  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7936 
7937  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7938  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7939  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7940  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7941  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7942  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7943 
7944  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
7945  }
7946  //**********************************************************************************************
7947 
7948  //**Addition assignment to dense matrices (kernel selection)************************************
7959  template< typename MT3 // Type of the left-hand side target matrix
7960  , typename MT4 // Type of the left-hand side matrix operand
7961  , typename MT5 // Type of the right-hand side matrix operand
7962  , typename ST2 > // Type of the scalar value
7963  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7964  {
7965  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
7966  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7967  selectSmallAddAssignKernel( C, A, B, scalar );
7968  else
7969  selectBlasAddAssignKernel( C, A, B, scalar );
7970  }
7971  //**********************************************************************************************
7972 
7973  //**Default addition assignment to dense matrices (general/general)*****************************
7987  template< typename MT3 // Type of the left-hand side target matrix
7988  , typename MT4 // Type of the left-hand side matrix operand
7989  , typename MT5 // Type of the right-hand side matrix operand
7990  , typename ST2 > // Type of the scalar value
7991  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
7992  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7993  {
7994  const ResultType tmp( serial( A * B * scalar ) );
7995  addAssign( C, tmp );
7996  }
7997  //**********************************************************************************************
7998 
7999  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
8013  template< typename MT3 // Type of the left-hand side target matrix
8014  , typename MT4 // Type of the left-hand side matrix operand
8015  , typename MT5 // Type of the right-hand side matrix operand
8016  , typename ST2 > // Type of the scalar value
8017  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
8018  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
8019  {
8020  const size_t M( A.rows() );
8021  const size_t N( B.columns() );
8022 
8023  const size_t block( BLOCK_SIZE );
8024 
8025  for( size_t ii=0UL; ii<M; ii+=block ) {
8026  const size_t iend( min( M, ii+block ) );
8027  for( size_t jj=0UL; jj<N; jj+=block ) {
8028  const size_t jend( min( N, jj+block ) );
8029  for( size_t i=ii; i<iend; ++i )
8030  {
8031  const size_t jbegin( ( IsUpper<MT4>::value )
8032  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
8033  :( jj ) );
8034  const size_t jpos( ( IsLower<MT4>::value )
8035  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
8036  :( jend ) );
8037 
8038  for( size_t j=jbegin; j<jpos; ++j ) {
8039  (~C)(i,j) += A(i,j) * B(j,j) * scalar;
8040  }
8041  }
8042  }
8043  }
8044  }
8045  //**********************************************************************************************
8046 
8047  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
8061  template< typename MT3 // Type of the left-hand side target matrix
8062  , typename MT4 // Type of the left-hand side matrix operand
8063  , typename MT5 // Type of the right-hand side matrix operand
8064  , typename ST2 > // Type of the scalar value
8065  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
8066  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8067  {
8068  const size_t M( A.rows() );
8069  const size_t N( B.columns() );
8070 
8071  for( size_t j=0UL; j<N; ++j )
8072  {
8073  const size_t ibegin( ( IsLower<MT4>::value )
8074  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
8075  :( 0UL ) );
8076  const size_t iend( ( IsUpper<MT4>::value )
8077  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
8078  :( M ) );
8079  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
8080 
8081  const size_t inum( iend - ibegin );
8082  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
8083 
8084  for( size_t i=ibegin; i<ipos; i+=2UL ) {
8085  (~C)(i ,j) += A(i ,j) * B(j,j) * scalar;
8086  (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
8087  }
8088  if( ipos < iend ) {
8089  (~C)(ipos,j) += A(ipos,j) * B(j,j) * scalar;
8090  }
8091  }
8092  }
8093  //**********************************************************************************************
8094 
8095  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
8109  template< typename MT3 // Type of the left-hand side target matrix
8110  , typename MT4 // Type of the left-hand side matrix operand
8111  , typename MT5 // Type of the right-hand side matrix operand
8112  , typename ST2 > // Type of the scalar value
8113  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
8114  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
8115  {
8116  const size_t M( A.rows() );
8117  const size_t N( B.columns() );
8118 
8119  for( size_t i=0UL; i<M; ++i )
8120  {
8121  const size_t jbegin( ( IsUpper<MT5>::value )
8122  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
8123  :( 0UL ) );
8124  const size_t jend( ( IsLower<MT5>::value )
8125  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
8126  :( N ) );
8127  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
8128 
8129  const size_t jnum( jend - jbegin );
8130  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
8131 
8132  for( size_t j=jbegin; j<jpos; j+=2UL ) {
8133  (~C)(i,j ) += A(i,i) * B(i,j ) * scalar;
8134  (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
8135  }
8136  if( jpos < jend ) {
8137  (~C)(i,jpos) += A(i,i) * B(i,jpos) * scalar;
8138  }
8139  }
8140  }
8141  //**********************************************************************************************
8142 
8143  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
8157  template< typename MT3 // Type of the left-hand side target matrix
8158  , typename MT4 // Type of the left-hand side matrix operand
8159  , typename MT5 // Type of the right-hand side matrix operand
8160  , typename ST2 > // Type of the scalar value
8161  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
8162  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8163  {
8164  const size_t M( A.rows() );
8165  const size_t N( B.columns() );
8166 
8167  const size_t block( BLOCK_SIZE );
8168 
8169  for( size_t jj=0UL; jj<N; jj+=block ) {
8170  const size_t jend( min( N, jj+block ) );
8171  for( size_t ii=0UL; ii<M; ii+=block ) {
8172  const size_t iend( min( M, ii+block ) );
8173  for( size_t j=jj; j<jend; ++j )
8174  {
8175  const size_t ibegin( ( IsLower<MT5>::value )
8176  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
8177  :( ii ) );
8178  const size_t ipos( ( IsUpper<MT5>::value )
8179  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
8180  :( iend ) );
8181 
8182  for( size_t i=ibegin; i<ipos; ++i ) {
8183  (~C)(i,j) += A(i,i) * B(i,j) * scalar;
8184  }
8185  }
8186  }
8187  }
8188  }
8189  //**********************************************************************************************
8190 
8191  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
8205  template< typename MT3 // Type of the left-hand side target matrix
8206  , typename MT4 // Type of the left-hand side matrix operand
8207  , typename MT5 // Type of the right-hand side matrix operand
8208  , typename ST2 > // Type of the scalar value
8209  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
8210  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8211  {
8212  for( size_t i=0UL; i<A.rows(); ++i ) {
8213  C(i,i) += A(i,i) * B(i,i) * scalar;
8214  }
8215  }
8216  //**********************************************************************************************
8217 
8218  //**Default addition assignment to dense matrices (small matrices)******************************
8232  template< typename MT3 // Type of the left-hand side target matrix
8233  , typename MT4 // Type of the left-hand side matrix operand
8234  , typename MT5 // Type of the right-hand side matrix operand
8235  , typename ST2 > // Type of the scalar value
8236  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8237  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8238  {
8239  selectDefaultAddAssignKernel( C, A, B, scalar );
8240  }
8241  //**********************************************************************************************
8242 
8243  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
8258  template< typename MT3 // Type of the left-hand side target matrix
8259  , typename MT4 // Type of the left-hand side matrix operand
8260  , typename MT5 // Type of the right-hand side matrix operand
8261  , typename ST2 > // Type of the scalar value
8262  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8263  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
8264  {
8265  typedef IntrinsicTrait<ElementType> IT;
8266 
8267  const size_t M( A.rows() );
8268  const size_t N( B.columns() );
8269  const size_t K( A.columns() );
8270 
8271  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
8272 
8273  const size_t jpos( remainder ? ( N & size_t(-IT::size) ) : N );
8274  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % IT::size ) ) == jpos, "Invalid end calculation" );
8275 
8276  const IntrinsicType factor( set( scalar ) );
8277 
8278  size_t j( 0UL );
8279 
8280  for( ; (j+IT::size*7UL) < jpos; j+=IT::size*8UL ) {
8281  for( size_t i=0UL; i<M; ++i )
8282  {
8283  const size_t kbegin( ( IsUpper<MT4>::value )
8284  ?( ( IsLower<MT5>::value )
8285  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8286  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8287  :( IsLower<MT5>::value ? j : 0UL ) );
8288  const size_t kend( ( IsLower<MT4>::value )
8289  ?( ( IsUpper<MT5>::value )
8290  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
8291  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
8292  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
8293 
8294  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8295 
8296  for( size_t k=kbegin; k<kend; ++k ) {
8297  const IntrinsicType a1( set( A(i,k) ) );
8298  xmm1 = xmm1 + a1 * B.load(k,j );
8299  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
8300  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
8301  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
8302  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
8303  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
8304  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
8305  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
8306  }
8307 
8308  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8309  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
8310  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
8311  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
8312  (~C).store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) + xmm5 * factor );
8313  (~C).store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) + xmm6 * factor );
8314  (~C).store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) + xmm7 * factor );
8315  (~C).store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) + xmm8 * factor );
8316  }
8317  }
8318 
8319  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
8320  {
8321  size_t i( 0UL );
8322 
8323  for( ; (i+2UL) <= M; i+=2UL )
8324  {
8325  const size_t kbegin( ( IsUpper<MT4>::value )
8326  ?( ( IsLower<MT5>::value )
8327  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8328  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8329  :( IsLower<MT5>::value ? j : 0UL ) );
8330  const size_t kend( ( IsLower<MT4>::value )
8331  ?( ( IsUpper<MT5>::value )
8332  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
8333  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8334  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
8335 
8336  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8337 
8338  for( size_t k=kbegin; k<kend; ++k ) {
8339  const IntrinsicType a1( set( A(i ,k) ) );
8340  const IntrinsicType a2( set( A(i+1UL,k) ) );
8341  const IntrinsicType b1( B.load(k,j ) );
8342  const IntrinsicType b2( B.load(k,j+IT::size ) );
8343  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
8344  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
8345  xmm1 = xmm1 + a1 * b1;
8346  xmm2 = xmm2 + a1 * b2;
8347  xmm3 = xmm3 + a1 * b3;
8348  xmm4 = xmm4 + a1 * b4;
8349  xmm5 = xmm5 + a2 * b1;
8350  xmm6 = xmm6 + a2 * b2;
8351  xmm7 = xmm7 + a2 * b3;
8352  xmm8 = xmm8 + a2 * b4;
8353  }
8354 
8355  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8356  (~C).store( i , j+IT::size , (~C).load(i ,j+IT::size ) + xmm2 * factor );
8357  (~C).store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) + xmm3 * factor );
8358  (~C).store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) + xmm4 * factor );
8359  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8360  (~C).store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) + xmm6 * factor );
8361  (~C).store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) + xmm7 * factor );
8362  (~C).store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) + xmm8 * factor );
8363  }
8364 
8365  if( i < M )
8366  {
8367  const size_t kbegin( ( IsUpper<MT4>::value )
8368  ?( ( IsLower<MT5>::value )
8369  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8370  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8371  :( IsLower<MT5>::value ? j : 0UL ) );
8372  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
8373 
8374  IntrinsicType xmm1, xmm2, xmm3, xmm4;
8375 
8376  for( size_t k=kbegin; k<kend; ++k ) {
8377  const IntrinsicType a1( set( A(i,k) ) );
8378  xmm1 = xmm1 + a1 * B.load(k,j );
8379  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
8380  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
8381  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
8382  }
8383 
8384  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8385  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
8386  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
8387  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
8388  }
8389  }
8390 
8391  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
8392  {
8393  size_t i( 0UL );
8394 
8395  for( ; (i+2UL) <= M; i+=2UL )
8396  {
8397  const size_t kbegin( ( IsUpper<MT4>::value )
8398  ?( ( IsLower<MT5>::value )
8399  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8400  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8401  :( IsLower<MT5>::value ? j : 0UL ) );
8402  const size_t kend( ( IsLower<MT4>::value )
8403  ?( ( IsUpper<MT5>::value )
8404  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
8405  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8406  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
8407 
8408  IntrinsicType xmm1, xmm2, xmm3, xmm4;
8409 
8410  for( size_t k=kbegin; k<kend; ++k ) {
8411  const IntrinsicType a1( set( A(i ,k) ) );
8412  const IntrinsicType a2( set( A(i+1UL,k) ) );
8413  const IntrinsicType b1( B.load(k,j ) );
8414  const IntrinsicType b2( B.load(k,j+IT::size) );
8415  xmm1 = xmm1 + a1 * b1;
8416  xmm2 = xmm2 + a1 * b2;
8417  xmm3 = xmm3 + a2 * b1;
8418  xmm4 = xmm4 + a2 * b2;
8419  }
8420 
8421  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8422  (~C).store( i , j+IT::size, (~C).load(i ,j+IT::size) + xmm2 * factor );
8423  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
8424  (~C).store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) + xmm4 * factor );
8425  }
8426 
8427  if( i < M )
8428  {
8429  const size_t kbegin( ( IsUpper<MT4>::value )
8430  ?( ( IsLower<MT5>::value )
8431  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8432  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8433  :( IsLower<MT5>::value ? j : 0UL ) );
8434  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
8435 
8436  IntrinsicType xmm1, xmm2;
8437 
8438  for( size_t k=kbegin; k<kend; ++k ) {
8439  const IntrinsicType a1( set( A(i,k) ) );
8440  xmm1 = xmm1 + a1 * B.load(k,j );
8441  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
8442  }
8443 
8444  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8445  (~C).store( i, j+IT::size, (~C).load(i,j+IT::size) + xmm2 * factor );
8446  }
8447  }
8448 
8449  for( ; j<jpos; j+=IT::size )
8450  {
8451  size_t i( 0UL );
8452 
8453  for( ; (i+2UL) <= M; i+=2UL )
8454  {
8455  const size_t kbegin( ( IsUpper<MT4>::value )
8456  ?( ( IsLower<MT5>::value )
8457  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8458  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8459  :( IsLower<MT5>::value ? j : 0UL ) );
8460  const size_t kend( ( IsLower<MT4>::value )
8461  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8462  :( K ) );
8463 
8464  IntrinsicType xmm1, xmm2;
8465 
8466  for( size_t k=kbegin; k<kend; ++k ) {
8467  const IntrinsicType b1( B.load(k,j) );
8468  xmm1 = xmm1 + set( A(i ,k) ) * b1;
8469  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
8470  }
8471 
8472  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8473  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
8474  }
8475 
8476  if( i < M )
8477  {
8478  const size_t kbegin( ( IsUpper<MT4>::value )
8479  ?( ( IsLower<MT5>::value )
8480  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8481  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8482  :( IsLower<MT5>::value ? j : 0UL ) );
8483 
8484  IntrinsicType xmm1;
8485 
8486  for( size_t k=kbegin; k<K; ++k ) {
8487  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
8488  }
8489 
8490  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8491  }
8492  }
8493 
8494  for( ; remainder && j<N; ++j )
8495  {
8496  size_t i( 0UL );
8497 
8498  for( ; (i+2UL) <= M; i+=2UL )
8499  {
8500  const size_t kbegin( ( IsUpper<MT4>::value )
8501  ?( ( IsLower<MT5>::value )
8502  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8503  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8504  :( IsLower<MT5>::value ? j : 0UL ) );
8505  const size_t kend( ( IsLower<MT4>::value )
8506  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8507  :( K ) );
8508 
8509  ElementType value1 = ElementType();
8510  ElementType value2 = ElementType();
8511 
8512  for( size_t k=kbegin; k<kend; ++k ) {
8513  value1 += A(i ,k) * B(k,j);
8514  value2 += A(i+1UL,k) * B(k,j);
8515  }
8516 
8517  (~C)(i ,j) += value1 * scalar;
8518  (~C)(i+1UL,j) += value2 * scalar;
8519  }
8520 
8521  if( i < M )
8522  {
8523  const size_t kbegin( ( IsUpper<MT4>::value )
8524  ?( ( IsLower<MT5>::value )
8525  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8526  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8527  :( IsLower<MT5>::value ? j : 0UL ) );
8528 
8529  ElementType value = ElementType();
8530 
8531  for( size_t k=kbegin; k<K; ++k ) {
8532  value += A(i,k) * B(k,j);
8533  }
8534 
8535  (~C)(i,j) += value * scalar;
8536  }
8537  }
8538  }
8539  //**********************************************************************************************
8540 
8541  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
8556  template< typename MT3 // Type of the left-hand side target matrix
8557  , typename MT4 // Type of the left-hand side matrix operand
8558  , typename MT5 // Type of the right-hand side matrix operand
8559  , typename ST2 > // Type of the scalar value
8560  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8561  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8562  {
8563  typedef IntrinsicTrait<ElementType> IT;
8564 
8565  const size_t M( A.rows() );
8566  const size_t N( B.columns() );
8567  const size_t K( A.columns() );
8568 
8569  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
8570 
8571  const size_t ipos( remainder ? ( M & size_t(-IT::size) ) : M );
8572  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % IT::size ) ) == ipos, "Invalid end calculation" );
8573 
8574  const IntrinsicType factor( set( scalar ) );
8575 
8576  size_t i( 0UL );
8577 
8578  for( ; (i+IT::size*7UL) < ipos; i+=IT::size*8UL ) {
8579  for( size_t j=0UL; j<N; ++j )
8580  {
8581  const size_t kbegin( ( IsLower<MT5>::value )
8582  ?( ( IsUpper<MT4>::value )
8583  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8584  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8585  :( IsUpper<MT4>::value ? i : 0UL ) );
8586  const size_t kend( ( IsUpper<MT5>::value )
8587  ?( ( IsLower<MT4>::value )
8588  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
8589  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
8590  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
8591 
8592  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8593 
8594  for( size_t k=kbegin; k<kend; ++k ) {
8595  const IntrinsicType b1( set( B(k,j) ) );
8596  xmm1 = xmm1 + A.load(i ,k) * b1;
8597  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
8598  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
8599  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
8600  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
8601  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
8602  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
8603  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
8604  }
8605 
8606  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8607  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
8608  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
8609  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
8610  (~C).store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) + xmm5 * factor );
8611  (~C).store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) + xmm6 * factor );
8612  (~C).store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) + xmm7 * factor );
8613  (~C).store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) + xmm8 * factor );
8614  }
8615  }
8616 
8617  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
8618  {
8619  size_t j( 0UL );
8620 
8621  for( ; (j+2UL) <= N; j+=2UL )
8622  {
8623  const size_t kbegin( ( IsLower<MT5>::value )
8624  ?( ( IsUpper<MT4>::value )
8625  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8626  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8627  :( IsUpper<MT4>::value ? i : 0UL ) );
8628  const size_t kend( ( IsUpper<MT5>::value )
8629  ?( ( IsLower<MT4>::value )
8630  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8631  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8632  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
8633 
8634  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8635 
8636  for( size_t k=kbegin; k<kend; ++k ) {
8637  const IntrinsicType a1( A.load(i ,k) );
8638  const IntrinsicType a2( A.load(i+IT::size ,k) );
8639  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
8640  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
8641  const IntrinsicType b1( set( B(k,j ) ) );
8642  const IntrinsicType b2( set( B(k,j+1UL) ) );
8643  xmm1 = xmm1 + a1 * b1;
8644  xmm2 = xmm2 + a2 * b1;
8645  xmm3 = xmm3 + a3 * b1;
8646  xmm4 = xmm4 + a4 * b1;
8647  xmm5 = xmm5 + a1 * b2;
8648  xmm6 = xmm6 + a2 * b2;
8649  xmm7 = xmm7 + a3 * b2;
8650  xmm8 = xmm8 + a4 * b2;
8651  }
8652 
8653  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8654  (~C).store( i+IT::size , j , (~C).load(i+IT::size ,j ) + xmm2 * factor );
8655  (~C).store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) + xmm3 * factor );
8656  (~C).store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) + xmm4 * factor );
8657  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
8658  (~C).store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) + xmm6 * factor );
8659  (~C).store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) + xmm7 * factor );
8660  (~C).store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) + xmm8 * factor );
8661  }
8662 
8663  if( j < N )
8664  {
8665  const size_t kbegin( ( IsLower<MT5>::value )
8666  ?( ( IsUpper<MT4>::value )
8667  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8668  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8669  :( IsUpper<MT4>::value ? i : 0UL ) );
8670  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
8671 
8672  IntrinsicType xmm1, xmm2, xmm3, xmm4;
8673 
8674  for( size_t k=kbegin; k<kend; ++k ) {
8675  const IntrinsicType b1( set( B(k,j) ) );
8676  xmm1 = xmm1 + A.load(i ,k) * b1;
8677  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
8678  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
8679  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
8680  }
8681 
8682  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8683  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
8684  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
8685  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
8686  }
8687  }
8688 
8689  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
8690  {
8691  size_t j( 0UL );
8692 
8693  for( ; (j+2UL) <= N; j+=2UL )
8694  {
8695  const size_t kbegin( ( IsLower<MT5>::value )
8696  ?( ( IsUpper<MT4>::value )
8697  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8698  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8699  :( IsUpper<MT4>::value ? i : 0UL ) );
8700  const size_t kend( ( IsUpper<MT5>::value )
8701  ?( ( IsLower<MT4>::value )
8702  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8703  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8704  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
8705 
8706  IntrinsicType xmm1, xmm2, xmm3, xmm4;
8707 
8708  for( size_t k=kbegin; k<kend; ++k ) {
8709  const IntrinsicType a1( A.load(i ,k) );
8710  const IntrinsicType a2( A.load(i+IT::size,k) );
8711  const IntrinsicType b1( set( B(k,j ) ) );
8712  const IntrinsicType b2( set( B(k,j+1UL) ) );
8713  xmm1 = xmm1 + a1 * b1;
8714  xmm2 = xmm2 + a2 * b1;
8715  xmm3 = xmm3 + a1 * b2;
8716  xmm4 = xmm4 + a2 * b2;
8717  }
8718 
8719  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8720  (~C).store( i+IT::size, j , (~C).load(i+IT::size,j ) + xmm2 * factor );
8721  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
8722  (~C).store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) + xmm4 * factor );
8723  }
8724 
8725  if( j < N )
8726  {
8727  const size_t kbegin( ( IsLower<MT5>::value )
8728  ?( ( IsUpper<MT4>::value )
8729  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8730  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8731  :( IsUpper<MT4>::value ? i : 0UL ) );
8732  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
8733 
8734  IntrinsicType xmm1, xmm2;
8735 
8736  for( size_t k=kbegin; k<kend; ++k ) {
8737  const IntrinsicType b1( set( B(k,j) ) );
8738  xmm1 = xmm1 + A.load(i ,k) * b1;
8739  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
8740  }
8741 
8742  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8743  (~C).store( i+IT::size, j, (~C).load(i+IT::size,j) + xmm2 * factor );
8744  }
8745  }
8746 
8747  for( ; i<ipos; i+=IT::size )
8748  {
8749  size_t j( 0UL );
8750 
8751  for( ; (j+2UL) <= N; j+=2UL )
8752  {
8753  const size_t kbegin( ( IsLower<MT5>::value )
8754  ?( ( IsUpper<MT4>::value )
8755  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8756  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8757  :( IsUpper<MT4>::value ? i : 0UL ) );
8758  const size_t kend( ( IsUpper<MT5>::value )
8759  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8760  :( K ) );
8761 
8762  IntrinsicType xmm1, xmm2;
8763 
8764  for( size_t k=kbegin; k<kend; ++k ) {
8765  const IntrinsicType a1( A.load(i,k) );
8766  xmm1 = xmm1 + a1 * set( B(k,j ) );
8767  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
8768  }
8769 
8770  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8771  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
8772  }
8773 
8774  if( j < N )
8775  {
8776  const size_t kbegin( ( IsLower<MT5>::value )
8777  ?( ( IsUpper<MT4>::value )
8778  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8779  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8780  :( IsUpper<MT4>::value ? i : 0UL ) );
8781 
8782  IntrinsicType xmm1;
8783 
8784  for( size_t k=kbegin; k<K; ++k ) {
8785  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
8786  }
8787 
8788  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8789  }
8790  }
8791 
8792  for( ; remainder && i<M; ++i )
8793  {
8794  size_t j( 0UL );
8795 
8796  for( ; (j+2UL) <= N; j+=2UL )
8797  {
8798  const size_t kbegin( ( IsLower<MT5>::value )
8799  ?( ( IsUpper<MT4>::value )
8800  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8801  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8802  :( IsUpper<MT4>::value ? i : 0UL ) );
8803  const size_t kend( ( IsUpper<MT5>::value )
8804  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8805  :( K ) );
8806 
8807  ElementType value1 = ElementType();
8808  ElementType value2 = ElementType();
8809 
8810  for( size_t k=kbegin; k<kend; ++k ) {
8811  value1 += A(i,k) * B(k,j );
8812  value2 += A(i,k) * B(k,j+1UL);
8813  }
8814 
8815  (~C)(i,j ) += value1 * scalar;
8816  (~C)(i,j+1UL) += value2 * scalar;
8817  }
8818 
8819  if( j < N )
8820  {
8821  const size_t kbegin( ( IsLower<MT5>::value )
8822  ?( ( IsUpper<MT4>::value )
8823  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8824  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8825  :( IsUpper<MT4>::value ? i : 0UL ) );
8826 
8827  ElementType value = ElementType();
8828 
8829  for( size_t k=kbegin; k<K; ++k ) {
8830  value += A(i,k) * B(k,j);
8831  }
8832 
8833  (~C)(i,j) += value * scalar;
8834  }
8835  }
8836  }
8837  //**********************************************************************************************
8838 
8839  //**Default addition assignment to dense matrices (large matrices)******************************
8853  template< typename MT3 // Type of the left-hand side target matrix
8854  , typename MT4 // Type of the left-hand side matrix operand
8855  , typename MT5 // Type of the right-hand side matrix operand
8856  , typename ST2 > // Type of the scalar value
8857  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8858  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8859  {
8860  selectDefaultAddAssignKernel( C, A, B, scalar );
8861  }
8862  //**********************************************************************************************
8863 
8864  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
8879  template< typename MT3 // Type of the left-hand side target matrix
8880  , typename MT4 // Type of the left-hand side matrix operand
8881  , typename MT5 // Type of the right-hand side matrix operand
8882  , typename ST2 > // Type of the scalar value
8883  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8884  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
8885  {
8886  typedef IntrinsicTrait<ElementType> IT;
8887 
8888  const size_t M( A.rows() );
8889  const size_t N( B.columns() );
8890  const size_t K( A.columns() );
8891 
8892  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
8893 
8894  const IntrinsicType factor( set( scalar ) );
8895 
8896  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
8897  {
8898  const size_t jend( min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
8899 
8900  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
8901  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % IT::size ) ) == jpos, "Invalid end calculation" );
8902 
8903  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
8904  {
8905  const size_t iend( min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
8906 
8907  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
8908  {
8909  const size_t ktmp( min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
8910 
8911  size_t j( jj );
8912 
8913  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
8914  {
8915  const size_t j1( j+IT::size );
8916  const size_t j2( j+IT::size*2UL );
8917  const size_t j3( j+IT::size*3UL );
8918 
8919  size_t i( ii );
8920 
8921  for( ; (i+2UL) <= iend; i+=2UL )
8922  {
8923  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
8924  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
8925  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
8926  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
8927 
8928  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8929 
8930  for( size_t k=kbegin; k<kend; ++k ) {
8931  const IntrinsicType a1( set( A(i ,k) ) );
8932  const IntrinsicType a2( set( A(i+1UL,k) ) );
8933  const IntrinsicType b1( B.load(k,j ) );
8934  const IntrinsicType b2( B.load(k,j1) );
8935  const IntrinsicType b3( B.load(k,j2) );
8936  const IntrinsicType b4( B.load(k,j3) );
8937  xmm1 = xmm1 + a1 * b1;
8938  xmm2 = xmm2 + a1 * b2;
8939  xmm3 = xmm3 + a1 * b3;
8940  xmm4 = xmm4 + a1 * b4;
8941  xmm5 = xmm5 + a2 * b1;
8942  xmm6 = xmm6 + a2 * b2;
8943  xmm7 = xmm7 + a2 * b3;
8944  xmm8 = xmm8 + a2 * b4;
8945  }
8946 
8947  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8948  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
8949  (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
8950  (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
8951  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8952  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
8953  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
8954  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
8955  }
8956 
8957  if( i < iend )
8958  {
8959  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
8960  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
8961  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
8962  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
8963 
8964  IntrinsicType xmm1, xmm2, xmm3, xmm4;
8965 
8966  for( size_t k=kbegin; k<kend; ++k ) {
8967  const IntrinsicType a1( set( A(i,k) ) );
8968  xmm1 = xmm1 + a1 * B.load(k,j );
8969  xmm2 = xmm2 + a1 * B.load(k,j1);
8970  xmm3 = xmm3 + a1 * B.load(k,j2);
8971  xmm4 = xmm4 + a1 * B.load(k,j3);
8972  }
8973 
8974  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8975  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
8976  (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
8977  (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
8978  }
8979  }
8980 
8981  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
8982  {
8983  const size_t j1( j+IT::size );
8984 
8985  size_t i( ii );
8986 
8987  for( ; (i+4UL) <= iend; i+=4UL )
8988  {
8989  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
8990  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
8991  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
8992  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
8993 
8994  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8995 
8996  for( size_t k=kbegin; k<kend; ++k ) {
8997  const IntrinsicType a1( set( A(i ,k) ) );
8998  const IntrinsicType a2( set( A(i+1UL,k) ) );
8999  const IntrinsicType a3( set( A(i+2UL,k) ) );
9000  const IntrinsicType a4( set( A(i+3UL,k) ) );
9001  const IntrinsicType b1( B.load(k,j ) );
9002  const IntrinsicType b2( B.load(k,j1) );
9003  xmm1 = xmm1 + a1 * b1;
9004  xmm2 = xmm2 + a1 * b2;
9005  xmm3 = xmm3 + a2 * b1;
9006  xmm4 = xmm4 + a2 * b2;
9007  xmm5 = xmm5 + a3 * b1;
9008  xmm6 = xmm6 + a3 * b2;
9009  xmm7 = xmm7 + a4 * b1;
9010  xmm8 = xmm8 + a4 * b2;
9011  }
9012 
9013  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9014  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
9015  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
9016  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
9017  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
9018  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
9019  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
9020  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
9021  }
9022 
9023  for( ; (i+2UL) <= iend; i+=2UL )
9024  {
9025  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9026  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9027  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
9028  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
9029 
9030  IntrinsicType xmm1, xmm2, xmm3, xmm4;
9031 
9032  for( size_t k=kbegin; k<kend; ++k ) {
9033  const IntrinsicType a1( set( A(i ,k) ) );
9034  const IntrinsicType a2( set( A(i+1UL,k) ) );
9035  const IntrinsicType b1( B.load(k,j ) );
9036  const IntrinsicType b2( B.load(k,j1) );
9037  xmm1 = xmm1 + a1 * b1;
9038  xmm2 = xmm2 + a1 * b2;
9039  xmm3 = xmm3 + a2 * b1;
9040  xmm4 = xmm4 + a2 * b2;
9041  }
9042 
9043  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9044  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
9045  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
9046  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
9047  }
9048 
9049  if( i < iend )
9050  {
9051  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9052  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9053  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9054  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
9055 
9056  IntrinsicType xmm1, xmm2;
9057 
9058  for( size_t k=kbegin; k<kend; ++k ) {
9059  const IntrinsicType a1( set( A(i,k) ) );
9060  xmm1 = xmm1 + a1 * B.load(k,j );
9061  xmm2 = xmm2 + a1 * B.load(k,j1);
9062  }
9063 
9064  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9065  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
9066  }
9067  }
9068 
9069  for( ; j<jpos; j+=IT::size )
9070  {
9071  for( size_t i=ii; i<iend; ++i )
9072  {
9073  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9074  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9075  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9076  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
9077 
9078  IntrinsicType xmm1;
9079 
9080  for( size_t k=kbegin; k<kend; ++k ) {
9081  const IntrinsicType a1( set( A(i,k) ) );
9082  xmm1 = xmm1 + a1 * B.load(k,j);
9083  }
9084 
9085  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
9086  }
9087  }
9088 
9089  for( ; remainder && j<jend; ++j )
9090  {
9091  for( size_t i=ii; i<iend; ++i )
9092  {
9093  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9094  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9095  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9096  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
9097 
9098  ElementType value = ElementType();
9099 
9100  for( size_t k=kbegin; k<kend; ++k ) {
9101  value += A(i,k) * B(k,j);
9102  }
9103 
9104  (~C)(i,j) += value * scalar;
9105  }
9106  }
9107  }
9108  }
9109  }
9110  }
9111  //**********************************************************************************************
9112 
9113  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
9128  template< typename MT3 // Type of the left-hand side target matrix
9129  , typename MT4 // Type of the left-hand side matrix operand
9130  , typename MT5 // Type of the right-hand side matrix operand
9131  , typename ST2 > // Type of the scalar value
9132  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9133  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9134  {
9135  typedef IntrinsicTrait<ElementType> IT;
9136 
9137  const size_t M( A.rows() );
9138  const size_t N( B.columns() );
9139  const size_t K( A.columns() );
9140 
9141  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
9142 
9143  const IntrinsicType factor( set( scalar ) );
9144 
9145  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
9146  {
9147  const size_t iend( min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
9148 
9149  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
9150  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % IT::size ) ) == ipos, "Invalid end calculation" );
9151 
9152  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
9153  {
9154  const size_t jend( min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
9155 
9156  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
9157  {
9158  const size_t ktmp( min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
9159 
9160  size_t i( ii );
9161 
9162  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
9163  {
9164  const size_t i1( i+IT::size );
9165  const size_t i2( i+IT::size*2UL );
9166  const size_t i3( i+IT::size*3UL );
9167 
9168  size_t j( jj );
9169 
9170  for( ; (j+2UL) <= jend; j+=2UL )
9171  {
9172  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9173  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9174  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
9175  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
9176 
9177  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9178 
9179  for( size_t k=kbegin; k<kend; ++k ) {
9180  const IntrinsicType a1( A.load(i ,k) );
9181  const IntrinsicType a2( A.load(i1,k) );
9182  const IntrinsicType a3( A.load(i2,k) );
9183  const IntrinsicType a4( A.load(i3,k) );
9184  const IntrinsicType b1( set( B(k,j ) ) );
9185  const IntrinsicType b2( set( B(k,j+1UL) ) );
9186  xmm1 = xmm1 + a1 * b1;
9187  xmm2 = xmm2 + a2 * b1;
9188  xmm3 = xmm3 + a3 * b1;
9189  xmm4 = xmm4 + a4 * b1;
9190  xmm5 = xmm5 + a1 * b2;
9191  xmm6 = xmm6 + a2 * b2;
9192  xmm7 = xmm7 + a3 * b2;
9193  xmm8 = xmm8 + a4 * b2;
9194  }
9195 
9196  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9197  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
9198  (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
9199  (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
9200  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
9201  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
9202  (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
9203  (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
9204  }
9205 
9206  if( j < jend )
9207  {
9208  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9209  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9210  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
9211  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9212 
9213  IntrinsicType xmm1, xmm2, xmm3, xmm4;
9214 
9215  for( size_t k=kbegin; k<kend; ++k ) {
9216  const IntrinsicType b1( set( B(k,j) ) );
9217  xmm1 = xmm1 + A.load(i ,k) * b1;
9218  xmm2 = xmm2 + A.load(i1,k) * b1;
9219  xmm3 = xmm3 + A.load(i2,k) * b1;
9220  xmm4 = xmm4 + A.load(i3,k) * b1;
9221  }
9222 
9223  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
9224  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
9225  (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
9226  (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
9227  }
9228  }
9229 
9230  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
9231  {
9232  const size_t i1( i+IT::size );
9233 
9234  size_t j( jj );
9235 
9236  for( ; (j+4UL) <= jend; j+=4UL )
9237  {
9238  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9239  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9240  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
9241  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
9242 
9243  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9244 
9245  for( size_t k=kbegin; k<kend; ++k ) {
9246  const IntrinsicType a1( A.load(i ,k) );
9247  const IntrinsicType a2( A.load(i1,k) );
9248  const IntrinsicType b1( set( B(k,j ) ) );
9249  const IntrinsicType b2( set( B(k,j+1UL) ) );
9250  const IntrinsicType b3( set( B(k,j+2UL) ) );
9251  const IntrinsicType b4( set( B(k,j+3UL) ) );
9252  xmm1 = xmm1 + a1 * b1;
9253  xmm2 = xmm2 + a2 * b1;
9254  xmm3 = xmm3 + a1 * b2;
9255  xmm4 = xmm4 + a2 * b2;
9256  xmm5 = xmm5 + a1 * b3;
9257  xmm6 = xmm6 + a2 * b3;
9258  xmm7 = xmm7 + a1 * b4;
9259  xmm8 = xmm8 + a2 * b4;
9260  }
9261 
9262  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9263  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
9264  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
9265  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
9266  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
9267  (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
9268  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
9269  (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
9270  }
9271 
9272  for( ; (j+2UL) <= jend; j+=2UL )
9273  {
9274  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9275  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9276  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
9277  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
9278 
9279  IntrinsicType xmm1, xmm2, xmm3, xmm4;
9280 
9281  for( size_t k=kbegin; k<kend; ++k ) {
9282  const IntrinsicType a1( A.load(i ,k) );
9283  const IntrinsicType a2( A.load(i1,k) );
9284  const IntrinsicType b1( set( B(k,j ) ) );
9285  const IntrinsicType b2( set( B(k,j+1UL) ) );
9286  xmm1 = xmm1 + a1 * b1;
9287  xmm2 = xmm2 + a2 * b1;
9288  xmm3 = xmm3 + a1 * b2;
9289  xmm4 = xmm4 + a2 * b2;
9290  }
9291 
9292  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9293  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
9294  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
9295  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
9296  }
9297 
9298  if( j < jend )
9299  {
9300  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9301  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9302  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
9303  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9304 
9305  IntrinsicType xmm1, xmm2;
9306 
9307  for( size_t k=kbegin; k<kend; ++k ) {
9308  const IntrinsicType b1( set( B(k,j) ) );
9309  xmm1 = xmm1 + A.load(i ,k) * b1;
9310  xmm2 = xmm2 + A.load(i1,k) * b1;
9311  }
9312 
9313  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
9314  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
9315  }
9316  }
9317 
9318  for( ; i<ipos; i+=IT::size )
9319  {
9320  for( size_t j=jj; j<jend; ++j )
9321  {
9322  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9323  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9324  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
9325  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9326 
9327  IntrinsicType xmm1;
9328 
9329  for( size_t k=kbegin; k<kend; ++k ) {
9330  const IntrinsicType b1( set( B(k,j) ) );
9331  xmm1 = xmm1 + A.load(i,k) * b1;
9332  }
9333 
9334  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
9335  }
9336  }
9337 
9338  for( ; remainder && i<iend; ++i )
9339  {
9340  for( size_t j=jj; j<jend; ++j )
9341  {
9342  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9343  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9344  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
9345  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9346 
9347  ElementType value = ElementType();
9348 
9349  for( size_t k=kbegin; k<kend; ++k ) {
9350  value += A(i,k) * B(k,j);
9351  }
9352 
9353  (~C)(i,j) += value * scalar;
9354  }
9355  }
9356  }
9357  }
9358  }
9359  }
9360  //**********************************************************************************************
9361 
9362  //**BLAS-based addition assignment to dense matrices (default)**********************************
9376  template< typename MT3 // Type of the left-hand side target matrix
9377  , typename MT4 // Type of the left-hand side matrix operand
9378  , typename MT5 // Type of the right-hand side matrix operand
9379  , typename ST2 > // Type of the scalar value
9380  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
9381  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9382  {
9383  selectLargeAddAssignKernel( C, A, B, scalar );
9384  }
9385  //**********************************************************************************************
9386 
9387  //**BLAS-based addition assignment to dense matrices********************************************
9388 #if BLAZE_BLAS_MODE
9389 
9402  template< typename MT3 // Type of the left-hand side target matrix
9403  , typename MT4 // Type of the left-hand side matrix operand
9404  , typename MT5 // Type of the right-hand side matrix operand
9405  , typename ST2 > // Type of the scalar value
9406  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
9407  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9408  {
9409  typedef typename MT3::ElementType ET;
9410 
9411  if( IsTriangular<MT4>::value ) {
9412  typename MT3::ResultType tmp( serial( B ) );
9413  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9414  addAssign( C, tmp );
9415  }
9416  else if( IsTriangular<MT5>::value ) {
9417  typename MT3::ResultType tmp( serial( A ) );
9418  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9419  addAssign( C, tmp );
9420  }
9421  else {
9422  gemm( C, A, B, ET(scalar), ET(1) );
9423  }
9424  }
9425 #endif
9426  //**********************************************************************************************
9427 
9428  //**Addition assignment to sparse matrices******************************************************
9429  // No special implementation for the addition assignment to sparse matrices.
9430  //**********************************************************************************************
9431 
9432  //**Subtraction assignment to dense matrices****************************************************
9444  template< typename MT // Type of the target dense matrix
9445  , bool SO > // Storage order of the target dense matrix
9446  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9447  {
9449 
9450  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9451  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9452 
9453  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
9454  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
9455 
9456  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
9457  return;
9458  }
9459 
9460  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
9461  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
9462 
9463  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9464  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9465  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9466  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9467  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
9468  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
9469 
9470  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
9471  }
9472  //**********************************************************************************************
9473 
9474  //**Subtraction assignment to dense matrices (kernel selection)*********************************
9485  template< typename MT3 // Type of the left-hand side target matrix
9486  , typename MT4 // Type of the left-hand side matrix operand
9487  , typename MT5 // Type of the right-hand side matrix operand
9488  , typename ST2 > // Type of the scalar value
9489  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9490  {
9491  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
9492  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9493  selectSmallSubAssignKernel( C, A, B, scalar );
9494  else
9495  selectBlasSubAssignKernel( C, A, B, scalar );
9496  }
9497  //**********************************************************************************************
9498 
9499  //**Default subtraction assignment to dense matrices********************************************
9513  template< typename MT3 // Type of the left-hand side target matrix
9514  , typename MT4 // Type of the left-hand side matrix operand
9515  , typename MT5 // Type of the right-hand side matrix operand
9516  , typename ST2 > // Type of the scalar value
9517  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
9518  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9519  {
9520  const ResultType tmp( serial( A * B * scalar ) );
9521  subAssign( C, tmp );
9522  }
9523  //**********************************************************************************************
9524 
9525  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
9539  template< typename MT3 // Type of the left-hand side target matrix
9540  , typename MT4 // Type of the left-hand side matrix operand
9541  , typename MT5 // Type of the right-hand side matrix operand
9542  , typename ST2 > // Type of the scalar value
9543  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
9544  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9545  {
9546  const size_t M( A.rows() );
9547  const size_t N( B.columns() );
9548 
9549  const size_t block( BLOCK_SIZE );
9550 
9551  for( size_t ii=0UL; ii<M; ii+=block ) {
9552  const size_t iend( min( M, ii+block ) );
9553  for( size_t jj=0UL; jj<N; jj+=block ) {
9554  const size_t jend( min( N, jj+block ) );
9555  for( size_t i=ii; i<iend; ++i )
9556  {
9557  const size_t jbegin( ( IsUpper<MT4>::value )
9558  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
9559  :( jj ) );
9560  const size_t jpos( ( IsLower<MT4>::value )
9561  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
9562  :( jend ) );
9563 
9564  for( size_t j=jbegin; j<jpos; ++j ) {
9565  (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
9566  }
9567  }
9568  }
9569  }
9570  }
9571  //**********************************************************************************************
9572 
9573  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
9587  template< typename MT3 // Type of the left-hand side target matrix
9588  , typename MT4 // Type of the left-hand side matrix operand
9589  , typename MT5 // Type of the right-hand side matrix operand
9590  , typename ST2 > // Type of the scalar value
9591  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
9592  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9593  {
9594  const size_t M( A.rows() );
9595  const size_t N( B.columns() );
9596 
9597  for( size_t j=0UL; j<N; ++j )
9598  {
9599  const size_t ibegin( ( IsLower<MT4>::value )
9600  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
9601  :( 0UL ) );
9602  const size_t iend( ( IsUpper<MT4>::value )
9603  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
9604  :( M ) );
9605  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
9606 
9607  const size_t inum( iend - ibegin );
9608  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
9609 
9610  for( size_t i=ibegin; i<ipos; i+=2UL ) {
9611  (~C)(i ,j) -= A(i ,j) * B(j,j) * scalar;
9612  (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
9613  }
9614  if( ipos < iend ) {
9615  (~C)(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
9616  }
9617  }
9618  }
9619  //**********************************************************************************************
9620 
9621  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
9635  template< typename MT3 // Type of the left-hand side target matrix
9636  , typename MT4 // Type of the left-hand side matrix operand
9637  , typename MT5 // Type of the right-hand side matrix operand
9638  , typename ST2 > // Type of the scalar value
9639  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
9640  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9641  {
9642  const size_t M( A.rows() );
9643  const size_t N( B.columns() );
9644 
9645  for( size_t i=0UL; i<M; ++i )
9646  {
9647  const size_t jbegin( ( IsUpper<MT5>::value )
9648  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
9649  :( 0UL ) );
9650  const size_t jend( ( IsLower<MT5>::value )
9651  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
9652  :( N ) );
9653  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
9654 
9655  const size_t jnum( jend - jbegin );
9656  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
9657 
9658  for( size_t j=jbegin; j<jpos; j+=2UL ) {
9659  (~C)(i,j ) -= A(i,i) * B(i,j ) * scalar;
9660  (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
9661  }
9662  if( jpos < jend ) {
9663  (~C)(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
9664  }
9665  }
9666  }
9667  //**********************************************************************************************
9668 
9669  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
9683  template< typename MT3 // Type of the left-hand side target matrix
9684  , typename MT4 // Type of the left-hand side matrix operand
9685  , typename MT5 // Type of the right-hand side matrix operand
9686  , typename ST2 > // Type of the scalar value
9687  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
9688  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9689  {
9690  const size_t M( A.rows() );
9691  const size_t N( B.columns() );
9692 
9693  const size_t block( BLOCK_SIZE );
9694 
9695  for( size_t jj=0UL; jj<N; jj+=block ) {
9696  const size_t jend( min( N, jj+block ) );
9697  for( size_t ii=0UL; ii<M; ii+=block ) {
9698  const size_t iend( min( M, ii+block ) );
9699  for( size_t j=jj; j<jend; ++j )
9700  {
9701  const size_t ibegin( ( IsLower<MT5>::value )
9702  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
9703  :( ii ) );
9704  const size_t ipos( ( IsUpper<MT5>::value )
9705  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
9706  :( iend ) );
9707 
9708  for( size_t i=ibegin; i<ipos; ++i ) {
9709  (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
9710  }
9711  }
9712  }
9713  }
9714  }
9715  //**********************************************************************************************
9716 
9717  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
9731  template< typename MT3 // Type of the left-hand side target matrix
9732  , typename MT4 // Type of the left-hand side matrix operand
9733  , typename MT5 // Type of the right-hand side matrix operand
9734  , typename ST2 > // Type of the scalar value
9735  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
9736  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9737  {
9738  for( size_t i=0UL; i<A.rows(); ++i ) {
9739  C(i,i) -= A(i,i) * B(i,i) * scalar;
9740  }
9741  }
9742  //**********************************************************************************************
9743 
9744  //**Default subtraction assignment to dense matrices (small matrices)***************************
9758  template< typename MT3 // Type of the left-hand side target matrix
9759  , typename MT4 // Type of the left-hand side matrix operand
9760  , typename MT5 // Type of the right-hand side matrix operand
9761  , typename ST2 > // Type of the scalar value
9762  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9763  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9764  {
9765  selectDefaultSubAssignKernel( C, A, B, scalar );
9766  }
9767  //**********************************************************************************************
9768 
9769  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
9784  template< typename MT3 // Type of the left-hand side target matrix
9785  , typename MT4 // Type of the left-hand side matrix operand
9786  , typename MT5 // Type of the right-hand side matrix operand
9787  , typename ST2 > // Type of the scalar value
9788  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9789  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9790  {
9791  typedef IntrinsicTrait<ElementType> IT;
9792 
9793  const size_t M( A.rows() );
9794  const size_t N( B.columns() );
9795  const size_t K( A.columns() );
9796 
9797  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
9798 
9799  const size_t jpos( remainder ? ( N & size_t(-IT::size) ) : N );
9800  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % IT::size ) ) == jpos, "Invalid end calculation" );
9801 
9802  const IntrinsicType factor( set( scalar ) );
9803 
9804  size_t j( 0UL );
9805 
9806  for( ; (j+IT::size*7UL) < jpos; j+=IT::size*8UL ) {
9807  for( size_t i=0UL; i<M; ++i )
9808  {
9809  const size_t kbegin( ( IsUpper<MT4>::value )
9810  ?( ( IsLower<MT5>::value )
9811  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9812  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9813  :( IsLower<MT5>::value ? j : 0UL ) );
9814  const size_t kend( ( IsLower<MT4>::value )
9815  ?( ( IsUpper<MT5>::value )
9816  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
9817  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
9818  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
9819 
9820  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9821 
9822  for( size_t k=kbegin; k<kend; ++k ) {
9823  const IntrinsicType a1( set( A(i,k) ) );
9824  xmm1 = xmm1 + a1 * B.load(k,j );
9825  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
9826  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
9827  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
9828  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
9829  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
9830  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
9831  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
9832  }
9833 
9834  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9835  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
9836  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
9837  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
9838  (~C).store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) - xmm5 * factor );
9839  (~C).store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) - xmm6 * factor );
9840  (~C).store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) - xmm7 * factor );
9841  (~C).store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) - xmm8 * factor );
9842  }
9843  }
9844 
9845  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
9846  {
9847  size_t i( 0UL );
9848 
9849  for( ; (i+2UL) <= M; i+=2UL )
9850  {
9851  const size_t kbegin( ( IsUpper<MT4>::value )
9852  ?( ( IsLower<MT5>::value )
9853  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9854  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9855  :( IsLower<MT5>::value ? j : 0UL ) );
9856  const size_t kend( ( IsLower<MT4>::value )
9857  ?( ( IsUpper<MT5>::value )
9858  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
9859  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9860  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
9861 
9862  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9863 
9864  for( size_t k=kbegin; k<kend; ++k ) {
9865  const IntrinsicType a1( set( A(i ,k) ) );
9866  const IntrinsicType a2( set( A(i+1UL,k) ) );
9867  const IntrinsicType b1( B.load(k,j ) );
9868  const IntrinsicType b2( B.load(k,j+IT::size ) );
9869  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
9870  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
9871  xmm1 = xmm1 + a1 * b1;
9872  xmm2 = xmm2 + a1 * b2;
9873  xmm3 = xmm3 + a1 * b3;
9874  xmm4 = xmm4 + a1 * b4;
9875  xmm5 = xmm5 + a2 * b1;
9876  xmm6 = xmm6 + a2 * b2;
9877  xmm7 = xmm7 + a2 * b3;
9878  xmm8 = xmm8 + a2 * b4;
9879  }
9880 
9881  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9882  (~C).store( i , j+IT::size , (~C).load(i ,j+IT::size ) - xmm2 * factor );
9883  (~C).store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) - xmm3 * factor );
9884  (~C).store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) - xmm4 * factor );
9885  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
9886  (~C).store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) - xmm6 * factor );
9887  (~C).store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) - xmm7 * factor );
9888  (~C).store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) - xmm8 * factor );
9889  }
9890 
9891  if( i < M )
9892  {
9893  const size_t kbegin( ( IsUpper<MT4>::value )
9894  ?( ( IsLower<MT5>::value )
9895  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9896  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9897  :( IsLower<MT5>::value ? j : 0UL ) );
9898  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
9899 
9900  IntrinsicType xmm1, xmm2, xmm3, xmm4;
9901 
9902  for( size_t k=kbegin; k<kend; ++k ) {
9903  const IntrinsicType a1( set( A(i,k) ) );
9904  xmm1 = xmm1 + a1 * B.load(k,j );
9905  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
9906  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
9907  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
9908  }
9909 
9910  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9911  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
9912  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
9913  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
9914  }
9915  }
9916 
9917  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
9918  {
9919  size_t i( 0UL );
9920 
9921  for( ; (i+2UL) <= M; i+=2UL )
9922  {
9923  const size_t kbegin( ( IsUpper<MT4>::value )
9924  ?( ( IsLower<MT5>::value )
9925  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9926  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9927  :( IsLower<MT5>::value ? j : 0UL ) );
9928  const size_t kend( ( IsLower<MT4>::value )
9929  ?( ( IsUpper<MT5>::value )
9930  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
9931  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9932  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
9933 
9934  IntrinsicType xmm1, xmm2, xmm3, xmm4;
9935 
9936  for( size_t k=kbegin; k<kend; ++k ) {
9937  const IntrinsicType a1( set( A(i ,k) ) );
9938  const IntrinsicType a2( set( A(i+1UL,k) ) );
9939  const IntrinsicType b1( B.load(k,j ) );
9940  const IntrinsicType b2( B.load(k,j+IT::size) );
9941  xmm1 = xmm1 + a1 * b1;
9942  xmm2 = xmm2 + a1 * b2;
9943  xmm3 = xmm3 + a2 * b1;
9944  xmm4 = xmm4 + a2 * b2;
9945  }
9946 
9947  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9948  (~C).store( i , j+IT::size, (~C).load(i ,j+IT::size) - xmm2 * factor );
9949  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
9950  (~C).store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) - xmm4 * factor );
9951  }
9952 
9953  if( i < M )
9954  {
9955  const size_t kbegin( ( IsUpper<MT4>::value )
9956  ?( ( IsLower<MT5>::value )
9957  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9958  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9959  :( IsLower<MT5>::value ? j : 0UL ) );
9960  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
9961 
9962  IntrinsicType xmm1, xmm2;
9963 
9964  for( size_t k=kbegin; k<kend; ++k ) {
9965  const IntrinsicType a1( set( A(i,k) ) );
9966  xmm1 = xmm1 + a1 * B.load(k,j );
9967  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
9968  }
9969 
9970  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9971  (~C).store( i, j+IT::size, (~C).load(i,j+IT::size) - xmm2 * factor );
9972  }
9973  }
9974 
9975  for( ; j<jpos; j+=IT::size )
9976  {
9977  size_t i( 0UL );
9978 
9979  for( ; (i+2UL) <= M; i+=2UL )
9980  {
9981  const size_t kbegin( ( IsUpper<MT4>::value )
9982  ?( ( IsLower<MT5>::value )
9983  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9984  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9985  :( IsLower<MT5>::value ? j : 0UL ) );
9986  const size_t kend( ( IsLower<MT4>::value )
9987  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
9988  :( K ) );
9989 
9990  IntrinsicType xmm1, xmm2;
9991 
9992  for( size_t k=kbegin; k<kend; ++k ) {
9993  const IntrinsicType b1( B.load(k,j) );
9994  xmm1 = xmm1 + set( A(i ,k) ) * b1;
9995  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
9996  }
9997 
9998  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9999  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
10000  }
10001 
10002  if( i < M )
10003  {
10004  const size_t kbegin( ( IsUpper<MT4>::value )
10005  ?( ( IsLower<MT5>::value )
10006  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10007  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10008  :( IsLower<MT5>::value ? j : 0UL ) );
10009 
10010  IntrinsicType xmm1;
10011 
10012  for( size_t k=kbegin; k<K; ++k ) {
10013  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
10014  }
10015 
10016  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10017  }
10018  }
10019 
10020  for( ; remainder && j<N; ++j )
10021  {
10022  size_t i( 0UL );
10023 
10024  for( ; (i+2UL) <= M; i+=2UL )
10025  {
10026  const size_t kbegin( ( IsUpper<MT4>::value )
10027  ?( ( IsLower<MT5>::value )
10028  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10029  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10030  :( IsLower<MT5>::value ? j : 0UL ) );
10031  const size_t kend( ( IsLower<MT4>::value )
10032  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
10033  :( K ) );
10034 
10035  ElementType value1 = ElementType();
10036  ElementType value2 = ElementType();
10037 
10038  for( size_t k=kbegin; k<kend; ++k ) {
10039  value1 += A(i ,k) * B(k,j);
10040  value2 += A(i+1UL,k) * B(k,j);
10041  }
10042 
10043  (~C)(i ,j) -= value1 * scalar;
10044  (~C)(i+1UL,j) -= value2 * scalar;
10045  }
10046 
10047  if( i < M )
10048  {
10049  const size_t kbegin( ( IsUpper<MT4>::value )
10050  ?( ( IsLower<MT5>::value )
10051  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10052  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10053  :( IsLower<MT5>::value ? j : 0UL ) );
10054 
10055  ElementType value = ElementType();
10056 
10057  for( size_t k=kbegin; k<K; ++k ) {
10058  value += A(i,k) * B(k,j);
10059  }
10060 
10061  (~C)(i,j) -= value * scalar;
10062  }
10063  }
10064  }
10065  //**********************************************************************************************
10066 
10067  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
10082  template< typename MT3 // Type of the left-hand side target matrix
10083  , typename MT4 // Type of the left-hand side matrix operand
10084  , typename MT5 // Type of the right-hand side matrix operand
10085  , typename ST2 > // Type of the scalar value
10086  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10087  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
10088  {
10089  typedef IntrinsicTrait<ElementType> IT;
10090 
10091  const size_t M( A.rows() );
10092  const size_t N( B.columns() );
10093  const size_t K( A.columns() );
10094 
10095  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
10096 
10097  const size_t ipos( remainder ? ( M & size_t(-IT::size) ) : M );
10098  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % IT::size ) ) == ipos, "Invalid end calculation" );
10099 
10100  const IntrinsicType factor( set( scalar ) );
10101 
10102  size_t i( 0UL );
10103 
10104  for( ; (i+IT::size*7UL) < ipos; i+=IT::size*8UL ) {
10105  for( size_t j=0UL; j<N; ++j )
10106  {
10107  const size_t kbegin( ( IsLower<MT5>::value )
10108  ?( ( IsUpper<MT4>::value )
10109  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10110  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10111  :( IsUpper<MT4>::value ? i : 0UL ) );
10112  const size_t kend( ( IsUpper<MT5>::value )
10113  ?( ( IsLower<MT4>::value )
10114  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
10115  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
10116  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
10117 
10118  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10119 
10120  for( size_t k=kbegin; k<kend; ++k ) {
10121  const IntrinsicType b1( set( B(k,j) ) );
10122  xmm1 = xmm1 + A.load(i ,k) * b1;
10123  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
10124  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
10125  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
10126  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
10127  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
10128  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
10129  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
10130  }
10131 
10132  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10133  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
10134  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
10135  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
10136  (~C).store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) - xmm5 * factor );
10137  (~C).store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) - xmm6 * factor );
10138  (~C).store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) - xmm7 * factor );
10139  (~C).store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) - xmm8 * factor );
10140  }
10141  }
10142 
10143  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
10144  {
10145  size_t j( 0UL );
10146 
10147  for( ; (j+2UL) <= N; j+=2UL )
10148  {
10149  const size_t kbegin( ( IsLower<MT5>::value )
10150  ?( ( IsUpper<MT4>::value )
10151  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10152  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10153  :( IsUpper<MT4>::value ? i : 0UL ) );
10154  const size_t kend( ( IsUpper<MT5>::value )
10155  ?( ( IsLower<MT4>::value )
10156  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10157  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10158  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
10159 
10160  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10161 
10162  for( size_t k=kbegin; k<kend; ++k ) {
10163  const IntrinsicType a1( A.load(i ,k) );
10164  const IntrinsicType a2( A.load(i+IT::size ,k) );
10165  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
10166  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
10167  const IntrinsicType b1( set( B(k,j ) ) );
10168  const IntrinsicType b2( set( B(k,j+1UL) ) );
10169  xmm1 = xmm1 + a1 * b1;
10170  xmm2 = xmm2 + a2 * b1;
10171  xmm3 = xmm3 + a3 * b1;
10172  xmm4 = xmm4 + a4 * b1;
10173  xmm5 = xmm5 + a1 * b2;
10174  xmm6 = xmm6 + a2 * b2;
10175  xmm7 = xmm7 + a3 * b2;
10176  xmm8 = xmm8 + a4 * b2;
10177  }
10178 
10179  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10180  (~C).store( i+IT::size , j , (~C).load(i+IT::size ,j ) - xmm2 * factor );
10181  (~C).store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) - xmm3 * factor );
10182  (~C).store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) - xmm4 * factor );
10183  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
10184  (~C).store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) - xmm6 * factor );
10185  (~C).store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) - xmm7 * factor );
10186  (~C).store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) - xmm8 * factor );
10187  }
10188 
10189  if( j < N )
10190  {
10191  const size_t kbegin( ( IsLower<MT5>::value )
10192  ?( ( IsUpper<MT4>::value )
10193  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10194  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10195  :( IsUpper<MT4>::value ? i : 0UL ) );
10196  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
10197 
10198  IntrinsicType xmm1, xmm2, xmm3, xmm4;
10199 
10200  for( size_t k=kbegin; k<kend; ++k ) {
10201  const IntrinsicType b1( set( B(k,j) ) );
10202  xmm1 = xmm1 + A.load(i ,k) * b1;
10203  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
10204  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
10205  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
10206  }
10207 
10208  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10209  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
10210  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
10211  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
10212  }
10213  }
10214 
10215  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
10216  {
10217  size_t j( 0UL );
10218 
10219  for( ; (j+2UL) <= N; j+=2UL )
10220  {
10221  const size_t kbegin( ( IsLower<MT5>::value )
10222  ?( ( IsUpper<MT4>::value )
10223  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10224  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10225  :( IsUpper<MT4>::value ? i : 0UL ) );
10226  const size_t kend( ( IsUpper<MT5>::value )
10227  ?( ( IsLower<MT4>::value )
10228  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10229  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10230  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
10231 
10232  IntrinsicType xmm1, xmm2, xmm3, xmm4;
10233 
10234  for( size_t k=kbegin; k<kend; ++k ) {
10235  const IntrinsicType a1( A.load(i ,k) );
10236  const IntrinsicType a2( A.load(i+IT::size,k) );
10237  const IntrinsicType b1( set( B(k,j ) ) );
10238  const IntrinsicType b2( set( B(k,j+1UL) ) );
10239  xmm1 = xmm1 + a1 * b1;
10240  xmm2 = xmm2 + a2 * b1;
10241  xmm3 = xmm3 + a1 * b2;
10242  xmm4 = xmm4 + a2 * b2;
10243  }
10244 
10245  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10246  (~C).store( i+IT::size, j , (~C).load(i+IT::size,j ) - xmm2 * factor );
10247  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10248  (~C).store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) - xmm4 * factor );
10249  }
10250 
10251  if( j < N )
10252  {
10253  const size_t kbegin( ( IsLower<MT5>::value )
10254  ?( ( IsUpper<MT4>::value )
10255  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10256  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10257  :( IsUpper<MT4>::value ? i : 0UL ) );
10258  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
10259 
10260  IntrinsicType xmm1, xmm2;
10261 
10262  for( size_t k=kbegin; k<kend; ++k ) {
10263  const IntrinsicType b1( set( B(k,j) ) );
10264  xmm1 = xmm1 + A.load(i ,k) * b1;
10265  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
10266  }
10267 
10268  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10269  (~C).store( i+IT::size, j, (~C).load(i+IT::size,j) - xmm2 * factor );
10270  }
10271  }
10272 
10273  for( ; i<ipos; i+=IT::size )
10274  {
10275  size_t j( 0UL );
10276 
10277  for( ; (j+2UL) <= N; j+=2UL )
10278  {
10279  const size_t kbegin( ( IsLower<MT5>::value )
10280  ?( ( IsUpper<MT4>::value )
10281  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10282  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10283  :( IsUpper<MT4>::value ? i : 0UL ) );
10284  const size_t kend( ( IsUpper<MT5>::value )
10285  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10286  :( K ) );
10287 
10288  IntrinsicType xmm1, xmm2;
10289 
10290  for( size_t k=kbegin; k<kend; ++k ) {
10291  const IntrinsicType a1( A.load(i,k) );
10292  xmm1 = xmm1 + a1 * set( B(k,j ) );
10293  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
10294  }
10295 
10296  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10297  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
10298  }
10299 
10300  if( j < N )
10301  {
10302  const size_t kbegin( ( IsLower<MT5>::value )
10303  ?( ( IsUpper<MT4>::value )
10304  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10305  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10306  :( IsUpper<MT4>::value ? i : 0UL ) );
10307 
10308  IntrinsicType xmm1;
10309 
10310  for( size_t k=kbegin; k<K; ++k ) {
10311  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
10312  }
10313 
10314  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10315  }
10316  }
10317 
10318  for( ; remainder && i<M; ++i )
10319  {
10320  size_t j( 0UL );
10321 
10322  for( ; (j+2UL) <= N; j+=2UL )
10323  {
10324  const size_t kbegin( ( IsLower<MT5>::value )
10325  ?( ( IsUpper<MT4>::value )
10326  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10327  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10328  :( IsUpper<MT4>::value ? i : 0UL ) );
10329  const size_t kend( ( IsUpper<MT5>::value )
10330  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10331  :( K ) );
10332 
10333  ElementType value1 = ElementType();
10334  ElementType value2 = ElementType();
10335 
10336  for( size_t k=kbegin; k<kend; ++k ) {
10337  value1 += A(i,k) * B(k,j );
10338  value2 += A(i,k) * B(k,j+1UL);
10339  }
10340 
10341  (~C)(i,j ) -= value1 * scalar;
10342  (~C)(i,j+1UL) -= value2 * scalar;
10343  }
10344 
10345  if( j < N )
10346  {
10347  const size_t kbegin( ( IsLower<MT5>::value )
10348  ?( ( IsUpper<MT4>::value )
10349  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10350  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10351  :( IsUpper<MT4>::value ? i : 0UL ) );
10352 
10353  ElementType value = ElementType();
10354 
10355  for( size_t k=kbegin; k<K; ++k ) {
10356  value += A(i,k) * B(k,j);
10357  }
10358 
10359  (~C)(i,j) -= value * scalar;
10360  }
10361  }
10362  }
10363  //**********************************************************************************************
10364 
10365  //**Default subtraction assignment to dense matrices (large matrices)***************************
10379  template< typename MT3 // Type of the left-hand side target matrix
10380  , typename MT4 // Type of the left-hand side matrix operand
10381  , typename MT5 // Type of the right-hand side matrix operand
10382  , typename ST2 > // Type of the scalar value
10383  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10384  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10385  {
10386  selectDefaultSubAssignKernel( C, A, B, scalar );
10387  }
10388  //**********************************************************************************************
10389 
10390  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
10405  template< typename MT3 // Type of the left-hand side target matrix
10406  , typename MT4 // Type of the left-hand side matrix operand
10407  , typename MT5 // Type of the right-hand side matrix operand
10408  , typename ST2 > // Type of the scalar value
10409  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10410  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
10411  {
10412  typedef IntrinsicTrait<ElementType> IT;
10413 
10414  const size_t M( A.rows() );
10415  const size_t N( B.columns() );
10416  const size_t K( A.columns() );
10417 
10418  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
10419 
10420  const IntrinsicType factor( set( scalar ) );
10421 
10422  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
10423  {
10424  const size_t jend( min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
10425 
10426  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
10427  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % IT::size ) ) == jpos, "Invalid end calculation" );
10428 
10429  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
10430  {
10431  const size_t iend( min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
10432 
10433  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
10434  {
10435  const size_t ktmp( min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
10436 
10437  size_t j( jj );
10438 
10439  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
10440  {
10441  const size_t j1( j+IT::size );
10442  const size_t j2( j+IT::size*2UL );
10443  const size_t j3( j+IT::size*3UL );
10444 
10445  size_t i( ii );
10446 
10447  for( ; (i+2UL) <= iend; i+=2UL )
10448  {
10449  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10450  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10451  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
10452  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
10453 
10454  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10455 
10456  for( size_t k=kbegin; k<kend; ++k ) {
10457  const IntrinsicType a1( set( A(i ,k) ) );
10458  const IntrinsicType a2( set( A(i+1UL,k) ) );
10459  const IntrinsicType b1( B.load(k,j ) );
10460  const IntrinsicType b2( B.load(k,j1) );
10461  const IntrinsicType b3( B.load(k,j2) );
10462  const IntrinsicType b4( B.load(k,j3) );
10463  xmm1 = xmm1 + a1 * b1;
10464  xmm2 = xmm2 + a1 * b2;
10465  xmm3 = xmm3 + a1 * b3;
10466  xmm4 = xmm4 + a1 * b4;
10467  xmm5 = xmm5 + a2 * b1;
10468  xmm6 = xmm6 + a2 * b2;
10469  xmm7 = xmm7 + a2 * b3;
10470  xmm8 = xmm8 + a2 * b4;
10471  }
10472 
10473  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10474  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10475  (~C).store( i , j2, (~C).load(i ,j2) - xmm3 * factor );
10476  (~C).store( i , j3, (~C).load(i ,j3) - xmm4 * factor );
10477  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
10478  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm6 * factor );
10479  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) - xmm7 * factor );
10480  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) - xmm8 * factor );
10481  }
10482 
10483  if( i < iend )
10484  {
10485  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10486  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10487  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10488  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
10489 
10490  IntrinsicType xmm1, xmm2, xmm3, xmm4;
10491 
10492  for( size_t k=kbegin; k<kend; ++k ) {
10493  const IntrinsicType a1( set( A(i,k) ) );
10494  xmm1 = xmm1 + a1 * B.load(k,j );
10495  xmm2 = xmm2 + a1 * B.load(k,j1);
10496  xmm3 = xmm3 + a1 * B.load(k,j2);
10497  xmm4 = xmm4 + a1 * B.load(k,j3);
10498  }
10499 
10500  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10501  (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
10502  (~C).store( i, j2, (~C).load(i,j2) - xmm3 * factor );
10503  (~C).store( i, j3, (~C).load(i,j3) - xmm4 * factor );
10504  }
10505  }
10506 
10507  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
10508  {
10509  const size_t j1( j+IT::size );
10510 
10511  size_t i( ii );
10512 
10513  for( ; (i+4UL) <= iend; i+=4UL )
10514  {
10515  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10516  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10517  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
10518  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
10519 
10520  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10521 
10522  for( size_t k=kbegin; k<kend; ++k ) {
10523  const IntrinsicType a1( set( A(i ,k) ) );
10524  const IntrinsicType a2( set( A(i+1UL,k) ) );
10525  const IntrinsicType a3( set( A(i+2UL,k) ) );
10526  const IntrinsicType a4( set( A(i+3UL,k) ) );
10527  const IntrinsicType b1( B.load(k,j ) );
10528  const IntrinsicType b2( B.load(k,j1) );
10529  xmm1 = xmm1 + a1 * b1;
10530  xmm2 = xmm2 + a1 * b2;
10531  xmm3 = xmm3 + a2 * b1;
10532  xmm4 = xmm4 + a2 * b2;
10533  xmm5 = xmm5 + a3 * b1;
10534  xmm6 = xmm6 + a3 * b2;
10535  xmm7 = xmm7 + a4 * b1;
10536  xmm8 = xmm8 + a4 * b2;
10537  }
10538 
10539  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10540  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10541  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10542  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
10543  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
10544  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) - xmm6 * factor );
10545  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
10546  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) - xmm8 * factor );
10547  }
10548 
10549  for( ; (i+2UL) <= iend; i+=2UL )
10550  {
10551  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10552  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10553  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
10554  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
10555 
10556  IntrinsicType xmm1, xmm2, xmm3, xmm4;
10557 
10558  for( size_t k=kbegin; k<kend; ++k ) {
10559  const IntrinsicType a1( set( A(i ,k) ) );
10560  const IntrinsicType a2( set( A(i+1UL,k) ) );
10561  const IntrinsicType b1( B.load(k,j ) );
10562  const IntrinsicType b2( B.load(k,j1) );
10563  xmm1 = xmm1 + a1 * b1;
10564  xmm2 = xmm2 + a1 * b2;
10565  xmm3 = xmm3 + a2 * b1;
10566  xmm4 = xmm4 + a2 * b2;
10567  }
10568 
10569  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10570  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10571  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10572  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
10573  }
10574 
10575  if( i < iend )
10576  {
10577  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10578  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10579  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10580  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
10581 
10582  IntrinsicType xmm1, xmm2;
10583 
10584  for( size_t k=kbegin; k<kend; ++k ) {
10585  const IntrinsicType a1( set( A(i,k) ) );
10586  xmm1 = xmm1 + a1 * B.load(k,j );
10587  xmm2 = xmm2 + a1 * B.load(k,j1);
10588  }
10589 
10590  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10591  (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
10592  }
10593  }
10594 
10595  for( ; j<jpos; j+=IT::size )
10596  {
10597  for( size_t i=ii; i<iend; ++i )
10598  {
10599  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10600  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10601  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10602  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
10603 
10604  IntrinsicType xmm1;
10605 
10606  for( size_t k=kbegin; k<kend; ++k ) {
10607  const IntrinsicType a1( set( A(i,k) ) );
10608  xmm1 = xmm1 + a1 * B.load(k,j);
10609  }
10610 
10611  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10612  }
10613  }
10614 
10615  for( ; remainder && j<jend; ++j )
10616  {
10617  for( size_t i=ii; i<iend; ++i )
10618  {
10619  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10620  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10621  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10622  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
10623 
10624  ElementType value = ElementType();
10625 
10626  for( size_t k=kbegin; k<kend; ++k ) {
10627  value += A(i,k) * B(k,j);
10628  }
10629 
10630  (~C)(i,j) -= value * scalar;
10631  }
10632  }
10633  }
10634  }
10635  }
10636  }
10637  //**********************************************************************************************
10638 
10639  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
10654  template< typename MT3 // Type of the left-hand side target matrix
10655  , typename MT4 // Type of the left-hand side matrix operand
10656  , typename MT5 // Type of the right-hand side matrix operand
10657  , typename ST2 > // Type of the scalar value
10658  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10659  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
10660  {
10661  typedef IntrinsicTrait<ElementType> IT;
10662 
10663  const size_t M( A.rows() );
10664  const size_t N( B.columns() );
10665  const size_t K( A.columns() );
10666 
10667  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
10668 
10669  const IntrinsicType factor( set( scalar ) );
10670 
10671  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
10672  {
10673  const size_t iend( min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
10674 
10675  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
10676  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % IT::size ) ) == ipos, "Invalid end calculation" );
10677 
10678  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
10679  {
10680  const size_t jend( min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
10681 
10682  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
10683  {
10684  const size_t ktmp( min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
10685 
10686  size_t i( ii );
10687 
10688  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
10689  {
10690  const size_t i1( i+IT::size );
10691  const size_t i2( i+IT::size*2UL );
10692  const size_t i3( i+IT::size*3UL );
10693 
10694  size_t j( jj );
10695 
10696  for( ; (j+2UL) <= jend; j+=2UL )
10697  {
10698  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10699  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10700  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
10701  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
10702 
10703  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10704 
10705  for( size_t k=kbegin; k<kend; ++k ) {
10706  const IntrinsicType a1( A.load(i ,k) );
10707  const IntrinsicType a2( A.load(i1,k) );
10708  const IntrinsicType a3( A.load(i2,k) );
10709  const IntrinsicType a4( A.load(i3,k) );
10710  const IntrinsicType b1( set( B(k,j ) ) );
10711  const IntrinsicType b2( set( B(k,j+1UL) ) );
10712  xmm1 = xmm1 + a1 * b1;
10713  xmm2 = xmm2 + a2 * b1;
10714  xmm3 = xmm3 + a3 * b1;
10715  xmm4 = xmm4 + a4 * b1;
10716  xmm5 = xmm5 + a1 * b2;
10717  xmm6 = xmm6 + a2 * b2;
10718  xmm7 = xmm7 + a3 * b2;
10719  xmm8 = xmm8 + a4 * b2;
10720  }
10721 
10722  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10723  (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
10724  (~C).store( i2, j , (~C).load(i2,j ) - xmm3 * factor );
10725  (~C).store( i3, j , (~C).load(i3,j ) - xmm4 * factor );
10726  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
10727  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm6 * factor );
10728  (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) - xmm7 * factor );
10729  (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) - xmm8 * factor );
10730  }
10731 
10732  if( j < jend )
10733  {
10734  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10735  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10736  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
10737  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10738 
10739  IntrinsicType xmm1, xmm2, xmm3, xmm4;
10740 
10741  for( size_t k=kbegin; k<kend; ++k ) {
10742  const IntrinsicType b1( set( B(k,j) ) );
10743  xmm1 = xmm1 + A.load(i ,k) * b1;
10744  xmm2 = xmm2 + A.load(i1,k) * b1;
10745  xmm3 = xmm3 + A.load(i2,k) * b1;
10746  xmm4 = xmm4 + A.load(i3,k) * b1;
10747  }
10748 
10749  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10750  (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
10751  (~C).store( i2, j, (~C).load(i2,j) - xmm3 * factor );
10752  (~C).store( i3, j, (~C).load(i3,j) - xmm4 * factor );
10753  }
10754  }
10755 
10756  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
10757  {
10758  const size_t i1( i+IT::size );
10759 
10760  size_t j( jj );
10761 
10762  for( ; (j+4UL) <= jend; j+=4UL )
10763  {
10764  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10765  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10766  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
10767  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
10768 
10769  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10770 
10771  for( size_t k=kbegin; k<kend; ++k ) {
10772  const IntrinsicType a1( A.load(i ,k) );
10773  const IntrinsicType a2( A.load(i1,k) );
10774  const IntrinsicType b1( set( B(k,j ) ) );
10775  const IntrinsicType b2( set( B(k,j+1UL) ) );
10776  const IntrinsicType b3( set( B(k,j+2UL) ) );
10777  const IntrinsicType b4( set( B(k,j+3UL) ) );
10778  xmm1 = xmm1 + a1 * b1;
10779  xmm2 = xmm2 + a2 * b1;
10780  xmm3 = xmm3 + a1 * b2;
10781  xmm4 = xmm4 + a2 * b2;
10782  xmm5 = xmm5 + a1 * b3;
10783  xmm6 = xmm6 + a2 * b3;
10784  xmm7 = xmm7 + a1 * b4;
10785  xmm8 = xmm8 + a2 * b4;
10786  }
10787 
10788  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10789  (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
10790  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10791  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
10792  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
10793  (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) - xmm6 * factor );
10794  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
10795  (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) - xmm8 * factor );
10796  }
10797 
10798  for( ; (j+2UL) <= jend; j+=2UL )
10799  {
10800  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10801  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10802  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
10803  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
10804 
10805  IntrinsicType xmm1, xmm2, xmm3, xmm4;
10806 
10807  for( size_t k=kbegin; k<kend; ++k ) {
10808  const IntrinsicType a1( A.load(i ,k) );
10809  const IntrinsicType a2( A.load(i1,k) );
10810  const IntrinsicType b1( set( B(k,j ) ) );
10811  const IntrinsicType b2( set( B(k,j+1UL) ) );
10812  xmm1 = xmm1 + a1 * b1;
10813  xmm2 = xmm2 + a2 * b1;
10814  xmm3 = xmm3 + a1 * b2;
10815  xmm4 = xmm4 + a2 * b2;
10816  }
10817 
10818  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10819  (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
10820  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10821  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
10822  }
10823 
10824  if( j < jend )
10825  {
10826  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10827  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10828  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
10829  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10830 
10831  IntrinsicType xmm1, xmm2;
10832 
10833  for( size_t k=kbegin; k<kend; ++k ) {
10834  const IntrinsicType b1( set( B(k,j) ) );
10835  xmm1 = xmm1 + A.load(i ,k) * b1;
10836  xmm2 = xmm2 + A.load(i1,k) * b1;
10837  }
10838 
10839  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10840  (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
10841  }
10842  }
10843 
10844  for( ; i<ipos; i+=IT::size )
10845  {
10846  for( size_t j=jj; j<jend; ++j )
10847  {
10848  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10849  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10850  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
10851  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10852 
10853  IntrinsicType xmm1;
10854 
10855  for( size_t k=kbegin; k<kend; ++k ) {
10856  const IntrinsicType b1( set( B(k,j) ) );
10857  xmm1 = xmm1 + A.load(i,k) * b1;
10858  }
10859 
10860  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10861  }
10862  }
10863 
10864  for( ; remainder && i<iend; ++i )
10865  {
10866  for( size_t j=jj; j<jend; ++j )
10867  {
10868  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10869  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10870  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
10871  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10872 
10873  ElementType value = ElementType();
10874 
10875  for( size_t k=kbegin; k<kend; ++k ) {
10876  value += A(i,k) * B(k,j);
10877  }
10878 
10879  (~C)(i,j) -= value * scalar;
10880  }
10881  }
10882  }
10883  }
10884  }
10885  }
10886  //**********************************************************************************************
10887 
10888  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
10902  template< typename MT3 // Type of the left-hand side target matrix
10903  , typename MT4 // Type of the left-hand side matrix operand
10904  , typename MT5 // Type of the right-hand side matrix operand
10905  , typename ST2 > // Type of the scalar value
10906  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
10907  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10908  {
10909  selectLargeSubAssignKernel( C, A, B, scalar );
10910  }
10911  //**********************************************************************************************
10912 
10913  //**BLAS-based subraction assignment to dense matrices******************************************
10914 #if BLAZE_BLAS_MODE
10915 
10928  template< typename MT3 // Type of the left-hand side target matrix
10929  , typename MT4 // Type of the left-hand side matrix operand
10930  , typename MT5 // Type of the right-hand side matrix operand
10931  , typename ST2 > // Type of the scalar value
10932  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
10933  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10934  {
10935  typedef typename MT3::ElementType ET;
10936 
10937  if( IsTriangular<MT4>::value ) {
10938  typename MT3::ResultType tmp( serial( B ) );
10939  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
10940  subAssign( C, tmp );
10941  }
10942  else if( IsTriangular<MT5>::value ) {
10943  typename MT3::ResultType tmp( serial( A ) );
10944  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
10945  subAssign( C, tmp );
10946  }
10947  else {
10948  gemm( C, A, B, ET(-scalar), ET(1) );
10949  }
10950  }
10951 #endif
10952  //**********************************************************************************************
10953 
10954  //**Subtraction assignment to sparse matrices***************************************************
10955  // No special implementation for the subtraction assignment to sparse matrices.
10956  //**********************************************************************************************
10957 
10958  //**Multiplication assignment to dense matrices*************************************************
10959  // No special implementation for the multiplication assignment to dense matrices.
10960  //**********************************************************************************************
10961 
10962  //**Multiplication assignment to sparse matrices************************************************
10963  // No special implementation for the multiplication assignment to sparse matrices.
10964  //**********************************************************************************************
10965 
10966  //**SMP assignment to dense matrices************************************************************
10981  template< typename MT // Type of the target dense matrix
10982  , bool SO > // Storage order of the target dense matrix
10983  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
10984  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
10985  {
10987 
10988  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
10989  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
10990 
10991  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
10992  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
10993 
10994  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
10995  return;
10996  }
10997  else if( left.columns() == 0UL ) {
10998  reset( ~lhs );
10999  return;
11000  }
11001 
11002  LT A( left ); // Evaluation of the left-hand side dense matrix operand
11003  RT B( right ); // Evaluation of the right-hand side dense matrix operand
11004 
11005  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11006  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11007  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11008  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11009  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
11010  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
11011 
11012  smpAssign( ~lhs, A * B * rhs.scalar_ );
11013  }
11014  //**********************************************************************************************
11015 
11016  //**SMP assignment to sparse matrices***********************************************************
11031  template< typename MT // Type of the target sparse matrix
11032  , bool SO > // Storage order of the target sparse matrix
11033  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11034  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11035  {
11037 
11038  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
11039 
11046 
11047  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11048  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11049 
11050  const TmpType tmp( rhs );
11051  smpAssign( ~lhs, tmp );
11052  }
11053  //**********************************************************************************************
11054 
11055  //**SMP addition assignment to dense matrices***************************************************
11070  template< typename MT // Type of the target dense matrix
11071  , bool SO > // Storage order of the target dense matrix
11072  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11073  smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11074  {
11076 
11077  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11078  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11079 
11080  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
11081  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
11082 
11083  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
11084  return;
11085  }
11086 
11087  LT A( left ); // Evaluation of the left-hand side dense matrix operand
11088  RT B( right ); // Evaluation of the right-hand side dense matrix operand
11089 
11090  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11091  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11092  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11093  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11094  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
11095  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
11096 
11097  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
11098  }
11099  //**********************************************************************************************
11100 
11101  //**SMP addition assignment to sparse matrices**************************************************
11102  // No special implementation for the SMP addition assignment to sparse matrices.
11103  //**********************************************************************************************
11104 
11105  //**SMP subtraction assignment to dense matrices************************************************
11120  template< typename MT // Type of the target dense matrix
11121  , bool SO > // Storage order of the target dense matrix
11122  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11123  smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11124  {
11126 
11127  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11128  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11129 
11130  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
11131  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
11132 
11133  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
11134  return;
11135  }
11136 
11137  LT A( left ); // Evaluation of the left-hand side dense matrix operand
11138  RT B( right ); // Evaluation of the right-hand side dense matrix operand
11139 
11140  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11141  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11142  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11143  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11144  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
11145  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
11146 
11147  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
11148  }
11149  //**********************************************************************************************
11150 
11151  //**SMP subtraction assignment to sparse matrices***********************************************
11152  // No special implementation for the SMP subtraction assignment to sparse matrices.
11153  //**********************************************************************************************
11154 
11155  //**SMP multiplication assignment to dense matrices*********************************************
11156  // No special implementation for the SMP multiplication assignment to dense matrices.
11157  //**********************************************************************************************
11158 
11159  //**SMP multiplication assignment to sparse matrices********************************************
11160  // No special implementation for the SMP multiplication assignment to sparse matrices.
11161  //**********************************************************************************************
11162 
11163  //**Compile time checks*************************************************************************
11171  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
11172  //**********************************************************************************************
11173 };
11175 //*************************************************************************************************
11176 
11177 
11178 
11179 
11180 //=================================================================================================
11181 //
11182 // GLOBAL BINARY ARITHMETIC OPERATORS
11183 //
11184 //=================================================================================================
11185 
11186 //*************************************************************************************************
11215 template< typename T1 // Type of the left-hand side dense matrix
11216  , typename T2 > // Type of the right-hand side dense matrix
11217 inline const TDMatDMatMultExpr<T1,T2>
11219 {
11221 
11222  if( (~lhs).columns() != (~rhs).rows() ) {
11223  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
11224  }
11225 
11226  return TDMatDMatMultExpr<T1,T2>( ~lhs, ~rhs );
11227 }
11228 //*************************************************************************************************
11229 
11230 
11231 
11232 
11233 //=================================================================================================
11234 //
11235 // ROWS SPECIALIZATIONS
11236 //
11237 //=================================================================================================
11238 
11239 //*************************************************************************************************
11241 template< typename MT1, typename MT2 >
11242 struct Rows< TDMatDMatMultExpr<MT1,MT2> > : public Rows<MT1>
11243 {};
11245 //*************************************************************************************************
11246 
11247 
11248 
11249 
11250 //=================================================================================================
11251 //
11252 // COLUMNS SPECIALIZATIONS
11253 //
11254 //=================================================================================================
11255 
11256 //*************************************************************************************************
11258 template< typename MT1, typename MT2 >
11259 struct Columns< TDMatDMatMultExpr<MT1,MT2> > : public Columns<MT2>
11260 {};
11262 //*************************************************************************************************
11263 
11264 
11265 
11266 
11267 //=================================================================================================
11268 //
11269 // ISALIGNED SPECIALIZATIONS
11270 //
11271 //=================================================================================================
11272 
11273 //*************************************************************************************************
11275 template< typename MT1, typename MT2 >
11276 struct IsAligned< TDMatDMatMultExpr<MT1,MT2> >
11277  : public IsTrue< And< IsAligned<MT1>, IsAligned<MT2> >::value >
11278 {};
11280 //*************************************************************************************************
11281 
11282 
11283 
11284 
11285 //=================================================================================================
11286 //
11287 // ISLOWER SPECIALIZATIONS
11288 //
11289 //=================================================================================================
11290 
11291 //*************************************************************************************************
11293 template< typename MT1, typename MT2 >
11294 struct IsLower< TDMatDMatMultExpr<MT1,MT2> >
11295  : public IsTrue< And< IsLower<MT1>, IsLower<MT2> >::value >
11296 {};
11298 //*************************************************************************************************
11299 
11300 
11301 
11302 
11303 //=================================================================================================
11304 //
11305 // ISUNILOWER SPECIALIZATIONS
11306 //
11307 //=================================================================================================
11308 
11309 //*************************************************************************************************
11311 template< typename MT1, typename MT2 >
11312 struct IsUniLower< TDMatDMatMultExpr<MT1,MT2> >
11313  : public IsTrue< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
11314 {};
11316 //*************************************************************************************************
11317 
11318 
11319 
11320 
11321 //=================================================================================================
11322 //
11323 // ISSTRICTLYLOWER SPECIALIZATIONS
11324 //
11325 //=================================================================================================
11326 
11327 //*************************************************************************************************
11329 template< typename MT1, typename MT2 >
11330 struct IsStrictlyLower< TDMatDMatMultExpr<MT1,MT2> >
11331  : public IsTrue< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
11332  , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
11333 {};
11335 //*************************************************************************************************
11336 
11337 
11338 
11339 
11340 //=================================================================================================
11341 //
11342 // ISUPPER SPECIALIZATIONS
11343 //
11344 //=================================================================================================
11345 
11346 //*************************************************************************************************
11348 template< typename MT1, typename MT2 >
11349 struct IsUpper< TDMatDMatMultExpr<MT1,MT2> >
11350  : public IsTrue< And< IsUpper<MT1>, IsUpper<MT2> >::value >
11351 {};
11353 //*************************************************************************************************
11354 
11355 
11356 
11357 
11358 //=================================================================================================
11359 //
11360 // ISUNIUPPER SPECIALIZATIONS
11361 //
11362 //=================================================================================================
11363 
11364 //*************************************************************************************************
11366 template< typename MT1, typename MT2 >
11367 struct IsUniUpper< TDMatDMatMultExpr<MT1,MT2> >
11368  : public IsTrue< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
11369 {};
11371 //*************************************************************************************************
11372 
11373 
11374 
11375 
11376 //=================================================================================================
11377 //
11378 // ISSTRICTLYUPPER SPECIALIZATIONS
11379 //
11380 //=================================================================================================
11381 
11382 //*************************************************************************************************
11384 template< typename MT1, typename MT2 >
11385 struct IsStrictlyUpper< TDMatDMatMultExpr<MT1,MT2> >
11386  : public IsTrue< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
11387  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
11388 {};
11390 //*************************************************************************************************
11391 
11392 
11393 
11394 
11395 //=================================================================================================
11396 //
11397 // EXPRESSION TRAIT SPECIALIZATIONS
11398 //
11399 //=================================================================================================
11400 
11401 //*************************************************************************************************
11403 template< typename MT1, typename MT2, typename VT >
11404 struct TDMatDVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
11405 {
11406  public:
11407  //**********************************************************************************************
11408  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11409  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
11410  IsDenseVector<VT>::value && IsColumnVector<VT>::value
11411  , typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
11412  , INVALID_TYPE >::Type Type;
11413  //**********************************************************************************************
11414 };
11416 //*************************************************************************************************
11417 
11418 
11419 //*************************************************************************************************
11421 template< typename MT1, typename MT2, typename VT >
11422 struct TDMatSVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
11423 {
11424  public:
11425  //**********************************************************************************************
11426  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11427  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
11428  IsSparseVector<VT>::value && IsColumnVector<VT>::value
11429  , typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
11430  , INVALID_TYPE >::Type Type;
11431  //**********************************************************************************************
11432 };
11434 //*************************************************************************************************
11435 
11436 
11437 //*************************************************************************************************
11439 template< typename VT, typename MT1, typename MT2 >
11440 struct TDVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
11441 {
11442  public:
11443  //**********************************************************************************************
11444  typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
11445  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11446  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
11447  , typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
11448  , INVALID_TYPE >::Type Type;
11449  //**********************************************************************************************
11450 };
11452 //*************************************************************************************************
11453 
11454 
11455 //*************************************************************************************************
11457 template< typename VT, typename MT1, typename MT2 >
11458 struct TSVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
11459 {
11460  public:
11461  //**********************************************************************************************
11462  typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
11463  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11464  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
11465  , typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
11466  , INVALID_TYPE >::Type Type;
11467  //**********************************************************************************************
11468 };
11470 //*************************************************************************************************
11471 
11472 
11473 //*************************************************************************************************
11475 template< typename MT1, typename MT2, bool AF >
11476 struct SubmatrixExprTrait< TDMatDMatMultExpr<MT1,MT2>, AF >
11477 {
11478  public:
11479  //**********************************************************************************************
11480  typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
11481  , typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
11482  //**********************************************************************************************
11483 };
11485 //*************************************************************************************************
11486 
11487 
11488 //*************************************************************************************************
11490 template< typename MT1, typename MT2 >
11491 struct RowExprTrait< TDMatDMatMultExpr<MT1,MT2> >
11492 {
11493  public:
11494  //**********************************************************************************************
11495  typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
11496  //**********************************************************************************************
11497 };
11499 //*************************************************************************************************
11500 
11501 
11502 //*************************************************************************************************
11504 template< typename MT1, typename MT2 >
11505 struct ColumnExprTrait< TDMatDMatMultExpr<MT1,MT2> >
11506 {
11507  public:
11508  //**********************************************************************************************
11509  typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
11510  //**********************************************************************************************
11511 };
11513 //*************************************************************************************************
11514 
11515 } // namespace blaze
11516 
11517 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exceptionThis macro encapsulates the default way of...
Definition: Exception.h:187
const MT::ElementType max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1729
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > >, simd_int16_t >::Type set(T value)
Sets all values in the vector to the given 2-byte integral value.
Definition: Set.h:73
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:244
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7820
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:105
Header file for basic type definitions.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:437
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:226
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:252
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:207
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:436
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:507
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:383
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2588
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:259
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:152
Header file for the And class template.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:257
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:337
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the IsUniLower type trait.
Header file for the IsFloat type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:407
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:353
Constraint on the data type.
Header file for the IsComplexDouble type trait.
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:138
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:225
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:363
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:79
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the Or class template.
Header file for the TDMatSVecMultExprTrait class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exceptionThis macro encapsulates the default way of Bla...
Definition: Exception.h:331
const MT::ElementType min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1682
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:427
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:266
Header file for the Not class template.
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:238
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDMatMultExpr.h:230
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2586
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:231
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:79
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:227
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:1232
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:138
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
const bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the reset shim.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:395
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:149
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:373
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:241
Constraints on the storage order of matrix types.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:232
Header file for the HasMutableDataAccess type trait.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:122
Header file for the IsDenseVector type trait.
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:235
Header file for all intrinsic functionality.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:417
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:150
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:153
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:258
Header file for the TDVecDMatMultExprTrait class template.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:151
Header file for the TDMatDVecMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2583
Header file for the IsTrue value trait.
Header file for the IsComplex type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:281
Header file for the complex data type.
Header file for the IsUpper type trait.
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:229
Header file for exception macros.
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:228
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.