DMatTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
55 #include <blaze/math/Functions.h>
56 #include <blaze/math/Intrinsics.h>
57 #include <blaze/math/shims/Reset.h>
95 #include <blaze/system/BLAS.h>
96 #include <blaze/system/Blocking.h>
99 #include <blaze/util/Assert.h>
100 #include <blaze/util/Complex.h>
104 #include <blaze/util/DisableIf.h>
105 #include <blaze/util/EnableIf.h>
106 #include <blaze/util/Exception.h>
107 #include <blaze/util/InvalidType.h>
109 #include <blaze/util/mpl/And.h>
110 #include <blaze/util/mpl/Not.h>
111 #include <blaze/util/mpl/Or.h>
112 #include <blaze/util/SelectType.h>
113 #include <blaze/util/Types.h>
123 
124 
125 namespace blaze {
126 
127 //=================================================================================================
128 //
129 // CLASS DMATTDMATMULTEXPR
130 //
131 //=================================================================================================
132 
133 //*************************************************************************************************
140 template< typename MT1 // Type of the left-hand side dense matrix
141  , typename MT2 > // Type of the right-hand side dense matrix
142 class DMatTDMatMultExpr : public DenseMatrix< DMatTDMatMultExpr<MT1,MT2>, false >
143  , private MatMatMultExpr
144  , private Computation
145 {
146  private:
147  //**Type definitions****************************************************************************
148  typedef typename MT1::ResultType RT1;
149  typedef typename MT2::ResultType RT2;
150  typedef typename RT1::ElementType ET1;
151  typedef typename RT2::ElementType ET2;
152  typedef typename MT1::CompositeType CT1;
153  typedef typename MT2::CompositeType CT2;
154  //**********************************************************************************************
155 
156  //**********************************************************************************************
159  //**********************************************************************************************
160 
161  //**********************************************************************************************
163  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
164  //**********************************************************************************************
165 
166  //**********************************************************************************************
168 
172  template< typename T1, typename T2, typename T3 >
173  struct IsEvaluationRequired {
174  enum { value = ( evaluateLeft || evaluateRight ) };
175  };
177  //**********************************************************************************************
178 
179  //**********************************************************************************************
181 
184  template< typename T1, typename T2, typename T3 >
185  struct UseBlasKernel {
186  enum { value = BLAZE_BLAS_MODE &&
187  HasMutableDataAccess<T1>::value &&
188  HasConstDataAccess<T2>::value &&
189  HasConstDataAccess<T3>::value &&
190  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
191  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
192  IsBlasCompatible<typename T1::ElementType>::value &&
193  IsBlasCompatible<typename T2::ElementType>::value &&
194  IsBlasCompatible<typename T3::ElementType>::value &&
195  IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
196  IsSame< typename T1::ElementType, typename T3::ElementType >::value };
197  };
199  //**********************************************************************************************
200 
201  //**********************************************************************************************
203 
206  template< typename T1, typename T2, typename T3 >
207  struct UseVectorizedDefaultKernel {
208  enum { value = useOptimizedKernels &&
209  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
210  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
211  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
212  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
213  IntrinsicTrait<typename T1::ElementType>::addition &&
214  IntrinsicTrait<typename T1::ElementType>::multiplication };
215  };
217  //**********************************************************************************************
218 
219  public:
220  //**Type definitions****************************************************************************
227  typedef const ElementType ReturnType;
228  typedef const ResultType CompositeType;
229 
231  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
232 
234  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
235 
238 
241  //**********************************************************************************************
242 
243  //**Compilation flags***************************************************************************
245  enum { vectorizable = !IsDiagonal<MT1>::value && !IsDiagonal<MT2>::value &&
246  MT1::vectorizable && MT2::vectorizable &&
250 
252  enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
253  !evaluateRight && MT2::smpAssignable };
254  //**********************************************************************************************
255 
256  //**Constructor*********************************************************************************
262  explicit inline DMatTDMatMultExpr( const MT1& lhs, const MT2& rhs )
263  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
264  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
265  {
266  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
267  }
268  //**********************************************************************************************
269 
270  //**Access operator*****************************************************************************
277  inline ReturnType operator()( size_t i, size_t j ) const {
278  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
279  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
280 
281  const size_t kbegin( ( IsUpper<MT1>::value )
282  ?( ( IsLower<MT2>::value )
283  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
284  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
285  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
286  :( ( IsLower<MT2>::value )
287  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
288  :( 0UL ) ) );
289  const size_t kend( ( IsLower<MT1>::value )
290  ?( ( IsUpper<MT2>::value )
291  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
292  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
293  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
294  :( ( IsUpper<MT2>::value )
295  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
296  :( lhs_.columns() ) ) );
297 
298  if( lhs_.columns() == 0UL ||
299  ( ( IsTriangular<MT1>::value || IsTriangular<MT2>::value ) && kbegin >= kend ) )
300  return ElementType();
301 
303  return lhs_(i,i) * rhs_(i,j);
304 
306  return lhs_(i,j) * rhs_(j,j);
307 
308  const size_t knum( kend - kbegin );
309  const size_t kpos( kbegin + ( ( knum - 1UL ) & size_t(-2) ) + 1UL );
310 
311  ElementType tmp( lhs_(i,kbegin) * rhs_(kbegin,j) );
312 
313  for( size_t k=kbegin+1UL; k<kpos; k+=2UL ) {
314  tmp += lhs_(i,k ) * rhs_(k ,j);
315  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
316  }
317  if( kpos < kend ) {
318  tmp += lhs_(i,kpos) * rhs_(kpos,j);
319  }
320 
321  return tmp;
322  }
323  //**********************************************************************************************
324 
325  //**At function*********************************************************************************
333  inline ReturnType at( size_t i, size_t j ) const {
334  if( i >= lhs_.rows() ) {
335  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
336  }
337  if( j >= rhs_.columns() ) {
338  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
339  }
340  return (*this)(i,j);
341  }
342  //**********************************************************************************************
343 
344  //**Rows function*******************************************************************************
349  inline size_t rows() const {
350  return lhs_.rows();
351  }
352  //**********************************************************************************************
353 
354  //**Columns function****************************************************************************
359  inline size_t columns() const {
360  return rhs_.columns();
361  }
362  //**********************************************************************************************
363 
364  //**Left operand access*************************************************************************
369  inline LeftOperand leftOperand() const {
370  return lhs_;
371  }
372  //**********************************************************************************************
373 
374  //**Right operand access************************************************************************
379  inline RightOperand rightOperand() const {
380  return rhs_;
381  }
382  //**********************************************************************************************
383 
384  //**********************************************************************************************
390  template< typename T >
391  inline bool canAlias( const T* alias ) const {
392  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
393  }
394  //**********************************************************************************************
395 
396  //**********************************************************************************************
402  template< typename T >
403  inline bool isAliased( const T* alias ) const {
404  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
405  }
406  //**********************************************************************************************
407 
408  //**********************************************************************************************
413  inline bool isAligned() const {
414  return lhs_.isAligned() && rhs_.isAligned();
415  }
416  //**********************************************************************************************
417 
418  //**********************************************************************************************
423  inline bool canSMPAssign() const {
424  return ( !BLAZE_BLAS_IS_PARALLEL ||
425  ( rows() * columns() < DMATTDMATMULT_THRESHOLD ) ) &&
426  ( rows() > SMP_DMATTDMATMULT_THRESHOLD );
427  }
428  //**********************************************************************************************
429 
430  private:
431  //**Member variables****************************************************************************
432  LeftOperand lhs_;
433  RightOperand rhs_;
434  //**********************************************************************************************
435 
436  //**Assignment to dense matrices****************************************************************
449  template< typename MT // Type of the target dense matrix
450  , bool SO > // Storage order of the target dense matrix
451  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
452  {
454 
455  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
456  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
457 
458  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
459  return;
460  }
461  else if( rhs.lhs_.columns() == 0UL ) {
462  reset( ~lhs );
463  return;
464  }
465 
466  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
467  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
468 
469  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
470  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
471  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
472  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
473  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
474  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
475 
476  DMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
477  }
479  //**********************************************************************************************
480 
481  //**Assignment to dense matrices (kernel selection)*********************************************
492  template< typename MT3 // Type of the left-hand side target matrix
493  , typename MT4 // Type of the left-hand side matrix operand
494  , typename MT5 > // Type of the right-hand side matrix operand
495  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
496  {
498  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
499  selectSmallAssignKernel( C, A, B );
500  else
501  selectBlasAssignKernel( C, A, B );
502  }
504  //**********************************************************************************************
505 
506  //**Default assignment to row-major dense matrices (general/general)****************************
520  template< typename MT3 // Type of the left-hand side target matrix
521  , typename MT4 // Type of the left-hand side matrix operand
522  , typename MT5 > // Type of the right-hand side matrix operand
523  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
524  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
525  {
526  const size_t M( A.rows() );
527  const size_t N( B.columns() );
528  const size_t K( A.columns() );
529 
530  const size_t ibegin( ( IsStrictlyLower<MT4>::value )
531  ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
532  :( 0UL ) );
533  const size_t iend( ( IsStrictlyUpper<MT4>::value )
534  ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
535  :( M ) );
536  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
537 
538  for( size_t i=0UL; i<ibegin; ++i ) {
539  for( size_t j=0UL; j<N; ++j ) {
540  reset( (~C)(i,j) );
541  }
542  }
543  for( size_t i=ibegin; i<iend; ++i )
544  {
545  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
546  ?( ( IsStrictlyUpper<MT4>::value )
547  ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
548  :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
549  :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
550  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
551  ?( ( IsStrictlyLower<MT4>::value )
552  ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
553  :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
554  :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
555  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
556 
557  for( size_t j=0UL; j<jbegin; ++j ) {
558  reset( (~C)(i,j) );
559  }
560  for( size_t j=jbegin; j<jend; ++j )
561  {
562  const size_t kbegin( ( IsUpper<MT4>::value )
563  ?( ( IsLower<MT5>::value )
564  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
565  , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
566  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
567  :( ( IsLower<MT5>::value )
568  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
569  :( 0UL ) ) );
570  const size_t kend( ( IsLower<MT4>::value )
571  ?( ( IsUpper<MT5>::value )
572  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
573  , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
574  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
575  :( ( IsUpper<MT5>::value )
576  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
577  :( K ) ) );
578  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
579 
580  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
581  for( size_t k=kbegin+1UL; k<kend; ++k ) {
582  (~C)(i,j) += A(i,k) * B(k,j);
583  }
584  }
585  for( size_t j=jend; j<N; ++j ) {
586  reset( (~C)(i,j) );
587  }
588  }
589  for( size_t i=iend; i<M; ++i ) {
590  for( size_t j=0UL; j<N; ++j ) {
591  reset( (~C)(i,j) );
592  }
593  }
594  }
596  //**********************************************************************************************
597 
598  //**Default assignment to column-major dense matrices (general/general)*************************
612  template< typename MT3 // Type of the left-hand side target matrix
613  , typename MT4 // Type of the left-hand side matrix operand
614  , typename MT5 > // Type of the right-hand side matrix operand
615  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
616  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
617  {
618  const size_t M( A.rows() );
619  const size_t N( B.columns() );
620  const size_t K( A.columns() );
621 
622  const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
623  ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
624  :( 0UL ) );
625  const size_t jend( ( IsStrictlyLower<MT5>::value )
626  ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
627  :( N ) );
628  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
629 
630  for( size_t j=0UL; j<jbegin; ++j ) {
631  for( size_t i=0UL; i<M; ++i ) {
632  reset( (~C)(i,j) );
633  }
634  }
635  for( size_t j=jbegin; j<jend; ++j )
636  {
637  const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
638  ?( ( IsStrictlyLower<MT4>::value )
639  ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
640  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
641  :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
642  const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
643  ?( ( IsStrictlyUpper<MT4>::value )
644  ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
645  :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
646  :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
647  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
648 
649  for( size_t i=0UL; i<ibegin; ++i ) {
650  reset( (~C)(i,j) );
651  }
652  for( size_t i=ibegin; i<iend; ++i )
653  {
654  const size_t kbegin( ( IsUpper<MT4>::value )
655  ?( ( IsLower<MT5>::value )
656  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
657  , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
658  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
659  :( ( IsLower<MT5>::value )
660  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
661  :( 0UL ) ) );
662  const size_t kend( ( IsLower<MT4>::value )
663  ?( ( IsUpper<MT5>::value )
664  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
665  , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
666  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
667  :( ( IsUpper<MT5>::value )
668  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
669  :( K ) ) );
670  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
671 
672  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
673  for( size_t k=kbegin+1UL; k<kend; ++k ) {
674  (~C)(i,j) += A(i,k) * B(k,j);
675  }
676  }
677  for( size_t i=iend; i<M; ++i ) {
678  reset( (~C)(i,j) );
679  }
680  }
681  for( size_t j=jend; j<N; ++j ) {
682  for( size_t i=0UL; i<M; ++i ) {
683  reset( (~C)(i,j) );
684  }
685  }
686  }
688  //**********************************************************************************************
689 
690  //**Default assignment to row-major dense matrices (general/diagonal)***************************
704  template< typename MT3 // Type of the left-hand side target matrix
705  , typename MT4 // Type of the left-hand side matrix operand
706  , typename MT5 > // Type of the right-hand side matrix operand
707  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
708  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
709  {
710  const size_t M( A.rows() );
711  const size_t N( B.columns() );
712 
713  for( size_t i=0UL; i<M; ++i )
714  {
715  const size_t jbegin( ( IsUpper<MT4>::value )
716  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
717  :( 0UL ) );
718  const size_t jend( ( IsLower<MT4>::value )
719  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
720  :( N ) );
721  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
722 
723  if( IsUpper<MT4>::value ) {
724  for( size_t j=0UL; j<jbegin; ++j ) {
725  reset( (~C)(i,j) );
726  }
727  }
728  for( size_t j=jbegin; j<jend; ++j ) {
729  (~C)(i,j) = A(i,j) * B(j,j);
730  }
731  if( IsLower<MT4>::value ) {
732  for( size_t j=jend; j<N; ++j ) {
733  reset( (~C)(i,j) );
734  }
735  }
736  }
737  }
739  //**********************************************************************************************
740 
741  //**Default assignment to column-major dense matrices (general/diagonal)************************
755  template< typename MT3 // Type of the left-hand side target matrix
756  , typename MT4 // Type of the left-hand side matrix operand
757  , typename MT5 > // Type of the right-hand side matrix operand
758  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
759  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
760  {
761  const size_t M( A.rows() );
762  const size_t N( B.columns() );
763 
764  const size_t block( BLOCK_SIZE );
765 
766  for( size_t jj=0UL; jj<N; jj+=block ) {
767  const size_t jend( min( N, jj+block ) );
768  for( size_t ii=0UL; ii<M; ii+=block ) {
769  const size_t iend( min( M, ii+block ) );
770  for( size_t j=jj; j<jend; ++j )
771  {
772  const size_t ibegin( ( IsLower<MT4>::value )
773  ?( max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
774  :( ii ) );
775  const size_t ipos( ( IsUpper<MT4>::value )
776  ?( min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
777  :( iend ) );
778 
779  if( IsLower<MT4>::value ) {
780  for( size_t i=ii; i<ibegin; ++i ) {
781  reset( (~C)(i,j) );
782  }
783  }
784  for( size_t i=ibegin; i<ipos; ++i ) {
785  (~C)(i,j) = A(i,j) * B(j,j);
786  }
787  if( IsUpper<MT4>::value ) {
788  for( size_t i=ipos; i<iend; ++i ) {
789  reset( (~C)(i,j) );
790  }
791  }
792  }
793  }
794  }
795  }
797  //**********************************************************************************************
798 
799  //**Default assignment to row-major dense matrices (diagonal/general)***************************
813  template< typename MT3 // Type of the left-hand side target matrix
814  , typename MT4 // Type of the left-hand side matrix operand
815  , typename MT5 > // Type of the right-hand side matrix operand
816  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
817  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
818  {
819  const size_t M( A.rows() );
820  const size_t N( B.columns() );
821 
822  const size_t block( BLOCK_SIZE );
823 
824  for( size_t ii=0UL; ii<M; ii+=block ) {
825  const size_t iend( min( M, ii+block ) );
826  for( size_t jj=0UL; jj<N; jj+=block ) {
827  const size_t jend( min( N, jj+block ) );
828  for( size_t i=ii; i<iend; ++i )
829  {
830  const size_t jbegin( ( IsUpper<MT5>::value )
831  ?( max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
832  :( jj ) );
833  const size_t jpos( ( IsLower<MT5>::value )
834  ?( min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
835  :( jend ) );
836 
837  if( IsUpper<MT5>::value ) {
838  for( size_t j=jj; j<jbegin; ++j ) {
839  reset( (~C)(i,j) );
840  }
841  }
842  for( size_t j=jbegin; j<jpos; ++j ) {
843  (~C)(i,j) = A(i,i) * B(i,j);
844  }
845  if( IsLower<MT5>::value ) {
846  for( size_t j=jpos; j<jend; ++j ) {
847  reset( (~C)(i,j) );
848  }
849  }
850  }
851  }
852  }
853  }
855  //**********************************************************************************************
856 
857  //**Default assignment to column-major dense matrices (diagonal/general)************************
871  template< typename MT3 // Type of the left-hand side target matrix
872  , typename MT4 // Type of the left-hand side matrix operand
873  , typename MT5 > // Type of the right-hand side matrix operand
874  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
875  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
876  {
877  const size_t M( A.rows() );
878  const size_t N( B.columns() );
879 
880  for( size_t j=0UL; j<N; ++j )
881  {
882  const size_t ibegin( ( IsLower<MT5>::value )
883  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
884  :( 0UL ) );
885  const size_t iend( ( IsUpper<MT5>::value )
886  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
887  :( M ) );
888  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
889 
890  if( IsLower<MT5>::value ) {
891  for( size_t i=0UL; i<ibegin; ++i ) {
892  reset( (~C)(i,j) );
893  }
894  }
895  for( size_t i=ibegin; i<iend; ++i ) {
896  (~C)(i,j) = A(i,i) * B(i,j);
897  }
898  if( IsUpper<MT5>::value ) {
899  for( size_t i=iend; i<M; ++i ) {
900  reset( (~C)(i,j) );
901  }
902  }
903  }
904  }
906  //**********************************************************************************************
907 
908  //**Default assignment to dense matrices (diagonal/diagonal)************************************
922  template< typename MT3 // Type of the left-hand side target matrix
923  , typename MT4 // Type of the left-hand side matrix operand
924  , typename MT5 > // Type of the right-hand side matrix operand
925  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
926  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
927  {
928  reset( C );
929 
930  for( size_t i=0UL; i<A.rows(); ++i ) {
931  C(i,i) = A(i,i) * B(i,i);
932  }
933  }
935  //**********************************************************************************************
936 
937  //**Default assignment to dense matrices (small matrices)***************************************
951  template< typename MT3 // Type of the left-hand side target matrix
952  , typename MT4 // Type of the left-hand side matrix operand
953  , typename MT5 > // Type of the right-hand side matrix operand
954  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
955  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
956  {
957  selectDefaultAssignKernel( C, A, B );
958  }
960  //**********************************************************************************************
961 
962  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
977  template< typename MT3 // Type of the left-hand side target matrix
978  , typename MT4 // Type of the left-hand side matrix operand
979  , typename MT5 > // Type of the right-hand side matrix operand
980  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
981  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
982  {
983  typedef IntrinsicTrait<ElementType> IT;
984 
985  const size_t M( A.rows() );
986  const size_t N( B.columns() );
987  const size_t K( A.columns() );
988 
989  const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
990 
991  size_t i( 0UL );
992 
993  for( ; (i+2UL) <= M; i+=2UL )
994  {
995  size_t j( 0UL );
996 
997  for( ; (j+4UL) <= N; j+=4UL )
998  {
999  const size_t kbegin( ( IsUpper<MT4>::value )
1000  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
1001  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
1002  const size_t kend( ( IsLower<MT4>::value )
1003  ?( IsUpper<MT5>::value ? min( i+2UL, j+4UL ) : ( i+2UL ) )
1004  :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
1005 
1006  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
1007  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
1008 
1009  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1010  size_t k( kbegin );
1011 
1012  for( ; k<kpos; k+=IT::size ) {
1013  const IntrinsicType a1( A.load(i ,k) );
1014  const IntrinsicType a2( A.load(i+1UL,k) );
1015  const IntrinsicType b1( B.load(k,j ) );
1016  const IntrinsicType b2( B.load(k,j+1UL) );
1017  const IntrinsicType b3( B.load(k,j+2UL) );
1018  const IntrinsicType b4( B.load(k,j+3UL) );
1019  xmm1 = xmm1 + a1 * b1;
1020  xmm2 = xmm2 + a1 * b2;
1021  xmm3 = xmm3 + a1 * b3;
1022  xmm4 = xmm4 + a1 * b4;
1023  xmm5 = xmm5 + a2 * b1;
1024  xmm6 = xmm6 + a2 * b2;
1025  xmm7 = xmm7 + a2 * b3;
1026  xmm8 = xmm8 + a2 * b4;
1027  }
1028 
1029  (~C)(i ,j ) = sum( xmm1 );
1030  (~C)(i ,j+1UL) = sum( xmm2 );
1031  (~C)(i ,j+2UL) = sum( xmm3 );
1032  (~C)(i ,j+3UL) = sum( xmm4 );
1033  (~C)(i+1UL,j ) = sum( xmm5 );
1034  (~C)(i+1UL,j+1UL) = sum( xmm6 );
1035  (~C)(i+1UL,j+2UL) = sum( xmm7 );
1036  (~C)(i+1UL,j+3UL) = sum( xmm8 );
1037 
1038  for( ; remainder && k<kend; ++k ) {
1039  (~C)(i ,j ) += A(i ,k) * B(k,j );
1040  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1041  (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1042  (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
1043  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1044  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1045  (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1046  (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
1047  }
1048  }
1049 
1050  for( ; (j+2UL) <= N; j+=2UL )
1051  {
1052  const size_t kbegin( ( IsUpper<MT4>::value )
1053  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
1054  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
1055  const size_t kend( ( IsLower<MT4>::value )
1056  ?( IsUpper<MT5>::value ? min( i+2UL, j+2UL ) : ( i+2UL ) )
1057  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
1058 
1059  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
1060  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
1061 
1062  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1063  size_t k( kbegin );
1064 
1065  for( ; k<kpos; k+=IT::size ) {
1066  const IntrinsicType a1( A.load(i ,k) );
1067  const IntrinsicType a2( A.load(i+1UL,k) );
1068  const IntrinsicType b1( B.load(k,j ) );
1069  const IntrinsicType b2( B.load(k,j+1UL) );
1070  xmm1 = xmm1 + a1 * b1;
1071  xmm2 = xmm2 + a1 * b2;
1072  xmm3 = xmm3 + a2 * b1;
1073  xmm4 = xmm4 + a2 * b2;
1074  }
1075 
1076  (~C)(i ,j ) = sum( xmm1 );
1077  (~C)(i ,j+1UL) = sum( xmm2 );
1078  (~C)(i+1UL,j ) = sum( xmm3 );
1079  (~C)(i+1UL,j+1UL) = sum( xmm4 );
1080 
1081  for( ; remainder && k<kend; ++k ) {
1082  (~C)(i ,j ) += A(i ,k) * B(k,j );
1083  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1084  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1085  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1086  }
1087  }
1088 
1089  if( j < N )
1090  {
1091  const size_t kbegin( ( IsUpper<MT4>::value )
1092  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
1093  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
1094  const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
1095 
1096  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
1097  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
1098 
1099  IntrinsicType xmm1, xmm2;
1100  size_t k( kbegin );
1101 
1102  for( ; k<kpos; k+=IT::size ) {
1103  const IntrinsicType b1( B.load(k,j) );
1104  xmm1 = xmm1 + A.load(i ,k) * b1;
1105  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1106  }
1107 
1108  (~C)(i ,j) = sum( xmm1 );
1109  (~C)(i+1UL,j) = sum( xmm2 );
1110 
1111  for( ; remainder && k<kend; ++k ) {
1112  (~C)(i ,j) += A(i ,k) * B(k,j);
1113  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1114  }
1115  }
1116  }
1117 
1118  if( i < M )
1119  {
1120  size_t j( 0UL );
1121 
1122  for( ; (j+4UL) <= N; j+=4UL )
1123  {
1124  const size_t kbegin( ( IsUpper<MT4>::value )
1125  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
1126  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
1127  const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
1128 
1129  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
1130  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
1131 
1132  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1133  size_t k( kbegin );
1134 
1135  for( ; k<kpos; k+=IT::size ) {
1136  const IntrinsicType a1( A.load(i,k) );
1137  xmm1 = xmm1 + a1 * B.load(k,j );
1138  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1139  xmm3 = xmm3 + a1 * B.load(k,j+2UL);
1140  xmm4 = xmm4 + a1 * B.load(k,j+3UL);
1141  }
1142 
1143  (~C)(i,j ) = sum( xmm1 );
1144  (~C)(i,j+1UL) = sum( xmm2 );
1145  (~C)(i,j+2UL) = sum( xmm3 );
1146  (~C)(i,j+3UL) = sum( xmm4 );
1147 
1148  for( ; remainder && k<kend; ++k ) {
1149  (~C)(i,j ) += A(i,k) * B(k,j );
1150  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1151  (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL);
1152  (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL);
1153  }
1154  }
1155 
1156  for( ; (j+2UL) <= N; j+=2UL )
1157  {
1158  const size_t kbegin( ( IsUpper<MT4>::value )
1159  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
1160  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
1161  const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
1162 
1163  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
1164  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
1165 
1166  IntrinsicType xmm1, xmm2;
1167  size_t k( kbegin );
1168 
1169  for( ; k<kpos; k+=IT::size ) {
1170  const IntrinsicType a1( A.load(i,k) );
1171  xmm1 = xmm1 + a1 * B.load(k,j );
1172  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1173  }
1174 
1175  (~C)(i,j ) = sum( xmm1 );
1176  (~C)(i,j+1UL) = sum( xmm2 );
1177 
1178  for( ; remainder && k<kend; ++k ) {
1179  (~C)(i,j ) += A(i,k) * B(k,j );
1180  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1181  }
1182  }
1183 
1184  if( j < N )
1185  {
1186  const size_t kbegin( ( IsUpper<MT4>::value )
1187  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
1188  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
1189 
1190  const size_t kpos( remainder ? ( K & size_t(-IT::size) ) : K );
1191  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (IT::size) ) ) == kpos, "Invalid end calculation" );
1192 
1193  IntrinsicType xmm1;
1194  size_t k( kbegin );
1195 
1196  for( ; k<kpos; k+=IT::size ) {
1197  xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1198  }
1199 
1200  (~C)(i,j) = sum( xmm1 );
1201 
1202  for( ; remainder && k<K; ++k ) {
1203  (~C)(i,j) += A(i,k) * B(k,j);
1204  }
1205  }
1206  }
1207  }
1209  //**********************************************************************************************
1210 
1211  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1226  template< typename MT3 // Type of the left-hand side target matrix
1227  , typename MT4 // Type of the left-hand side matrix operand
1228  , typename MT5 > // Type of the right-hand side matrix operand
1229  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1230  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1231  {
1232  typedef IntrinsicTrait<ElementType> IT;
1233 
1234  const size_t M( A.rows() );
1235  const size_t N( B.columns() );
1236  const size_t K( A.columns() );
1237 
1238  const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
1239 
1240  size_t i( 0UL );
1241 
1242  for( ; (i+4UL) <= M; i+=4UL )
1243  {
1244  size_t j( 0UL );
1245 
1246  for( ; (j+2UL) <= N; j+=2UL )
1247  {
1248  const size_t kbegin( ( IsUpper<MT4>::value )
1249  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
1250  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
1251  const size_t kend( ( IsLower<MT4>::value )
1252  ?( IsUpper<MT5>::value ? min( i+4UL, j+2UL ) : ( i+4UL ) )
1253  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
1254 
1255  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
1256  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
1257 
1258  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1259  size_t k( kbegin );
1260 
1261  for( ; k<kpos; k+=IT::size ) {
1262  const IntrinsicType a1( A.load(i ,k) );
1263  const IntrinsicType a2( A.load(i+1UL,k) );
1264  const IntrinsicType a3( A.load(i+2UL,k) );
1265  const IntrinsicType a4( A.load(i+3UL,k) );
1266  const IntrinsicType b1( B.load(k,j ) );
1267  const IntrinsicType b2( B.load(k,j+1UL) );
1268  xmm1 = xmm1 + a1 * b1;
1269  xmm2 = xmm2 + a1 * b2;
1270  xmm3 = xmm3 + a2 * b1;
1271  xmm4 = xmm4 + a2 * b2;
1272  xmm5 = xmm5 + a3 * b1;
1273  xmm6 = xmm6 + a3 * b2;
1274  xmm7 = xmm7 + a4 * b1;
1275  xmm8 = xmm8 + a4 * b2;
1276  }
1277 
1278  (~C)(i ,j ) = sum( xmm1 );
1279  (~C)(i ,j+1UL) = sum( xmm2 );
1280  (~C)(i+1UL,j ) = sum( xmm3 );
1281  (~C)(i+1UL,j+1UL) = sum( xmm4 );
1282  (~C)(i+2UL,j ) = sum( xmm5 );
1283  (~C)(i+2UL,j+1UL) = sum( xmm6 );
1284  (~C)(i+3UL,j ) = sum( xmm7 );
1285  (~C)(i+3UL,j+1UL) = sum( xmm8 );
1286 
1287  for( ; remainder && k<kend; ++k ) {
1288  (~C)(i ,j ) += A(i ,k) * B(k,j );
1289  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1290  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1291  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1292  (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1293  (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1294  (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j );
1295  (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
1296  }
1297  }
1298 
1299  if( j < N )
1300  {
1301  const size_t kbegin( ( IsUpper<MT4>::value )
1302  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
1303  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
1304  const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
1305 
1306  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
1307  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
1308 
1309  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1310  size_t k( kbegin );
1311 
1312  for( ; k<kpos; k+=IT::size ) {
1313  const IntrinsicType b1( B.load(k,j) );
1314  xmm1 = xmm1 + A.load(i ,k) * b1;
1315  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1316  xmm3 = xmm3 + A.load(i+2UL,k) * b1;
1317  xmm4 = xmm4 + A.load(i+3UL,k) * b1;
1318  }
1319 
1320  (~C)(i ,j) = sum( xmm1 );
1321  (~C)(i+1UL,j) = sum( xmm2 );
1322  (~C)(i+2UL,j) = sum( xmm3 );
1323  (~C)(i+3UL,j) = sum( xmm4 );
1324 
1325  for( ; remainder && k<kend; ++k ) {
1326  (~C)(i ,j) += A(i ,k) * B(k,j);
1327  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1328  (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j);
1329  (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j);
1330  }
1331  }
1332  }
1333 
1334  for( ; (i+2UL) <= M; i+=2UL )
1335  {
1336  size_t j( 0UL );
1337 
1338  for( ; (j+2UL) <= N; j+=2UL )
1339  {
1340  const size_t kbegin( ( IsUpper<MT4>::value )
1341  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
1342  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
1343  const size_t kend( ( IsLower<MT4>::value )
1344  ?( IsUpper<MT5>::value ? min( i+2UL, j+2UL ) : ( i+2UL ) )
1345  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
1346 
1347  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
1348  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
1349 
1350  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1351  size_t k( kbegin );
1352 
1353  for( ; k<kpos; k+=IT::size ) {
1354  const IntrinsicType a1( A.load(i ,k) );
1355  const IntrinsicType a2( A.load(i+1UL,k) );
1356  const IntrinsicType b1( B.load(k,j ) );
1357  const IntrinsicType b2( B.load(k,j+1UL) );
1358  xmm1 = xmm1 + a1 * b1;
1359  xmm2 = xmm2 + a1 * b2;
1360  xmm3 = xmm3 + a2 * b1;
1361  xmm4 = xmm4 + a2 * b2;
1362  }
1363 
1364  (~C)(i ,j ) = sum( xmm1 );
1365  (~C)(i ,j+1UL) = sum( xmm2 );
1366  (~C)(i+1UL,j ) = sum( xmm3 );
1367  (~C)(i+1UL,j+1UL) = sum( xmm4 );
1368 
1369  for( ; remainder && k<kend; ++k ) {
1370  (~C)(i ,j ) += A(i ,k) * B(k,j );
1371  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1372  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1373  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1374  }
1375  }
1376 
1377  if( j < N )
1378  {
1379  const size_t kbegin( ( IsUpper<MT4>::value )
1380  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
1381  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
1382  const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
1383 
1384  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
1385  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
1386 
1387  IntrinsicType xmm1, xmm2;
1388  size_t k( kbegin );
1389 
1390  for( ; k<kpos; k+=IT::size ) {
1391  const IntrinsicType b1( B.load(k,j) );
1392  xmm1 = xmm1 + A.load(i ,k) * b1;
1393  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1394  }
1395 
1396  (~C)(i ,j) = sum( xmm1 );
1397  (~C)(i+1UL,j) = sum( xmm2 );
1398 
1399  for( ; remainder && k<kend; ++k ) {
1400  (~C)(i ,j) += A(i ,k) * B(k,j);
1401  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1402  }
1403  }
1404  }
1405 
1406  if( i < M )
1407  {
1408  size_t j( 0UL );
1409 
1410  for( ; (j+2UL) <= N; j+=2UL )
1411  {
1412  const size_t kbegin( ( IsUpper<MT4>::value )
1413  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
1414  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
1415  const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
1416 
1417  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
1418  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
1419 
1420  IntrinsicType xmm1, xmm2;
1421  size_t k( kbegin );
1422 
1423  for( ; k<kpos; k+=IT::size ) {
1424  const IntrinsicType a1( A.load(i,k) );
1425  xmm1 = xmm1 + a1 * B.load(k,j );
1426  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1427  }
1428 
1429  (~C)(i,j ) = sum( xmm1 );
1430  (~C)(i,j+1UL) = sum( xmm2 );
1431 
1432  for( ; remainder && k<kend; ++k ) {
1433  (~C)(i,j ) += A(i,k) * B(k,j );
1434  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1435  }
1436  }
1437 
1438  if( j < N )
1439  {
1440  const size_t kbegin( ( IsUpper<MT4>::value )
1441  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
1442  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
1443 
1444  const size_t kpos( remainder ? ( K & size_t(-IT::size) ) : K );
1445  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (IT::size) ) ) == kpos, "Invalid end calculation" );
1446 
1447  IntrinsicType xmm1;
1448  size_t k( kbegin );
1449 
1450  for( ; k<kpos; k+=IT::size ) {
1451  xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1452  }
1453 
1454  (~C)(i,j) = sum( xmm1 );
1455 
1456  for( ; remainder && k<K; ++k ) {
1457  (~C)(i,j) += A(i,k) * B(k,j);
1458  }
1459  }
1460  }
1461  }
1463  //**********************************************************************************************
1464 
1465  //**Default assignment to dense matrices (large matrices)***************************************
1479  template< typename MT3 // Type of the left-hand side target matrix
1480  , typename MT4 // Type of the left-hand side matrix operand
1481  , typename MT5 > // Type of the right-hand side matrix operand
1482  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1483  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1484  {
1485  selectDefaultAssignKernel( C, A, B );
1486  }
1488  //**********************************************************************************************
1489 
1490  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
1505  template< typename MT3 // Type of the left-hand side target matrix
1506  , typename MT4 // Type of the left-hand side matrix operand
1507  , typename MT5 > // Type of the right-hand side matrix operand
1508  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1509  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1510  {
1511  // TODO
1512  selectSmallAssignKernel( ~C, A, B );
1513  }
1515  //**********************************************************************************************
1516 
1517  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
1532  template< typename MT3 // Type of the left-hand side target matrix
1533  , typename MT4 // Type of the left-hand side matrix operand
1534  , typename MT5 > // Type of the right-hand side matrix operand
1535  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1536  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1537  {
1538  // TODO
1539  selectSmallAssignKernel( ~C, A, B );
1540  }
1542  //**********************************************************************************************
1543 
1544  //**Default assignment to dense matrices********************************************************
1558  template< typename MT3 // Type of the left-hand side target matrix
1559  , typename MT4 // Type of the left-hand side matrix operand
1560  , typename MT5 > // Type of the right-hand side matrix operand
1561  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
1562  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1563  {
1564  selectLargeAssignKernel( C, A, B );
1565  }
1567  //**********************************************************************************************
1568 
1569  //**BLAS-based assignment to dense matrices*****************************************************
1570 #if BLAZE_BLAS_MODE
1571 
1584  template< typename MT3 // Type of the left-hand side target matrix
1585  , typename MT4 // Type of the left-hand side matrix operand
1586  , typename MT5 > // Type of the right-hand side matrix operand
1587  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
1588  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1589  {
1590  typedef typename MT3::ElementType ET;
1591 
1592  if( IsTriangular<MT4>::value ) {
1593  assign( C, B );
1594  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1595  }
1596  else if( IsTriangular<MT5>::value ) {
1597  assign( C, A );
1598  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1599  }
1600  else {
1601  gemm( C, A, B, ET(1), ET(0) );
1602  }
1603  }
1605 #endif
1606  //**********************************************************************************************
1607 
1608  //**Assignment to sparse matrices***************************************************************
1621  template< typename MT // Type of the target sparse matrix
1622  , bool SO > // Storage order of the target sparse matrix
1623  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
1624  {
1626 
1627  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
1628 
1635 
1636  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1637  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1638 
1639  const TmpType tmp( serial( rhs ) );
1640  assign( ~lhs, tmp );
1641  }
1643  //**********************************************************************************************
1644 
1645  //**Addition assignment to dense matrices*******************************************************
1658  template< typename MT // Type of the target dense matrix
1659  , bool SO > // Storage order of the target dense matrix
1660  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
1661  {
1663 
1664  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1665  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1666 
1667  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1668  return;
1669  }
1670 
1671  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1672  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1673 
1674  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1675  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1676  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1677  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1678  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1679  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1680 
1681  DMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1682  }
1684  //**********************************************************************************************
1685 
1686  //**Addition assignment to dense matrices (kernel selection)************************************
1697  template< typename MT3 // Type of the left-hand side target matrix
1698  , typename MT4 // Type of the left-hand side matrix operand
1699  , typename MT5 > // Type of the right-hand side matrix operand
1700  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1701  {
1702  if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
1703  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
1704  selectSmallAddAssignKernel( C, A, B );
1705  else
1706  selectBlasAddAssignKernel( C, A, B );
1707  }
1709  //**********************************************************************************************
1710 
1711  //**Default addition assignment to row-major dense matrices (general/general)*******************
1725  template< typename MT3 // Type of the left-hand side target matrix
1726  , typename MT4 // Type of the left-hand side matrix operand
1727  , typename MT5 > // Type of the right-hand side matrix operand
1728  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
1729  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1730  {
1731  const size_t M( A.rows() );
1732  const size_t N( B.columns() );
1733  const size_t K( A.columns() );
1734 
1735  const size_t ibegin( ( IsStrictlyLower<MT4>::value )
1736  ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
1737  :( 0UL ) );
1738  const size_t iend( ( IsStrictlyUpper<MT4>::value )
1739  ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
1740  :( M ) );
1741  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1742 
1743  for( size_t i=ibegin; i<iend; ++i )
1744  {
1745  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
1746  ?( ( IsStrictlyUpper<MT4>::value )
1747  ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
1748  :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
1749  :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
1750  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
1751  ?( ( IsStrictlyLower<MT4>::value )
1752  ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
1753  :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
1754  :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
1755  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1756 
1757  for( size_t j=jbegin; j<jend; ++j )
1758  {
1759  const size_t kbegin( ( IsUpper<MT4>::value )
1760  ?( ( IsLower<MT5>::value )
1761  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1762  , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1763  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1764  :( ( IsLower<MT5>::value )
1765  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1766  :( 0UL ) ) );
1767  const size_t kend( ( IsLower<MT4>::value )
1768  ?( ( IsUpper<MT5>::value )
1769  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
1770  , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1771  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1772  :( ( IsUpper<MT5>::value )
1773  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1774  :( K ) ) );
1775  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
1776 
1777  const size_t knum( kend - kbegin );
1778  const size_t kpos( kbegin + ( knum & size_t(-2) ) );
1779 
1780  for( size_t k=kbegin; k<kpos; k+=2UL ) {
1781  (~C)(i,j) += A(i,k ) * B(k ,j);
1782  (~C)(i,j) += A(i,k+1UL) * B(k+1UL,j);
1783  }
1784  if( kpos < kend ) {
1785  (~C)(i,j) += A(i,kpos) * B(kpos,j);
1786  }
1787  }
1788  }
1789  }
1791  //**********************************************************************************************
1792 
1793  //**Default addition assignment to column-major dense matrices (general/general)****************
1807  template< typename MT3 // Type of the left-hand side target matrix
1808  , typename MT4 // Type of the left-hand side matrix operand
1809  , typename MT5 > // Type of the right-hand side matrix operand
1810  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
1811  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1812  {
1813  const size_t M( A.rows() );
1814  const size_t N( B.columns() );
1815  const size_t K( A.columns() );
1816 
1817  const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
1818  ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
1819  :( 0UL ) );
1820  const size_t jend( ( IsStrictlyLower<MT5>::value )
1821  ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
1822  :( N ) );
1823  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1824 
1825  for( size_t j=jbegin; j<jend; ++j )
1826  {
1827  const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
1828  ?( ( IsStrictlyLower<MT4>::value )
1829  ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
1830  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1831  :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
1832  const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
1833  ?( ( IsStrictlyUpper<MT4>::value )
1834  ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
1835  :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
1836  :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
1837  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1838 
1839  for( size_t i=ibegin; i<iend; ++i )
1840  {
1841  const size_t kbegin( ( IsUpper<MT4>::value )
1842  ?( ( IsLower<MT5>::value )
1843  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1844  , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1845  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1846  :( ( IsLower<MT5>::value )
1847  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1848  :( 0UL ) ) );
1849  const size_t kend( ( IsLower<MT4>::value )
1850  ?( ( IsUpper<MT5>::value )
1851  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
1852  , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1853  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1854  :( ( IsUpper<MT5>::value )
1855  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1856  :( K ) ) );
1857  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
1858 
1859  const size_t knum( kend - kbegin );
1860  const size_t kpos( kbegin + ( knum & size_t(-2) ) );
1861 
1862  for( size_t k=kbegin; k<kpos; k+=2UL ) {
1863  (~C)(i,j) += A(i,k ) * B(k ,j);
1864  (~C)(i,j) += A(i,k+1UL) * B(k+1UL,j);
1865  }
1866  if( kpos < kend ) {
1867  (~C)(i,j) += A(i,kpos) * B(kpos,j);
1868  }
1869  }
1870  }
1871  }
1873  //**********************************************************************************************
1874 
1875  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
1889  template< typename MT3 // Type of the left-hand side target matrix
1890  , typename MT4 // Type of the left-hand side matrix operand
1891  , typename MT5 > // Type of the right-hand side matrix operand
1892  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
1893  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1894  {
1895  const size_t M( A.rows() );
1896  const size_t N( B.columns() );
1897 
1898  for( size_t i=0UL; i<M; ++i )
1899  {
1900  const size_t jbegin( ( IsUpper<MT4>::value )
1901  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1902  :( 0UL ) );
1903  const size_t jend( ( IsLower<MT4>::value )
1904  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1905  :( N ) );
1906  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1907 
1908  const size_t jnum( jend - jbegin );
1909  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1910 
1911  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1912  (~C)(i,j ) += A(i,j ) * B(j ,j );
1913  (~C)(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1914  }
1915  if( jpos < jend ) {
1916  (~C)(i,jpos) += A(i,jpos) * B(jpos,jpos);
1917  }
1918  }
1919  }
1921  //**********************************************************************************************
1922 
1923  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
1937  template< typename MT3 // Type of the left-hand side target matrix
1938  , typename MT4 // Type of the left-hand side matrix operand
1939  , typename MT5 > // Type of the right-hand side matrix operand
1940  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
1941  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1942  {
1943  const size_t M( A.rows() );
1944  const size_t N( B.columns() );
1945 
1946  const size_t block( BLOCK_SIZE );
1947 
1948  for( size_t jj=0UL; jj<N; jj+=block ) {
1949  const size_t jend( min( N, jj+block ) );
1950  for( size_t ii=0UL; ii<M; ii+=block ) {
1951  const size_t iend( min( M, ii+block ) );
1952  for( size_t j=jj; j<jend; ++j )
1953  {
1954  const size_t ibegin( ( IsLower<MT4>::value )
1955  ?( max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
1956  :( ii ) );
1957  const size_t ipos( ( IsUpper<MT4>::value )
1958  ?( min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
1959  :( iend ) );
1960 
1961  for( size_t i=ibegin; i<ipos; ++i ) {
1962  (~C)(i,j) += A(i,j) * B(j,j);
1963  }
1964  }
1965  }
1966  }
1967  }
1969  //**********************************************************************************************
1970 
1971  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
1985  template< typename MT3 // Type of the left-hand side target matrix
1986  , typename MT4 // Type of the left-hand side matrix operand
1987  , typename MT5 > // Type of the right-hand side matrix operand
1988  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
1989  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1990  {
1991  const size_t M( A.rows() );
1992  const size_t N( B.columns() );
1993 
1994  const size_t block( BLOCK_SIZE );
1995 
1996  for( size_t ii=0UL; ii<M; ii+=block ) {
1997  const size_t iend( min( M, ii+block ) );
1998  for( size_t jj=0UL; jj<N; jj+=block ) {
1999  const size_t jend( min( N, jj+block ) );
2000  for( size_t i=ii; i<iend; ++i )
2001  {
2002  const size_t jbegin( ( IsUpper<MT5>::value )
2003  ?( max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
2004  :( jj ) );
2005  const size_t jpos( ( IsLower<MT5>::value )
2006  ?( min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
2007  :( jend ) );
2008 
2009  for( size_t j=jbegin; j<jpos; ++j ) {
2010  (~C)(i,j) += A(i,i) * B(i,j);
2011  }
2012  }
2013  }
2014  }
2015  }
2017  //**********************************************************************************************
2018 
2019  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
2033  template< typename MT3 // Type of the left-hand side target matrix
2034  , typename MT4 // Type of the left-hand side matrix operand
2035  , typename MT5 > // Type of the right-hand side matrix operand
2036  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2037  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2038  {
2039  const size_t M( A.rows() );
2040  const size_t N( B.columns() );
2041 
2042  for( size_t j=0UL; j<N; ++j )
2043  {
2044  const size_t ibegin( ( IsLower<MT5>::value )
2045  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2046  :( 0UL ) );
2047  const size_t iend( ( IsUpper<MT5>::value )
2048  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2049  :( M ) );
2050  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2051 
2052  const size_t inum( iend - ibegin );
2053  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2054 
2055  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2056  (~C)(i ,j) += A(i ,i ) * B(i ,j);
2057  (~C)(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2058  }
2059  if( ipos < iend ) {
2060  (~C)(ipos,j) += A(ipos,ipos) * B(ipos,j);
2061  }
2062  }
2063  }
2065  //**********************************************************************************************
2066 
2067  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2081  template< typename MT3 // Type of the left-hand side target matrix
2082  , typename MT4 // Type of the left-hand side matrix operand
2083  , typename MT5 > // Type of the right-hand side matrix operand
2084  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
2085  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2086  {
2087  for( size_t i=0UL; i<A.rows(); ++i ) {
2088  C(i,i) += A(i,i) * B(i,i);
2089  }
2090  }
2092  //**********************************************************************************************
2093 
2094  //**Default addition assignment to dense matrices (small matrices)******************************
2108  template< typename MT3 // Type of the left-hand side target matrix
2109  , typename MT4 // Type of the left-hand side matrix operand
2110  , typename MT5 > // Type of the right-hand side matrix operand
2111  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2112  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2113  {
2114  selectDefaultAddAssignKernel( C, A, B );
2115  }
2117  //**********************************************************************************************
2118 
2119  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2134  template< typename MT3 // Type of the left-hand side target matrix
2135  , typename MT4 // Type of the left-hand side matrix operand
2136  , typename MT5 > // Type of the right-hand side matrix operand
2137  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2138  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2139  {
2140  typedef IntrinsicTrait<ElementType> IT;
2141 
2142  const size_t M( A.rows() );
2143  const size_t N( B.columns() );
2144  const size_t K( A.columns() );
2145 
2146  const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
2147 
2148  size_t i( 0UL );
2149 
2150  for( ; (i+2UL) <= M; i+=2UL )
2151  {
2152  size_t j( 0UL );
2153 
2154  for( ; (j+4UL) <= N; j+=4UL )
2155  {
2156  const size_t kbegin( ( IsUpper<MT4>::value )
2157  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
2158  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
2159  const size_t kend( ( IsLower<MT4>::value )
2160  ?( IsUpper<MT5>::value ? min( i+2UL, j+4UL ) : ( i+2UL ) )
2161  :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
2162 
2163  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
2164  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
2165 
2166  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2167  size_t k( kbegin );
2168 
2169  for( ; k<kpos; k+=IT::size ) {
2170  const IntrinsicType a1( A.load(i ,k) );
2171  const IntrinsicType a2( A.load(i+1UL,k) );
2172  const IntrinsicType b1( B.load(k,j ) );
2173  const IntrinsicType b2( B.load(k,j+1UL) );
2174  const IntrinsicType b3( B.load(k,j+2UL) );
2175  const IntrinsicType b4( B.load(k,j+3UL) );
2176  xmm1 = xmm1 + a1 * b1;
2177  xmm2 = xmm2 + a1 * b2;
2178  xmm3 = xmm3 + a1 * b3;
2179  xmm4 = xmm4 + a1 * b4;
2180  xmm5 = xmm5 + a2 * b1;
2181  xmm6 = xmm6 + a2 * b2;
2182  xmm7 = xmm7 + a2 * b3;
2183  xmm8 = xmm8 + a2 * b4;
2184  }
2185 
2186  (~C)(i ,j ) += sum( xmm1 );
2187  (~C)(i ,j+1UL) += sum( xmm2 );
2188  (~C)(i ,j+2UL) += sum( xmm3 );
2189  (~C)(i ,j+3UL) += sum( xmm4 );
2190  (~C)(i+1UL,j ) += sum( xmm5 );
2191  (~C)(i+1UL,j+1UL) += sum( xmm6 );
2192  (~C)(i+1UL,j+2UL) += sum( xmm7 );
2193  (~C)(i+1UL,j+3UL) += sum( xmm8 );
2194 
2195  for( ; remainder && k<kend; ++k ) {
2196  (~C)(i ,j ) += A(i ,k) * B(k,j );
2197  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2198  (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
2199  (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
2200  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2201  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2202  (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
2203  (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
2204  }
2205  }
2206 
2207  for( ; (j+2UL) <= N; j+=2UL )
2208  {
2209  const size_t kbegin( ( IsUpper<MT4>::value )
2210  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
2211  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
2212  const size_t kend( ( IsLower<MT4>::value )
2213  ?( IsUpper<MT5>::value ? min( i+2UL, j+2UL ) : ( i+2UL ) )
2214  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
2215 
2216  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
2217  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
2218 
2219  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2220  size_t k( kbegin );
2221 
2222  for( ; k<kpos; k+=IT::size ) {
2223  const IntrinsicType a1( A.load(i ,k) );
2224  const IntrinsicType a2( A.load(i+1UL,k) );
2225  const IntrinsicType b1( B.load(k,j ) );
2226  const IntrinsicType b2( B.load(k,j+1UL) );
2227  xmm1 = xmm1 + a1 * b1;
2228  xmm2 = xmm2 + a1 * b2;
2229  xmm3 = xmm3 + a2 * b1;
2230  xmm4 = xmm4 + a2 * b2;
2231  }
2232 
2233  (~C)(i ,j ) += sum( xmm1 );
2234  (~C)(i ,j+1UL) += sum( xmm2 );
2235  (~C)(i+1UL,j ) += sum( xmm3 );
2236  (~C)(i+1UL,j+1UL) += sum( xmm4 );
2237 
2238  for( ; remainder && k<kend; ++k ) {
2239  (~C)(i ,j ) += A(i ,k) * B(k,j );
2240  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2241  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2242  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2243  }
2244  }
2245 
2246  if( j < N )
2247  {
2248  const size_t kbegin( ( IsUpper<MT4>::value )
2249  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
2250  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
2251  const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
2252 
2253  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
2254  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
2255 
2256  IntrinsicType xmm1, xmm2;
2257  size_t k( kbegin );
2258 
2259  for( ; k<kpos; k+=IT::size ) {
2260  const IntrinsicType b1( B.load(k,j) );
2261  xmm1 = xmm1 + A.load(i ,k) * b1;
2262  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2263  }
2264 
2265  (~C)(i ,j) += sum( xmm1 );
2266  (~C)(i+1UL,j) += sum( xmm2 );
2267 
2268  for( ; remainder && k<kend; ++k ) {
2269  (~C)(i ,j) += A(i ,k) * B(k,j);
2270  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2271  }
2272  }
2273  }
2274  if( i < M )
2275  {
2276  size_t j( 0UL );
2277 
2278  for( ; (j+4UL) <= N; j+=4UL )
2279  {
2280  const size_t kbegin( ( IsUpper<MT4>::value )
2281  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
2282  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
2283  const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
2284 
2285  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
2286  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
2287 
2288  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2289  size_t k( kbegin );
2290 
2291  for( ; k<kpos; k+=IT::size ) {
2292  const IntrinsicType a1( A.load(i,k) );
2293  xmm1 = xmm1 + a1 * B.load(k,j );
2294  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2295  xmm3 = xmm3 + a1 * B.load(k,j+2UL);
2296  xmm4 = xmm4 + a1 * B.load(k,j+3UL);
2297  }
2298 
2299  (~C)(i,j ) += sum( xmm1 );
2300  (~C)(i,j+1UL) += sum( xmm2 );
2301  (~C)(i,j+2UL) += sum( xmm3 );
2302  (~C)(i,j+3UL) += sum( xmm4 );
2303 
2304  for( ; remainder && k<kend; ++k ) {
2305  (~C)(i,j ) += A(i,k) * B(k,j );
2306  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2307  (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL);
2308  (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL);
2309  }
2310  }
2311 
2312  for( ; (j+2UL) <= N; j+=2UL )
2313  {
2314  const size_t kbegin( ( IsUpper<MT4>::value )
2315  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
2316  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
2317  const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
2318 
2319  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
2320  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
2321 
2322  IntrinsicType xmm1, xmm2;
2323  size_t k( kbegin );
2324 
2325  for( ; k<kpos; k+=IT::size ) {
2326  const IntrinsicType a1( A.load(i,k) );
2327  xmm1 = xmm1 + a1 * B.load(k,j );
2328  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2329  }
2330 
2331  (~C)(i,j ) += sum( xmm1 );
2332  (~C)(i,j+1UL) += sum( xmm2 );
2333 
2334  for( ; remainder && k<kend; ++k ) {
2335  (~C)(i,j ) += A(i,k) * B(k,j );
2336  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2337  }
2338  }
2339 
2340  if( j < N )
2341  {
2342  const size_t kbegin( ( IsUpper<MT4>::value )
2343  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
2344  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
2345 
2346  const size_t kpos( remainder ? ( K & size_t(-IT::size) ) : K );
2347  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (IT::size) ) ) == kpos, "Invalid end calculation" );
2348 
2349  IntrinsicType xmm1;
2350  size_t k( kbegin );
2351 
2352  for( ; k<kpos; k+=IT::size ) {
2353  xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
2354  }
2355 
2356  (~C)(i,j) += sum( xmm1 );
2357 
2358  for( ; remainder && k<K; ++k ) {
2359  (~C)(i,j) += A(i,k) * B(k,j);
2360  }
2361  }
2362  }
2363  }
2365  //**********************************************************************************************
2366 
2367  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2382  template< typename MT3 // Type of the left-hand side target matrix
2383  , typename MT4 // Type of the left-hand side matrix operand
2384  , typename MT5 > // Type of the right-hand side matrix operand
2385  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2386  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2387  {
2388  typedef IntrinsicTrait<ElementType> IT;
2389 
2390  const size_t M( A.rows() );
2391  const size_t N( B.columns() );
2392  const size_t K( A.columns() );
2393 
2394  const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
2395 
2396  size_t i( 0UL );
2397 
2398  for( ; (i+4UL) <= M; i+=4UL )
2399  {
2400  size_t j( 0UL );
2401 
2402  for( ; (j+2UL) <= N; j+=2UL )
2403  {
2404  const size_t kbegin( ( IsUpper<MT4>::value )
2405  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
2406  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
2407  const size_t kend( ( IsLower<MT4>::value )
2408  ?( IsUpper<MT5>::value ? min( i+4UL, j+2UL ) : ( i+4UL ) )
2409  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
2410 
2411  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
2412  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
2413 
2414  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2415  size_t k( kbegin );
2416 
2417  for( ; k<kpos; k+=IT::size ) {
2418  const IntrinsicType a1( A.load(i ,k) );
2419  const IntrinsicType a2( A.load(i+1UL,k) );
2420  const IntrinsicType a3( A.load(i+2UL,k) );
2421  const IntrinsicType a4( A.load(i+3UL,k) );
2422  const IntrinsicType b1( B.load(k,j ) );
2423  const IntrinsicType b2( B.load(k,j+1UL) );
2424  xmm1 = xmm1 + a1 * b1;
2425  xmm2 = xmm2 + a1 * b2;
2426  xmm3 = xmm3 + a2 * b1;
2427  xmm4 = xmm4 + a2 * b2;
2428  xmm5 = xmm5 + a3 * b1;
2429  xmm6 = xmm6 + a3 * b2;
2430  xmm7 = xmm7 + a4 * b1;
2431  xmm8 = xmm8 + a4 * b2;
2432  }
2433 
2434  (~C)(i ,j ) += sum( xmm1 );
2435  (~C)(i ,j+1UL) += sum( xmm2 );
2436  (~C)(i+1UL,j ) += sum( xmm3 );
2437  (~C)(i+1UL,j+1UL) += sum( xmm4 );
2438  (~C)(i+2UL,j ) += sum( xmm5 );
2439  (~C)(i+2UL,j+1UL) += sum( xmm6 );
2440  (~C)(i+3UL,j ) += sum( xmm7 );
2441  (~C)(i+3UL,j+1UL) += sum( xmm8 );
2442 
2443  for( ; remainder && k<kend; ++k ) {
2444  (~C)(i ,j ) += A(i ,k) * B(k,j );
2445  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2446  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2447  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2448  (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2449  (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2450  (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j );
2451  (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
2452  }
2453  }
2454 
2455  if( j < N )
2456  {
2457  const size_t kbegin( ( IsUpper<MT4>::value )
2458  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
2459  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
2460  const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
2461 
2462  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
2463  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
2464 
2465  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2466  size_t k( kbegin );
2467 
2468  for( ; k<kpos; k+=IT::size ) {
2469  const IntrinsicType b1( B.load(k,j) );
2470  xmm1 = xmm1 + A.load(i ,k) * b1;
2471  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2472  xmm3 = xmm3 + A.load(i+2UL,k) * b1;
2473  xmm4 = xmm4 + A.load(i+3UL,k) * b1;
2474  }
2475 
2476  (~C)(i ,j) += sum( xmm1 );
2477  (~C)(i+1UL,j) += sum( xmm2 );
2478  (~C)(i+2UL,j) += sum( xmm3 );
2479  (~C)(i+3UL,j) += sum( xmm4 );
2480 
2481  for( ; remainder && k<kend; ++k ) {
2482  (~C)(i ,j) += A(i ,k) * B(k,j);
2483  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2484  (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j);
2485  (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j);
2486  }
2487  }
2488  }
2489 
2490  for( ; (i+2UL) <= M; i+=2UL )
2491  {
2492  size_t j( 0UL );
2493 
2494  for( ; (j+2UL) <= N; j+=2UL )
2495  {
2496  const size_t kbegin( ( IsUpper<MT4>::value )
2497  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
2498  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
2499  const size_t kend( ( IsLower<MT4>::value )
2500  ?( IsUpper<MT5>::value ? min( i+2UL, j+2UL ) : ( i+2UL ) )
2501  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
2502 
2503  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
2504  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
2505 
2506  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2507  size_t k( kbegin );
2508 
2509  for( ; k<kpos; k+=IT::size ) {
2510  const IntrinsicType a1( A.load(i ,k) );
2511  const IntrinsicType a2( A.load(i+1UL,k) );
2512  const IntrinsicType b1( B.load(k,j ) );
2513  const IntrinsicType b2( B.load(k,j+1UL) );
2514  xmm1 = xmm1 + a1 * b1;
2515  xmm2 = xmm2 + a1 * b2;
2516  xmm3 = xmm3 + a2 * b1;
2517  xmm4 = xmm4 + a2 * b2;
2518  }
2519 
2520  (~C)(i ,j ) += sum( xmm1 );
2521  (~C)(i ,j+1UL) += sum( xmm2 );
2522  (~C)(i+1UL,j ) += sum( xmm3 );
2523  (~C)(i+1UL,j+1UL) += sum( xmm4 );
2524 
2525  for( ; remainder && k<kend; ++k ) {
2526  (~C)(i ,j ) += A(i ,k) * B(k,j );
2527  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2528  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2529  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2530  }
2531  }
2532 
2533  if( j < N )
2534  {
2535  const size_t kbegin( ( IsUpper<MT4>::value )
2536  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
2537  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
2538  const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
2539 
2540  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
2541  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
2542 
2543  IntrinsicType xmm1, xmm2;
2544  size_t k( kbegin );
2545 
2546  for( ; k<kpos; k+=IT::size ) {
2547  const IntrinsicType b1( B.load(k,j) );
2548  xmm1 = xmm1 + A.load(i ,k) * b1;
2549  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2550  }
2551 
2552  (~C)(i ,j) += sum( xmm1 );
2553  (~C)(i+1UL,j) += sum( xmm2 );
2554 
2555  for( ; remainder && k<kend; ++k ) {
2556  (~C)(i ,j) += A(i ,k) * B(k,j);
2557  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2558  }
2559  }
2560  }
2561 
2562  if( i < M )
2563  {
2564  size_t j( 0UL );
2565 
2566  for( ; (j+2UL) <= N; j+=2UL )
2567  {
2568  const size_t kbegin( ( IsUpper<MT4>::value )
2569  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
2570  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
2571  const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
2572 
2573  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
2574  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
2575 
2576  IntrinsicType xmm1, xmm2;
2577  size_t k( kbegin );
2578 
2579  for( ; k<kpos; k+=IT::size ) {
2580  const IntrinsicType a1( A.load(i,k) );
2581  xmm1 = xmm1 + a1 * B.load(k,j );
2582  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2583  }
2584 
2585  (~C)(i,j ) += sum( xmm1 );
2586  (~C)(i,j+1UL) += sum( xmm2 );
2587 
2588  for( ; remainder && k<kend; ++k ) {
2589  (~C)(i,j ) += A(i,k) * B(k,j );
2590  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2591  }
2592  }
2593 
2594  if( j < N )
2595  {
2596  const size_t kbegin( ( IsUpper<MT4>::value )
2597  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
2598  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
2599 
2600  const size_t kpos( remainder ? ( K & size_t(-IT::size) ) : K );
2601  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (IT::size) ) ) == kpos, "Invalid end calculation" );
2602 
2603  IntrinsicType xmm1;
2604  size_t k( kbegin );
2605 
2606  for( ; k<kpos; k+=IT::size ) {
2607  xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
2608  }
2609 
2610  (~C)(i,j) += sum( xmm1 );
2611 
2612  for( ; remainder && k<K; ++k ) {
2613  (~C)(i,j) += A(i,k) * B(k,j);
2614  }
2615  }
2616  }
2617  }
2619  //**********************************************************************************************
2620 
2621  //**Default addition assignment to dense matrices (large matrices)******************************
2635  template< typename MT3 // Type of the left-hand side target matrix
2636  , typename MT4 // Type of the left-hand side matrix operand
2637  , typename MT5 > // Type of the right-hand side matrix operand
2638  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2639  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2640  {
2641  selectDefaultAddAssignKernel( C, A, B );
2642  }
2644  //**********************************************************************************************
2645 
2646  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
2661  template< typename MT3 // Type of the left-hand side target matrix
2662  , typename MT4 // Type of the left-hand side matrix operand
2663  , typename MT5 > // Type of the right-hand side matrix operand
2664  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2665  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2666  {
2667  // TODO
2668  selectSmallAddAssignKernel( ~C, A, B );
2669  }
2671  //**********************************************************************************************
2672 
2673  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
2688  template< typename MT3 // Type of the left-hand side target matrix
2689  , typename MT4 // Type of the left-hand side matrix operand
2690  , typename MT5 > // Type of the right-hand side matrix operand
2691  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2692  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2693  {
2694  // TODO
2695  selectSmallAddAssignKernel( ~C, A, B );
2696  }
2698  //**********************************************************************************************
2699 
2700  //**Default addition assignment to dense matrices***********************************************
2714  template< typename MT3 // Type of the left-hand side target matrix
2715  , typename MT4 // Type of the left-hand side matrix operand
2716  , typename MT5 > // Type of the right-hand side matrix operand
2717  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2718  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2719  {
2720  selectLargeAddAssignKernel( C, A, B );
2721  }
2723  //**********************************************************************************************
2724 
2725  //**BLAS-based addition assignment to dense matrices********************************************
2726 #if BLAZE_BLAS_MODE
2727 
2740  template< typename MT3 // Type of the left-hand side target matrix
2741  , typename MT4 // Type of the left-hand side matrix operand
2742  , typename MT5 > // Type of the right-hand side matrix operand
2743  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2744  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2745  {
2746  typedef typename MT3::ElementType ET;
2747 
2748  if( IsTriangular<MT4>::value ) {
2749  typename MT3::ResultType tmp( serial( B ) );
2750  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2751  addAssign( C, tmp );
2752  }
2753  else if( IsTriangular<MT5>::value ) {
2754  typename MT3::ResultType tmp( serial( A ) );
2755  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2756  addAssign( C, tmp );
2757  }
2758  else {
2759  gemm( C, A, B, ET(1), ET(1) );
2760  }
2761  }
2763 #endif
2764  //**********************************************************************************************
2765 
2766  //**Addition assignment to sparse matrices******************************************************
2767  // No special implementation for the addition assignment to sparse matrices.
2768  //**********************************************************************************************
2769 
2770  //**Subtraction assignment to dense matrices****************************************************
2783  template< typename MT // Type of the target dense matrix
2784  , bool SO > // Storage order of the target dense matrix
2785  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
2786  {
2788 
2789  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2790  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2791 
2792  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2793  return;
2794  }
2795 
2796  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2797  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2798 
2799  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2800  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2801  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2802  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2803  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2804  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2805 
2806  DMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2807  }
2809  //**********************************************************************************************
2810 
2811  //**Subtraction assignment to dense matrices (kernel selection)*********************************
2822  template< typename MT3 // Type of the left-hand side target matrix
2823  , typename MT4 // Type of the left-hand side matrix operand
2824  , typename MT5 > // Type of the right-hand side matrix operand
2825  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2826  {
2827  if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
2828  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
2829  selectSmallSubAssignKernel( C, A, B );
2830  else
2831  selectBlasSubAssignKernel( C, A, B );
2832  }
2834  //**********************************************************************************************
2835 
2836  //**Default subtraction assignment to row-major dense matrices (general/general)****************
2850  template< typename MT3 // Type of the left-hand side target matrix
2851  , typename MT4 // Type of the left-hand side matrix operand
2852  , typename MT5 > // Type of the right-hand side matrix operand
2853  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2854  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2855  {
2856  const size_t M( A.rows() );
2857  const size_t N( B.columns() );
2858  const size_t K( A.columns() );
2859 
2860  const size_t ibegin( ( IsStrictlyLower<MT4>::value )
2861  ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
2862  :( 0UL ) );
2863  const size_t iend( ( IsStrictlyUpper<MT4>::value )
2864  ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
2865  :( M ) );
2866  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2867 
2868  for( size_t i=ibegin; i<iend; ++i )
2869  {
2870  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
2871  ?( ( IsStrictlyUpper<MT4>::value )
2872  ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
2873  :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
2874  :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
2875  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
2876  ?( ( IsStrictlyLower<MT4>::value )
2877  ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
2878  :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
2879  :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
2880  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2881 
2882  for( size_t j=jbegin; j<jend; ++j )
2883  {
2884  const size_t kbegin( ( IsUpper<MT4>::value )
2885  ?( ( IsLower<MT5>::value )
2886  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2887  , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2888  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2889  :( ( IsLower<MT5>::value )
2890  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2891  :( 0UL ) ) );
2892  const size_t kend( ( IsLower<MT4>::value )
2893  ?( ( IsUpper<MT5>::value )
2894  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
2895  , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
2896  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2897  :( ( IsUpper<MT5>::value )
2898  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2899  :( K ) ) );
2900  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
2901 
2902  const size_t knum( kend - kbegin );
2903  const size_t kpos( kbegin + ( knum & size_t(-2) ) );
2904 
2905  for( size_t k=kbegin; k<kpos; k+=2UL ) {
2906  (~C)(i,j) -= A(i,k ) * B(k ,j);
2907  (~C)(i,j) -= A(i,k+1UL) * B(k+1UL,j);
2908  }
2909  if( kpos < kend ) {
2910  (~C)(i,j) -= A(i,kpos) * B(kpos,j);
2911  }
2912  }
2913  }
2914  }
2916  //**********************************************************************************************
2917 
2918  //**Default subtraction assignment to column-major dense matrices (general/general)*************
2932  template< typename MT3 // Type of the left-hand side target matrix
2933  , typename MT4 // Type of the left-hand side matrix operand
2934  , typename MT5 > // Type of the right-hand side matrix operand
2935  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2936  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2937  {
2938  const size_t M( A.rows() );
2939  const size_t N( B.columns() );
2940  const size_t K( A.columns() );
2941 
2942  const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
2943  ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
2944  :( 0UL ) );
2945  const size_t jend( ( IsStrictlyLower<MT5>::value )
2946  ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
2947  :( N ) );
2948  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2949 
2950  for( size_t j=jbegin; j<jend; ++j )
2951  {
2952  const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
2953  ?( ( IsStrictlyLower<MT4>::value )
2954  ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
2955  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2956  :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
2957  const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
2958  ?( ( IsStrictlyUpper<MT4>::value )
2959  ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
2960  :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
2961  :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
2962  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2963 
2964  for( size_t i=ibegin; i<iend; ++i )
2965  {
2966  const size_t kbegin( ( IsUpper<MT4>::value )
2967  ?( ( IsLower<MT5>::value )
2968  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2969  , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2970  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2971  :( ( IsLower<MT5>::value )
2972  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2973  :( 0UL ) ) );
2974  const size_t kend( ( IsLower<MT4>::value )
2975  ?( ( IsUpper<MT5>::value )
2976  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
2977  , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
2978  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2979  :( ( IsUpper<MT5>::value )
2980  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2981  :( K ) ) );
2982  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
2983 
2984  const size_t knum( kend - kbegin );
2985  const size_t kpos( kbegin + ( knum & size_t(-2) ) );
2986 
2987  for( size_t k=kbegin; k<kpos; k+=2UL ) {
2988  (~C)(i,j) -= A(i,k ) * B(k ,j);
2989  (~C)(i,j) -= A(i,k+1UL) * B(k+1UL,j);
2990  }
2991  if( kpos < kend ) {
2992  (~C)(i,j) -= A(i,kpos) * B(kpos,j);
2993  }
2994  }
2995  }
2996  }
2998  //**********************************************************************************************
2999 
3000  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
3014  template< typename MT3 // Type of the left-hand side target matrix
3015  , typename MT4 // Type of the left-hand side matrix operand
3016  , typename MT5 > // Type of the right-hand side matrix operand
3017  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
3018  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3019  {
3020  const size_t M( A.rows() );
3021  const size_t N( B.columns() );
3022 
3023  for( size_t i=0UL; i<M; ++i )
3024  {
3025  const size_t jbegin( ( IsUpper<MT4>::value )
3026  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
3027  :( 0UL ) );
3028  const size_t jend( ( IsLower<MT4>::value )
3029  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
3030  :( N ) );
3031  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3032 
3033  const size_t jnum( jend - jbegin );
3034  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3035 
3036  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3037  (~C)(i,j ) -= A(i,j ) * B(j ,j );
3038  (~C)(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3039  }
3040  if( jpos < jend ) {
3041  (~C)(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3042  }
3043  }
3044  }
3046  //**********************************************************************************************
3047 
3048  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
3062  template< typename MT3 // Type of the left-hand side target matrix
3063  , typename MT4 // Type of the left-hand side matrix operand
3064  , typename MT5 > // Type of the right-hand side matrix operand
3065  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
3066  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3067  {
3068  const size_t M( A.rows() );
3069  const size_t N( B.columns() );
3070 
3071  const size_t block( BLOCK_SIZE );
3072 
3073  for( size_t jj=0UL; jj<N; jj+=block ) {
3074  const size_t jend( min( N, jj+block ) );
3075  for( size_t ii=0UL; ii<M; ii+=block ) {
3076  const size_t iend( min( M, ii+block ) );
3077  for( size_t j=jj; j<jend; ++j )
3078  {
3079  const size_t ibegin( ( IsLower<MT4>::value )
3080  ?( max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
3081  :( ii ) );
3082  const size_t ipos( ( IsUpper<MT4>::value )
3083  ?( min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
3084  :( iend ) );
3085 
3086  for( size_t i=ibegin; i<ipos; ++i ) {
3087  (~C)(i,j) -= A(i,j) * B(j,j);
3088  }
3089  }
3090  }
3091  }
3092  }
3094  //**********************************************************************************************
3095 
3096  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
3110  template< typename MT3 // Type of the left-hand side target matrix
3111  , typename MT4 // Type of the left-hand side matrix operand
3112  , typename MT5 > // Type of the right-hand side matrix operand
3113  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
3114  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3115  {
3116  const size_t M( A.rows() );
3117  const size_t N( B.columns() );
3118 
3119  const size_t block( BLOCK_SIZE );
3120 
3121  for( size_t ii=0UL; ii<M; ii+=block ) {
3122  const size_t iend( min( M, ii+block ) );
3123  for( size_t jj=0UL; jj<N; jj+=block ) {
3124  const size_t jend( min( N, jj+block ) );
3125  for( size_t i=ii; i<iend; ++i )
3126  {
3127  const size_t jbegin( ( IsUpper<MT5>::value )
3128  ?( max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
3129  :( jj ) );
3130  const size_t jpos( ( IsLower<MT5>::value )
3131  ?( min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
3132  :( jend ) );
3133 
3134  for( size_t j=jbegin; j<jpos; ++j ) {
3135  (~C)(i,j) -= A(i,i) * B(i,j);
3136  }
3137  }
3138  }
3139  }
3140  }
3142  //**********************************************************************************************
3143 
3144  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
3158  template< typename MT3 // Type of the left-hand side target matrix
3159  , typename MT4 // Type of the left-hand side matrix operand
3160  , typename MT5 > // Type of the right-hand side matrix operand
3161  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
3162  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3163  {
3164  const size_t M( A.rows() );
3165  const size_t N( B.columns() );
3166 
3167  for( size_t j=0UL; j<N; ++j )
3168  {
3169  const size_t ibegin( ( IsLower<MT5>::value )
3170  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
3171  :( 0UL ) );
3172  const size_t iend( ( IsUpper<MT5>::value )
3173  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
3174  :( M ) );
3175  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3176 
3177  const size_t inum( iend - ibegin );
3178  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3179 
3180  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3181  (~C)(i ,j) -= A(i ,i ) * B(i ,j);
3182  (~C)(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3183  }
3184  if( ipos < iend ) {
3185  (~C)(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3186  }
3187  }
3188  }
3190  //**********************************************************************************************
3191 
3192  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
3206  template< typename MT3 // Type of the left-hand side target matrix
3207  , typename MT4 // Type of the left-hand side matrix operand
3208  , typename MT5 > // Type of the right-hand side matrix operand
3209  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
3210  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3211  {
3212  for( size_t i=0UL; i<A.rows(); ++i ) {
3213  C(i,i) -= A(i,i) * B(i,i);
3214  }
3215  }
3217  //**********************************************************************************************
3218 
3219  //**Default subtraction assignment to dense matrices (small matrices)***************************
3233  template< typename MT3 // Type of the left-hand side target matrix
3234  , typename MT4 // Type of the left-hand side matrix operand
3235  , typename MT5 > // Type of the right-hand side matrix operand
3236  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3237  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3238  {
3239  selectDefaultSubAssignKernel( ~C, A, B );
3240  }
3242  //**********************************************************************************************
3243 
3244  //**Default subtraction assignment to row-major dense matrices (small matrices)*****************
3259  template< typename MT3 // Type of the left-hand side target matrix
3260  , typename MT4 // Type of the left-hand side matrix operand
3261  , typename MT5 > // Type of the right-hand side matrix operand
3262  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3263  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3264  {
3265  typedef IntrinsicTrait<ElementType> IT;
3266 
3267  const size_t M( A.rows() );
3268  const size_t N( B.columns() );
3269  const size_t K( A.columns() );
3270 
3271  const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
3272 
3273  size_t i( 0UL );
3274 
3275  for( ; (i+2UL) <= M; i+=2UL )
3276  {
3277  size_t j( 0UL );
3278 
3279  for( ; (j+4UL) <= N; j+=4UL )
3280  {
3281  const size_t kbegin( ( IsUpper<MT4>::value )
3282  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
3283  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
3284  const size_t kend( ( IsLower<MT4>::value )
3285  ?( IsUpper<MT5>::value ? min( i+2UL, j+4UL ) : ( i+2UL ) )
3286  :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
3287 
3288  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
3289  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
3290 
3291  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3292  size_t k( kbegin );
3293 
3294  for( ; k<kpos; k+=IT::size ) {
3295  const IntrinsicType a1( A.load(i ,k) );
3296  const IntrinsicType a2( A.load(i+1UL,k) );
3297  const IntrinsicType b1( B.load(k,j ) );
3298  const IntrinsicType b2( B.load(k,j+1UL) );
3299  const IntrinsicType b3( B.load(k,j+2UL) );
3300  const IntrinsicType b4( B.load(k,j+3UL) );
3301  xmm1 = xmm1 + a1 * b1;
3302  xmm2 = xmm2 + a1 * b2;
3303  xmm3 = xmm3 + a1 * b3;
3304  xmm4 = xmm4 + a1 * b4;
3305  xmm5 = xmm5 + a2 * b1;
3306  xmm6 = xmm6 + a2 * b2;
3307  xmm7 = xmm7 + a2 * b3;
3308  xmm8 = xmm8 + a2 * b4;
3309  }
3310 
3311  (~C)(i ,j ) -= sum( xmm1 );
3312  (~C)(i ,j+1UL) -= sum( xmm2 );
3313  (~C)(i ,j+2UL) -= sum( xmm3 );
3314  (~C)(i ,j+3UL) -= sum( xmm4 );
3315  (~C)(i+1UL,j ) -= sum( xmm5 );
3316  (~C)(i+1UL,j+1UL) -= sum( xmm6 );
3317  (~C)(i+1UL,j+2UL) -= sum( xmm7 );
3318  (~C)(i+1UL,j+3UL) -= sum( xmm8 );
3319 
3320  for( ; remainder && k<kend; ++k ) {
3321  (~C)(i ,j ) -= A(i ,k) * B(k,j );
3322  (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3323  (~C)(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
3324  (~C)(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
3325  (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3326  (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3327  (~C)(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
3328  (~C)(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
3329  }
3330  }
3331 
3332  for( ; (j+2UL) <= N; j+=2UL )
3333  {
3334  const size_t kbegin( ( IsUpper<MT4>::value )
3335  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
3336  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
3337  const size_t kend( ( IsLower<MT4>::value )
3338  ?( IsUpper<MT5>::value ? min( i+2UL, j+2UL ) : ( i+2UL ) )
3339  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
3340 
3341  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
3342  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
3343 
3344  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3345  size_t k( kbegin );
3346 
3347  for( ; k<kpos; k+=IT::size ) {
3348  const IntrinsicType a1( A.load(i ,k) );
3349  const IntrinsicType a2( A.load(i+1UL,k) );
3350  const IntrinsicType b1( B.load(k,j ) );
3351  const IntrinsicType b2( B.load(k,j+1UL) );
3352  xmm1 = xmm1 + a1 * b1;
3353  xmm2 = xmm2 + a1 * b2;
3354  xmm3 = xmm3 + a2 * b1;
3355  xmm4 = xmm4 + a2 * b2;
3356  }
3357 
3358  (~C)(i ,j ) -= sum( xmm1 );
3359  (~C)(i ,j+1UL) -= sum( xmm2 );
3360  (~C)(i+1UL,j ) -= sum( xmm3 );
3361  (~C)(i+1UL,j+1UL) -= sum( xmm4 );
3362 
3363  for( ; remainder && k<kend; ++k ) {
3364  (~C)(i ,j ) -= A(i ,k) * B(k,j );
3365  (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3366  (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3367  (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3368  }
3369  }
3370 
3371  if( j < N )
3372  {
3373  const size_t kbegin( ( IsUpper<MT4>::value )
3374  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
3375  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
3376  const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
3377 
3378  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
3379  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
3380 
3381  IntrinsicType xmm1, xmm2;
3382  size_t k( kbegin );
3383 
3384  for( ; k<kpos; k+=IT::size ) {
3385  const IntrinsicType b1( B.load(k,j) );
3386  xmm1 = xmm1 + A.load(i ,k) * b1;
3387  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3388  }
3389 
3390  (~C)(i ,j) -= sum( xmm1 );
3391  (~C)(i+1UL,j) -= sum( xmm2 );
3392 
3393  for( ; remainder && k<kend; ++k ) {
3394  (~C)(i ,j) -= A(i ,k) * B(k,j);
3395  (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3396  }
3397  }
3398  }
3399 
3400  if( i < M )
3401  {
3402  size_t j( 0UL );
3403 
3404  for( ; (j+4UL) <= N; j+=4UL )
3405  {
3406  const size_t kbegin( ( IsUpper<MT4>::value )
3407  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
3408  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
3409  const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
3410 
3411  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
3412  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
3413 
3414  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3415  size_t k( kbegin );
3416 
3417  for( ; k<kpos; k+=IT::size ) {
3418  const IntrinsicType a1( A.load(i,k) );
3419  xmm1 = xmm1 + a1 * B.load(k,j );
3420  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3421  xmm3 = xmm3 + a1 * B.load(k,j+2UL);
3422  xmm4 = xmm4 + a1 * B.load(k,j+3UL);
3423  }
3424 
3425  (~C)(i,j ) -= sum( xmm1 );
3426  (~C)(i,j+1UL) -= sum( xmm2 );
3427  (~C)(i,j+2UL) -= sum( xmm3 );
3428  (~C)(i,j+3UL) -= sum( xmm4 );
3429 
3430  for( ; remainder && k<kend; ++k ) {
3431  (~C)(i,j ) -= A(i,k) * B(k,j );
3432  (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3433  (~C)(i,j+2UL) -= A(i,k) * B(k,j+2UL);
3434  (~C)(i,j+3UL) -= A(i,k) * B(k,j+3UL);
3435  }
3436  }
3437 
3438  for( ; (j+2UL) <= N; j+=2UL )
3439  {
3440  const size_t kbegin( ( IsUpper<MT4>::value )
3441  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
3442  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
3443  const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
3444 
3445  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
3446  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
3447 
3448  IntrinsicType xmm1, xmm2;
3449  size_t k( kbegin );
3450 
3451  for( ; k<kpos; k+=IT::size ) {
3452  const IntrinsicType a1( A.load(i,k) );
3453  xmm1 = xmm1 + a1 * B.load(k,j );
3454  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3455  }
3456 
3457  (~C)(i,j ) -= sum( xmm1 );
3458  (~C)(i,j+1UL) -= sum( xmm2 );
3459 
3460  for( ; remainder && k<kend; ++k ) {
3461  (~C)(i,j ) -= A(i,k) * B(k,j );
3462  (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3463  }
3464  }
3465 
3466  if( j < N )
3467  {
3468  const size_t kbegin( ( IsUpper<MT4>::value )
3469  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
3470  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
3471 
3472  const size_t kpos( remainder ? ( K & size_t(-IT::size) ) : K );
3473  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (IT::size) ) ) == kpos, "Invalid end calculation" );
3474 
3475  IntrinsicType xmm1;
3476  size_t k( kbegin );
3477 
3478  for( ; k<kpos; k+=IT::size ) {
3479  xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3480  }
3481 
3482  (~C)(i,j) -= sum( xmm1 );
3483 
3484  for( ; remainder && k<K; ++k ) {
3485  (~C)(i,j) -= A(i,k) * B(k,j);
3486  }
3487  }
3488  }
3489  }
3491  //**********************************************************************************************
3492 
3493  //**Default subtraction assignment to column-major dense matrices (small matrices)**************
3508  template< typename MT3 // Type of the left-hand side target matrix
3509  , typename MT4 // Type of the left-hand side matrix operand
3510  , typename MT5 > // Type of the right-hand side matrix operand
3511  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3512  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3513  {
3514  typedef IntrinsicTrait<ElementType> IT;
3515 
3516  const size_t M( A.rows() );
3517  const size_t N( B.columns() );
3518  const size_t K( A.columns() );
3519 
3520  const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
3521 
3522  size_t i( 0UL );
3523 
3524  for( ; (i+4UL) <= M; i+=4UL )
3525  {
3526  size_t j( 0UL );
3527 
3528  for( ; (j+2UL) <= N; j+=2UL )
3529  {
3530  const size_t kbegin( ( IsUpper<MT4>::value )
3531  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
3532  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
3533  const size_t kend( ( IsLower<MT4>::value )
3534  ?( IsUpper<MT5>::value ? min( i+4UL, j+2UL ) : ( i+4UL ) )
3535  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
3536 
3537  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
3538  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
3539 
3540  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3541  size_t k( kbegin );
3542 
3543  for( ; k<kpos; k+=IT::size ) {
3544  const IntrinsicType a1( A.load(i ,k) );
3545  const IntrinsicType a2( A.load(i+1UL,k) );
3546  const IntrinsicType a3( A.load(i+2UL,k) );
3547  const IntrinsicType a4( A.load(i+3UL,k) );
3548  const IntrinsicType b1( B.load(k,j ) );
3549  const IntrinsicType b2( B.load(k,j+1UL) );
3550  xmm1 = xmm1 + a1 * b1;
3551  xmm2 = xmm2 + a1 * b2;
3552  xmm3 = xmm3 + a2 * b1;
3553  xmm4 = xmm4 + a2 * b2;
3554  xmm5 = xmm5 + a3 * b1;
3555  xmm6 = xmm6 + a3 * b2;
3556  xmm7 = xmm7 + a4 * b1;
3557  xmm8 = xmm8 + a4 * b2;
3558  }
3559 
3560  (~C)(i ,j ) -= sum( xmm1 );
3561  (~C)(i ,j+1UL) -= sum( xmm2 );
3562  (~C)(i+1UL,j ) -= sum( xmm3 );
3563  (~C)(i+1UL,j+1UL) -= sum( xmm4 );
3564  (~C)(i+2UL,j ) -= sum( xmm5 );
3565  (~C)(i+2UL,j+1UL) -= sum( xmm6 );
3566  (~C)(i+3UL,j ) -= sum( xmm7 );
3567  (~C)(i+3UL,j+1UL) -= sum( xmm8 );
3568 
3569  for( ; remainder && k<kend; ++k ) {
3570  (~C)(i ,j ) -= A(i ,k) * B(k,j );
3571  (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3572  (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3573  (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3574  (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3575  (~C)(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
3576  (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3577  (~C)(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
3578  }
3579  }
3580 
3581  if( j < N )
3582  {
3583  const size_t kbegin( ( IsUpper<MT4>::value )
3584  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
3585  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
3586  const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
3587 
3588  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
3589  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
3590 
3591  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3592  size_t k( kbegin );
3593 
3594  for( ; k<kpos; k+=IT::size ) {
3595  const IntrinsicType b1( B.load(k,j) );
3596  xmm1 = xmm1 + A.load(i ,k) * b1;
3597  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3598  xmm3 = xmm3 + A.load(i+2UL,k) * b1;
3599  xmm4 = xmm4 + A.load(i+3UL,k) * b1;
3600  }
3601 
3602  (~C)(i ,j) -= sum( xmm1 );
3603  (~C)(i+1UL,j) -= sum( xmm2 );
3604  (~C)(i+2UL,j) -= sum( xmm3 );
3605  (~C)(i+3UL,j) -= sum( xmm4 );
3606 
3607  for( ; remainder && k<kend; ++k ) {
3608  (~C)(i ,j ) -= A(i ,k) * B(k,j );
3609  (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3610  (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3611  (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3612  }
3613  }
3614  }
3615 
3616  for( ; (i+2UL) <= M; i+=2UL )
3617  {
3618  size_t j( 0UL );
3619 
3620  for( ; (j+2UL) <= N; j+=2UL )
3621  {
3622  const size_t kbegin( ( IsUpper<MT4>::value )
3623  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
3624  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
3625  const size_t kend( ( IsLower<MT4>::value )
3626  ?( IsUpper<MT5>::value ? min( i+2UL, j+2UL ) : ( i+2UL ) )
3627  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
3628 
3629  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
3630  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
3631 
3632  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3633  size_t k( kbegin );
3634 
3635  for( ; k<kpos; k+=IT::size ) {
3636  const IntrinsicType a1( A.load(i ,k) );
3637  const IntrinsicType a2( A.load(i+1UL,k) );
3638  const IntrinsicType b1( B.load(k,j ) );
3639  const IntrinsicType b2( B.load(k,j+1UL) );
3640  xmm1 = xmm1 + a1 * b1;
3641  xmm2 = xmm2 + a1 * b2;
3642  xmm3 = xmm3 + a2 * b1;
3643  xmm4 = xmm4 + a2 * b2;
3644  }
3645 
3646  (~C)(i ,j ) -= sum( xmm1 );
3647  (~C)(i ,j+1UL) -= sum( xmm2 );
3648  (~C)(i+1UL,j ) -= sum( xmm3 );
3649  (~C)(i+1UL,j+1UL) -= sum( xmm4 );
3650 
3651  for( ; remainder && k<kend; ++k ) {
3652  (~C)(i ,j ) -= A(i ,k) * B(k,j );
3653  (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3654  (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3655  (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3656  }
3657  }
3658 
3659  if( j < N )
3660  {
3661  const size_t kbegin( ( IsUpper<MT4>::value )
3662  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
3663  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
3664  const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
3665 
3666  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
3667  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
3668 
3669  IntrinsicType xmm1, xmm2;
3670  size_t k( kbegin );
3671 
3672  for( ; k<kpos; k+=IT::size ) {
3673  const IntrinsicType b1( B.load(k,j) );
3674  xmm1 = xmm1 + A.load(i ,k) * b1;
3675  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3676  }
3677 
3678  (~C)(i ,j) -= sum( xmm1 );
3679  (~C)(i+1UL,j) -= sum( xmm2 );
3680 
3681  for( ; remainder && k<kend; ++k ) {
3682  (~C)(i ,j) -= A(i ,k) * B(k,j);
3683  (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3684  }
3685  }
3686  }
3687  if( i < M )
3688  {
3689  size_t j( 0UL );
3690 
3691  for( ; (j+2UL) <= N; j+=2UL )
3692  {
3693  const size_t kbegin( ( IsUpper<MT4>::value )
3694  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
3695  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
3696  const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
3697 
3698  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
3699  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
3700 
3701  IntrinsicType xmm1, xmm2;
3702  size_t k( kbegin );
3703 
3704  for( ; k<kpos; k+=IT::size ) {
3705  const IntrinsicType a1( A.load(i,k) );
3706  xmm1 = xmm1 + a1 * B.load(k,j );
3707  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3708  }
3709 
3710  (~C)(i,j ) -= sum( xmm1 );
3711  (~C)(i,j+1UL) -= sum( xmm2 );
3712 
3713  for( ; remainder && k<kend; ++k ) {
3714  (~C)(i,j ) -= A(i,k) * B(k,j );
3715  (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3716  }
3717  }
3718 
3719  if( j < N )
3720  {
3721  const size_t kbegin( ( IsUpper<MT4>::value )
3722  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
3723  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
3724 
3725  const size_t kpos( remainder ? ( K & size_t(-IT::size) ) : K );
3726  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (IT::size) ) ) == kpos, "Invalid end calculation" );
3727 
3728  IntrinsicType xmm1;
3729  size_t k( kbegin );
3730 
3731  for( ; k<kpos; k+=IT::size ) {
3732  xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3733  }
3734 
3735  (~C)(i,j) -= sum( xmm1 );
3736 
3737  for( ; remainder && k<K; ++k ) {
3738  (~C)(i,j) -= A(i,k) * B(k,j);
3739  }
3740  }
3741  }
3742  }
3744  //**********************************************************************************************
3745 
3746  //**Default subtraction assignment to dense matrices (large matrices)***************************
3760  template< typename MT3 // Type of the left-hand side target matrix
3761  , typename MT4 // Type of the left-hand side matrix operand
3762  , typename MT5 > // Type of the right-hand side matrix operand
3763  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3764  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3765  {
3766  selectDefaultSubAssignKernel( ~C, A, B );
3767  }
3769  //**********************************************************************************************
3770 
3771  //**Default subtraction assignment to row-major dense matrices (large matrices)*****************
3786  template< typename MT3 // Type of the left-hand side target matrix
3787  , typename MT4 // Type of the left-hand side matrix operand
3788  , typename MT5 > // Type of the right-hand side matrix operand
3789  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3790  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3791  {
3792  // TODO
3793  selectSmallSubAssignKernel( ~C, A, B );
3794  }
3796  //**********************************************************************************************
3797 
3798  //**Default subtraction assignment to column-major dense matrices (large matrices)**************
3813  template< typename MT3 // Type of the left-hand side target matrix
3814  , typename MT4 // Type of the left-hand side matrix operand
3815  , typename MT5 > // Type of the right-hand side matrix operand
3816  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3817  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3818  {
3819  // TODO
3820  selectSmallSubAssignKernel( ~C, A, B );
3821  }
3823  //**********************************************************************************************
3824 
3825  //**Default subtraction assignment to dense matrices********************************************
3839  template< typename MT3 // Type of the left-hand side target matrix
3840  , typename MT4 // Type of the left-hand side matrix operand
3841  , typename MT5 > // Type of the right-hand side matrix operand
3842  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3843  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3844  {
3845  selectLargeSubAssignKernel( C, A, B );
3846  }
3848  //**********************************************************************************************
3849 
3850  //**BLAS-based subraction assignment to dense matrices******************************************
3851 #if BLAZE_BLAS_MODE
3852 
3865  template< typename MT3 // Type of the left-hand side target matrix
3866  , typename MT4 // Type of the left-hand side matrix operand
3867  , typename MT5 > // Type of the right-hand side matrix operand
3868  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3869  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3870  {
3871  typedef typename MT3::ElementType ET;
3872 
3873  if( IsTriangular<MT4>::value ) {
3874  typename MT3::ResultType tmp( serial( B ) );
3875  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3876  subAssign( C, tmp );
3877  }
3878  else if( IsTriangular<MT5>::value ) {
3879  typename MT3::ResultType tmp( serial( A ) );
3880  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3881  subAssign( C, tmp );
3882  }
3883  else {
3884  gemm( C, A, B, ET(-1), ET(1) );
3885  }
3886  }
3888 #endif
3889  //**********************************************************************************************
3890 
3891  //**Subtraction assignment to sparse matrices***************************************************
3892  // No special implementation for the subtraction assignment to sparse matrices.
3893  //**********************************************************************************************
3894 
3895  //**Multiplication assignment to dense matrices*************************************************
3896  // No special implementation for the multiplication assignment to dense matrices.
3897  //**********************************************************************************************
3898 
3899  //**Multiplication assignment to sparse matrices************************************************
3900  // No special implementation for the multiplication assignment to sparse matrices.
3901  //**********************************************************************************************
3902 
3903  //**SMP assignment to dense matrices************************************************************
3918  template< typename MT // Type of the target dense matrix
3919  , bool SO > // Storage order of the target dense matrix
3920  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3921  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
3922  {
3924 
3925  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3926  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3927 
3928  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
3929  return;
3930  }
3931  else if( rhs.lhs_.columns() == 0UL ) {
3932  reset( ~lhs );
3933  return;
3934  }
3935 
3936  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3937  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3938 
3939  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3940  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3941  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3942  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3943  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3944  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3945 
3946  smpAssign( ~lhs, A * B );
3947  }
3949  //**********************************************************************************************
3950 
3951  //**SMP assignment to sparse matrices***********************************************************
3966  template< typename MT // Type of the target sparse matrix
3967  , bool SO > // Storage order of the target sparse matrix
3968  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3969  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
3970  {
3972 
3973  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
3974 
3981 
3982  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3983  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3984 
3985  const TmpType tmp( rhs );
3986  smpAssign( ~lhs, tmp );
3987  }
3989  //**********************************************************************************************
3990 
3991  //**SMP addition assignment to dense matrices***************************************************
4007  template< typename MT // Type of the target dense matrix
4008  , bool SO > // Storage order of the target dense matrix
4009  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4010  smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4011  {
4013 
4014  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4015  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4016 
4017  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4018  return;
4019  }
4020 
4021  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4022  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4023 
4024  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4025  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4026  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4027  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4028  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4029  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4030 
4031  smpAddAssign( ~lhs, A * B );
4032  }
4034  //**********************************************************************************************
4035 
4036  //**SMP addition assignment to sparse matrices**************************************************
4037  // No special implementation for the SMP addition assignment to sparse matrices.
4038  //**********************************************************************************************
4039 
4040  //**SMP subtraction assignment to dense matrices************************************************
4056  template< typename MT // Type of the target dense matrix
4057  , bool SO > // Storage order of the target dense matrix
4058  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4059  smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4060  {
4062 
4063  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4064  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4065 
4066  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4067  return;
4068  }
4069 
4070  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4071  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4072 
4073  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4074  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4075  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4076  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4077  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4078  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4079 
4080  smpSubAssign( ~lhs, A * B );
4081  }
4083  //**********************************************************************************************
4084 
4085  //**SMP subtraction assignment to sparse matrices***********************************************
4086  // No special implementation for the SMP subtraction assignment to sparse matrices.
4087  //**********************************************************************************************
4088 
4089  //**SMP multiplication assignment to dense matrices*********************************************
4090  // No special implementation for the SMP multiplication assignment to dense matrices.
4091  //**********************************************************************************************
4092 
4093  //**SMP multiplication assignment to sparse matrices********************************************
4094  // No special implementation for the SMP multiplication assignment to sparse matrices.
4095  //**********************************************************************************************
4096 
4097  //**Compile time checks*************************************************************************
4105  //**********************************************************************************************
4106 };
4107 //*************************************************************************************************
4108 
4109 
4110 
4111 
4112 //=================================================================================================
4113 //
4114 // DMATSCALARMULTEXPR SPECIALIZATION
4115 //
4116 //=================================================================================================
4117 
4118 //*************************************************************************************************
4126 template< typename MT1 // Type of the left-hand side dense matrix
4127  , typename MT2 // Type of the right-hand side dense matrix
4128  , typename ST > // Type of the right-hand side scalar value
4129 class DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >
4130  : public DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >, false >
4131  , private MatScalarMultExpr
4132  , private Computation
4133 {
4134  private:
4135  //**Type definitions****************************************************************************
4136  typedef DMatTDMatMultExpr<MT1,MT2> MMM;
4137  typedef typename MMM::ResultType RES;
4138  typedef typename MT1::ResultType RT1;
4139  typedef typename MT2::ResultType RT2;
4140  typedef typename RT1::ElementType ET1;
4141  typedef typename RT2::ElementType ET2;
4142  typedef typename MT1::CompositeType CT1;
4143  typedef typename MT2::CompositeType CT2;
4144  //**********************************************************************************************
4145 
4146  //**********************************************************************************************
4148  enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4149  //**********************************************************************************************
4150 
4151  //**********************************************************************************************
4153  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4154  //**********************************************************************************************
4155 
4156  //**********************************************************************************************
4158 
4161  template< typename T1, typename T2, typename T3 >
4162  struct IsEvaluationRequired {
4163  enum { value = ( evaluateLeft || evaluateRight ) };
4164  };
4165  //**********************************************************************************************
4166 
4167  //**********************************************************************************************
4169 
4171  template< typename T1, typename T2, typename T3, typename T4 >
4172  struct UseBlasKernel {
4173  enum { value = BLAZE_BLAS_MODE &&
4174  HasMutableDataAccess<T1>::value &&
4175  HasConstDataAccess<T2>::value &&
4176  HasConstDataAccess<T3>::value &&
4177  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4178  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4179  IsBlasCompatible<typename T1::ElementType>::value &&
4180  IsBlasCompatible<typename T2::ElementType>::value &&
4181  IsBlasCompatible<typename T3::ElementType>::value &&
4182  IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
4183  IsSame< typename T1::ElementType, typename T3::ElementType >::value &&
4184  !( IsBuiltin<typename T1::ElementType>::value && IsComplex<T4>::value ) };
4185  };
4186  //**********************************************************************************************
4187 
4188  //**********************************************************************************************
4190 
4192  template< typename T1, typename T2, typename T3, typename T4 >
4193  struct UseVectorizedDefaultKernel {
4194  enum { value = useOptimizedKernels &&
4195  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4196  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4197  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
4198  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
4199  IsSame<typename T1::ElementType,T4>::value &&
4200  IntrinsicTrait<typename T1::ElementType>::addition &&
4201  IntrinsicTrait<typename T1::ElementType>::multiplication };
4202  };
4203  //**********************************************************************************************
4204 
4205  public:
4206  //**Type definitions****************************************************************************
4207  typedef DMatScalarMultExpr<MMM,ST,false> This;
4208  typedef typename MultTrait<RES,ST>::Type ResultType;
4209  typedef typename ResultType::OppositeType OppositeType;
4210  typedef typename ResultType::TransposeType TransposeType;
4211  typedef typename ResultType::ElementType ElementType;
4212  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
4213  typedef const ElementType ReturnType;
4214  typedef const ResultType CompositeType;
4215 
4217  typedef const DMatTDMatMultExpr<MT1,MT2> LeftOperand;
4218 
4220  typedef ST RightOperand;
4221 
4223  typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type LT;
4224 
4226  typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type RT;
4227  //**********************************************************************************************
4228 
4229  //**Compilation flags***************************************************************************
4231  enum { vectorizable = !IsDiagonal<MT1>::value && !IsDiagonal<MT2>::value &&
4232  MT1::vectorizable && MT2::vectorizable &&
4233  IsSame<ET1,ET2>::value &&
4234  IsSame<ET1,ST>::value &&
4235  IntrinsicTrait<ET1>::addition &&
4236  IntrinsicTrait<ET1>::multiplication };
4237 
4239  enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4240  !evaluateRight && MT2::smpAssignable };
4241  //**********************************************************************************************
4242 
4243  //**Constructor*********************************************************************************
4249  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4250  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4251  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4252  {}
4253  //**********************************************************************************************
4254 
4255  //**Access operator*****************************************************************************
4262  inline ReturnType operator()( size_t i, size_t j ) const {
4263  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4264  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4265  return matrix_(i,j) * scalar_;
4266  }
4267  //**********************************************************************************************
4268 
4269  //**At function*********************************************************************************
4277  inline ReturnType at( size_t i, size_t j ) const {
4278  if( i >= matrix_.rows() ) {
4279  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4280  }
4281  if( j >= matrix_.columns() ) {
4282  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4283  }
4284  return (*this)(i,j);
4285  }
4286  //**********************************************************************************************
4287 
4288  //**Rows function*******************************************************************************
4293  inline size_t rows() const {
4294  return matrix_.rows();
4295  }
4296  //**********************************************************************************************
4297 
4298  //**Columns function****************************************************************************
4303  inline size_t columns() const {
4304  return matrix_.columns();
4305  }
4306  //**********************************************************************************************
4307 
4308  //**Left operand access*************************************************************************
4313  inline LeftOperand leftOperand() const {
4314  return matrix_;
4315  }
4316  //**********************************************************************************************
4317 
4318  //**Right operand access************************************************************************
4323  inline RightOperand rightOperand() const {
4324  return scalar_;
4325  }
4326  //**********************************************************************************************
4327 
4328  //**********************************************************************************************
4334  template< typename T >
4335  inline bool canAlias( const T* alias ) const {
4336  return matrix_.canAlias( alias );
4337  }
4338  //**********************************************************************************************
4339 
4340  //**********************************************************************************************
4346  template< typename T >
4347  inline bool isAliased( const T* alias ) const {
4348  return matrix_.isAliased( alias );
4349  }
4350  //**********************************************************************************************
4351 
4352  //**********************************************************************************************
4357  inline bool isAligned() const {
4358  return matrix_.isAligned();
4359  }
4360  //**********************************************************************************************
4361 
4362  //**********************************************************************************************
4367  inline bool canSMPAssign() const {
4368  typename MMM::LeftOperand A( matrix_.leftOperand() );
4369  return ( !BLAZE_BLAS_IS_PARALLEL ||
4370  ( rows() * columns() < DMATTDMATMULT_THRESHOLD ) ) &&
4371  ( A.rows() > SMP_DMATTDMATMULT_THRESHOLD );
4372  }
4373  //**********************************************************************************************
4374 
4375  private:
4376  //**Member variables****************************************************************************
4377  LeftOperand matrix_;
4378  RightOperand scalar_;
4379  //**********************************************************************************************
4380 
4381  //**Assignment to dense matrices****************************************************************
4393  template< typename MT // Type of the target dense matrix
4394  , bool SO > // Storage order of the target dense matrix
4395  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4396  {
4398 
4399  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4400  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4401 
4402  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4403  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4404 
4405  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4406  return;
4407  }
4408  else if( left.columns() == 0UL ) {
4409  reset( ~lhs );
4410  return;
4411  }
4412 
4413  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4414  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4415 
4416  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4417  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4418  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4419  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4420  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4421  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4422 
4423  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4424  }
4425  //**********************************************************************************************
4426 
4427  //**Assignment to dense matrices (kernel selection)*********************************************
4438  template< typename MT3 // Type of the left-hand side target matrix
4439  , typename MT4 // Type of the left-hand side matrix operand
4440  , typename MT5 // Type of the right-hand side matrix operand
4441  , typename ST2 > // Type of the scalar value
4442  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4443  {
4444  if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
4445  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
4446  selectSmallAssignKernel( C, A, B, scalar );
4447  else
4448  selectBlasAssignKernel( C, A, B, scalar );
4449  }
4450  //**********************************************************************************************
4451 
4452  //**Default assignment to row-major dense matrices (general/general)****************************
4466  template< typename MT3 // Type of the left-hand side target matrix
4467  , typename MT4 // Type of the left-hand side matrix operand
4468  , typename MT5 // Type of the right-hand side matrix operand
4469  , typename ST2 > // Type of the scalar value
4470  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4471  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
4472  {
4473  const size_t M( A.rows() );
4474  const size_t N( B.columns() );
4475  const size_t K( A.columns() );
4476 
4477  const size_t ibegin( ( IsStrictlyLower<MT4>::value )
4478  ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
4479  :( 0UL ) );
4480  const size_t iend( ( IsStrictlyUpper<MT4>::value )
4481  ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
4482  :( M ) );
4483  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4484 
4485  for( size_t i=0UL; i<ibegin; ++i ) {
4486  for( size_t j=0UL; j<N; ++j ) {
4487  reset( (~C)(i,j) );
4488  }
4489  }
4490  for( size_t i=ibegin; i<iend; ++i )
4491  {
4492  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4493  ?( ( IsStrictlyUpper<MT4>::value )
4494  ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
4495  :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
4496  :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
4497  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
4498  ?( ( IsStrictlyLower<MT4>::value )
4499  ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
4500  :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
4501  :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
4502  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4503 
4504  for( size_t j=0UL; j<jbegin; ++j ) {
4505  reset( (~C)(i,j) );
4506  }
4507  for( size_t j=jbegin; j<jend; ++j )
4508  {
4509  const size_t kbegin( ( IsUpper<MT4>::value )
4510  ?( ( IsLower<MT5>::value )
4511  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4512  , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4513  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4514  :( ( IsLower<MT5>::value )
4515  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4516  :( 0UL ) ) );
4517  const size_t kend( ( IsLower<MT4>::value )
4518  ?( ( IsUpper<MT5>::value )
4519  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
4520  , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4521  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4522  :( ( IsUpper<MT5>::value )
4523  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4524  :( K ) ) );
4525  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
4526 
4527  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
4528  for( size_t k=kbegin+1UL; k<kend; ++k ) {
4529  (~C)(i,j) += A(i,k) * B(k,j);
4530  }
4531  (~C)(i,j) *= scalar;
4532  }
4533  for( size_t j=jend; j<N; ++j ) {
4534  reset( (~C)(i,j) );
4535  }
4536  }
4537  for( size_t i=iend; i<M; ++i ) {
4538  for( size_t j=0UL; j<N; ++j ) {
4539  reset( (~C)(i,j) );
4540  }
4541  }
4542  }
4543  //**********************************************************************************************
4544 
4545  //**Default assignment to column-major dense matrices (general/general)*************************
4559  template< typename MT3 // Type of the left-hand side target matrix
4560  , typename MT4 // Type of the left-hand side matrix operand
4561  , typename MT5 // Type of the right-hand side matrix operand
4562  , typename ST2 > // Type of the scalar value
4563  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4564  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
4565  {
4566  const size_t M( A.rows() );
4567  const size_t N( B.columns() );
4568  const size_t K( A.columns() );
4569 
4570  const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
4571  ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
4572  :( 0UL ) );
4573  const size_t jend( ( IsStrictlyLower<MT5>::value )
4574  ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
4575  :( N ) );
4576  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4577 
4578  for( size_t j=0UL; j<jbegin; ++j ) {
4579  for( size_t i=0UL; i<M; ++i ) {
4580  reset( (~C)(i,j) );
4581  }
4582  }
4583  for( size_t j=jbegin; j<jend; ++j )
4584  {
4585  const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
4586  ?( ( IsStrictlyLower<MT4>::value )
4587  ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
4588  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4589  :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
4590  const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4591  ?( ( IsStrictlyUpper<MT4>::value )
4592  ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
4593  :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
4594  :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
4595  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4596 
4597  for( size_t i=0UL; i<ibegin; ++i ) {
4598  reset( (~C)(i,j) );
4599  }
4600  for( size_t i=ibegin; i<iend; ++i )
4601  {
4602  const size_t kbegin( ( IsUpper<MT4>::value )
4603  ?( ( IsLower<MT5>::value )
4604  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4605  , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4606  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4607  :( ( IsLower<MT5>::value )
4608  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4609  :( 0UL ) ) );
4610  const size_t kend( ( IsLower<MT4>::value )
4611  ?( ( IsUpper<MT5>::value )
4612  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
4613  , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4614  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4615  :( ( IsUpper<MT5>::value )
4616  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4617  :( K ) ) );
4618  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
4619 
4620  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
4621  for( size_t k=kbegin+1UL; k<kend; ++k ) {
4622  (~C)(i,j) += A(i,k) * B(k,j);
4623  }
4624  (~C)(i,j) *= scalar;
4625  }
4626  for( size_t i=iend; i<M; ++i ) {
4627  reset( (~C)(i,j) );
4628  }
4629  }
4630  for( size_t j=jend; j<N; ++j ) {
4631  for( size_t i=0UL; i<M; ++i ) {
4632  reset( (~C)(i,j) );
4633  }
4634  }
4635  }
4636  //**********************************************************************************************
4637 
4638  //**Default assignment to row-major dense matrices (general/diagonal)***************************
4652  template< typename MT3 // Type of the left-hand side target matrix
4653  , typename MT4 // Type of the left-hand side matrix operand
4654  , typename MT5 // Type of the right-hand side matrix operand
4655  , typename ST2 > // Type of the scalar value
4656  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4657  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
4658  {
4659  const size_t M( A.rows() );
4660  const size_t N( B.columns() );
4661 
4662  for( size_t i=0UL; i<M; ++i )
4663  {
4664  const size_t jbegin( ( IsUpper<MT4>::value )
4665  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4666  :( 0UL ) );
4667  const size_t jend( ( IsLower<MT4>::value )
4668  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4669  :( N ) );
4670  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4671 
4672  if( IsUpper<MT4>::value ) {
4673  for( size_t j=0UL; j<jbegin; ++j ) {
4674  reset( (~C)(i,j) );
4675  }
4676  }
4677  for( size_t j=jbegin; j<jend; ++j ) {
4678  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
4679  }
4680  if( IsLower<MT4>::value ) {
4681  for( size_t j=jend; j<N; ++j ) {
4682  reset( (~C)(i,j) );
4683  }
4684  }
4685  }
4686  }
4687  //**********************************************************************************************
4688 
4689  //**Default assignment to column-major dense matrices (general/diagonal)************************
4703  template< typename MT3 // Type of the left-hand side target matrix
4704  , typename MT4 // Type of the left-hand side matrix operand
4705  , typename MT5 // Type of the right-hand side matrix operand
4706  , typename ST2 > // Type of the scalar value
4707  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4708  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
4709  {
4710  const size_t M( A.rows() );
4711  const size_t N( B.columns() );
4712 
4713  const size_t block( BLOCK_SIZE );
4714 
4715  for( size_t jj=0UL; jj<N; jj+=block ) {
4716  const size_t jend( min( N, jj+block ) );
4717  for( size_t ii=0UL; ii<M; ii+=block ) {
4718  const size_t iend( min( M, ii+block ) );
4719  for( size_t j=jj; j<jend; ++j )
4720  {
4721  const size_t ibegin( ( IsLower<MT4>::value )
4722  ?( max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
4723  :( ii ) );
4724  const size_t ipos( ( IsUpper<MT4>::value )
4725  ?( min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
4726  :( iend ) );
4727 
4728  if( IsLower<MT4>::value ) {
4729  for( size_t i=ii; i<ibegin; ++i ) {
4730  reset( (~C)(i,j) );
4731  }
4732  }
4733  for( size_t i=ibegin; i<ipos; ++i ) {
4734  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
4735  }
4736  if( IsUpper<MT4>::value ) {
4737  for( size_t i=ipos; i<iend; ++i ) {
4738  reset( (~C)(i,j) );
4739  }
4740  }
4741  }
4742  }
4743  }
4744  }
4745  //**********************************************************************************************
4746 
4747  //**Default assignment to row-major dense matrices (diagonal/general)***************************
4761  template< typename MT3 // Type of the left-hand side target matrix
4762  , typename MT4 // Type of the left-hand side matrix operand
4763  , typename MT5 // Type of the right-hand side matrix operand
4764  , typename ST2 > // Type of the scalar value
4765  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4766  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
4767  {
4768  const size_t M( A.rows() );
4769  const size_t N( B.columns() );
4770 
4771  const size_t block( BLOCK_SIZE );
4772 
4773  for( size_t ii=0UL; ii<M; ii+=block ) {
4774  const size_t iend( min( M, ii+block ) );
4775  for( size_t jj=0UL; jj<N; jj+=block ) {
4776  const size_t jend( min( N, jj+block ) );
4777  for( size_t i=ii; i<iend; ++i )
4778  {
4779  const size_t jbegin( ( IsUpper<MT5>::value )
4780  ?( max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
4781  :( jj ) );
4782  const size_t jpos( ( IsLower<MT5>::value )
4783  ?( min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
4784  :( jend ) );
4785 
4786  if( IsUpper<MT5>::value ) {
4787  for( size_t j=jj; j<jbegin; ++j ) {
4788  reset( (~C)(i,j) );
4789  }
4790  }
4791  for( size_t j=jbegin; j<jpos; ++j ) {
4792  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
4793  }
4794  if( IsLower<MT5>::value ) {
4795  for( size_t j=jpos; j<jend; ++j ) {
4796  reset( (~C)(i,j) );
4797  }
4798  }
4799  }
4800  }
4801  }
4802  }
4803  //**********************************************************************************************
4804 
4805  //**Default assignment to column-major dense matrices (diagonal/general)************************
4819  template< typename MT3 // Type of the left-hand side target matrix
4820  , typename MT4 // Type of the left-hand side matrix operand
4821  , typename MT5 // Type of the right-hand side matrix operand
4822  , typename ST2 > // Type of the scalar value
4823  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4824  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
4825  {
4826  const size_t M( A.rows() );
4827  const size_t N( B.columns() );
4828 
4829  for( size_t j=0UL; j<N; ++j )
4830  {
4831  const size_t ibegin( ( IsLower<MT5>::value )
4832  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4833  :( 0UL ) );
4834  const size_t iend( ( IsUpper<MT5>::value )
4835  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4836  :( M ) );
4837  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4838 
4839  if( IsLower<MT5>::value ) {
4840  for( size_t i=0UL; i<ibegin; ++i ) {
4841  reset( (~C)(i,j) );
4842  }
4843  }
4844  for( size_t i=ibegin; i<iend; ++i ) {
4845  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
4846  }
4847  if( IsUpper<MT5>::value ) {
4848  for( size_t i=iend; i<M; ++i ) {
4849  reset( (~C)(i,j) );
4850  }
4851  }
4852  }
4853  }
4854  //**********************************************************************************************
4855 
4856  //**Default assignment to dense matrices (diagonal/diagonal)************************************
4870  template< typename MT3 // Type of the left-hand side target matrix
4871  , typename MT4 // Type of the left-hand side matrix operand
4872  , typename MT5 // Type of the right-hand side matrix operand
4873  , typename ST2 > // Type of the scalar value
4874  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
4875  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4876  {
4877  reset( C );
4878 
4879  for( size_t i=0UL; i<A.rows(); ++i ) {
4880  C(i,i) = A(i,i) * B(i,i) * scalar;
4881  }
4882  }
4883  //**********************************************************************************************
4884 
4885  //**Default assignment to dense matrices (small matrices)***************************************
4899  template< typename MT3 // Type of the left-hand side target matrix
4900  , typename MT4 // Type of the left-hand side matrix operand
4901  , typename MT5 // Type of the right-hand side matrix operand
4902  , typename ST2 > // Type of the scalar value
4903  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4904  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4905  {
4906  selectDefaultAssignKernel( C, A, B, scalar );
4907  }
4908  //**********************************************************************************************
4909 
4910  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
4925  template< typename MT3 // Type of the left-hand side target matrix
4926  , typename MT4 // Type of the left-hand side matrix operand
4927  , typename MT5 // Type of the right-hand side matrix operand
4928  , typename ST2 > // Type of the scalar value
4929  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4930  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
4931  {
4932  typedef IntrinsicTrait<ElementType> IT;
4933 
4934  const size_t M( A.rows() );
4935  const size_t N( B.columns() );
4936  const size_t K( A.columns() );
4937 
4938  const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
4939 
4940  size_t i( 0UL );
4941 
4942  for( ; (i+2UL) <= M; i+=2UL )
4943  {
4944  size_t j( 0UL );
4945 
4946  for( ; (j+4UL) <= N; j+=4UL )
4947  {
4948  const size_t kbegin( ( IsUpper<MT4>::value )
4949  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
4950  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
4951  const size_t kend( ( IsLower<MT4>::value )
4952  ?( IsUpper<MT5>::value ? min( i+2UL, j+4UL ) : ( i+2UL ) )
4953  :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
4954 
4955  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
4956  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
4957 
4958  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4959  size_t k( kbegin );
4960 
4961  for( ; k<kpos; k+=IT::size ) {
4962  const IntrinsicType a1( A.load(i ,k) );
4963  const IntrinsicType a2( A.load(i+1UL,k) );
4964  const IntrinsicType b1( B.load(k,j ) );
4965  const IntrinsicType b2( B.load(k,j+1UL) );
4966  const IntrinsicType b3( B.load(k,j+2UL) );
4967  const IntrinsicType b4( B.load(k,j+3UL) );
4968  xmm1 = xmm1 + a1 * b1;
4969  xmm2 = xmm2 + a1 * b2;
4970  xmm3 = xmm3 + a1 * b3;
4971  xmm4 = xmm4 + a1 * b4;
4972  xmm5 = xmm5 + a2 * b1;
4973  xmm6 = xmm6 + a2 * b2;
4974  xmm7 = xmm7 + a2 * b3;
4975  xmm8 = xmm8 + a2 * b4;
4976  }
4977 
4978  (~C)(i ,j ) = sum( xmm1 ) * scalar;
4979  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
4980  (~C)(i ,j+2UL) = sum( xmm3 ) * scalar;
4981  (~C)(i ,j+3UL) = sum( xmm4 ) * scalar;
4982  (~C)(i+1UL,j ) = sum( xmm5 ) * scalar;
4983  (~C)(i+1UL,j+1UL) = sum( xmm6 ) * scalar;
4984  (~C)(i+1UL,j+2UL) = sum( xmm7 ) * scalar;
4985  (~C)(i+1UL,j+3UL) = sum( xmm8 ) * scalar;
4986 
4987  for( ; remainder && k<kend; ++k ) {
4988  (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
4989  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
4990  (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
4991  (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
4992  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
4993  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
4994  (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
4995  (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
4996  }
4997  }
4998 
4999  for( ; (j+2UL) <= N; j+=2UL )
5000  {
5001  const size_t kbegin( ( IsUpper<MT4>::value )
5002  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
5003  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
5004  const size_t kend( ( IsLower<MT4>::value )
5005  ?( IsUpper<MT5>::value ? min( i+2UL, j+2UL ) : ( i+2UL ) )
5006  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
5007 
5008  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
5009  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
5010 
5011  IntrinsicType xmm1, xmm2, xmm3, xmm4;
5012  size_t k( kbegin );
5013 
5014  for( ; k<kpos; k+=IT::size ) {
5015  const IntrinsicType a1( A.load(i ,k) );
5016  const IntrinsicType a2( A.load(i+1UL,k) );
5017  const IntrinsicType b1( B.load(k,j ) );
5018  const IntrinsicType b2( B.load(k,j+1UL) );
5019  xmm1 = xmm1 + a1 * b1;
5020  xmm2 = xmm2 + a1 * b2;
5021  xmm3 = xmm3 + a2 * b1;
5022  xmm4 = xmm4 + a2 * b2;
5023  }
5024 
5025  (~C)(i ,j ) = sum( xmm1 ) * scalar;
5026  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
5027  (~C)(i+1UL,j ) = sum( xmm3 ) * scalar;
5028  (~C)(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
5029 
5030  for( ; remainder && k<kend; ++k ) {
5031  (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5032  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5033  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5034  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5035  }
5036  }
5037 
5038  if( j < N )
5039  {
5040  const size_t kbegin( ( IsUpper<MT4>::value )
5041  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
5042  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
5043  const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
5044 
5045  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
5046  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
5047 
5048  IntrinsicType xmm1, xmm2;
5049  size_t k( kbegin );
5050 
5051  for( ; k<kpos; k+=IT::size ) {
5052  const IntrinsicType b1( B.load(k,j) );
5053  xmm1 = xmm1 + A.load(i ,k) * b1;
5054  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
5055  }
5056 
5057  (~C)(i ,j) = sum( xmm1 ) * scalar;
5058  (~C)(i+1UL,j) = sum( xmm2 ) * scalar;
5059 
5060  for( ; remainder && k<kend; ++k ) {
5061  (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5062  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5063  }
5064  }
5065  }
5066 
5067  if( i < M )
5068  {
5069  size_t j( 0UL );
5070 
5071  for( ; (j+4UL) <= N; j+=4UL )
5072  {
5073  const size_t kbegin( ( IsUpper<MT4>::value )
5074  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
5075  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
5076  const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
5077 
5078  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
5079  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
5080 
5081  IntrinsicType xmm1, xmm2, xmm3, xmm4;
5082  size_t k( kbegin );
5083 
5084  for( ; k<kpos; k+=IT::size ) {
5085  const IntrinsicType a1( A.load(i,k) );
5086  xmm1 = xmm1 + a1 * B.load(k,j );
5087  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
5088  xmm3 = xmm3 + a1 * B.load(k,j+2UL);
5089  xmm4 = xmm4 + a1 * B.load(k,j+3UL);
5090  }
5091 
5092  (~C)(i,j ) = sum( xmm1 ) * scalar;
5093  (~C)(i,j+1UL) = sum( xmm2 ) * scalar;
5094  (~C)(i,j+2UL) = sum( xmm3 ) * scalar;
5095  (~C)(i,j+3UL) = sum( xmm4 ) * scalar;
5096 
5097  for( ; remainder && k<kend; ++k ) {
5098  (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5099  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5100  (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
5101  (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
5102  }
5103  }
5104 
5105  for( ; (j+2UL) <= N; j+=2UL )
5106  {
5107  const size_t kbegin( ( IsUpper<MT4>::value )
5108  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
5109  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
5110  const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
5111 
5112  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
5113  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
5114 
5115  IntrinsicType xmm1, xmm2;
5116  size_t k( kbegin );
5117 
5118  for( ; k<kpos; k+=IT::size ) {
5119  const IntrinsicType a1( A.load(i,k) );
5120  xmm1 = xmm1 + a1 * B.load(k,j );
5121  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
5122  }
5123 
5124  (~C)(i,j ) = sum( xmm1 ) * scalar;
5125  (~C)(i,j+1UL) = sum( xmm2 ) * scalar;
5126 
5127  for( ; remainder && k<kend; ++k ) {
5128  (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5129  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5130  }
5131  }
5132 
5133  if( j < N )
5134  {
5135  const size_t kbegin( ( IsUpper<MT4>::value )
5136  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
5137  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
5138 
5139  const size_t kpos( remainder ? ( K & size_t(-IT::size) ) : K );
5140  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (IT::size) ) ) == kpos, "Invalid end calculation" );
5141 
5142  IntrinsicType xmm1;
5143  size_t k( kbegin );
5144 
5145  for( ; k<kpos; k+=IT::size ) {
5146  xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
5147  }
5148 
5149  (~C)(i,j) = sum( xmm1 ) * scalar;
5150 
5151  for( ; remainder && k<K; ++k ) {
5152  (~C)(i,j) += A(i,k) * B(k,j) * scalar;
5153  }
5154  }
5155  }
5156  }
5157  //**********************************************************************************************
5158 
5159  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
5174  template< typename MT3 // Type of the left-hand side target matrix
5175  , typename MT4 // Type of the left-hand side matrix operand
5176  , typename MT5 // Type of the right-hand side matrix operand
5177  , typename ST2 > // Type of the scalar value
5178  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5179  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5180  {
5181  typedef IntrinsicTrait<ElementType> IT;
5182 
5183  const size_t M( A.rows() );
5184  const size_t N( B.columns() );
5185  const size_t K( A.columns() );
5186 
5187  const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
5188 
5189  size_t i( 0UL );
5190 
5191  for( ; (i+4UL) <= M; i+=4UL )
5192  {
5193  size_t j( 0UL );
5194 
5195  for( ; (j+2UL) <= N; j+=2UL )
5196  {
5197  const size_t kbegin( ( IsUpper<MT4>::value )
5198  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
5199  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
5200  const size_t kend( ( IsLower<MT4>::value )
5201  ?( IsUpper<MT5>::value ? min( i+4UL, j+2UL ) : ( i+4UL ) )
5202  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
5203 
5204  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
5205  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
5206 
5207  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5208  size_t k( kbegin );
5209 
5210  for( ; k<kpos; k+=IT::size ) {
5211  const IntrinsicType a1( A.load(i ,k) );
5212  const IntrinsicType a2( A.load(i+1UL,k) );
5213  const IntrinsicType a3( A.load(i+2UL,k) );
5214  const IntrinsicType a4( A.load(i+3UL,k) );
5215  const IntrinsicType b1( B.load(k,j ) );
5216  const IntrinsicType b2( B.load(k,j+1UL) );
5217  xmm1 = xmm1 + a1 * b1;
5218  xmm2 = xmm2 + a1 * b2;
5219  xmm3 = xmm3 + a2 * b1;
5220  xmm4 = xmm4 + a2 * b2;
5221  xmm5 = xmm5 + a3 * b1;
5222  xmm6 = xmm6 + a3 * b2;
5223  xmm7 = xmm7 + a4 * b1;
5224  xmm8 = xmm8 + a4 * b2;
5225  }
5226 
5227  (~C)(i ,j ) = sum( xmm1 ) * scalar;
5228  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
5229  (~C)(i+1UL,j ) = sum( xmm3 ) * scalar;
5230  (~C)(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
5231  (~C)(i+2UL,j ) = sum( xmm5 ) * scalar;
5232  (~C)(i+2UL,j+1UL) = sum( xmm6 ) * scalar;
5233  (~C)(i+3UL,j ) = sum( xmm7 ) * scalar;
5234  (~C)(i+3UL,j+1UL) = sum( xmm8 ) * scalar;
5235 
5236  for( ; remainder && k<kend; ++k ) {
5237  (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5238  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5239  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5240  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5241  (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
5242  (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
5243  (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
5244  (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
5245  }
5246  }
5247 
5248  if( j < N )
5249  {
5250  const size_t kbegin( ( IsUpper<MT4>::value )
5251  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
5252  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
5253  const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
5254 
5255  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
5256  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
5257 
5258  IntrinsicType xmm1, xmm2, xmm3, xmm4;
5259  size_t k( kbegin );
5260 
5261  for( ; k<kpos; k+=IT::size ) {
5262  const IntrinsicType b1( B.load(k,j) );
5263  xmm1 = xmm1 + A.load(i ,k) * b1;
5264  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
5265  xmm3 = xmm3 + A.load(i+2UL,k) * b1;
5266  xmm4 = xmm4 + A.load(i+3UL,k) * b1;
5267  }
5268 
5269  (~C)(i ,j) = sum( xmm1 ) * scalar;
5270  (~C)(i+1UL,j) = sum( xmm2 ) * scalar;
5271  (~C)(i+2UL,j) = sum( xmm3 ) * scalar;
5272  (~C)(i+3UL,j) = sum( xmm4 ) * scalar;
5273 
5274  for( ; remainder && k<kend; ++k ) {
5275  (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5276  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5277  (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
5278  (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
5279  }
5280  }
5281  }
5282 
5283  for( ; (i+2UL) <= M; i+=2UL )
5284  {
5285  size_t j( 0UL );
5286 
5287  for( ; (j+2UL) <= N; j+=2UL )
5288  {
5289  const size_t kbegin( ( IsUpper<MT4>::value )
5290  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
5291  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
5292  const size_t kend( ( IsLower<MT4>::value )
5293  ?( IsUpper<MT5>::value ? min( i+2UL, j+2UL ) : ( i+2UL ) )
5294  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
5295 
5296  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
5297  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
5298 
5299  IntrinsicType xmm1, xmm2, xmm3, xmm4;
5300  size_t k( kbegin );
5301 
5302  for( ; k<kpos; k+=IT::size ) {
5303  const IntrinsicType a1( A.load(i ,k) );
5304  const IntrinsicType a2( A.load(i+1UL,k) );
5305  const IntrinsicType b1( B.load(k,j ) );
5306  const IntrinsicType b2( B.load(k,j+1UL) );
5307  xmm1 = xmm1 + a1 * b1;
5308  xmm2 = xmm2 + a1 * b2;
5309  xmm3 = xmm3 + a2 * b1;
5310  xmm4 = xmm4 + a2 * b2;
5311  }
5312 
5313  (~C)(i ,j ) = sum( xmm1 ) * scalar;
5314  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
5315  (~C)(i+1UL,j ) = sum( xmm3 ) * scalar;
5316  (~C)(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
5317 
5318  for( ; remainder && k<kend; ++k ) {
5319  (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5320  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5321  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5322  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5323  }
5324  }
5325 
5326  if( j < N )
5327  {
5328  const size_t kbegin( ( IsUpper<MT4>::value )
5329  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
5330  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
5331  const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
5332 
5333  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
5334  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
5335 
5336  IntrinsicType xmm1, xmm2;
5337  size_t k( kbegin );
5338 
5339  for( ; k<kpos; k+=IT::size ) {
5340  const IntrinsicType b1( B.load(k,j) );
5341  xmm1 = xmm1 + A.load(i ,k) * b1;
5342  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
5343  }
5344 
5345  (~C)(i ,j) = sum( xmm1 ) * scalar;
5346  (~C)(i+1UL,j) = sum( xmm2 ) * scalar;
5347 
5348  for( ; remainder && k<kend; ++k ) {
5349  (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5350  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5351  }
5352  }
5353  }
5354 
5355  if( i < M )
5356  {
5357  size_t j( 0UL );
5358 
5359  for( ; (j+2UL) <= N; j+=2UL )
5360  {
5361  const size_t kbegin( ( IsUpper<MT4>::value )
5362  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
5363  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
5364  const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
5365 
5366  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
5367  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
5368 
5369  IntrinsicType xmm1, xmm2;
5370  size_t k( kbegin );
5371 
5372  for( ; k<kpos; k+=IT::size ) {
5373  const IntrinsicType a1( A.load(i,k) );
5374  xmm1 = xmm1 + a1 * B.load(k,j );
5375  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
5376  }
5377 
5378  (~C)(i,j ) = sum( xmm1 ) * scalar;
5379  (~C)(i,j+1UL) = sum( xmm2 ) * scalar;
5380 
5381  for( ; remainder && k<kend; ++k ) {
5382  (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5383  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5384  }
5385  }
5386 
5387  if( j < N )
5388  {
5389  const size_t kbegin( ( IsUpper<MT4>::value )
5390  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
5391  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
5392 
5393  const size_t kpos( remainder ? ( K & size_t(-IT::size) ) : K );
5394  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (IT::size) ) ) == kpos, "Invalid end calculation" );
5395 
5396  IntrinsicType xmm1;
5397  size_t k( kbegin );
5398 
5399  for( ; k<kpos; k+=IT::size ) {
5400  xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
5401  }
5402 
5403  (~C)(i,j) = sum( xmm1 ) * scalar;
5404 
5405  for( ; remainder && k<K; ++k ) {
5406  (~C)(i,j) += A(i,k) * B(k,j) * scalar;
5407  }
5408  }
5409  }
5410  }
5411  //**********************************************************************************************
5412 
5413  //**Default assignment to dense matrices (large matrices)***************************************
5427  template< typename MT3 // Type of the left-hand side target matrix
5428  , typename MT4 // Type of the left-hand side matrix operand
5429  , typename MT5 // Type of the right-hand side matrix operand
5430  , typename ST2 > // Type of the scalar value
5431  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5432  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5433  {
5434  selectDefaultAssignKernel( C, A, B, scalar );
5435  }
5436  //**********************************************************************************************
5437 
5438  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
5453  template< typename MT3 // Type of the left-hand side target matrix
5454  , typename MT4 // Type of the left-hand side matrix operand
5455  , typename MT5 // Type of the right-hand side matrix operand
5456  , typename ST2 > // Type of the scalar value
5457  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5458  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5459  {
5460  // TODO
5461  selectSmallAssignKernel( ~C, A, B, scalar );
5462  }
5463  //**********************************************************************************************
5464 
5465  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
5480  template< typename MT3 // Type of the left-hand side target matrix
5481  , typename MT4 // Type of the left-hand side matrix operand
5482  , typename MT5 // Type of the right-hand side matrix operand
5483  , typename ST2 > // Type of the scalar value
5484  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5485  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5486  {
5487  // TODO
5488  selectSmallAssignKernel( ~C, A, B, scalar );
5489  }
5490  //**********************************************************************************************
5491 
5492  //**BLAS-based assignment to dense matrices (default)*******************************************
5506  template< typename MT3 // Type of the left-hand side target matrix
5507  , typename MT4 // Type of the left-hand side matrix operand
5508  , typename MT5 // Type of the right-hand side matrix operand
5509  , typename ST2 > // Type of the scalar value
5510  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
5511  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5512  {
5513  selectLargeAssignKernel( C, A, B, scalar );
5514  }
5515  //**********************************************************************************************
5516 
5517  //**BLAS-based assignment to dense matrices*****************************************************
5518 #if BLAZE_BLAS_MODE
5519 
5532  template< typename MT3 // Type of the left-hand side target matrix
5533  , typename MT4 // Type of the left-hand side matrix operand
5534  , typename MT5 // Type of the right-hand side matrix operand
5535  , typename ST2 > // Type of the scalar value
5536  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
5537  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5538  {
5539  typedef typename MT3::ElementType ET;
5540 
5541  if( IsTriangular<MT4>::value ) {
5542  assign( C, B );
5543  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5544  }
5545  else if( IsTriangular<MT5>::value ) {
5546  assign( C, A );
5547  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5548  }
5549  else {
5550  gemm( C, A, B, ET(scalar), ET(0) );
5551  }
5552  }
5553 #endif
5554  //**********************************************************************************************
5555 
5556  //**Assignment to sparse matrices***************************************************************
5568  template< typename MT // Type of the target sparse matrix
5569  , bool SO > // Storage order of the target sparse matrix
5570  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5571  {
5573 
5574  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
5575 
5582 
5583  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5584  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5585 
5586  const TmpType tmp( serial( rhs ) );
5587  assign( ~lhs, tmp );
5588  }
5589  //**********************************************************************************************
5590 
5591  //**Addition assignment to dense matrices*******************************************************
5603  template< typename MT // Type of the target dense matrix
5604  , bool SO > // Storage order of the target dense matrix
5605  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5606  {
5608 
5609  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5610  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5611 
5612  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5613  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5614 
5615  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
5616  return;
5617  }
5618 
5619  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
5620  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
5621 
5622  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5623  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
5624  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
5625  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
5626  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5627  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
5628 
5629  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5630  }
5631  //**********************************************************************************************
5632 
5633  //**Addition assignment to dense matrices (kernel selection)************************************
5644  template< typename MT3 // Type of the left-hand side target matrix
5645  , typename MT4 // Type of the left-hand side matrix operand
5646  , typename MT5 // Type of the right-hand side matrix operand
5647  , typename ST2 > // Type of the scalar value
5648  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5649  {
5650  if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
5651  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
5652  selectSmallAddAssignKernel( C, A, B, scalar );
5653  else
5654  selectBlasAddAssignKernel( C, A, B, scalar );
5655  }
5656  //**********************************************************************************************
5657 
5658  //**Default addition assignment to dense matrices (general/general)*****************************
5672  template< typename MT3 // Type of the left-hand side target matrix
5673  , typename MT4 // Type of the left-hand side matrix operand
5674  , typename MT5 // Type of the right-hand side matrix operand
5675  , typename ST2 > // Type of the scalar value
5676  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
5677  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5678  {
5679  const ResultType tmp( serial( A * B * scalar ) );
5680  addAssign( C, tmp );
5681  }
5682  //**********************************************************************************************
5683 
5684  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
5698  template< typename MT3 // Type of the left-hand side target matrix
5699  , typename MT4 // Type of the left-hand side matrix operand
5700  , typename MT5 // Type of the right-hand side matrix operand
5701  , typename ST2 > // Type of the scalar value
5702  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
5703  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5704  {
5705  const size_t M( A.rows() );
5706  const size_t N( B.columns() );
5707 
5708  for( size_t i=0UL; i<M; ++i )
5709  {
5710  const size_t jbegin( ( IsUpper<MT4>::value )
5711  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
5712  :( 0UL ) );
5713  const size_t jend( ( IsLower<MT4>::value )
5714  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
5715  :( N ) );
5716  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5717 
5718  const size_t jnum( jend - jbegin );
5719  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
5720 
5721  for( size_t j=jbegin; j<jpos; j+=2UL ) {
5722  (~C)(i,j ) += A(i,j ) * B(j ,j ) * scalar;
5723  (~C)(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
5724  }
5725  if( jpos < jend ) {
5726  (~C)(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
5727  }
5728  }
5729  }
5730  //**********************************************************************************************
5731 
5732  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
5746  template< typename MT3 // Type of the left-hand side target matrix
5747  , typename MT4 // Type of the left-hand side matrix operand
5748  , typename MT5 // Type of the right-hand side matrix operand
5749  , typename ST2 > // Type of the scalar value
5750  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
5751  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5752  {
5753  const size_t M( A.rows() );
5754  const size_t N( B.columns() );
5755 
5756  const size_t block( BLOCK_SIZE );
5757 
5758  for( size_t jj=0UL; jj<N; jj+=block ) {
5759  const size_t jend( min( N, jj+block ) );
5760  for( size_t ii=0UL; ii<M; ii+=block ) {
5761  const size_t iend( min( M, ii+block ) );
5762  for( size_t j=jj; j<jend; ++j )
5763  {
5764  const size_t ibegin( ( IsLower<MT4>::value )
5765  ?( max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
5766  :( ii ) );
5767  const size_t ipos( ( IsUpper<MT4>::value )
5768  ?( min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
5769  :( iend ) );
5770 
5771  for( size_t i=ibegin; i<ipos; ++i ) {
5772  (~C)(i,j) += A(i,j) * B(j,j) * scalar;
5773  }
5774  }
5775  }
5776  }
5777  }
5778  //**********************************************************************************************
5779 
5780  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
5794  template< typename MT3 // Type of the left-hand side target matrix
5795  , typename MT4 // Type of the left-hand side matrix operand
5796  , typename MT5 // Type of the right-hand side matrix operand
5797  , typename ST2 > // Type of the scalar value
5798  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
5799  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5800  {
5801  const size_t M( A.rows() );
5802  const size_t N( B.columns() );
5803 
5804  const size_t block( BLOCK_SIZE );
5805 
5806  for( size_t ii=0UL; ii<M; ii+=block ) {
5807  const size_t iend( min( M, ii+block ) );
5808  for( size_t jj=0UL; jj<N; jj+=block ) {
5809  const size_t jend( min( N, jj+block ) );
5810  for( size_t i=ii; i<iend; ++i )
5811  {
5812  const size_t jbegin( ( IsUpper<MT5>::value )
5813  ?( max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
5814  :( jj ) );
5815  const size_t jpos( ( IsLower<MT5>::value )
5816  ?( min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
5817  :( jend ) );
5818 
5819  for( size_t j=jbegin; j<jpos; ++j ) {
5820  (~C)(i,j) += A(i,i) * B(i,j) * scalar;
5821  }
5822  }
5823  }
5824  }
5825  }
5826  //**********************************************************************************************
5827 
5828  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
5842  template< typename MT3 // Type of the left-hand side target matrix
5843  , typename MT4 // Type of the left-hand side matrix operand
5844  , typename MT5 // Type of the right-hand side matrix operand
5845  , typename ST2 > // Type of the scalar value
5846  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
5847  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5848  {
5849  const size_t M( A.rows() );
5850  const size_t N( B.columns() );
5851 
5852  for( size_t j=0UL; j<N; ++j )
5853  {
5854  const size_t ibegin( ( IsLower<MT5>::value )
5855  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5856  :( 0UL ) );
5857  const size_t iend( ( IsUpper<MT5>::value )
5858  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5859  :( M ) );
5860  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5861 
5862  const size_t inum( iend - ibegin );
5863  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
5864 
5865  for( size_t i=ibegin; i<ipos; i+=2UL ) {
5866  (~C)(i ,j) += A(i ,i ) * B(i ,j) * scalar;
5867  (~C)(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
5868  }
5869  if( ipos < iend ) {
5870  (~C)(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
5871  }
5872  }
5873  }
5874  //**********************************************************************************************
5875 
5876  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
5890  template< typename MT3 // Type of the left-hand side target matrix
5891  , typename MT4 // Type of the left-hand side matrix operand
5892  , typename MT5 // Type of the right-hand side matrix operand
5893  , typename ST2 > // Type of the scalar value
5894  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
5895  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5896  {
5897  for( size_t i=0UL; i<A.rows(); ++i ) {
5898  C(i,i) += A(i,i) * B(i,i) * scalar;
5899  }
5900  }
5901  //**********************************************************************************************
5902 
5903  //**Default addition assignment to dense matrices (small matrices)******************************
5917  template< typename MT3 // Type of the left-hand side target matrix
5918  , typename MT4 // Type of the left-hand side matrix operand
5919  , typename MT5 // Type of the right-hand side matrix operand
5920  , typename ST2 > // Type of the scalar value
5921  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5922  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5923  {
5924  selectDefaultAddAssignKernel( C, A, B, scalar );
5925  }
5926  //**********************************************************************************************
5927 
5928  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
5943  template< typename MT3 // Type of the left-hand side target matrix
5944  , typename MT4 // Type of the left-hand side matrix operand
5945  , typename MT5 // Type of the right-hand side matrix operand
5946  , typename ST2 > // Type of the scalar value
5947  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5948  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5949  {
5950  typedef IntrinsicTrait<ElementType> IT;
5951 
5952  const size_t M( A.rows() );
5953  const size_t N( B.columns() );
5954  const size_t K( A.columns() );
5955 
5956  const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
5957 
5958  size_t i( 0UL );
5959 
5960  for( ; (i+2UL) <= M; i+=2UL )
5961  {
5962  size_t j( 0UL );
5963 
5964  for( ; (j+4UL) <= N; j+=4UL )
5965  {
5966  const size_t kbegin( ( IsUpper<MT4>::value )
5967  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
5968  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
5969  const size_t kend( ( IsLower<MT4>::value )
5970  ?( IsUpper<MT5>::value ? min( i+2UL, j+4UL ) : ( i+2UL ) )
5971  :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
5972 
5973  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
5974  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
5975 
5976  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5977  size_t k( kbegin );
5978 
5979  for( ; k<kpos; k+=IT::size ) {
5980  const IntrinsicType a1( A.load(i ,k) );
5981  const IntrinsicType a2( A.load(i+1UL,k) );
5982  const IntrinsicType b1( B.load(k,j ) );
5983  const IntrinsicType b2( B.load(k,j+1UL) );
5984  const IntrinsicType b3( B.load(k,j+2UL) );
5985  const IntrinsicType b4( B.load(k,j+3UL) );
5986  xmm1 = xmm1 + a1 * b1;
5987  xmm2 = xmm2 + a1 * b2;
5988  xmm3 = xmm3 + a1 * b3;
5989  xmm4 = xmm4 + a1 * b4;
5990  xmm5 = xmm5 + a2 * b1;
5991  xmm6 = xmm6 + a2 * b2;
5992  xmm7 = xmm7 + a2 * b3;
5993  xmm8 = xmm8 + a2 * b4;
5994  }
5995 
5996  (~C)(i ,j ) += sum( xmm1 ) * scalar;
5997  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
5998  (~C)(i ,j+2UL) += sum( xmm3 ) * scalar;
5999  (~C)(i ,j+3UL) += sum( xmm4 ) * scalar;
6000  (~C)(i+1UL,j ) += sum( xmm5 ) * scalar;
6001  (~C)(i+1UL,j+1UL) += sum( xmm6 ) * scalar;
6002  (~C)(i+1UL,j+2UL) += sum( xmm7 ) * scalar;
6003  (~C)(i+1UL,j+3UL) += sum( xmm8 ) * scalar;
6004 
6005  for( ; remainder && k<kend; ++k ) {
6006  (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6007  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6008  (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
6009  (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
6010  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6011  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6012  (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
6013  (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
6014  }
6015  }
6016 
6017  for( ; (j+2UL) <= N; j+=2UL )
6018  {
6019  const size_t kbegin( ( IsUpper<MT4>::value )
6020  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
6021  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
6022  const size_t kend( ( IsLower<MT4>::value )
6023  ?( IsUpper<MT5>::value ? min( i+2UL, j+2UL ) : ( i+2UL ) )
6024  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
6025 
6026  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
6027  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
6028 
6029  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6030  size_t k( kbegin );
6031 
6032  for( ; k<kpos; k+=IT::size ) {
6033  const IntrinsicType a1( A.load(i ,k) );
6034  const IntrinsicType a2( A.load(i+1UL,k) );
6035  const IntrinsicType b1( B.load(k,j ) );
6036  const IntrinsicType b2( B.load(k,j+1UL) );
6037  xmm1 = xmm1 + a1 * b1;
6038  xmm2 = xmm2 + a1 * b2;
6039  xmm3 = xmm3 + a2 * b1;
6040  xmm4 = xmm4 + a2 * b2;
6041  }
6042 
6043  (~C)(i ,j ) += sum( xmm1 ) * scalar;
6044  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
6045  (~C)(i+1UL,j ) += sum( xmm3 ) * scalar;
6046  (~C)(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
6047 
6048  for( ; remainder && k<kend; ++k ) {
6049  (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6050  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6051  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6052  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6053  }
6054  }
6055 
6056  if( j < N )
6057  {
6058  const size_t kbegin( ( IsUpper<MT4>::value )
6059  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
6060  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
6061  const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
6062 
6063  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
6064  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
6065 
6066  IntrinsicType xmm1, xmm2;
6067  size_t k( kbegin );
6068 
6069  for( ; k<kpos; k+=IT::size ) {
6070  const IntrinsicType b1( B.load(k,j) );
6071  xmm1 = xmm1 + A.load(i ,k) * b1;
6072  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
6073  }
6074 
6075  (~C)(i ,j) += sum( xmm1 ) * scalar;
6076  (~C)(i+1UL,j) += sum( xmm2 ) * scalar;
6077 
6078  for( ; remainder && k<kend; ++k ) {
6079  (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6080  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6081  }
6082  }
6083  }
6084 
6085  if( i < M )
6086  {
6087  size_t j( 0UL );
6088 
6089  for( ; (j+4UL) <= N; j+=4UL )
6090  {
6091  const size_t kbegin( ( IsUpper<MT4>::value )
6092  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
6093  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
6094  const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
6095 
6096  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
6097  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
6098 
6099  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6100  size_t k( kbegin );
6101 
6102  for( ; k<kpos; k+=IT::size ) {
6103  const IntrinsicType a1( A.load(i,k) );
6104  xmm1 = xmm1 + a1 * B.load(k,j );
6105  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
6106  xmm3 = xmm3 + a1 * B.load(k,j+2UL);
6107  xmm4 = xmm4 + a1 * B.load(k,j+3UL);
6108  }
6109 
6110  (~C)(i,j ) += sum( xmm1 ) * scalar;
6111  (~C)(i,j+1UL) += sum( xmm2 ) * scalar;
6112  (~C)(i,j+2UL) += sum( xmm3 ) * scalar;
6113  (~C)(i,j+3UL) += sum( xmm4 ) * scalar;
6114 
6115  for( ; remainder && k<kend; ++k ) {
6116  (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6117  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6118  (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
6119  (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
6120  }
6121  }
6122 
6123  for( ; (j+2UL) <= N; j+=2UL )
6124  {
6125  const size_t kbegin( ( IsUpper<MT4>::value )
6126  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
6127  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
6128  const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
6129 
6130  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
6131  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
6132 
6133  IntrinsicType xmm1, xmm2;
6134  size_t k( kbegin );
6135 
6136  for( ; k<kpos; k+=IT::size ) {
6137  const IntrinsicType a1( A.load(i,k) );
6138  xmm1 = xmm1 + a1 * B.load(k,j );
6139  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
6140  }
6141 
6142  (~C)(i,j ) += sum( xmm1 ) * scalar;
6143  (~C)(i,j+1UL) += sum( xmm2 ) * scalar;
6144 
6145  for( ; remainder && k<kend; ++k ) {
6146  (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6147  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6148  }
6149  }
6150 
6151  if( j < N )
6152  {
6153  const size_t kbegin( ( IsUpper<MT4>::value )
6154  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
6155  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
6156 
6157  const size_t kpos( remainder ? ( K & size_t(-IT::size) ) : K );
6158  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (IT::size) ) ) == kpos, "Invalid end calculation" );
6159 
6160  IntrinsicType xmm1;
6161  size_t k( kbegin );
6162 
6163  for( ; k<kpos; k+=IT::size ) {
6164  xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
6165  }
6166 
6167  (~C)(i,j) += sum( xmm1 ) * scalar;
6168 
6169  for( ; remainder && k<K; ++k ) {
6170  (~C)(i,j) += A(i,k) * B(k,j) * scalar;
6171  }
6172  }
6173  }
6174  }
6175  //**********************************************************************************************
6176 
6177  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
6192  template< typename MT3 // Type of the left-hand side target matrix
6193  , typename MT4 // Type of the left-hand side matrix operand
6194  , typename MT5 // Type of the right-hand side matrix operand
6195  , typename ST2 > // Type of the scalar value
6196  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6197  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6198  {
6199  typedef IntrinsicTrait<ElementType> IT;
6200 
6201  const size_t M( A.rows() );
6202  const size_t N( B.columns() );
6203  const size_t K( A.columns() );
6204 
6205  const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
6206 
6207  size_t i( 0UL );
6208 
6209  for( ; (i+4UL) <= M; i+=4UL )
6210  {
6211  size_t j( 0UL );
6212 
6213  for( ; (j+2UL) <= N; j+=2UL )
6214  {
6215  const size_t kbegin( ( IsUpper<MT4>::value )
6216  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
6217  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
6218  const size_t kend( ( IsLower<MT4>::value )
6219  ?( IsUpper<MT5>::value ? min( i+4UL, j+2UL ) : ( i+4UL ) )
6220  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
6221 
6222  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
6223  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
6224 
6225  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6226  size_t k( kbegin );
6227 
6228  for( ; k<kpos; k+=IT::size ) {
6229  const IntrinsicType a1( A.load(i ,k) );
6230  const IntrinsicType a2( A.load(i+1UL,k) );
6231  const IntrinsicType a3( A.load(i+2UL,k) );
6232  const IntrinsicType a4( A.load(i+3UL,k) );
6233  const IntrinsicType b1( B.load(k,j ) );
6234  const IntrinsicType b2( B.load(k,j+1UL) );
6235  xmm1 = xmm1 + a1 * b1;
6236  xmm2 = xmm2 + a1 * b2;
6237  xmm3 = xmm3 + a2 * b1;
6238  xmm4 = xmm4 + a2 * b2;
6239  xmm5 = xmm5 + a3 * b1;
6240  xmm6 = xmm6 + a3 * b2;
6241  xmm7 = xmm7 + a4 * b1;
6242  xmm8 = xmm8 + a4 * b2;
6243  }
6244 
6245  (~C)(i ,j ) += sum( xmm1 ) * scalar;
6246  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
6247  (~C)(i+1UL,j ) += sum( xmm3 ) * scalar;
6248  (~C)(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
6249  (~C)(i+2UL,j ) += sum( xmm5 ) * scalar;
6250  (~C)(i+2UL,j+1UL) += sum( xmm6 ) * scalar;
6251  (~C)(i+3UL,j ) += sum( xmm7 ) * scalar;
6252  (~C)(i+3UL,j+1UL) += sum( xmm8 ) * scalar;
6253 
6254  for( ; remainder && k<kend; ++k ) {
6255  (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6256  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6257  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6258  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6259  (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
6260  (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
6261  (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
6262  (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
6263  }
6264  }
6265 
6266  if( j < N )
6267  {
6268  const size_t kbegin( ( IsUpper<MT4>::value )
6269  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
6270  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
6271  const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
6272 
6273  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
6274  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
6275 
6276  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6277  size_t k( kbegin );
6278 
6279  for( ; k<kpos; k+=IT::size ) {
6280  const IntrinsicType b1( B.load(k,j) );
6281  xmm1 = xmm1 + A.load(i ,k) * b1;
6282  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
6283  xmm3 = xmm3 + A.load(i+2UL,k) * b1;
6284  xmm4 = xmm4 + A.load(i+3UL,k) * b1;
6285  }
6286 
6287  (~C)(i ,j) += sum( xmm1 ) * scalar;
6288  (~C)(i+1UL,j) += sum( xmm2 ) * scalar;
6289  (~C)(i+2UL,j) += sum( xmm3 ) * scalar;
6290  (~C)(i+3UL,j) += sum( xmm4 ) * scalar;
6291 
6292  for( ; remainder && k<kend; ++k ) {
6293  (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6294  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6295  (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
6296  (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
6297  }
6298  }
6299  }
6300 
6301  for( ; (i+2UL) <= M; i+=2UL )
6302  {
6303  size_t j( 0UL );
6304 
6305  for( ; (j+2UL) <= N; j+=2UL )
6306  {
6307  const size_t kbegin( ( IsUpper<MT4>::value )
6308  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
6309  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
6310  const size_t kend( ( IsLower<MT4>::value )
6311  ?( IsUpper<MT5>::value ? min( i+2UL, j+2UL ) : ( i+2UL ) )
6312  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
6313 
6314  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
6315  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
6316 
6317  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6318  size_t k( kbegin );
6319 
6320  for( ; k<kpos; k+=IT::size ) {
6321  const IntrinsicType a1( A.load(i ,k) );
6322  const IntrinsicType a2( A.load(i+1UL,k) );
6323  const IntrinsicType b1( B.load(k,j ) );
6324  const IntrinsicType b2( B.load(k,j+1UL) );
6325  xmm1 = xmm1 + a1 * b1;
6326  xmm2 = xmm2 + a1 * b2;
6327  xmm3 = xmm3 + a2 * b1;
6328  xmm4 = xmm4 + a2 * b2;
6329  }
6330 
6331  (~C)(i ,j ) += sum( xmm1 ) * scalar;
6332  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
6333  (~C)(i+1UL,j ) += sum( xmm3 ) * scalar;
6334  (~C)(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
6335 
6336  for( ; remainder && k<kend; ++k ) {
6337  (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6338  (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6339  (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6340  (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6341  }
6342  }
6343 
6344  if( j < N )
6345  {
6346  const size_t kbegin( ( IsUpper<MT4>::value )
6347  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
6348  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
6349  const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
6350 
6351  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
6352  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
6353 
6354  IntrinsicType xmm1, xmm2;
6355  size_t k( kbegin );
6356 
6357  for( ; k<kpos; k+=IT::size ) {
6358  const IntrinsicType b1( B.load(k,j) );
6359  xmm1 = xmm1 + A.load(i ,k) * b1;
6360  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
6361  }
6362 
6363  (~C)(i ,j) += sum( xmm1 ) * scalar;
6364  (~C)(i+1UL,j) += sum( xmm2 ) * scalar;
6365 
6366  for( ; remainder && k<kend; ++k ) {
6367  (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6368  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6369  }
6370  }
6371  }
6372 
6373  if( i < M )
6374  {
6375  size_t j( 0UL );
6376 
6377  for( ; (j+2UL) <= N; j+=2UL )
6378  {
6379  const size_t kbegin( ( IsUpper<MT4>::value )
6380  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
6381  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
6382  const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
6383 
6384  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
6385  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
6386 
6387  IntrinsicType xmm1, xmm2;
6388  size_t k( kbegin );
6389 
6390  for( ; k<kpos; k+=IT::size ) {
6391  const IntrinsicType a1( A.load(i,k) );
6392  xmm1 = xmm1 + a1 * B.load(k,j );
6393  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
6394  }
6395 
6396  (~C)(i,j ) += sum( xmm1 ) * scalar;
6397  (~C)(i,j+1UL) += sum( xmm2 ) * scalar;
6398 
6399  for( ; remainder && k<kend; ++k ) {
6400  (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6401  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6402  }
6403  }
6404 
6405  if( j < N )
6406  {
6407  const size_t kbegin( ( IsUpper<MT4>::value )
6408  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
6409  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
6410 
6411  const size_t kpos( remainder ? ( K & size_t(-IT::size) ) : K );
6412  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (IT::size) ) ) == kpos, "Invalid end calculation" );
6413 
6414  IntrinsicType xmm1;
6415  size_t k( kbegin );
6416 
6417  for( ; k<kpos; k+=IT::size ) {
6418  xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
6419  }
6420 
6421  (~C)(i,j) += sum( xmm1 ) * scalar;
6422 
6423  for( ; remainder && k<K; ++k ) {
6424  (~C)(i,j) += A(i,k) * B(k,j) * scalar;
6425  }
6426  }
6427  }
6428  }
6429  //**********************************************************************************************
6430 
6431  //**Default addition assignment to dense matrices (large matrices)******************************
6445  template< typename MT3 // Type of the left-hand side target matrix
6446  , typename MT4 // Type of the left-hand side matrix operand
6447  , typename MT5 // Type of the right-hand side matrix operand
6448  , typename ST2 > // Type of the scalar value
6449  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6450  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6451  {
6452  selectDefaultAddAssignKernel( C, A, B, scalar );
6453  }
6454  //**********************************************************************************************
6455 
6456  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
6471  template< typename MT3 // Type of the left-hand side target matrix
6472  , typename MT4 // Type of the left-hand side matrix operand
6473  , typename MT5 // Type of the right-hand side matrix operand
6474  , typename ST2 > // Type of the scalar value
6475  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6476  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6477  {
6478  // TODO
6479  selectSmallAddAssignKernel( ~C, A, B, scalar );
6480  }
6481  //**********************************************************************************************
6482 
6483  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
6498  template< typename MT3 // Type of the left-hand side target matrix
6499  , typename MT4 // Type of the left-hand side matrix operand
6500  , typename MT5 // Type of the right-hand side matrix operand
6501  , typename ST2 > // Type of the scalar value
6502  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6503  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6504  {
6505  // TODO
6506  selectSmallAddAssignKernel( ~C, A, B, scalar );
6507  }
6508  //**********************************************************************************************
6509 
6510  //**BLAS-based addition assignment to dense matrices (default)**********************************
6524  template< typename MT3 // Type of the left-hand side target matrix
6525  , typename MT4 // Type of the left-hand side matrix operand
6526  , typename MT5 // Type of the right-hand side matrix operand
6527  , typename ST2 > // Type of the scalar value
6528  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
6529  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6530  {
6531  selectLargeAddAssignKernel( C, A, B, scalar );
6532  }
6533  //**********************************************************************************************
6534 
6535  //**BLAS-based addition assignment to dense matrices********************************************
6536 #if BLAZE_BLAS_MODE
6537 
6550  template< typename MT3 // Type of the left-hand side target matrix
6551  , typename MT4 // Type of the left-hand side matrix operand
6552  , typename MT5 // Type of the right-hand side matrix operand
6553  , typename ST2 > // Type of the scalar value
6554  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
6555  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6556  {
6557  typedef typename MT3::ElementType ET;
6558 
6559  if( IsTriangular<MT4>::value ) {
6560  typename MT3::ResultType tmp( serial( B ) );
6561  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6562  addAssign( C, tmp );
6563  }
6564  else if( IsTriangular<MT5>::value ) {
6565  typename MT3::ResultType tmp( serial( A ) );
6566  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6567  addAssign( C, tmp );
6568  }
6569  else {
6570  gemm( C, A, B, ET(scalar), ET(1) );
6571  }
6572  }
6573 #endif
6574  //**********************************************************************************************
6575 
6576  //**Addition assignment to sparse matrices******************************************************
6577  // No special implementation for the addition assignment to sparse matrices.
6578  //**********************************************************************************************
6579 
6580  //**Subtraction assignment to dense matrices****************************************************
6592  template< typename MT // Type of the target dense matrix
6593  , bool SO > // Storage order of the target dense matrix
6594  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6595  {
6597 
6598  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6599  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6600 
6601  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6602  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6603 
6604  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6605  return;
6606  }
6607 
6608  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6609  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6610 
6611  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6612  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6613  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6614  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6615  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6616  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6617 
6618  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6619  }
6620  //**********************************************************************************************
6621 
6622  //**Subtraction assignment to dense matrices (kernel selection)*********************************
6633  template< typename MT3 // Type of the left-hand side target matrix
6634  , typename MT4 // Type of the left-hand side matrix operand
6635  , typename MT5 // Type of the right-hand side matrix operand
6636  , typename ST2 > // Type of the scalar value
6637  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6638  {
6639  if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
6640  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
6641  selectSmallSubAssignKernel( C, A, B, scalar );
6642  else
6643  selectBlasSubAssignKernel( C, A, B, scalar );
6644  }
6645  //**********************************************************************************************
6646 
6647  //**Default subtraction assignment to dense matrices (general/general)**************************
6661  template< typename MT3 // Type of the left-hand side target matrix
6662  , typename MT4 // Type of the left-hand side matrix operand
6663  , typename MT5 // Type of the right-hand side matrix operand
6664  , typename ST2 > // Type of the scalar value
6665  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6666  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6667  {
6668  const ResultType tmp( serial( A * B * scalar ) );
6669  subAssign( C, tmp );
6670  }
6671  //**********************************************************************************************
6672 
6673  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
6687  template< typename MT3 // Type of the left-hand side target matrix
6688  , typename MT4 // Type of the left-hand side matrix operand
6689  , typename MT5 // Type of the right-hand side matrix operand
6690  , typename ST2 > // Type of the scalar value
6691  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6692  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6693  {
6694  const size_t M( A.rows() );
6695  const size_t N( B.columns() );
6696 
6697  for( size_t i=0UL; i<M; ++i )
6698  {
6699  const size_t jbegin( ( IsUpper<MT4>::value )
6700  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6701  :( 0UL ) );
6702  const size_t jend( ( IsLower<MT4>::value )
6703  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6704  :( N ) );
6705  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6706 
6707  const size_t jnum( jend - jbegin );
6708  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6709 
6710  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6711  (~C)(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
6712  (~C)(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6713  }
6714  if( jpos < jend ) {
6715  (~C)(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
6716  }
6717  }
6718  }
6719  //**********************************************************************************************
6720 
6721  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
6735  template< typename MT3 // Type of the left-hand side target matrix
6736  , typename MT4 // Type of the left-hand side matrix operand
6737  , typename MT5 // Type of the right-hand side matrix operand
6738  , typename ST2 > // Type of the scalar value
6739  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6740  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6741  {
6742  const size_t M( A.rows() );
6743  const size_t N( B.columns() );
6744 
6745  const size_t block( BLOCK_SIZE );
6746 
6747  for( size_t jj=0UL; jj<N; jj+=block ) {
6748  const size_t jend( min( N, jj+block ) );
6749  for( size_t ii=0UL; ii<M; ii+=block ) {
6750  const size_t iend( min( M, ii+block ) );
6751  for( size_t j=jj; j<jend; ++j )
6752  {
6753  const size_t ibegin( ( IsLower<MT4>::value )
6754  ?( max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
6755  :( ii ) );
6756  const size_t ipos( ( IsUpper<MT4>::value )
6757  ?( min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
6758  :( iend ) );
6759 
6760  for( size_t i=ibegin; i<ipos; ++i ) {
6761  (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
6762  }
6763  }
6764  }
6765  }
6766  }
6767  //**********************************************************************************************
6768 
6769  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
6784  template< typename MT3 // Type of the left-hand side target matrix
6785  , typename MT4 // Type of the left-hand side matrix operand
6786  , typename MT5 // Type of the right-hand side matrix operand
6787  , typename ST2 > // Type of the scalar value
6788  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6789  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6790  {
6791  const size_t M( A.rows() );
6792  const size_t N( B.columns() );
6793 
6794  const size_t block( BLOCK_SIZE );
6795 
6796  for( size_t ii=0UL; ii<M; ii+=block ) {
6797  const size_t iend( min( M, ii+block ) );
6798  for( size_t jj=0UL; jj<N; jj+=block ) {
6799  const size_t jend( min( N, jj+block ) );
6800  for( size_t i=ii; i<iend; ++i )
6801  {
6802  const size_t jbegin( ( IsUpper<MT5>::value )
6803  ?( max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
6804  :( jj ) );
6805  const size_t jpos( ( IsLower<MT5>::value )
6806  ?( min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
6807  :( jend ) );
6808 
6809  for( size_t j=jbegin; j<jpos; ++j ) {
6810  (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
6811  }
6812  }
6813  }
6814  }
6815  }
6816  //**********************************************************************************************
6817 
6818  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
6833  template< typename MT3 // Type of the left-hand side target matrix
6834  , typename MT4 // Type of the left-hand side matrix operand
6835  , typename MT5 // Type of the right-hand side matrix operand
6836  , typename ST2 > // Type of the scalar value
6837  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6838  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6839  {
6840  const size_t M( A.rows() );
6841  const size_t N( B.columns() );
6842 
6843  for( size_t j=0UL; j<N; ++j )
6844  {
6845  const size_t ibegin( ( IsLower<MT5>::value )
6846  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6847  :( 0UL ) );
6848  const size_t iend( ( IsUpper<MT5>::value )
6849  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6850  :( M ) );
6851  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6852 
6853  const size_t inum( iend - ibegin );
6854  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6855 
6856  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6857  (~C)(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
6858  (~C)(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6859  }
6860  if( ipos < iend ) {
6861  (~C)(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
6862  }
6863  }
6864  }
6865  //**********************************************************************************************
6866 
6867  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
6881  template< typename MT3 // Type of the left-hand side target matrix
6882  , typename MT4 // Type of the left-hand side matrix operand
6883  , typename MT5 // Type of the right-hand side matrix operand
6884  , typename ST2 > // Type of the scalar value
6885  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
6886  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6887  {
6888  for( size_t i=0UL; i<A.rows(); ++i ) {
6889  C(i,i) -= A(i,i) * B(i,i) * scalar;
6890  }
6891  }
6892  //**********************************************************************************************
6893 
6894  //**Default subtraction assignment to dense matrices (small matrices)***************************
6908  template< typename MT3 // Type of the left-hand side target matrix
6909  , typename MT4 // Type of the left-hand side matrix operand
6910  , typename MT5 // Type of the right-hand side matrix operand
6911  , typename ST2 > // Type of the scalar value
6912  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6913  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6914  {
6915  selectDefaultSubAssignKernel( C, A, B, scalar );
6916  }
6917  //**********************************************************************************************
6918 
6919  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
6934  template< typename MT3 // Type of the left-hand side target matrix
6935  , typename MT4 // Type of the left-hand side matrix operand
6936  , typename MT5 // Type of the right-hand side matrix operand
6937  , typename ST2 > // Type of the scalar value
6938  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6939  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6940  {
6941  typedef IntrinsicTrait<ElementType> IT;
6942 
6943  const size_t M( A.rows() );
6944  const size_t N( B.columns() );
6945  const size_t K( A.columns() );
6946 
6947  const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
6948 
6949  size_t i( 0UL );
6950 
6951  for( ; (i+2UL) <= M; i+=2UL )
6952  {
6953  size_t j( 0UL );
6954 
6955  for( ; (j+4UL) <= N; j+=4UL )
6956  {
6957  const size_t kbegin( ( IsUpper<MT4>::value )
6958  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
6959  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
6960  const size_t kend( ( IsLower<MT4>::value )
6961  ?( IsUpper<MT5>::value ? min( i+2UL, j+4UL ) : ( i+2UL ) )
6962  :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
6963 
6964  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
6965  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
6966 
6967  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6968  size_t k( kbegin );
6969 
6970  for( ; k<kpos; k+=IT::size ) {
6971  const IntrinsicType a1( A.load(i ,k) );
6972  const IntrinsicType a2( A.load(i+1UL,k) );
6973  const IntrinsicType b1( B.load(k,j ) );
6974  const IntrinsicType b2( B.load(k,j+1UL) );
6975  const IntrinsicType b3( B.load(k,j+2UL) );
6976  const IntrinsicType b4( B.load(k,j+3UL) );
6977  xmm1 = xmm1 + a1 * b1;
6978  xmm2 = xmm2 + a1 * b2;
6979  xmm3 = xmm3 + a1 * b3;
6980  xmm4 = xmm4 + a1 * b4;
6981  xmm5 = xmm5 + a2 * b1;
6982  xmm6 = xmm6 + a2 * b2;
6983  xmm7 = xmm7 + a2 * b3;
6984  xmm8 = xmm8 + a2 * b4;
6985  }
6986 
6987  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
6988  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
6989  (~C)(i ,j+2UL) -= sum( xmm3 ) * scalar;
6990  (~C)(i ,j+3UL) -= sum( xmm4 ) * scalar;
6991  (~C)(i+1UL,j ) -= sum( xmm5 ) * scalar;
6992  (~C)(i+1UL,j+1UL) -= sum( xmm6 ) * scalar;
6993  (~C)(i+1UL,j+2UL) -= sum( xmm7 ) * scalar;
6994  (~C)(i+1UL,j+3UL) -= sum( xmm8 ) * scalar;
6995 
6996  for( ; remainder && k<kend; ++k ) {
6997  (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
6998  (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
6999  (~C)(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
7000  (~C)(i ,j+3UL) -= A(i ,k) * B(k,j+3UL) * scalar;
7001  (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7002  (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7003  (~C)(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
7004  (~C)(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL) * scalar;
7005  }
7006  }
7007 
7008  for( ; (j+2UL) <= N; j+=2UL )
7009  {
7010  const size_t kbegin( ( IsUpper<MT4>::value )
7011  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
7012  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
7013  const size_t kend( ( IsLower<MT4>::value )
7014  ?( IsUpper<MT5>::value ? min( i+2UL, j+2UL ) : ( i+2UL ) )
7015  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
7016 
7017  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
7018  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
7019 
7020  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7021  size_t k( kbegin );
7022 
7023  for( ; k<kpos; k+=IT::size ) {
7024  const IntrinsicType a1( A.load(i ,k) );
7025  const IntrinsicType a2( A.load(i+1UL,k) );
7026  const IntrinsicType b1( B.load(k,j ) );
7027  const IntrinsicType b2( B.load(k,j+1UL) );
7028  xmm1 = xmm1 + a1 * b1;
7029  xmm2 = xmm2 + a1 * b2;
7030  xmm3 = xmm3 + a2 * b1;
7031  xmm4 = xmm4 + a2 * b2;
7032  }
7033 
7034  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
7035  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
7036  (~C)(i+1UL,j ) -= sum( xmm3 ) * scalar;
7037  (~C)(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
7038 
7039  for( ; remainder && k<kend; ++k ) {
7040  (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7041  (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7042  (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7043  (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7044  }
7045  }
7046 
7047  if( j < N )
7048  {
7049  const size_t kbegin( ( IsUpper<MT4>::value )
7050  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
7051  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
7052  const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
7053 
7054  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
7055  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
7056 
7057  IntrinsicType xmm1, xmm2;
7058  size_t k( kbegin );
7059 
7060  for( ; k<kpos; k+=IT::size ) {
7061  const IntrinsicType b1( B.load(k,j) );
7062  xmm1 = xmm1 + A.load(i ,k) * b1;
7063  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
7064  }
7065 
7066  (~C)(i ,j) -= sum( xmm1 ) * scalar;
7067  (~C)(i+1UL,j) -= sum( xmm2 ) * scalar;
7068 
7069  for( ; remainder && k<kend; ++k ) {
7070  (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7071  (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7072  }
7073  }
7074  }
7075 
7076  if( i < M )
7077  {
7078  size_t j( 0UL );
7079 
7080  for( ; (j+4UL) <= N; j+=4UL )
7081  {
7082  const size_t kbegin( ( IsUpper<MT4>::value )
7083  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
7084  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
7085  const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
7086 
7087  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
7088  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
7089 
7090  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7091  size_t k( kbegin );
7092 
7093  for( ; k<kpos; k+=IT::size ) {
7094  const IntrinsicType a1( A.load(i,k) );
7095  xmm1 = xmm1 + a1 * B.load(k,j );
7096  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
7097  xmm3 = xmm3 + a1 * B.load(k,j+2UL);
7098  xmm4 = xmm4 + a1 * B.load(k,j+3UL);
7099  }
7100 
7101  (~C)(i,j ) -= sum( xmm1 ) * scalar;
7102  (~C)(i,j+1UL) -= sum( xmm2 ) * scalar;
7103  (~C)(i,j+2UL) -= sum( xmm3 ) * scalar;
7104  (~C)(i,j+3UL) -= sum( xmm4 ) * scalar;
7105 
7106  for( ; remainder && k<kend; ++k ) {
7107  (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7108  (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7109  (~C)(i,j+2UL) -= A(i,k) * B(k,j+2UL) * scalar;
7110  (~C)(i,j+3UL) -= A(i,k) * B(k,j+3UL) * scalar;
7111  }
7112  }
7113 
7114  for( ; (j+2UL) <= N; j+=2UL )
7115  {
7116  const size_t kbegin( ( IsUpper<MT4>::value )
7117  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
7118  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
7119  const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
7120 
7121  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
7122  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
7123 
7124  IntrinsicType xmm1, xmm2;
7125  size_t k( kbegin );
7126 
7127  for( ; k<kpos; k+=IT::size ) {
7128  const IntrinsicType a1( A.load(i,k) );
7129  xmm1 = xmm1 + a1 * B.load(k,j );
7130  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
7131  }
7132 
7133  (~C)(i,j ) -= sum( xmm1 ) * scalar;
7134  (~C)(i,j+1UL) -= sum( xmm2 ) * scalar;
7135 
7136  for( ; remainder && k<kend; ++k ) {
7137  (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7138  (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7139  }
7140  }
7141 
7142  if( j < N )
7143  {
7144  const size_t kbegin( ( IsUpper<MT4>::value )
7145  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
7146  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
7147 
7148  const size_t kpos( remainder ? ( K & size_t(-IT::size) ) : K );
7149  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (IT::size) ) ) == kpos, "Invalid end calculation" );
7150 
7151  IntrinsicType xmm1;
7152  size_t k( kbegin );
7153 
7154  for( ; k<kpos; k+=IT::size ) {
7155  xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
7156  }
7157 
7158  (~C)(i,j) -= sum( xmm1 ) * scalar;
7159 
7160  for( ; remainder && k<K; ++k ) {
7161  (~C)(i,j) -= A(i,k) * B(k,j) * scalar;
7162  }
7163  }
7164  }
7165  }
7166  //**********************************************************************************************
7167 
7168  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
7183  template< typename MT3 // Type of the left-hand side target matrix
7184  , typename MT4 // Type of the left-hand side matrix operand
7185  , typename MT5 // Type of the right-hand side matrix operand
7186  , typename ST2 > // Type of the scalar value
7187  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7188  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7189  {
7190  typedef IntrinsicTrait<ElementType> IT;
7191 
7192  const size_t M( A.rows() );
7193  const size_t N( B.columns() );
7194  const size_t K( A.columns() );
7195 
7196  const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
7197 
7198  size_t i( 0UL );
7199 
7200  for( ; (i+4UL) <= M; i+=4UL )
7201  {
7202  size_t j( 0UL );
7203 
7204  for( ; (j+2UL) <= N; j+=2UL )
7205  {
7206  const size_t kbegin( ( IsUpper<MT4>::value )
7207  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
7208  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
7209  const size_t kend( ( IsLower<MT4>::value )
7210  ?( IsUpper<MT5>::value ? min( i+4UL, j+2UL ) : ( i+4UL ) )
7211  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
7212 
7213  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
7214  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
7215 
7216  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7217  size_t k( kbegin );
7218 
7219  for( ; k<kpos; k+=IT::size )
7220  {
7221  const IntrinsicType a1( A.load(i ,k) );
7222  const IntrinsicType a2( A.load(i+1UL,k) );
7223  const IntrinsicType a3( A.load(i+2UL,k) );
7224  const IntrinsicType a4( A.load(i+3UL,k) );
7225  const IntrinsicType b1( B.load(k,j ) );
7226  const IntrinsicType b2( B.load(k,j+1UL) );
7227  xmm1 = xmm1 + a1 * b1;
7228  xmm2 = xmm2 + a1 * b2;
7229  xmm3 = xmm3 + a2 * b1;
7230  xmm4 = xmm4 + a2 * b2;
7231  xmm5 = xmm5 + a3 * b1;
7232  xmm6 = xmm6 + a3 * b2;
7233  xmm7 = xmm7 + a4 * b1;
7234  xmm8 = xmm8 + a4 * b2;
7235  }
7236 
7237  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
7238  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
7239  (~C)(i+1UL,j ) -= sum( xmm3 ) * scalar;
7240  (~C)(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
7241  (~C)(i+2UL,j ) -= sum( xmm5 ) * scalar;
7242  (~C)(i+2UL,j+1UL) -= sum( xmm6 ) * scalar;
7243  (~C)(i+3UL,j ) -= sum( xmm7 ) * scalar;
7244  (~C)(i+3UL,j+1UL) -= sum( xmm8 ) * scalar;
7245 
7246  for( ; remainder && k<kend; ++k ) {
7247  (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7248  (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7249  (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7250  (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7251  (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
7252  (~C)(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
7253  (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j ) * scalar;
7254  (~C)(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL) * scalar;
7255  }
7256  }
7257 
7258  if( j < N )
7259  {
7260  const size_t kbegin( ( IsUpper<MT4>::value )
7261  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
7262  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
7263  const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
7264 
7265  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
7266  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
7267 
7268  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7269  size_t k( kbegin );
7270 
7271  for( ; k<kpos; k+=IT::size ) {
7272  const IntrinsicType b1( B.load(k,j) );
7273  xmm1 = xmm1 + A.load(i ,k) * b1;
7274  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
7275  xmm3 = xmm3 + A.load(i+2UL,k) * b1;
7276  xmm4 = xmm4 + A.load(i+3UL,k) * b1;
7277  }
7278 
7279  (~C)(i ,j) -= sum( xmm1 ) * scalar;
7280  (~C)(i+1UL,j) -= sum( xmm2 ) * scalar;
7281  (~C)(i+2UL,j) -= sum( xmm3 ) * scalar;
7282  (~C)(i+3UL,j) -= sum( xmm4 ) * scalar;
7283 
7284  for( ; remainder && k<kend; ++k ) {
7285  (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7286  (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7287  (~C)(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
7288  (~C)(i+3UL,j) -= A(i+3UL,k) * B(k,j) * scalar;
7289  }
7290  }
7291  }
7292 
7293  for( ; (i+2UL) <= M; i+=2UL )
7294  {
7295  size_t j( 0UL );
7296 
7297  for( ; (j+2UL) <= N; j+=2UL )
7298  {
7299  const size_t kbegin( ( IsUpper<MT4>::value )
7300  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
7301  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
7302  const size_t kend( ( IsLower<MT4>::value )
7303  ?( IsUpper<MT5>::value ? min( i+2UL, j+2UL ) : ( i+2UL ) )
7304  :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
7305 
7306  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
7307  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
7308 
7309  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7310  size_t k( kbegin );
7311 
7312  for( ; k<kpos; k+=IT::size ) {
7313  const IntrinsicType a1( A.load(i ,k) );
7314  const IntrinsicType a2( A.load(i+1UL,k) );
7315  const IntrinsicType b1( B.load(k,j ) );
7316  const IntrinsicType b2( B.load(k,j+1UL) );
7317  xmm1 = xmm1 + a1 * b1;
7318  xmm2 = xmm2 + a1 * b2;
7319  xmm3 = xmm3 + a2 * b1;
7320  xmm4 = xmm4 + a2 * b2;
7321  }
7322 
7323  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
7324  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
7325  (~C)(i+1UL,j ) -= sum( xmm3 ) * scalar;
7326  (~C)(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
7327 
7328  for( ; remainder && k<kend; ++k ) {
7329  (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7330  (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7331  (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7332  (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7333  }
7334  }
7335 
7336  if( j < N )
7337  {
7338  const size_t kbegin( ( IsUpper<MT4>::value )
7339  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
7340  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
7341  const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
7342 
7343  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
7344  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
7345 
7346  IntrinsicType xmm1, xmm2;
7347  size_t k( kbegin );
7348 
7349  for( ; k<kpos; k+=IT::size ) {
7350  const IntrinsicType b1( B.load(k,j) );
7351  xmm1 = xmm1 + A.load(i ,k) * b1;
7352  xmm2 = xmm2 + A.load(i+1UL,k) * b1;
7353  }
7354 
7355  (~C)(i ,j) -= sum( xmm1 ) * scalar;
7356  (~C)(i+1UL,j) -= sum( xmm2 ) * scalar;
7357 
7358  for( ; remainder && k<kend; ++k ) {
7359  (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7360  (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7361  }
7362  }
7363  }
7364 
7365  if( i < M )
7366  {
7367  size_t j( 0UL );
7368 
7369  for( ; (j+2UL) <= N; j+=2UL )
7370  {
7371  const size_t kbegin( ( IsUpper<MT4>::value )
7372  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
7373  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
7374  const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
7375 
7376  const size_t kpos( remainder ? ( kend & size_t(-IT::size) ) : kend );
7377  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (IT::size) ) ) == kpos, "Invalid end calculation" );
7378 
7379  IntrinsicType xmm1, xmm2;
7380  size_t k( kbegin );
7381 
7382  for( ; k<kpos; k+=IT::size ) {
7383  const IntrinsicType a1( A.load(i,k) );
7384  xmm1 = xmm1 + a1 * B.load(k,j );
7385  xmm2 = xmm2 + a1 * B.load(k,j+1UL);
7386  }
7387 
7388  (~C)(i,j ) -= sum( xmm1 ) * scalar;
7389  (~C)(i,j+1UL) -= sum( xmm2 ) * scalar;
7390 
7391  for( ; remainder && k<kend; ++k ) {
7392  (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7393  (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7394  }
7395  }
7396 
7397  if( j < N )
7398  {
7399  const size_t kbegin( ( IsUpper<MT4>::value )
7400  ?( ( IsLower<MT5>::value ? max( i, j ) : i ) & size_t(-IT::size) )
7401  :( IsLower<MT5>::value ? ( j & size_t(-IT::size) ) : 0UL ) );
7402 
7403  const size_t kpos( remainder ? ( K & size_t(-IT::size) ) : K );
7404  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (IT::size) ) ) == kpos, "Invalid end calculation" );
7405 
7406  IntrinsicType xmm1;
7407  size_t k( kbegin );
7408 
7409  for( ; k<kpos; k+=IT::size ) {
7410  xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
7411  }
7412 
7413  (~C)(i,j) -= sum( xmm1 ) * scalar;
7414 
7415  for( ; remainder && k<K; ++k ) {
7416  (~C)(i,j) -= A(i,k) * B(k,j) * scalar;
7417  }
7418  }
7419  }
7420  }
7421  //**********************************************************************************************
7422 
7423  //**Default subtraction assignment to dense matrices (large matrices)***************************
7437  template< typename MT3 // Type of the left-hand side target matrix
7438  , typename MT4 // Type of the left-hand side matrix operand
7439  , typename MT5 // Type of the right-hand side matrix operand
7440  , typename ST2 > // Type of the scalar value
7441  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7442  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7443  {
7444  selectDefaultSubAssignKernel( C, A, B, scalar );
7445  }
7446  //**********************************************************************************************
7447 
7448  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
7463  template< typename MT3 // Type of the left-hand side target matrix
7464  , typename MT4 // Type of the left-hand side matrix operand
7465  , typename MT5 // Type of the right-hand side matrix operand
7466  , typename ST2 > // Type of the scalar value
7467  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7468  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7469  {
7470  // TODO
7471  selectSmallSubAssignKernel( ~C, A, B, scalar );
7472  }
7473  //**********************************************************************************************
7474 
7475  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
7490  template< typename MT3 // Type of the left-hand side target matrix
7491  , typename MT4 // Type of the left-hand side matrix operand
7492  , typename MT5 // Type of the right-hand side matrix operand
7493  , typename ST2 > // Type of the scalar value
7494  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7495  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7496  {
7497  // TODO
7498  selectSmallSubAssignKernel( ~C, A, B, scalar );
7499  }
7500  //**********************************************************************************************
7501 
7502  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
7516  template< typename MT3 // Type of the left-hand side target matrix
7517  , typename MT4 // Type of the left-hand side matrix operand
7518  , typename MT5 // Type of the right-hand side matrix operand
7519  , typename ST2 > // Type of the scalar value
7520  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7521  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7522  {
7523  selectLargeSubAssignKernel( C, A, B, scalar );
7524  }
7525  //**********************************************************************************************
7526 
7527  //**BLAS-based subraction assignment to dense matrices******************************************
7528 #if BLAZE_BLAS_MODE
7529 
7542  template< typename MT3 // Type of the left-hand side target matrix
7543  , typename MT4 // Type of the left-hand side matrix operand
7544  , typename MT5 // Type of the right-hand side matrix operand
7545  , typename ST2 > // Type of the scalar value
7546  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7547  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7548  {
7549  typedef typename MT3::ElementType ET;
7550 
7551  if( IsTriangular<MT4>::value ) {
7552  typename MT3::ResultType tmp( serial( B ) );
7553  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7554  subAssign( C, tmp );
7555  }
7556  else if( IsTriangular<MT5>::value ) {
7557  typename MT3::ResultType tmp( serial( A ) );
7558  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7559  subAssign( C, tmp );
7560  }
7561  else {
7562  gemm( C, A, B, ET(-scalar), ET(1) );
7563  }
7564  }
7565 #endif
7566  //**********************************************************************************************
7567 
7568  //**Subtraction assignment to sparse matrices***************************************************
7569  // No special implementation for the subtraction assignment to sparse matrices.
7570  //**********************************************************************************************
7571 
7572  //**Multiplication assignment to dense matrices*************************************************
7573  // No special implementation for the multiplication assignment to dense matrices.
7574  //**********************************************************************************************
7575 
7576  //**Multiplication assignment to sparse matrices************************************************
7577  // No special implementation for the multiplication assignment to sparse matrices.
7578  //**********************************************************************************************
7579 
7580  //**SMP assignment to dense matrices************************************************************
7595  template< typename MT // Type of the target dense matrix
7596  , bool SO > // Storage order of the target dense matrix
7597  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7598  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7599  {
7601 
7602  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7603  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7604 
7605  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7606  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7607 
7608  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
7609  return;
7610  }
7611  else if( left.columns() == 0UL ) {
7612  reset( ~lhs );
7613  return;
7614  }
7615 
7616  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7617  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7618 
7619  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7620  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7621  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7622  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7623  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7624  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7625 
7626  smpAssign( ~lhs, A * B * rhs.scalar_ );
7627  }
7628  //**********************************************************************************************
7629 
7630  //**SMP assignment to sparse matrices***********************************************************
7645  template< typename MT // Type of the target sparse matrix
7646  , bool SO > // Storage order of the target sparse matrix
7647  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7648  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7649  {
7651 
7652  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
7653 
7660 
7661  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7662  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7663 
7664  const TmpType tmp( rhs );
7665  smpAssign( ~lhs, tmp );
7666  }
7667  //**********************************************************************************************
7668 
7669  //**SMP addition assignment to dense matrices***************************************************
7684  template< typename MT // Type of the target dense matrix
7685  , bool SO > // Storage order of the target dense matrix
7686  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7687  smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7688  {
7690 
7691  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7692  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7693 
7694  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7695  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7696 
7697  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7698  return;
7699  }
7700 
7701  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7702  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7703 
7704  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7705  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7706  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7707  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7708  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7709  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7710 
7711  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
7712  }
7713  //**********************************************************************************************
7714 
7715  //**SMP addition assignment to sparse matrices**************************************************
7716  // No special implementation for the SMP addition assignment to sparse matrices.
7717  //**********************************************************************************************
7718 
7719  //**SMP subtraction assignment to dense matrices************************************************
7734  template< typename MT // Type of the target dense matrix
7735  , bool SO > // Storage order of the target dense matrix
7736  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7737  smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7738  {
7740 
7741  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7742  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7743 
7744  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7745  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7746 
7747  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7748  return;
7749  }
7750 
7751  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7752  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7753 
7754  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7755  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7756  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7757  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7758  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7759  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7760 
7761  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
7762  }
7763  //**********************************************************************************************
7764 
7765  //**SMP subtraction assignment to sparse matrices***********************************************
7766  // No special implementation for the SMP subtraction assignment to sparse matrices.
7767  //**********************************************************************************************
7768 
7769  //**SMP multiplication assignment to dense matrices*********************************************
7770  // No special implementation for the SMP multiplication assignment to dense matrices.
7771  //**********************************************************************************************
7772 
7773  //**SMP multiplication assignment to sparse matrices********************************************
7774  // No special implementation for the SMP multiplication assignment to sparse matrices.
7775  //**********************************************************************************************
7776 
7777  //**Compile time checks*************************************************************************
7785  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
7786  //**********************************************************************************************
7787 };
7789 //*************************************************************************************************
7790 
7791 
7792 
7793 
7794 //=================================================================================================
7795 //
7796 // GLOBAL BINARY ARITHMETIC OPERATORS
7797 //
7798 //=================================================================================================
7799 
7800 //*************************************************************************************************
7829 template< typename T1 // Type of the left-hand side dense matrix
7830  , typename T2 > // Type of the right-hand side dense matrix
7831 inline const DMatTDMatMultExpr<T1,T2>
7833 {
7835 
7836  if( (~lhs).columns() != (~rhs).rows() ) {
7837  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
7838  }
7839 
7840  return DMatTDMatMultExpr<T1,T2>( ~lhs, ~rhs );
7841 }
7842 //*************************************************************************************************
7843 
7844 
7845 
7846 
7847 //=================================================================================================
7848 //
7849 // ROWS SPECIALIZATIONS
7850 //
7851 //=================================================================================================
7852 
7853 //*************************************************************************************************
7855 template< typename MT1, typename MT2 >
7856 struct Rows< DMatTDMatMultExpr<MT1,MT2> > : public Rows<MT1>
7857 {};
7859 //*************************************************************************************************
7860 
7861 
7862 
7863 
7864 //=================================================================================================
7865 //
7866 // COLUMNS SPECIALIZATIONS
7867 //
7868 //=================================================================================================
7869 
7870 //*************************************************************************************************
7872 template< typename MT1, typename MT2 >
7873 struct Columns< DMatTDMatMultExpr<MT1,MT2> > : public Columns<MT2>
7874 {};
7876 //*************************************************************************************************
7877 
7878 
7879 
7880 
7881 //=================================================================================================
7882 //
7883 // ISALIGNED SPECIALIZATIONS
7884 //
7885 //=================================================================================================
7886 
7887 //*************************************************************************************************
7889 template< typename MT1, typename MT2 >
7890 struct IsAligned< DMatTDMatMultExpr<MT1,MT2> >
7891  : public IsTrue< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7892 {};
7894 //*************************************************************************************************
7895 
7896 
7897 
7898 
7899 //=================================================================================================
7900 //
7901 // ISLOWER SPECIALIZATIONS
7902 //
7903 //=================================================================================================
7904 
7905 //*************************************************************************************************
7907 template< typename MT1, typename MT2 >
7908 struct IsLower< DMatTDMatMultExpr<MT1,MT2> >
7909  : public IsTrue< And< IsLower<MT1>, IsLower<MT2> >::value >
7910 {};
7912 //*************************************************************************************************
7913 
7914 
7915 
7916 
7917 //=================================================================================================
7918 //
7919 // ISUNILOWER SPECIALIZATIONS
7920 //
7921 //=================================================================================================
7922 
7923 //*************************************************************************************************
7925 template< typename MT1, typename MT2 >
7926 struct IsUniLower< DMatTDMatMultExpr<MT1,MT2> >
7927  : public IsTrue< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
7928 {};
7930 //*************************************************************************************************
7931 
7932 
7933 
7934 
7935 //=================================================================================================
7936 //
7937 // ISSTRICTLYLOWER SPECIALIZATIONS
7938 //
7939 //=================================================================================================
7940 
7941 //*************************************************************************************************
7943 template< typename MT1, typename MT2 >
7944 struct IsStrictlyLower< DMatTDMatMultExpr<MT1,MT2> >
7945  : public IsTrue< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7946  , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
7947 {};
7949 //*************************************************************************************************
7950 
7951 
7952 
7953 
7954 //=================================================================================================
7955 //
7956 // ISUPPER SPECIALIZATIONS
7957 //
7958 //=================================================================================================
7959 
7960 //*************************************************************************************************
7962 template< typename MT1, typename MT2 >
7963 struct IsUpper< DMatTDMatMultExpr<MT1,MT2> >
7964  : public IsTrue< And< IsUpper<MT1>, IsUpper<MT2> >::value >
7965 {};
7967 //*************************************************************************************************
7968 
7969 
7970 
7971 
7972 //=================================================================================================
7973 //
7974 // ISUNIUPPER SPECIALIZATIONS
7975 //
7976 //=================================================================================================
7977 
7978 //*************************************************************************************************
7980 template< typename MT1, typename MT2 >
7981 struct IsUniUpper< DMatTDMatMultExpr<MT1,MT2> >
7982  : public IsTrue< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
7983 {};
7985 //*************************************************************************************************
7986 
7987 
7988 
7989 
7990 //=================================================================================================
7991 //
7992 // ISSTRICTLYUPPER SPECIALIZATIONS
7993 //
7994 //=================================================================================================
7995 
7996 //*************************************************************************************************
7998 template< typename MT1, typename MT2 >
7999 struct IsStrictlyUpper< DMatTDMatMultExpr<MT1,MT2> >
8000  : public IsTrue< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
8001  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
8002 {};
8004 //*************************************************************************************************
8005 
8006 
8007 
8008 
8009 //=================================================================================================
8010 //
8011 // EXPRESSION TRAIT SPECIALIZATIONS
8012 //
8013 //=================================================================================================
8014 
8015 //*************************************************************************************************
8017 template< typename MT1, typename MT2, typename VT >
8018 struct DMatDVecMultExprTrait< DMatTDMatMultExpr<MT1,MT2>, VT >
8019 {
8020  public:
8021  //**********************************************************************************************
8022  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8023  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
8024  IsDenseVector<VT>::value && IsColumnVector<VT>::value
8025  , typename DMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
8026  , INVALID_TYPE >::Type Type;
8027  //**********************************************************************************************
8028 };
8030 //*************************************************************************************************
8031 
8032 
8033 //*************************************************************************************************
8035 template< typename MT1, typename MT2, typename VT >
8036 struct DMatSVecMultExprTrait< DMatTDMatMultExpr<MT1,MT2>, VT >
8037 {
8038  public:
8039  //**********************************************************************************************
8040  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8041  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
8042  IsSparseVector<VT>::value && IsColumnVector<VT>::value
8043  , typename DMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
8044  , INVALID_TYPE >::Type Type;
8045  //**********************************************************************************************
8046 };
8048 //*************************************************************************************************
8049 
8050 
8051 //*************************************************************************************************
8053 template< typename VT, typename MT1, typename MT2 >
8054 struct TDVecDMatMultExprTrait< VT, DMatTDMatMultExpr<MT1,MT2> >
8055 {
8056  public:
8057  //**********************************************************************************************
8058  typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
8059  IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8060  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
8061  , typename TDVecTDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8062  , INVALID_TYPE >::Type Type;
8063  //**********************************************************************************************
8064 };
8066 //*************************************************************************************************
8067 
8068 
8069 //*************************************************************************************************
8071 template< typename VT, typename MT1, typename MT2 >
8072 struct TSVecDMatMultExprTrait< VT, DMatTDMatMultExpr<MT1,MT2> >
8073 {
8074  public:
8075  //**********************************************************************************************
8076  typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
8077  IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8078  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
8079  , typename TDVecTDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8080  , INVALID_TYPE >::Type Type;
8081  //**********************************************************************************************
8082 };
8084 //*************************************************************************************************
8085 
8086 
8087 //*************************************************************************************************
8089 template< typename MT1, typename MT2, bool AF >
8090 struct SubmatrixExprTrait< DMatTDMatMultExpr<MT1,MT2>, AF >
8091 {
8092  public:
8093  //**********************************************************************************************
8094  typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
8095  , typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
8096  //**********************************************************************************************
8097 };
8099 //*************************************************************************************************
8100 
8101 
8102 //*************************************************************************************************
8104 template< typename MT1, typename MT2 >
8105 struct RowExprTrait< DMatTDMatMultExpr<MT1,MT2> >
8106 {
8107  public:
8108  //**********************************************************************************************
8109  typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
8110  //**********************************************************************************************
8111 };
8113 //*************************************************************************************************
8114 
8115 
8116 //*************************************************************************************************
8118 template< typename MT1, typename MT2 >
8119 struct ColumnExprTrait< DMatTDMatMultExpr<MT1,MT2> >
8120 {
8121  public:
8122  //**********************************************************************************************
8123  typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
8124  //**********************************************************************************************
8125 };
8127 //*************************************************************************************************
8128 
8129 } // namespace blaze
8130 
8131 #endif
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:423
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:223
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exceptionThis macro encapsulates the default way of...
Definition: Exception.h:187
const MT::ElementType max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1729
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:231
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7820
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:105
Header file for basic type definitions.
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:225
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:252
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:277
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:207
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:507
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:413
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2588
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:259
DMatTDMatMultExpr< MT1, MT2 > This
Type of this DMatTDMatMultExpr instance.
Definition: DMatTDMatMultExpr.h:221
Header file for the And class template.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatTDMatMultExpr.h:333
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:257
Header file for the TDVecSMatMultExprTrait class template.
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:150
Header file for the IsUniLower type trait.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:152
Header file for the IsComplexDouble type trait.
Constraint on the data type.
Header file for the MultExprTrait class template.
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:262
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
size_t columns() const
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:359
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:228
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:79
Header file for the Or class template.
Header file for the TDMatSVecMultExprTrait class template.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:403
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exceptionThis macro encapsulates the default way of Bla...
Definition: Exception.h:331
const MT::ElementType min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1682
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:149
Header file for the Not class template.
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:153
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2586
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatTDMatMultExpr.h:226
size_t rows() const
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:349
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:79
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:1232
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:138
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
const bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the HasMutableDataAccess type trait.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:391
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:151
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:122
BLAZE_ALWAYS_INLINE int16_t sum(const simd_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:63
Header file for the IsDenseVector type trait.
Header file for all intrinsic functionality.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:369
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:258
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:227
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the TDMatDVecMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2583
Header file for the IsTrue value trait.
Header file for the IsComplex type trait.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:432
Header file for the complex data type.
Expression object for dense matrix-transpose dense matrix multiplications.The DMatTDMatMultExpr class...
Definition: DMatTDMatMultExpr.h:142
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:148
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:433
Header file for the IsUpper type trait.
Header file for exception macros.
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:379
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:224
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:237
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:240
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:234
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.