TDMatTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
56 #include <blaze/math/Functions.h>
57 #include <blaze/math/Intrinsics.h>
58 #include <blaze/math/shims/Reset.h>
95 #include <blaze/system/BLAS.h>
96 #include <blaze/system/Blocking.h>
99 #include <blaze/util/Assert.h>
100 #include <blaze/util/Complex.h>
104 #include <blaze/util/DisableIf.h>
105 #include <blaze/util/EnableIf.h>
106 #include <blaze/util/Exception.h>
107 #include <blaze/util/InvalidType.h>
109 #include <blaze/util/mpl/And.h>
110 #include <blaze/util/mpl/Not.h>
111 #include <blaze/util/mpl/Or.h>
112 #include <blaze/util/SelectType.h>
113 #include <blaze/util/Types.h>
123 
124 
125 namespace blaze {
126 
127 //=================================================================================================
128 //
129 // CLASS TDMATTDMATMULTEXPR
130 //
131 //=================================================================================================
132 
133 //*************************************************************************************************
140 template< typename MT1 // Type of the left-hand side dense matrix
141  , typename MT2 > // Type of the right-hand side dense matrix
142 class TDMatTDMatMultExpr : public DenseMatrix< TDMatTDMatMultExpr<MT1,MT2>, true >
143  , private MatMatMultExpr
144  , private Computation
145 {
146  private:
147  //**Type definitions****************************************************************************
148  typedef typename MT1::ResultType RT1;
149  typedef typename MT2::ResultType RT2;
150  typedef typename RT1::ElementType ET1;
151  typedef typename RT2::ElementType ET2;
152  typedef typename MT1::CompositeType CT1;
153  typedef typename MT2::CompositeType CT2;
154  //**********************************************************************************************
155 
156  //**********************************************************************************************
159  //**********************************************************************************************
160 
161  //**********************************************************************************************
163  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
164  //**********************************************************************************************
165 
166  //**********************************************************************************************
168 
174  template< typename T1, typename T2, typename T3 >
175  struct CanExploitSymmetry {
176  enum { value = IsRowMajorMatrix<T1>::value &&
177  ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
178  };
180  //**********************************************************************************************
181 
182  //**********************************************************************************************
184 
188  template< typename T1, typename T2, typename T3 >
189  struct IsEvaluationRequired {
190  enum { value = ( evaluateLeft || evaluateRight ) &&
191  CanExploitSymmetry<T1,T2,T3>::value };
192  };
194  //**********************************************************************************************
195 
196  //**********************************************************************************************
198 
201  template< typename T1, typename T2, typename T3 >
202  struct UseBlasKernel {
203  enum { value = BLAZE_BLAS_MODE &&
204  HasMutableDataAccess<T1>::value &&
205  HasConstDataAccess<T2>::value &&
206  HasConstDataAccess<T3>::value &&
207  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
208  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
209  IsBlasCompatible<typename T1::ElementType>::value &&
210  IsBlasCompatible<typename T2::ElementType>::value &&
211  IsBlasCompatible<typename T3::ElementType>::value &&
212  IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
213  IsSame< typename T1::ElementType, typename T3::ElementType >::value };
214  };
216  //**********************************************************************************************
217 
218  //**********************************************************************************************
220 
223  template< typename T1, typename T2, typename T3 >
224  struct UseVectorizedDefaultKernel {
225  enum { value = useOptimizedKernels &&
226  !IsDiagonal<T2>::value &&
227  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
228  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
229  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
230  IntrinsicTrait<typename T1::ElementType>::addition &&
231  IntrinsicTrait<typename T1::ElementType>::subtraction &&
232  IntrinsicTrait<typename T1::ElementType>::multiplication };
233  };
235  //**********************************************************************************************
236 
237  public:
238  //**Type definitions****************************************************************************
245  typedef const ElementType ReturnType;
246  typedef const ResultType CompositeType;
247 
249  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
250 
252  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
253 
256 
259  //**********************************************************************************************
260 
261  //**Compilation flags***************************************************************************
263  enum { vectorizable = !IsDiagonal<MT1>::value &&
264  MT1::vectorizable && MT2::vectorizable &&
268 
270  enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
271  !evaluateRight && MT2::smpAssignable };
272  //**********************************************************************************************
273 
274  //**Constructor*********************************************************************************
280  explicit inline TDMatTDMatMultExpr( const MT1& lhs, const MT2& rhs )
281  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
282  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
283  {
284  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
285  }
286  //**********************************************************************************************
287 
288  //**Access operator*****************************************************************************
295  inline ReturnType operator()( size_t i, size_t j ) const {
296  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
297  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
298 
299  const size_t kbegin( ( IsUpper<MT1>::value )
300  ?( ( IsLower<MT2>::value )
301  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
302  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
303  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
304  :( ( IsLower<MT2>::value )
305  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
306  :( 0UL ) ) );
307  const size_t kend( ( IsLower<MT1>::value )
308  ?( ( IsUpper<MT2>::value )
309  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
310  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
311  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
312  :( ( IsUpper<MT2>::value )
313  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
314  :( lhs_.columns() ) ) );
315 
316  if( lhs_.columns() == 0UL ||
317  ( ( IsTriangular<MT1>::value || IsTriangular<MT2>::value ) && kbegin >= kend ) )
318  return ElementType();
319 
321  return lhs_(i,i) * rhs_(i,j);
322 
324  return lhs_(i,j) * rhs_(j,j);
325 
326  const size_t knum( kend - kbegin );
327  const size_t kpos( kbegin + ( ( knum - 1UL ) & size_t(-2) ) + 1UL );
328 
329  ElementType tmp( lhs_(i,kbegin) * rhs_(kbegin,j) );
330 
331  for( size_t k=kbegin+1UL; k<kpos; k+=2UL ) {
332  tmp += lhs_(i,k ) * rhs_(k ,j);
333  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
334  }
335  if( kpos < kend ) {
336  tmp += lhs_(i,kpos) * rhs_(kpos,j);
337  }
338 
339  return tmp;
340  }
341  //**********************************************************************************************
342 
343  //**At function*********************************************************************************
351  inline ReturnType at( size_t i, size_t j ) const {
352  if( i >= lhs_.rows() ) {
353  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
354  }
355  if( j >= rhs_.columns() ) {
356  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
357  }
358  return (*this)(i,j);
359  }
360  //**********************************************************************************************
361 
362  //**Rows function*******************************************************************************
367  inline size_t rows() const {
368  return lhs_.rows();
369  }
370  //**********************************************************************************************
371 
372  //**Columns function****************************************************************************
377  inline size_t columns() const {
378  return rhs_.columns();
379  }
380  //**********************************************************************************************
381 
382  //**Left operand access*************************************************************************
387  inline LeftOperand leftOperand() const {
388  return lhs_;
389  }
390  //**********************************************************************************************
391 
392  //**Right operand access************************************************************************
397  inline RightOperand rightOperand() const {
398  return rhs_;
399  }
400  //**********************************************************************************************
401 
402  //**********************************************************************************************
408  template< typename T >
409  inline bool canAlias( const T* alias ) const {
410  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
411  }
412  //**********************************************************************************************
413 
414  //**********************************************************************************************
420  template< typename T >
421  inline bool isAliased( const T* alias ) const {
422  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
423  }
424  //**********************************************************************************************
425 
426  //**********************************************************************************************
431  inline bool isAligned() const {
432  return lhs_.isAligned() && rhs_.isAligned();
433  }
434  //**********************************************************************************************
435 
436  //**********************************************************************************************
441  inline bool canSMPAssign() const {
442  return ( !BLAZE_BLAS_IS_PARALLEL ||
443  ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
444  ( columns() > SMP_TDMATTDMATMULT_THRESHOLD );
445  }
446  //**********************************************************************************************
447 
448  private:
449  //**Member variables****************************************************************************
450  LeftOperand lhs_;
451  RightOperand rhs_;
452  //**********************************************************************************************
453 
454  //**Assignment to dense matrices****************************************************************
467  template< typename MT // Type of the target dense matrix
468  , bool SO > // Storage order of the target dense matrix
469  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
470  assign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
471  {
473 
474  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
475  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
476 
477  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
478  return;
479  }
480  else if( rhs.lhs_.columns() == 0UL ) {
481  reset( ~lhs );
482  return;
483  }
484 
485  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
486  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
487 
488  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
489  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
490  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
491  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
492  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
493  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
494 
495  TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
496  }
498  //**********************************************************************************************
499 
500  //**Assignment to dense matrices (kernel selection)*********************************************
511  template< typename MT3 // Type of the left-hand side target matrix
512  , typename MT4 // Type of the left-hand side matrix operand
513  , typename MT5 > // Type of the right-hand side matrix operand
514  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
515  {
516  if( ( IsDiagonal<MT4>::value ) ||
517  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
518  selectSmallAssignKernel( C, A, B );
519  else
520  selectBlasAssignKernel( C, A, B );
521  }
523  //**********************************************************************************************
524 
525  //**Default assignment to dense matrices (general/general)**************************************
539  template< typename MT3 // Type of the left-hand side target matrix
540  , typename MT4 // Type of the left-hand side matrix operand
541  , typename MT5 > // Type of the right-hand side matrix operand
542  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
543  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
544  {
545  const size_t M( A.rows() );
546  const size_t N( B.columns() );
547  const size_t K( A.columns() );
548 
549  for( size_t j=0UL; j<N; ++j )
550  {
551  const size_t kbegin( ( IsLower<MT5>::value )
552  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
553  :( 0UL ) );
554  const size_t kend( ( IsUpper<MT5>::value )
555  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
556  :( K ) );
557  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
558 
559  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
560  for( size_t i=0UL; i<M; ++i ) {
561  reset( (~C)(i,j) );
562  }
563  continue;
564  }
565 
566  {
567  const size_t ibegin( ( IsLower<MT4>::value )
568  ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
569  :( 0UL ) );
570  const size_t iend( ( IsUpper<MT4>::value )
571  ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
572  :( M ) );
573  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
574 
575  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
576  for( size_t i=0UL; i<ibegin; ++i ) {
577  reset( C(i,j) );
578  }
579  }
580  else if( IsStrictlyLower<MT4>::value ) {
581  reset( C(0UL,j) );
582  }
583  for( size_t i=ibegin; i<iend; ++i ) {
584  C(i,j) = A(i,kbegin) * B(kbegin,j);
585  }
586  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
587  for( size_t i=iend; i<M; ++i ) {
588  reset( C(i,j) );
589  }
590  }
591  else if( IsStrictlyUpper<MT4>::value ) {
592  reset( C(M-1UL,j) );
593  }
594  }
595 
596  for( size_t k=kbegin+1UL; k<kend; ++k )
597  {
598  const size_t ibegin( ( IsLower<MT4>::value )
599  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
600  :( 0UL ) );
601  const size_t iend( ( IsUpper<MT4>::value )
602  ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
603  :( M ) );
604  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
605 
606  for( size_t i=ibegin; i<iend; ++i ) {
607  C(i,j) += A(i,k) * B(k,j);
608  }
609  if( IsUpper<MT4>::value ) {
610  C(iend,j) = A(iend,k) * B(k,j);
611  }
612  }
613  }
614  }
616  //**********************************************************************************************
617 
618  //**Default assignment to dense matrices (general/diagonal)*************************************
632  template< typename MT3 // Type of the left-hand side target matrix
633  , typename MT4 // Type of the left-hand side matrix operand
634  , typename MT5 > // Type of the right-hand side matrix operand
635  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
636  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
637  {
639 
640  const size_t M( A.rows() );
641  const size_t N( B.columns() );
642 
643  for( size_t j=0UL; j<N; ++j )
644  {
645  const size_t ibegin( ( IsLower<MT4>::value )
646  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
647  :( 0UL ) );
648  const size_t iend( ( IsUpper<MT4>::value )
649  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
650  :( M ) );
651  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
652 
653  if( IsLower<MT4>::value ) {
654  for( size_t i=0UL; i<ibegin; ++i ) {
655  reset( C(i,j) );
656  }
657  }
658  for( size_t i=ibegin; i<iend; ++i ) {
659  C(i,j) = A(i,j) * B(j,j);
660  }
661  if( IsUpper<MT4>::value ) {
662  for( size_t i=iend; i<M; ++i ) {
663  reset( C(i,j) );
664  }
665  }
666  }
667  }
669  //**********************************************************************************************
670 
671  //**Default assignment to dense matrices (diagonal/general)*************************************
685  template< typename MT3 // Type of the left-hand side target matrix
686  , typename MT4 // Type of the left-hand side matrix operand
687  , typename MT5 > // Type of the right-hand side matrix operand
688  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
689  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
690  {
692 
693  const size_t M( A.rows() );
694  const size_t N( B.columns() );
695 
696  for( size_t j=0UL; j<N; ++j )
697  {
698  const size_t ibegin( ( IsLower<MT5>::value )
699  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
700  :( 0UL ) );
701  const size_t iend( ( IsUpper<MT5>::value )
702  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
703  :( M ) );
704  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
705 
706  if( IsLower<MT4>::value ) {
707  for( size_t i=0UL; i<ibegin; ++i ) {
708  reset( C(i,j) );
709  }
710  }
711  for( size_t i=ibegin; i<iend; ++i ) {
712  C(i,j) = A(i,i) * B(i,j);
713  }
714  if( IsUpper<MT4>::value ) {
715  for( size_t i=iend; i<M; ++i ) {
716  reset( C(i,j) );
717  }
718  }
719  }
720  }
722  //**********************************************************************************************
723 
724  //**Default assignment to dense matrices (diagonal/diagonal)************************************
738  template< typename MT3 // Type of the left-hand side target matrix
739  , typename MT4 // Type of the left-hand side matrix operand
740  , typename MT5 > // Type of the right-hand side matrix operand
741  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
742  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
743  {
745 
746  reset( C );
747 
748  for( size_t i=0UL; i<A.rows(); ++i ) {
749  C(i,i) = A(i,i) * B(i,i);
750  }
751  }
753  //**********************************************************************************************
754 
755  //**Default assignment to dense matrices (small matrices)***************************************
769  template< typename MT3 // Type of the left-hand side target matrix
770  , typename MT4 // Type of the left-hand side matrix operand
771  , typename MT5 > // Type of the right-hand side matrix operand
772  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
773  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
774  {
775  selectDefaultAssignKernel( C, A, B );
776  }
778  //**********************************************************************************************
779 
780  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
795  template< typename MT3 // Type of the left-hand side target matrix
796  , typename MT4 // Type of the left-hand side matrix operand
797  , typename MT5 > // Type of the right-hand side matrix operand
798  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
799  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
800  {
805 
806  if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
807  const typename MT5::OppositeType tmp( serial( B ) );
808  assign( ~C, A * tmp );
809  }
810  else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
811  const typename MT4::OppositeType tmp( serial( A ) );
812  assign( ~C, tmp * B );
813  }
814  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
815  const typename MT5::OppositeType tmp( serial( B ) );
816  assign( ~C, A * tmp );
817  }
818  else {
819  const typename MT4::OppositeType tmp( serial( A ) );
820  assign( ~C, tmp * B );
821  }
822  }
824  //**********************************************************************************************
825 
826  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
841  template< typename MT3 // Type of the left-hand side target matrix
842  , typename MT4 // Type of the left-hand side matrix operand
843  , typename MT5 > // Type of the right-hand side matrix operand
844  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
845  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
846  {
847  typedef IntrinsicTrait<ElementType> IT;
848 
849  const size_t M( A.rows() );
850  const size_t N( B.columns() );
851  const size_t K( A.columns() );
852 
853  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
854 
855  const size_t ipos( remainder ? ( M & size_t(-IT::size) ) : M );
856  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % IT::size ) ) == ipos, "Invalid end calculation" );
857 
858  size_t i( 0UL );
859 
860  for( ; (i+IT::size*7UL) < ipos; i+=IT::size*8UL ) {
861  for( size_t j=0UL; j<N; ++j )
862  {
863  const size_t kbegin( ( IsLower<MT5>::value )
864  ?( ( IsUpper<MT4>::value )
865  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
866  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
867  :( IsUpper<MT4>::value ? i : 0UL ) );
868  const size_t kend( ( IsUpper<MT5>::value )
869  ?( ( IsLower<MT4>::value )
870  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
871  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
872  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
873 
874  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
875 
876  for( size_t k=kbegin; k<kend; ++k ) {
877  const IntrinsicType b1( set( B(k,j) ) );
878  xmm1 = xmm1 + A.load(i ,k) * b1;
879  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
880  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
881  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
882  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
883  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
884  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
885  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
886  }
887 
888  (~C).store( i , j, xmm1 );
889  (~C).store( i+IT::size , j, xmm2 );
890  (~C).store( i+IT::size*2UL, j, xmm3 );
891  (~C).store( i+IT::size*3UL, j, xmm4 );
892  (~C).store( i+IT::size*4UL, j, xmm5 );
893  (~C).store( i+IT::size*5UL, j, xmm6 );
894  (~C).store( i+IT::size*6UL, j, xmm7 );
895  (~C).store( i+IT::size*7UL, j, xmm8 );
896  }
897  }
898 
899  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
900  {
901  size_t j( 0UL );
902 
903  for( ; (j+2UL) <= N; j+=2UL )
904  {
905  const size_t kbegin( ( IsLower<MT5>::value )
906  ?( ( IsUpper<MT4>::value )
907  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
908  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
909  :( IsUpper<MT4>::value ? i : 0UL ) );
910  const size_t kend( ( IsUpper<MT5>::value )
911  ?( ( IsLower<MT4>::value )
912  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
913  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
914  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
915 
916  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
917 
918  for( size_t k=kbegin; k<kend; ++k ) {
919  const IntrinsicType a1( A.load(i ,k) );
920  const IntrinsicType a2( A.load(i+IT::size ,k) );
921  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
922  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
923  const IntrinsicType b1( set( B(k,j ) ) );
924  const IntrinsicType b2( set( B(k,j+1UL) ) );
925  xmm1 = xmm1 + a1 * b1;
926  xmm2 = xmm2 + a2 * b1;
927  xmm3 = xmm3 + a3 * b1;
928  xmm4 = xmm4 + a4 * b1;
929  xmm5 = xmm5 + a1 * b2;
930  xmm6 = xmm6 + a2 * b2;
931  xmm7 = xmm7 + a3 * b2;
932  xmm8 = xmm8 + a4 * b2;
933  }
934 
935  (~C).store( i , j , xmm1 );
936  (~C).store( i+IT::size , j , xmm2 );
937  (~C).store( i+IT::size*2UL, j , xmm3 );
938  (~C).store( i+IT::size*3UL, j , xmm4 );
939  (~C).store( i , j+1UL, xmm5 );
940  (~C).store( i+IT::size , j+1UL, xmm6 );
941  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
942  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
943  }
944 
945  if( j < N )
946  {
947  const size_t kbegin( ( IsLower<MT5>::value )
948  ?( ( IsUpper<MT4>::value )
949  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
950  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
951  :( IsUpper<MT4>::value ? i : 0UL ) );
952  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
953 
954  IntrinsicType xmm1, xmm2, xmm3, xmm4;
955 
956  for( size_t k=kbegin; k<kend; ++k ) {
957  const IntrinsicType b1( set( B(k,j) ) );
958  xmm1 = xmm1 + A.load(i ,k) * b1;
959  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
960  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
961  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
962  }
963 
964  (~C).store( i , j, xmm1 );
965  (~C).store( i+IT::size , j, xmm2 );
966  (~C).store( i+IT::size*2UL, j, xmm3 );
967  (~C).store( i+IT::size*3UL, j, xmm4 );
968  }
969  }
970 
971  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
972  {
973  size_t j( 0UL );
974 
975  for( ; (j+2UL) <= N; j+=2UL )
976  {
977  const size_t kbegin( ( IsLower<MT5>::value )
978  ?( ( IsUpper<MT4>::value )
979  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
980  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
981  :( IsUpper<MT4>::value ? i : 0UL ) );
982  const size_t kend( ( IsUpper<MT5>::value )
983  ?( ( IsLower<MT4>::value )
984  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
985  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
986  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
987 
988  IntrinsicType xmm1, xmm2, xmm3, xmm4;
989 
990  for( size_t k=kbegin; k<kend; ++k ) {
991  const IntrinsicType a1( A.load(i ,k) );
992  const IntrinsicType a2( A.load(i+IT::size,k) );
993  const IntrinsicType b1( set( B(k,j ) ) );
994  const IntrinsicType b2( set( B(k,j+1UL) ) );
995  xmm1 = xmm1 + a1 * b1;
996  xmm2 = xmm2 + a2 * b1;
997  xmm3 = xmm3 + a1 * b2;
998  xmm4 = xmm4 + a2 * b2;
999  }
1000 
1001  (~C).store( i , j , xmm1 );
1002  (~C).store( i+IT::size, j , xmm2 );
1003  (~C).store( i , j+1UL, xmm3 );
1004  (~C).store( i+IT::size, j+1UL, xmm4 );
1005  }
1006 
1007  if( j < N )
1008  {
1009  const size_t kbegin( ( IsLower<MT5>::value )
1010  ?( ( IsUpper<MT4>::value )
1011  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1012  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1013  :( IsUpper<MT4>::value ? i : 0UL ) );
1014  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
1015 
1016  IntrinsicType xmm1, xmm2;
1017 
1018  for( size_t k=kbegin; k<kend; ++k ) {
1019  const IntrinsicType b1( set( B(k,j) ) );
1020  xmm1 = xmm1 + A.load(i ,k) * b1;
1021  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
1022  }
1023 
1024  (~C).store( i , j, xmm1 );
1025  (~C).store( i+IT::size, j, xmm2 );
1026  }
1027  }
1028 
1029  for( ; i<ipos; i+=IT::size )
1030  {
1031  size_t j( 0UL );
1032 
1033  for( ; (j+2UL) <= N; j+=2UL )
1034  {
1035  const size_t kbegin( ( IsLower<MT5>::value )
1036  ?( ( IsUpper<MT4>::value )
1037  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1038  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1039  :( IsUpper<MT4>::value ? i : 0UL ) );
1040  const size_t kend( ( IsUpper<MT5>::value )
1041  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1042  :( K ) );
1043 
1044  IntrinsicType xmm1, xmm2;
1045 
1046  for( size_t k=kbegin; k<kend; ++k ) {
1047  const IntrinsicType a1( A.load(i,k) );
1048  xmm1 = xmm1 + a1 * set( B(k,j ) );
1049  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
1050  }
1051 
1052  (~C).store( i, j , xmm1 );
1053  (~C).store( i, j+1UL, xmm2 );
1054  }
1055 
1056  if( j < N )
1057  {
1058  const size_t kbegin( ( IsLower<MT5>::value )
1059  ?( ( IsUpper<MT4>::value )
1060  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1061  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1062  :( IsUpper<MT4>::value ? i : 0UL ) );
1063 
1064  IntrinsicType xmm1;
1065 
1066  for( size_t k=kbegin; k<K; ++k ) {
1067  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
1068  }
1069 
1070  (~C).store( i, j, xmm1 );
1071  }
1072  }
1073 
1074  for( ; remainder && i<M; ++i )
1075  {
1076  size_t j( 0UL );
1077 
1078  for( ; (j+2UL) <= N; j+=2UL )
1079  {
1080  const size_t kbegin( ( IsLower<MT5>::value )
1081  ?( ( IsUpper<MT4>::value )
1082  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1083  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1084  :( IsUpper<MT4>::value ? i : 0UL ) );
1085  const size_t kend( ( IsUpper<MT5>::value )
1086  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1087  :( K ) );
1088 
1089  ElementType value1 = ElementType();
1090  ElementType value2 = ElementType();
1091 
1092  for( size_t k=kbegin; k<kend; ++k ) {
1093  value1 += A(i,k) * B(k,j );
1094  value2 += A(i,k) * B(k,j+1UL);
1095  }
1096 
1097  (~C)(i,j ) = value1;
1098  (~C)(i,j+1UL) = value2;
1099  }
1100 
1101  if( j < N )
1102  {
1103  const size_t kbegin( ( IsLower<MT5>::value )
1104  ?( ( IsUpper<MT4>::value )
1105  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1106  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1107  :( IsUpper<MT4>::value ? i : 0UL ) );
1108 
1109  ElementType value = ElementType();
1110 
1111  for( size_t k=kbegin; k<K; ++k ) {
1112  value += A(i,k) * B(k,j);
1113  }
1114 
1115  (~C)(i,j) = value;
1116  }
1117  }
1118  }
1120  //**********************************************************************************************
1121 
1122  //**Default assignment to dense matrices (large matrices)***************************************
1136  template< typename MT3 // Type of the left-hand side target matrix
1137  , typename MT4 // Type of the left-hand side matrix operand
1138  , typename MT5 > // Type of the right-hand side matrix operand
1139  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1140  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1141  {
1142  selectDefaultAssignKernel( C, A, B );
1143  }
1145  //**********************************************************************************************
1146 
1147  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
1162  template< typename MT3 // Type of the left-hand side target matrix
1163  , typename MT4 // Type of the left-hand side matrix operand
1164  , typename MT5 > // Type of the right-hand side matrix operand
1165  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1166  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1167  {
1168  selectSmallAssignKernel( ~C, A, B );
1169  }
1171  //**********************************************************************************************
1172 
1173  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
1188  template< typename MT3 // Type of the left-hand side target matrix
1189  , typename MT4 // Type of the left-hand side matrix operand
1190  , typename MT5 > // Type of the right-hand side matrix operand
1191  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1192  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1193  {
1194  typedef IntrinsicTrait<ElementType> IT;
1195 
1196  const size_t M( A.rows() );
1197  const size_t N( B.columns() );
1198  const size_t K( A.columns() );
1199 
1200  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1201 
1202  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
1203  {
1204  const size_t iend( min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
1205 
1206  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1207  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % IT::size ) ) == ipos, "Invalid end calculation" );
1208 
1209  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
1210  {
1211  const size_t jend( min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
1212 
1213  for( size_t j=jj; j<jend; ++j ) {
1214  for( size_t i=ii; i<iend; ++i ) {
1215  reset( (~C)(i,j) );
1216  }
1217  }
1218 
1219  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
1220  {
1221  const size_t ktmp( min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
1222 
1223  size_t i( ii );
1224 
1225  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
1226  {
1227  const size_t i1( i+IT::size );
1228  const size_t i2( i+IT::size*2UL );
1229  const size_t i3( i+IT::size*3UL );
1230 
1231  size_t j( jj );
1232 
1233  for( ; (j+2UL) <= jend; j+=2UL )
1234  {
1235  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1236  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1237  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
1238  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1239 
1240  IntrinsicType xmm1( (~C).load(i ,j ) );
1241  IntrinsicType xmm2( (~C).load(i1,j ) );
1242  IntrinsicType xmm3( (~C).load(i2,j ) );
1243  IntrinsicType xmm4( (~C).load(i3,j ) );
1244  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
1245  IntrinsicType xmm6( (~C).load(i1,j+1UL) );
1246  IntrinsicType xmm7( (~C).load(i2,j+1UL) );
1247  IntrinsicType xmm8( (~C).load(i3,j+1UL) );
1248 
1249  for( size_t k=kbegin; k<kend; ++k ) {
1250  const IntrinsicType a1( A.load(i ,k) );
1251  const IntrinsicType a2( A.load(i1,k) );
1252  const IntrinsicType a3( A.load(i2,k) );
1253  const IntrinsicType a4( A.load(i3,k) );
1254  const IntrinsicType b1( set( B(k,j ) ) );
1255  const IntrinsicType b2( set( B(k,j+1UL) ) );
1256  xmm1 = xmm1 + a1 * b1;
1257  xmm2 = xmm2 + a2 * b1;
1258  xmm3 = xmm3 + a3 * b1;
1259  xmm4 = xmm4 + a4 * b1;
1260  xmm5 = xmm5 + a1 * b2;
1261  xmm6 = xmm6 + a2 * b2;
1262  xmm7 = xmm7 + a3 * b2;
1263  xmm8 = xmm8 + a4 * b2;
1264  }
1265 
1266  (~C).store( i , j , xmm1 );
1267  (~C).store( i1, j , xmm2 );
1268  (~C).store( i2, j , xmm3 );
1269  (~C).store( i3, j , xmm4 );
1270  (~C).store( i , j+1UL, xmm5 );
1271  (~C).store( i1, j+1UL, xmm6 );
1272  (~C).store( i2, j+1UL, xmm7 );
1273  (~C).store( i3, j+1UL, xmm8 );
1274  }
1275 
1276  if( j < jend )
1277  {
1278  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1279  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1280  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
1281  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1282 
1283  IntrinsicType xmm1( (~C).load(i ,j) );
1284  IntrinsicType xmm2( (~C).load(i1,j) );
1285  IntrinsicType xmm3( (~C).load(i2,j) );
1286  IntrinsicType xmm4( (~C).load(i3,j) );
1287 
1288  for( size_t k=kbegin; k<kend; ++k ) {
1289  const IntrinsicType b1( set( B(k,j) ) );
1290  xmm1 = xmm1 + A.load(i ,k) * b1;
1291  xmm2 = xmm2 + A.load(i1,k) * b1;
1292  xmm3 = xmm3 + A.load(i2,k) * b1;
1293  xmm4 = xmm4 + A.load(i3,k) * b1;
1294  }
1295 
1296  (~C).store( i , j, xmm1 );
1297  (~C).store( i1, j, xmm2 );
1298  (~C).store( i2, j, xmm3 );
1299  (~C).store( i3, j, xmm4 );
1300  }
1301  }
1302 
1303  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
1304  {
1305  const size_t i1( i+IT::size );
1306 
1307  size_t j( jj );
1308 
1309  for( ; (j+4UL) <= jend; j+=4UL )
1310  {
1311  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1312  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1313  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
1314  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
1315 
1316  IntrinsicType xmm1( (~C).load(i ,j ) );
1317  IntrinsicType xmm2( (~C).load(i1,j ) );
1318  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
1319  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
1320  IntrinsicType xmm5( (~C).load(i ,j+2UL) );
1321  IntrinsicType xmm6( (~C).load(i1,j+2UL) );
1322  IntrinsicType xmm7( (~C).load(i ,j+3UL) );
1323  IntrinsicType xmm8( (~C).load(i1,j+3UL) );
1324 
1325  for( size_t k=kbegin; k<kend; ++k ) {
1326  const IntrinsicType a1( A.load(i ,k) );
1327  const IntrinsicType a2( A.load(i1,k) );
1328  const IntrinsicType b1( set( B(k,j ) ) );
1329  const IntrinsicType b2( set( B(k,j+1UL) ) );
1330  const IntrinsicType b3( set( B(k,j+2UL) ) );
1331  const IntrinsicType b4( set( B(k,j+3UL) ) );
1332  xmm1 = xmm1 + a1 * b1;
1333  xmm2 = xmm2 + a2 * b1;
1334  xmm3 = xmm3 + a1 * b2;
1335  xmm4 = xmm4 + a2 * b2;
1336  xmm5 = xmm5 + a1 * b3;
1337  xmm6 = xmm6 + a2 * b3;
1338  xmm7 = xmm7 + a1 * b4;
1339  xmm8 = xmm8 + a2 * b4;
1340  }
1341 
1342  (~C).store( i , j , xmm1 );
1343  (~C).store( i1, j , xmm2 );
1344  (~C).store( i , j+1UL, xmm3 );
1345  (~C).store( i1, j+1UL, xmm4 );
1346  (~C).store( i , j+2UL, xmm5 );
1347  (~C).store( i1, j+2UL, xmm6 );
1348  (~C).store( i , j+3UL, xmm7 );
1349  (~C).store( i1, j+3UL, xmm8 );
1350  }
1351 
1352  for( ; (j+2UL) <= jend; j+=2UL )
1353  {
1354  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1355  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1356  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
1357  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1358 
1359  IntrinsicType xmm1( (~C).load(i ,j ) );
1360  IntrinsicType xmm2( (~C).load(i1,j ) );
1361  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
1362  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
1363 
1364  for( size_t k=kbegin; k<kend; ++k ) {
1365  const IntrinsicType a1( A.load(i ,k) );
1366  const IntrinsicType a2( A.load(i1,k) );
1367  const IntrinsicType b1( set( B(k,j ) ) );
1368  const IntrinsicType b2( set( B(k,j+1UL) ) );
1369  xmm1 = xmm1 + a1 * b1;
1370  xmm2 = xmm2 + a2 * b1;
1371  xmm3 = xmm3 + a1 * b2;
1372  xmm4 = xmm4 + a2 * b2;
1373  }
1374 
1375  (~C).store( i , j , xmm1 );
1376  (~C).store( i1, j , xmm2 );
1377  (~C).store( i , j+1UL, xmm3 );
1378  (~C).store( i1, j+1UL, xmm4 );
1379  }
1380 
1381  if( j < jend )
1382  {
1383  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1384  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1385  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
1386  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1387 
1388  IntrinsicType xmm1( (~C).load(i ,j) );
1389  IntrinsicType xmm2( (~C).load(i1,j) );
1390 
1391  for( size_t k=kbegin; k<kend; ++k ) {
1392  const IntrinsicType b1( set( B(k,j) ) );
1393  xmm1 = xmm1 + A.load(i ,k) * b1;
1394  xmm2 = xmm2 + A.load(i1,k) * b1;
1395  }
1396 
1397  (~C).store( i , j, xmm1 );
1398  (~C).store( i1, j, xmm2 );
1399  }
1400  }
1401 
1402  for( ; i<ipos; i+=IT::size )
1403  {
1404  for( size_t j=jj; j<jend; ++j )
1405  {
1406  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1407  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1408  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
1409  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1410 
1411  IntrinsicType xmm1( (~C).load(i,j) );
1412 
1413  for( size_t k=kbegin; k<kend; ++k ) {
1414  const IntrinsicType b1( set( B(k,j) ) );
1415  xmm1 = xmm1 + A.load(i,k) * b1;
1416  }
1417 
1418  (~C).store( i, j, xmm1 );
1419  }
1420  }
1421 
1422  for( ; remainder && i<iend; ++i )
1423  {
1424  for( size_t j=jj; j<jend; ++j )
1425  {
1426  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1427  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1428  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
1429  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1430 
1431  ElementType value( (~C)(i,j) );
1432 
1433  for( size_t k=kbegin; k<kend; ++k ) {
1434  value += A(i,k) * B(k,j);
1435  }
1436 
1437  (~C)(i,j) = value;
1438  }
1439  }
1440  }
1441  }
1442  }
1443  }
1445  //**********************************************************************************************
1446 
1447  //**BLAS-based assignment to dense matrices (default)*******************************************
1461  template< typename MT3 // Type of the left-hand side target matrix
1462  , typename MT4 // Type of the left-hand side matrix operand
1463  , typename MT5 > // Type of the right-hand side matrix operand
1464  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
1465  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1466  {
1467  selectLargeAssignKernel( C, A, B );
1468  }
1470  //**********************************************************************************************
1471 
1472  //**BLAS-based assignment to dense matrices*****************************************************
1473 #if BLAZE_BLAS_MODE
1474 
1487  template< typename MT3 // Type of the left-hand side target matrix
1488  , typename MT4 // Type of the left-hand side matrix operand
1489  , typename MT5 > // Type of the right-hand side matrix operand
1490  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
1491  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1492  {
1493  typedef typename MT3::ElementType ET;
1494 
1495  if( IsTriangular<MT4>::value ) {
1496  assign( C, B );
1497  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1498  }
1499  else if( IsTriangular<MT5>::value ) {
1500  assign( C, A );
1501  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1502  }
1503  else {
1504  gemm( C, A, B, ET(1), ET(0) );
1505  }
1506  }
1508 #endif
1509  //**********************************************************************************************
1510 
1511  //**Assignment to sparse matrices***************************************************************
1524  template< typename MT // Type of the target sparse matrix
1525  , bool SO > // Storage order of the target sparse matrix
1526  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1527  assign( SparseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
1528  {
1530 
1531  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
1532 
1539 
1540  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1541  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1542 
1543  const TmpType tmp( serial( rhs ) );
1544  assign( ~lhs, tmp );
1545  }
1547  //**********************************************************************************************
1548 
1549  //**Restructuring assignment to row-major matrices**********************************************
1564  template< typename MT > // Type of the target matrix
1565  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1566  assign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
1567  {
1569 
1571 
1572  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1573  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1574 
1575  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
1576  assign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
1577  else if( IsSymmetric<MT1>::value )
1578  assign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
1579  else
1580  assign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
1581  }
1583  //**********************************************************************************************
1584 
1585  //**Addition assignment to dense matrices*******************************************************
1598  template< typename MT // Type of the target dense matrix
1599  , bool SO > // Storage order of the target dense matrix
1600  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1601  addAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
1602  {
1604 
1605  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1606  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1607 
1608  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1609  return;
1610  }
1611 
1612  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1613  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1614 
1615  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1616  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1617  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1618  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1619  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1620  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1621 
1622  TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1623  }
1625  //**********************************************************************************************
1626 
1627  //**Addition assignment to dense matrices (kernel selection)************************************
1638  template< typename MT3 // Type of the left-hand side target matrix
1639  , typename MT4 // Type of the left-hand side matrix operand
1640  , typename MT5 > // Type of the right-hand side matrix operand
1641  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1642  {
1643  if( ( IsDiagonal<MT4>::value ) ||
1644  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1645  selectSmallAddAssignKernel( C, A, B );
1646  else
1647  selectBlasAddAssignKernel( C, A, B );
1648  }
1650  //**********************************************************************************************
1651 
1652  //**Default addition assignment to dense matrices (general/general)*****************************
1666  template< typename MT3 // Type of the left-hand side target matrix
1667  , typename MT4 // Type of the left-hand side matrix operand
1668  , typename MT5 > // Type of the right-hand side matrix operand
1669  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
1670  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1671  {
1672  const size_t M( A.rows() );
1673  const size_t N( B.columns() );
1674  const size_t K( A.columns() );
1675 
1676  for( size_t j=0UL; j<N; ++j )
1677  {
1678  const size_t kbegin( ( IsLower<MT5>::value )
1679  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1680  :( 0UL ) );
1681  const size_t kend( ( IsUpper<MT5>::value )
1682  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1683  :( K ) );
1684  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
1685 
1686  for( size_t k=kbegin; k<kend; ++k )
1687  {
1688  const size_t ibegin( ( IsLower<MT4>::value )
1689  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
1690  :( 0UL ) );
1691  const size_t iend( ( IsUpper<MT4>::value )
1692  ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
1693  :( M ) );
1694  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1695 
1696  const size_t inum( iend - ibegin );
1697  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1698 
1699  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1700  C(i ,j) += A(i ,k) * B(k,j);
1701  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1702  }
1703  if( ipos < iend ) {
1704  C(ipos,j) += A(ipos,k) * B(k,j);
1705  }
1706  }
1707  }
1708  }
1710  //**********************************************************************************************
1711 
1712  //**Default addition assignment to dense matrices (general/diagonal)****************************
1726  template< typename MT3 // Type of the left-hand side target matrix
1727  , typename MT4 // Type of the left-hand side matrix operand
1728  , typename MT5 > // Type of the right-hand side matrix operand
1729  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
1730  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1731  {
1733 
1734  const size_t M( A.rows() );
1735  const size_t N( B.columns() );
1736 
1737  for( size_t j=0UL; j<N; ++j )
1738  {
1739  const size_t ibegin( ( IsLower<MT4>::value )
1740  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
1741  :( 0UL ) );
1742  const size_t iend( ( IsUpper<MT4>::value )
1743  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
1744  :( M ) );
1745  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1746 
1747  const size_t inum( iend - ibegin );
1748  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1749 
1750  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1751  C(i ,j) += A(i ,j) * B(j,j);
1752  C(i+1UL,j) += A(i+1UL,j) * B(j,j);
1753  }
1754  if( ipos < iend ) {
1755  C(ipos,j) += A(ipos,j) * B(j,j);
1756  }
1757  }
1758  }
1760  //**********************************************************************************************
1761 
1762  //**Default addition assignment to dense matrices (diagonal/general)****************************
1776  template< typename MT3 // Type of the left-hand side target matrix
1777  , typename MT4 // Type of the left-hand side matrix operand
1778  , typename MT5 > // Type of the right-hand side matrix operand
1779  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
1780  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1781  {
1783 
1784  const size_t M( A.rows() );
1785  const size_t N( B.columns() );
1786 
1787  for( size_t j=0UL; j<N; ++j )
1788  {
1789  const size_t ibegin( ( IsLower<MT5>::value )
1790  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1791  :( 0UL ) );
1792  const size_t iend( ( IsUpper<MT5>::value )
1793  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1794  :( M ) );
1795  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1796 
1797  const size_t inum( iend - ibegin );
1798  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1799 
1800  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1801  C(i ,j) += A(i ,i ) * B(i ,j);
1802  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
1803  }
1804  if( ipos < iend ) {
1805  C(ipos,j) += A(ipos,ipos) * B(ipos,j);
1806  }
1807  }
1808  }
1810  //**********************************************************************************************
1811 
1812  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
1826  template< typename MT3 // Type of the left-hand side target matrix
1827  , typename MT4 // Type of the left-hand side matrix operand
1828  , typename MT5 > // Type of the right-hand side matrix operand
1829  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
1830  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1831  {
1833 
1834  for( size_t i=0UL; i<A.rows(); ++i ) {
1835  C(i,i) += A(i,i) * B(i,i);
1836  }
1837  }
1839  //**********************************************************************************************
1840 
1841  //**Default addition assignment to dense matrices (small matrices)******************************
1855  template< typename MT3 // Type of the left-hand side target matrix
1856  , typename MT4 // Type of the left-hand side matrix operand
1857  , typename MT5 > // Type of the right-hand side matrix operand
1858  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1859  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1860  {
1861  selectDefaultAddAssignKernel( C, A, B );
1862  }
1864  //**********************************************************************************************
1865 
1866  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
1881  template< typename MT3 // Type of the left-hand side target matrix
1882  , typename MT4 // Type of the left-hand side matrix operand
1883  , typename MT5 > // Type of the right-hand side matrix operand
1884  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1885  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1886  {
1891 
1892  if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1893  const typename MT5::OppositeType tmp( serial( B ) );
1894  addAssign( ~C, A * tmp );
1895  }
1896  else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1897  const typename MT4::OppositeType tmp( serial( A ) );
1898  addAssign( ~C, tmp * B );
1899  }
1900  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
1901  const typename MT5::OppositeType tmp( serial( B ) );
1902  addAssign( ~C, A * tmp );
1903  }
1904  else {
1905  const typename MT4::OppositeType tmp( serial( A ) );
1906  addAssign( ~C, tmp * B );
1907  }
1908  }
1910  //**********************************************************************************************
1911 
1912  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
1927  template< typename MT3 // Type of the left-hand side target matrix
1928  , typename MT4 // Type of the left-hand side matrix operand
1929  , typename MT5 > // Type of the right-hand side matrix operand
1930  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1931  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1932  {
1933  typedef IntrinsicTrait<ElementType> IT;
1934 
1935  const size_t M( A.rows() );
1936  const size_t N( B.columns() );
1937  const size_t K( A.columns() );
1938 
1939  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1940 
1941  const size_t ipos( remainder ? ( M & size_t(-IT::size) ) : M );
1942  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % IT::size ) ) == ipos, "Invalid end calculation" );
1943 
1944  size_t i( 0UL );
1945 
1946  for( ; (i+IT::size*7UL) < ipos; i+=IT::size*8UL ) {
1947  for( size_t j=0UL; j<N; ++j )
1948  {
1949  const size_t kbegin( ( IsLower<MT5>::value )
1950  ?( ( IsUpper<MT4>::value )
1951  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1952  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1953  :( IsUpper<MT4>::value ? i : 0UL ) );
1954  const size_t kend( ( IsUpper<MT5>::value )
1955  ?( ( IsLower<MT4>::value )
1956  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1957  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1958  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
1959 
1960  IntrinsicType xmm1( (~C).load(i ,j) );
1961  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
1962  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
1963  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
1964  IntrinsicType xmm5( (~C).load(i+IT::size*4UL,j) );
1965  IntrinsicType xmm6( (~C).load(i+IT::size*5UL,j) );
1966  IntrinsicType xmm7( (~C).load(i+IT::size*6UL,j) );
1967  IntrinsicType xmm8( (~C).load(i+IT::size*7UL,j) );
1968 
1969  for( size_t k=kbegin; k<kend; ++k ) {
1970  const IntrinsicType b1( set( B(k,j) ) );
1971  xmm1 = xmm1 + A.load(i ,k) * b1;
1972  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1973  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1974  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1975  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
1976  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
1977  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
1978  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
1979  }
1980 
1981  (~C).store( i , j, xmm1 );
1982  (~C).store( i+IT::size , j, xmm2 );
1983  (~C).store( i+IT::size*2UL, j, xmm3 );
1984  (~C).store( i+IT::size*3UL, j, xmm4 );
1985  (~C).store( i+IT::size*4UL, j, xmm5 );
1986  (~C).store( i+IT::size*5UL, j, xmm6 );
1987  (~C).store( i+IT::size*6UL, j, xmm7 );
1988  (~C).store( i+IT::size*7UL, j, xmm8 );
1989  }
1990  }
1991 
1992  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
1993  {
1994  size_t j( 0UL );
1995 
1996  for( ; (j+2UL) <= N; j+=2UL )
1997  {
1998  const size_t kbegin( ( IsLower<MT5>::value )
1999  ?( ( IsUpper<MT4>::value )
2000  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2001  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2002  :( IsUpper<MT4>::value ? i : 0UL ) );
2003  const size_t kend( ( IsUpper<MT5>::value )
2004  ?( ( IsLower<MT4>::value )
2005  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2006  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2007  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
2008 
2009  IntrinsicType xmm1( (~C).load(i ,j ) );
2010  IntrinsicType xmm2( (~C).load(i+IT::size ,j ) );
2011  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j ) );
2012  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j ) );
2013  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
2014  IntrinsicType xmm6( (~C).load(i+IT::size ,j+1UL) );
2015  IntrinsicType xmm7( (~C).load(i+IT::size*2UL,j+1UL) );
2016  IntrinsicType xmm8( (~C).load(i+IT::size*3UL,j+1UL) );
2017 
2018  for( size_t k=kbegin; k<kend; ++k ) {
2019  const IntrinsicType a1( A.load(i ,k) );
2020  const IntrinsicType a2( A.load(i+IT::size ,k) );
2021  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
2022  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
2023  const IntrinsicType b1( set( B(k,j ) ) );
2024  const IntrinsicType b2( set( B(k,j+1UL) ) );
2025  xmm1 = xmm1 + a1 * b1;
2026  xmm2 = xmm2 + a2 * b1;
2027  xmm3 = xmm3 + a3 * b1;
2028  xmm4 = xmm4 + a4 * b1;
2029  xmm5 = xmm5 + a1 * b2;
2030  xmm6 = xmm6 + a2 * b2;
2031  xmm7 = xmm7 + a3 * b2;
2032  xmm8 = xmm8 + a4 * b2;
2033  }
2034 
2035  (~C).store( i , j , xmm1 );
2036  (~C).store( i+IT::size , j , xmm2 );
2037  (~C).store( i+IT::size*2UL, j , xmm3 );
2038  (~C).store( i+IT::size*3UL, j , xmm4 );
2039  (~C).store( i , j+1UL, xmm5 );
2040  (~C).store( i+IT::size , j+1UL, xmm6 );
2041  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
2042  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
2043  }
2044 
2045  if( j < N )
2046  {
2047  const size_t kbegin( ( IsLower<MT5>::value )
2048  ?( ( IsUpper<MT4>::value )
2049  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2050  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2051  :( IsUpper<MT4>::value ? i : 0UL ) );
2052  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
2053 
2054  IntrinsicType xmm1( (~C).load(i ,j) );
2055  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
2056  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
2057  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
2058 
2059  for( size_t k=kbegin; k<kend; ++k ) {
2060  const IntrinsicType b1( set( B(k,j) ) );
2061  xmm1 = xmm1 + A.load(i ,k) * b1;
2062  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
2063  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
2064  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
2065  }
2066 
2067  (~C).store( i , j, xmm1 );
2068  (~C).store( i+IT::size , j, xmm2 );
2069  (~C).store( i+IT::size*2UL, j, xmm3 );
2070  (~C).store( i+IT::size*3UL, j, xmm4 );
2071  }
2072  }
2073 
2074  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
2075  {
2076  size_t j( 0UL );
2077 
2078  for( ; (j+2UL) <= N; j+=2UL )
2079  {
2080  const size_t kbegin( ( IsLower<MT5>::value )
2081  ?( ( IsUpper<MT4>::value )
2082  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2083  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2084  :( IsUpper<MT4>::value ? i : 0UL ) );
2085  const size_t kend( ( IsUpper<MT5>::value )
2086  ?( ( IsLower<MT4>::value )
2087  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2088  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2089  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
2090 
2091  IntrinsicType xmm1( (~C).load(i ,j ) );
2092  IntrinsicType xmm2( (~C).load(i+IT::size,j ) );
2093  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2094  IntrinsicType xmm4( (~C).load(i+IT::size,j+1UL) );
2095 
2096  for( size_t k=kbegin; k<kend; ++k ) {
2097  const IntrinsicType a1( A.load(i ,k) );
2098  const IntrinsicType a2( A.load(i+IT::size,k) );
2099  const IntrinsicType b1( set( B(k,j ) ) );
2100  const IntrinsicType b2( set( B(k,j+1UL) ) );
2101  xmm1 = xmm1 + a1 * b1;
2102  xmm2 = xmm2 + a2 * b1;
2103  xmm3 = xmm3 + a1 * b2;
2104  xmm4 = xmm4 + a2 * b2;
2105  }
2106 
2107  (~C).store( i , j , xmm1 );
2108  (~C).store( i+IT::size, j , xmm2 );
2109  (~C).store( i , j+1UL, xmm3 );
2110  (~C).store( i+IT::size, j+1UL, xmm4 );
2111  }
2112 
2113  if( j < N )
2114  {
2115  const size_t kbegin( ( IsLower<MT5>::value )
2116  ?( ( IsUpper<MT4>::value )
2117  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2118  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2119  :( IsUpper<MT4>::value ? i : 0UL ) );
2120  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
2121 
2122  IntrinsicType xmm1( (~C).load(i ,j) );
2123  IntrinsicType xmm2( (~C).load(i+IT::size,j) );
2124 
2125  for( size_t k=kbegin; k<kend; ++k ) {
2126  const IntrinsicType b1( set( B(k,j) ) );
2127  xmm1 = xmm1 + A.load(i ,k) * b1;
2128  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
2129  }
2130 
2131  (~C).store( i , j, xmm1 );
2132  (~C).store( i+IT::size, j, xmm2 );
2133  }
2134  }
2135 
2136  for( ; i<ipos; i+=IT::size )
2137  {
2138  size_t j( 0UL );
2139 
2140  for( ; (j+2UL) <= N; j+=2UL )
2141  {
2142  const size_t kbegin( ( IsLower<MT5>::value )
2143  ?( ( IsUpper<MT4>::value )
2144  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2145  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2146  :( IsUpper<MT4>::value ? i : 0UL ) );
2147  const size_t kend( ( IsUpper<MT5>::value )
2148  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2149  :( K ) );
2150 
2151  IntrinsicType xmm1( (~C).load(i,j ) );
2152  IntrinsicType xmm2( (~C).load(i,j+1UL) );
2153 
2154  for( size_t k=kbegin; k<kend; ++k ) {
2155  const IntrinsicType a1( A.load(i,k) );
2156  xmm1 = xmm1 + a1 * set( B(k,j ) );
2157  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
2158  }
2159 
2160  (~C).store( i, j , xmm1 );
2161  (~C).store( i, j+1UL, xmm2 );
2162  }
2163 
2164  if( j < N )
2165  {
2166  const size_t kbegin( ( IsLower<MT5>::value )
2167  ?( ( IsUpper<MT4>::value )
2168  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2169  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2170  :( IsUpper<MT4>::value ? i : 0UL ) );
2171 
2172  IntrinsicType xmm1( (~C).load(i,j) );
2173 
2174  for( size_t k=kbegin; k<K; ++k ) {
2175  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
2176  }
2177 
2178  (~C).store( i, j, xmm1 );
2179  }
2180  }
2181 
2182  for( ; remainder && i<M; ++i )
2183  {
2184  size_t j( 0UL );
2185 
2186  for( ; (j+2UL) <= N; j+=2UL )
2187  {
2188  const size_t kbegin( ( IsLower<MT5>::value )
2189  ?( ( IsUpper<MT4>::value )
2190  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2191  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2192  :( IsUpper<MT4>::value ? i : 0UL ) );
2193  const size_t kend( ( IsUpper<MT5>::value )
2194  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2195  :( K ) );
2196 
2197  ElementType value1( (~C)(i,j ) );
2198  ElementType value2( (~C)(i,j+1UL) );
2199 
2200  for( size_t k=kbegin; k<kend; ++k ) {
2201  value1 += A(i,k) * B(k,j );
2202  value2 += A(i,k) * B(k,j+1UL);
2203  }
2204 
2205  (~C)(i,j ) = value1;
2206  (~C)(i,j+1UL) = value2;
2207  }
2208 
2209  if( j < N )
2210  {
2211  const size_t kbegin( ( IsLower<MT5>::value )
2212  ?( ( IsUpper<MT4>::value )
2213  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2214  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2215  :( IsUpper<MT4>::value ? i : 0UL ) );
2216 
2217  ElementType value( (~C)(i,j) );
2218 
2219  for( size_t k=kbegin; k<K; ++k ) {
2220  value += A(i,k) * B(k,j);
2221  }
2222 
2223  (~C)(i,j) = value;
2224  }
2225  }
2226  }
2228  //**********************************************************************************************
2229 
2230  //**Default addition assignment to dense matrices (large matrices)******************************
2244  template< typename MT3 // Type of the left-hand side target matrix
2245  , typename MT4 // Type of the left-hand side matrix operand
2246  , typename MT5 > // Type of the right-hand side matrix operand
2247  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2248  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2249  {
2250  selectDefaultAddAssignKernel( C, A, B );
2251  }
2253  //**********************************************************************************************
2254 
2255  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
2270  template< typename MT3 // Type of the left-hand side target matrix
2271  , typename MT4 // Type of the left-hand side matrix operand
2272  , typename MT5 > // Type of the right-hand side matrix operand
2273  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2274  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2275  {
2276  selectSmallAddAssignKernel( ~C, A, B );
2277  }
2279  //**********************************************************************************************
2280 
2281  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
2296  template< typename MT3 // Type of the left-hand side target matrix
2297  , typename MT4 // Type of the left-hand side matrix operand
2298  , typename MT5 > // Type of the right-hand side matrix operand
2299  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2300  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2301  {
2302  typedef IntrinsicTrait<ElementType> IT;
2303 
2304  const size_t M( A.rows() );
2305  const size_t N( B.columns() );
2306  const size_t K( A.columns() );
2307 
2308  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
2309 
2310  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
2311  {
2312  const size_t iend( min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
2313 
2314  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
2315  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % IT::size ) ) == ipos, "Invalid end calculation" );
2316 
2317  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
2318  {
2319  const size_t jend( min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
2320 
2321  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
2322  {
2323  const size_t ktmp( min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
2324 
2325  size_t i( ii );
2326 
2327  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
2328  {
2329  const size_t i1( i+IT::size );
2330  const size_t i2( i+IT::size*2UL );
2331  const size_t i3( i+IT::size*3UL );
2332 
2333  size_t j( jj );
2334 
2335  for( ; (j+2UL) <= jend; j+=2UL )
2336  {
2337  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2338  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2339  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
2340  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2341 
2342  IntrinsicType xmm1( (~C).load(i ,j ) );
2343  IntrinsicType xmm2( (~C).load(i1,j ) );
2344  IntrinsicType xmm3( (~C).load(i2,j ) );
2345  IntrinsicType xmm4( (~C).load(i3,j ) );
2346  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
2347  IntrinsicType xmm6( (~C).load(i1,j+1UL) );
2348  IntrinsicType xmm7( (~C).load(i2,j+1UL) );
2349  IntrinsicType xmm8( (~C).load(i3,j+1UL) );
2350 
2351  for( size_t k=kbegin; k<kend; ++k ) {
2352  const IntrinsicType a1( A.load(i ,k) );
2353  const IntrinsicType a2( A.load(i1,k) );
2354  const IntrinsicType a3( A.load(i2,k) );
2355  const IntrinsicType a4( A.load(i3,k) );
2356  const IntrinsicType b1( set( B(k,j ) ) );
2357  const IntrinsicType b2( set( B(k,j+1UL) ) );
2358  xmm1 = xmm1 + a1 * b1;
2359  xmm2 = xmm2 + a2 * b1;
2360  xmm3 = xmm3 + a3 * b1;
2361  xmm4 = xmm4 + a4 * b1;
2362  xmm5 = xmm5 + a1 * b2;
2363  xmm6 = xmm6 + a2 * b2;
2364  xmm7 = xmm7 + a3 * b2;
2365  xmm8 = xmm8 + a4 * b2;
2366  }
2367 
2368  (~C).store( i , j , xmm1 );
2369  (~C).store( i1, j , xmm2 );
2370  (~C).store( i2, j , xmm3 );
2371  (~C).store( i3, j , xmm4 );
2372  (~C).store( i , j+1UL, xmm5 );
2373  (~C).store( i1, j+1UL, xmm6 );
2374  (~C).store( i2, j+1UL, xmm7 );
2375  (~C).store( i3, j+1UL, xmm8 );
2376  }
2377 
2378  if( j < jend )
2379  {
2380  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2381  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2382  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
2383  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2384 
2385  IntrinsicType xmm1( (~C).load(i ,j) );
2386  IntrinsicType xmm2( (~C).load(i1,j) );
2387  IntrinsicType xmm3( (~C).load(i2,j) );
2388  IntrinsicType xmm4( (~C).load(i3,j) );
2389 
2390  for( size_t k=kbegin; k<kend; ++k ) {
2391  const IntrinsicType b1( set( B(k,j) ) );
2392  xmm1 = xmm1 + A.load(i ,k) * b1;
2393  xmm2 = xmm2 + A.load(i1,k) * b1;
2394  xmm3 = xmm3 + A.load(i2,k) * b1;
2395  xmm4 = xmm4 + A.load(i3,k) * b1;
2396  }
2397 
2398  (~C).store( i , j, xmm1 );
2399  (~C).store( i1, j, xmm2 );
2400  (~C).store( i2, j, xmm3 );
2401  (~C).store( i3, j, xmm4 );
2402  }
2403  }
2404 
2405  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
2406  {
2407  const size_t i1( i+IT::size );
2408 
2409  size_t j( jj );
2410 
2411  for( ; (j+4UL) <= jend; j+=4UL )
2412  {
2413  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2414  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2415  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
2416  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
2417 
2418  IntrinsicType xmm1( (~C).load(i ,j ) );
2419  IntrinsicType xmm2( (~C).load(i1,j ) );
2420  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2421  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
2422  IntrinsicType xmm5( (~C).load(i ,j+2UL) );
2423  IntrinsicType xmm6( (~C).load(i1,j+2UL) );
2424  IntrinsicType xmm7( (~C).load(i ,j+3UL) );
2425  IntrinsicType xmm8( (~C).load(i1,j+3UL) );
2426 
2427  for( size_t k=kbegin; k<kend; ++k ) {
2428  const IntrinsicType a1( A.load(i ,k) );
2429  const IntrinsicType a2( A.load(i1,k) );
2430  const IntrinsicType b1( set( B(k,j ) ) );
2431  const IntrinsicType b2( set( B(k,j+1UL) ) );
2432  const IntrinsicType b3( set( B(k,j+2UL) ) );
2433  const IntrinsicType b4( set( B(k,j+3UL) ) );
2434  xmm1 = xmm1 + a1 * b1;
2435  xmm2 = xmm2 + a2 * b1;
2436  xmm3 = xmm3 + a1 * b2;
2437  xmm4 = xmm4 + a2 * b2;
2438  xmm5 = xmm5 + a1 * b3;
2439  xmm6 = xmm6 + a2 * b3;
2440  xmm7 = xmm7 + a1 * b4;
2441  xmm8 = xmm8 + a2 * b4;
2442  }
2443 
2444  (~C).store( i , j , xmm1 );
2445  (~C).store( i1, j , xmm2 );
2446  (~C).store( i , j+1UL, xmm3 );
2447  (~C).store( i1, j+1UL, xmm4 );
2448  (~C).store( i , j+2UL, xmm5 );
2449  (~C).store( i1, j+2UL, xmm6 );
2450  (~C).store( i , j+3UL, xmm7 );
2451  (~C).store( i1, j+3UL, xmm8 );
2452  }
2453 
2454  for( ; (j+2UL) <= jend; j+=2UL )
2455  {
2456  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2457  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2458  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
2459  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2460 
2461  IntrinsicType xmm1( (~C).load(i ,j ) );
2462  IntrinsicType xmm2( (~C).load(i1,j ) );
2463  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2464  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
2465 
2466  for( size_t k=kbegin; k<kend; ++k ) {
2467  const IntrinsicType a1( A.load(i ,k) );
2468  const IntrinsicType a2( A.load(i1,k) );
2469  const IntrinsicType b1( set( B(k,j ) ) );
2470  const IntrinsicType b2( set( B(k,j+1UL) ) );
2471  xmm1 = xmm1 + a1 * b1;
2472  xmm2 = xmm2 + a2 * b1;
2473  xmm3 = xmm3 + a1 * b2;
2474  xmm4 = xmm4 + a2 * b2;
2475  }
2476 
2477  (~C).store( i , j , xmm1 );
2478  (~C).store( i1, j , xmm2 );
2479  (~C).store( i , j+1UL, xmm3 );
2480  (~C).store( i1, j+1UL, xmm4 );
2481  }
2482 
2483  if( j < jend )
2484  {
2485  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2486  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2487  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
2488  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2489 
2490  IntrinsicType xmm1( (~C).load(i ,j) );
2491  IntrinsicType xmm2( (~C).load(i1,j) );
2492 
2493  for( size_t k=kbegin; k<kend; ++k ) {
2494  const IntrinsicType b1( set( B(k,j) ) );
2495  xmm1 = xmm1 + A.load(i ,k) * b1;
2496  xmm2 = xmm2 + A.load(i1,k) * b1;
2497  }
2498 
2499  (~C).store( i , j, xmm1 );
2500  (~C).store( i1, j, xmm2 );
2501  }
2502  }
2503 
2504  for( ; i<ipos; i+=IT::size )
2505  {
2506  for( size_t j=jj; j<jend; ++j )
2507  {
2508  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2509  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2510  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
2511  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2512 
2513  IntrinsicType xmm1( (~C).load(i,j) );
2514 
2515  for( size_t k=kbegin; k<kend; ++k ) {
2516  const IntrinsicType b1( set( B(k,j) ) );
2517  xmm1 = xmm1 + A.load(i,k) * b1;
2518  }
2519 
2520  (~C).store( i, j, xmm1 );
2521  }
2522  }
2523 
2524  for( ; remainder && i<iend; ++i )
2525  {
2526  for( size_t j=jj; j<jend; ++j )
2527  {
2528  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2529  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2530  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
2531  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2532 
2533  ElementType value( (~C)(i,j) );
2534 
2535  for( size_t k=kbegin; k<kend; ++k ) {
2536  value += A(i,k) * B(k,j);
2537  }
2538 
2539  (~C)(i,j) = value;
2540  }
2541  }
2542  }
2543  }
2544  }
2545  }
2547  //**********************************************************************************************
2548 
2549  //**BLAS-based addition assignment to dense matrices (default)**********************************
2563  template< typename MT3 // Type of the left-hand side target matrix
2564  , typename MT4 // Type of the left-hand side matrix operand
2565  , typename MT5 > // Type of the right-hand side matrix operand
2566  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2567  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2568  {
2569  selectLargeAddAssignKernel( C, A, B );
2570  }
2572  //**********************************************************************************************
2573 
2574  //**BLAS-based addition assignment to dense matrices********************************************
2575 #if BLAZE_BLAS_MODE
2576 
2589  template< typename MT3 // Type of the left-hand side target matrix
2590  , typename MT4 // Type of the left-hand side matrix operand
2591  , typename MT5 > // Type of the right-hand side matrix operand
2592  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2593  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2594  {
2595  typedef typename MT3::ElementType ET;
2596 
2597  if( IsTriangular<MT4>::value ) {
2598  typename MT3::ResultType tmp( serial( B ) );
2599  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2600  addAssign( C, tmp );
2601  }
2602  else if( IsTriangular<MT5>::value ) {
2603  typename MT3::ResultType tmp( serial( A ) );
2604  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2605  addAssign( C, tmp );
2606  }
2607  else {
2608  gemm( C, A, B, ET(1), ET(1) );
2609  }
2610  }
2612 #endif
2613  //**********************************************************************************************
2614 
2615  //**Restructuring addition assignment to row-major matrices*************************************
2630  template< typename MT > // Type of the target matrix
2631  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
2632  addAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
2633  {
2635 
2637 
2638  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2639  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2640 
2641  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
2642  addAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
2643  else if( IsSymmetric<MT1>::value )
2644  addAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
2645  else
2646  addAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
2647  }
2649  //**********************************************************************************************
2650 
2651  //**Addition assignment to sparse matrices******************************************************
2652  // No special implementation for the addition assignment to sparse matrices.
2653  //**********************************************************************************************
2654 
2655  //**Subtraction assignment to dense matrices****************************************************
2668  template< typename MT // Type of the target dense matrix
2669  , bool SO > // Storage order of the target dense matrix
2670  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
2671  subAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
2672  {
2674 
2675  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2676  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2677 
2678  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2679  return;
2680  }
2681 
2682  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2683  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2684 
2685  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2686  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2687  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2688  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2689  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2690  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2691 
2692  TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2693  }
2695  //**********************************************************************************************
2696 
2697  //**Subtraction assignment to dense matrices (kernel selection)*********************************
2708  template< typename MT3 // Type of the left-hand side target matrix
2709  , typename MT4 // Type of the left-hand side matrix operand
2710  , typename MT5 > // Type of the right-hand side matrix operand
2711  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2712  {
2713  if( ( IsDiagonal<MT4>::value ) ||
2714  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
2715  selectSmallSubAssignKernel( C, A, B );
2716  else
2717  selectBlasSubAssignKernel( C, A, B );
2718  }
2720  //**********************************************************************************************
2721 
2722  //**Default subtraction assignment to dense matrices (general/general)**************************
2736  template< typename MT3 // Type of the left-hand side target matrix
2737  , typename MT4 // Type of the left-hand side matrix operand
2738  , typename MT5 > // Type of the right-hand side matrix operand
2739  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2740  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2741  {
2742  const size_t M( A.rows() );
2743  const size_t N( B.columns() );
2744  const size_t K( A.columns() );
2745 
2746  for( size_t j=0UL; j<N; ++j )
2747  {
2748  const size_t kbegin( ( IsLower<MT5>::value )
2749  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2750  :( 0UL ) );
2751  const size_t kend( ( IsUpper<MT5>::value )
2752  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2753  :( K ) );
2754  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2755 
2756  for( size_t k=kbegin; k<kend; ++k )
2757  {
2758  const size_t ibegin( ( IsLower<MT4>::value )
2759  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
2760  :( 0UL ) );
2761  const size_t iend( ( IsUpper<MT4>::value )
2762  ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
2763  :( M ) );
2764  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2765 
2766  const size_t inum( iend - ibegin );
2767  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2768 
2769  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2770  C(i ,j) -= A(i ,k) * B(k,j);
2771  C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
2772  }
2773  if( ipos < iend ) {
2774  C(ipos,j) -= A(ipos,k) * B(k,j);
2775  }
2776  }
2777  }
2778  }
2780  //**********************************************************************************************
2781 
2782  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
2796  template< typename MT3 // Type of the left-hand side target matrix
2797  , typename MT4 // Type of the left-hand side matrix operand
2798  , typename MT5 > // Type of the right-hand side matrix operand
2799  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2800  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2801  {
2803 
2804  const size_t M( A.rows() );
2805  const size_t N( B.columns() );
2806 
2807  for( size_t j=0UL; j<N; ++j )
2808  {
2809  const size_t ibegin( ( IsLower<MT4>::value )
2810  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2811  :( 0UL ) );
2812  const size_t iend( ( IsUpper<MT4>::value )
2813  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2814  :( M ) );
2815  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2816 
2817  const size_t inum( iend - ibegin );
2818  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2819 
2820  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2821  C(i ,j) -= A(i ,j) * B(j,j);
2822  C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
2823  }
2824  if( ipos < iend ) {
2825  C(ipos,j) -= A(ipos,j) * B(j,j);
2826  }
2827  }
2828  }
2830  //**********************************************************************************************
2831 
2832  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
2846  template< typename MT3 // Type of the left-hand side target matrix
2847  , typename MT4 // Type of the left-hand side matrix operand
2848  , typename MT5 > // Type of the right-hand side matrix operand
2849  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2850  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2851  {
2853 
2854  const size_t M( A.rows() );
2855  const size_t N( B.columns() );
2856 
2857  for( size_t j=0UL; j<N; ++j )
2858  {
2859  const size_t ibegin( ( IsLower<MT5>::value )
2860  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2861  :( 0UL ) );
2862  const size_t iend( ( IsUpper<MT5>::value )
2863  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2864  :( M ) );
2865  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2866 
2867  const size_t inum( iend - ibegin );
2868  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2869 
2870  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2871  C(i ,j) -= A(i ,i ) * B(i ,j);
2872  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
2873  }
2874  if( ipos < iend ) {
2875  C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
2876  }
2877  }
2878  }
2880  //**********************************************************************************************
2881 
2882  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
2896  template< typename MT3 // Type of the left-hand side target matrix
2897  , typename MT4 // Type of the left-hand side matrix operand
2898  , typename MT5 > // Type of the right-hand side matrix operand
2899  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
2900  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2901  {
2903 
2904  for( size_t i=0UL; i<A.rows(); ++i ) {
2905  C(i,i) -= A(i,i) * B(i,i);
2906  }
2907  }
2909  //**********************************************************************************************
2910 
2911  //**Default subtraction assignment to dense matrices (small matrices)***************************
2925  template< typename MT3 // Type of the left-hand side target matrix
2926  , typename MT4 // Type of the left-hand side matrix operand
2927  , typename MT5 > // Type of the right-hand side matrix operand
2928  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2929  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2930  {
2931  selectDefaultSubAssignKernel( C, A, B );
2932  }
2934  //**********************************************************************************************
2935 
2936  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
2951  template< typename MT3 // Type of the left-hand side target matrix
2952  , typename MT4 // Type of the left-hand side matrix operand
2953  , typename MT5 > // Type of the right-hand side matrix operand
2954  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2955  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2956  {
2961 
2962  if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2963  const typename MT5::OppositeType tmp( serial( B ) );
2964  subAssign( ~C, A * tmp );
2965  }
2966  else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2967  const typename MT4::OppositeType tmp( serial( A ) );
2968  subAssign( ~C, tmp * B );
2969  }
2970  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2971  const typename MT5::OppositeType tmp( serial( B ) );
2972  subAssign( ~C, A * tmp );
2973  }
2974  else {
2975  const typename MT4::OppositeType tmp( serial( A ) );
2976  subAssign( ~C, tmp * B );
2977  }
2978  }
2980  //**********************************************************************************************
2981 
2982  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
2997  template< typename MT3 // Type of the left-hand side target matrix
2998  , typename MT4 // Type of the left-hand side matrix operand
2999  , typename MT5 > // Type of the right-hand side matrix operand
3000  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3001  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3002  {
3003  typedef IntrinsicTrait<ElementType> IT;
3004 
3005  const size_t M( A.rows() );
3006  const size_t N( B.columns() );
3007  const size_t K( A.columns() );
3008 
3009  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3010 
3011  const size_t ipos( remainder ? ( M & size_t(-IT::size) ) : M );
3012  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % IT::size ) ) == ipos, "Invalid end calculation" );
3013 
3014  size_t i( 0UL );
3015 
3016  for( ; (i+IT::size*7UL) < ipos; i+=IT::size*8UL ) {
3017  for( size_t j=0UL; j<N; ++j )
3018  {
3019  const size_t kbegin( ( IsLower<MT5>::value )
3020  ?( ( IsUpper<MT4>::value )
3021  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3022  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3023  :( IsUpper<MT4>::value ? i : 0UL ) );
3024  const size_t kend( ( IsUpper<MT5>::value )
3025  ?( ( IsLower<MT4>::value )
3026  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3027  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3028  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
3029 
3030  IntrinsicType xmm1( (~C).load(i ,j) );
3031  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
3032  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
3033  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
3034  IntrinsicType xmm5( (~C).load(i+IT::size*4UL,j) );
3035  IntrinsicType xmm6( (~C).load(i+IT::size*5UL,j) );
3036  IntrinsicType xmm7( (~C).load(i+IT::size*6UL,j) );
3037  IntrinsicType xmm8( (~C).load(i+IT::size*7UL,j) );
3038 
3039  for( size_t k=kbegin; k<kend; ++k ) {
3040  const IntrinsicType b1( set( B(k,j) ) );
3041  xmm1 = xmm1 - A.load(i ,k) * b1;
3042  xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
3043  xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
3044  xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
3045  xmm5 = xmm5 - A.load(i+IT::size*4UL,k) * b1;
3046  xmm6 = xmm6 - A.load(i+IT::size*5UL,k) * b1;
3047  xmm7 = xmm7 - A.load(i+IT::size*6UL,k) * b1;
3048  xmm8 = xmm8 - A.load(i+IT::size*7UL,k) * b1;
3049  }
3050 
3051  (~C).store( i , j, xmm1 );
3052  (~C).store( i+IT::size , j, xmm2 );
3053  (~C).store( i+IT::size*2UL, j, xmm3 );
3054  (~C).store( i+IT::size*3UL, j, xmm4 );
3055  (~C).store( i+IT::size*4UL, j, xmm5 );
3056  (~C).store( i+IT::size*5UL, j, xmm6 );
3057  (~C).store( i+IT::size*6UL, j, xmm7 );
3058  (~C).store( i+IT::size*7UL, j, xmm8 );
3059  }
3060  }
3061 
3062  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
3063  {
3064  size_t j( 0UL );
3065 
3066  for( ; (j+2UL) <= N; j+=2UL )
3067  {
3068  const size_t kbegin( ( IsLower<MT5>::value )
3069  ?( ( IsUpper<MT4>::value )
3070  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3071  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3072  :( IsUpper<MT4>::value ? i : 0UL ) );
3073  const size_t kend( ( IsUpper<MT5>::value )
3074  ?( ( IsLower<MT4>::value )
3075  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3076  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3077  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
3078 
3079  IntrinsicType xmm1( (~C).load(i ,j ) );
3080  IntrinsicType xmm2( (~C).load(i+IT::size ,j ) );
3081  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j ) );
3082  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j ) );
3083  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
3084  IntrinsicType xmm6( (~C).load(i+IT::size ,j+1UL) );
3085  IntrinsicType xmm7( (~C).load(i+IT::size*2UL,j+1UL) );
3086  IntrinsicType xmm8( (~C).load(i+IT::size*3UL,j+1UL) );
3087 
3088  for( size_t k=kbegin; k<kend; ++k ) {
3089  const IntrinsicType a1( A.load(i ,k) );
3090  const IntrinsicType a2( A.load(i+IT::size ,k) );
3091  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
3092  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
3093  const IntrinsicType b1( set( B(k,j ) ) );
3094  const IntrinsicType b2( set( B(k,j+1UL) ) );
3095  xmm1 = xmm1 - a1 * b1;
3096  xmm2 = xmm2 - a2 * b1;
3097  xmm3 = xmm3 - a3 * b1;
3098  xmm4 = xmm4 - a4 * b1;
3099  xmm5 = xmm5 - a1 * b2;
3100  xmm6 = xmm6 - a2 * b2;
3101  xmm7 = xmm7 - a3 * b2;
3102  xmm8 = xmm8 - a4 * b2;
3103  }
3104 
3105  (~C).store( i , j , xmm1 );
3106  (~C).store( i+IT::size , j , xmm2 );
3107  (~C).store( i+IT::size*2UL, j , xmm3 );
3108  (~C).store( i+IT::size*3UL, j , xmm4 );
3109  (~C).store( i , j+1UL, xmm5 );
3110  (~C).store( i+IT::size , j+1UL, xmm6 );
3111  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
3112  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
3113  }
3114 
3115  if( j < N )
3116  {
3117  const size_t kbegin( ( IsLower<MT5>::value )
3118  ?( ( IsUpper<MT4>::value )
3119  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3120  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3121  :( IsUpper<MT4>::value ? i : 0UL ) );
3122  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
3123 
3124  IntrinsicType xmm1( (~C).load(i ,j) );
3125  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
3126  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
3127  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
3128 
3129  for( size_t k=kbegin; k<kend; ++k ) {
3130  const IntrinsicType b1( set( B(k,j) ) );
3131  xmm1 = xmm1 - A.load(i ,k) * b1;
3132  xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
3133  xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
3134  xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
3135  }
3136 
3137  (~C).store( i , j, xmm1 );
3138  (~C).store( i+IT::size , j, xmm2 );
3139  (~C).store( i+IT::size*2UL, j, xmm3 );
3140  (~C).store( i+IT::size*3UL, j, xmm4 );
3141  }
3142  }
3143 
3144  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
3145  {
3146  size_t j( 0UL );
3147 
3148  for( ; (j+2UL) <= N; j+=2UL )
3149  {
3150  const size_t kbegin( ( IsLower<MT5>::value )
3151  ?( ( IsUpper<MT4>::value )
3152  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3153  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3154  :( IsUpper<MT4>::value ? i : 0UL ) );
3155  const size_t kend( ( IsUpper<MT5>::value )
3156  ?( ( IsLower<MT4>::value )
3157  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3158  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3159  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
3160 
3161  IntrinsicType xmm1( (~C).load(i ,j ) );
3162  IntrinsicType xmm2( (~C).load(i+IT::size,j ) );
3163  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3164  IntrinsicType xmm4( (~C).load(i+IT::size,j+1UL) );
3165 
3166  for( size_t k=kbegin; k<kend; ++k ) {
3167  const IntrinsicType a1( A.load(i ,k) );
3168  const IntrinsicType a2( A.load(i+IT::size,k) );
3169  const IntrinsicType b1( set( B(k,j ) ) );
3170  const IntrinsicType b2( set( B(k,j+1UL) ) );
3171  xmm1 = xmm1 - a1 * b1;
3172  xmm2 = xmm2 - a2 * b1;
3173  xmm3 = xmm3 - a1 * b2;
3174  xmm4 = xmm4 - a2 * b2;
3175  }
3176 
3177  (~C).store( i , j , xmm1 );
3178  (~C).store( i+IT::size, j , xmm2 );
3179  (~C).store( i , j+1UL, xmm3 );
3180  (~C).store( i+IT::size, j+1UL, xmm4 );
3181  }
3182 
3183  if( j < N )
3184  {
3185  const size_t kbegin( ( IsLower<MT5>::value )
3186  ?( ( IsUpper<MT4>::value )
3187  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3188  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3189  :( IsUpper<MT4>::value ? i : 0UL ) );
3190  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
3191 
3192  IntrinsicType xmm1( (~C).load(i ,j) );
3193  IntrinsicType xmm2( (~C).load(i+IT::size,j) );
3194 
3195  for( size_t k=kbegin; k<kend; ++k ) {
3196  const IntrinsicType b1( set( B(k,j) ) );
3197  xmm1 = xmm1 - A.load(i ,k) * b1;
3198  xmm2 = xmm2 - A.load(i+IT::size,k) * b1;
3199  }
3200 
3201  (~C).store( i , j, xmm1 );
3202  (~C).store( i+IT::size, j, xmm2 );
3203  }
3204  }
3205 
3206  for( ; i<ipos; i+=IT::size )
3207  {
3208  size_t j( 0UL );
3209 
3210  for( ; (j+2UL) <= N; j+=2UL )
3211  {
3212  const size_t kbegin( ( IsLower<MT5>::value )
3213  ?( ( IsUpper<MT4>::value )
3214  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3215  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3216  :( IsUpper<MT4>::value ? i : 0UL ) );
3217  const size_t kend( ( IsUpper<MT5>::value )
3218  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3219  :( K ) );
3220 
3221  IntrinsicType xmm1( (~C).load(i,j ) );
3222  IntrinsicType xmm2( (~C).load(i,j+1UL) );
3223 
3224  for( size_t k=kbegin; k<kend; ++k ) {
3225  const IntrinsicType a1( A.load(i,k) );
3226  xmm1 = xmm1 - a1 * set( B(k,j ) );
3227  xmm2 = xmm2 - a1 * set( B(k,j+1UL) );
3228  }
3229 
3230  (~C).store( i, j , xmm1 );
3231  (~C).store( i, j+1UL, xmm2 );
3232  }
3233 
3234  if( j < N )
3235  {
3236  const size_t kbegin( ( IsLower<MT5>::value )
3237  ?( ( IsUpper<MT4>::value )
3238  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3239  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3240  :( IsUpper<MT4>::value ? i : 0UL ) );
3241 
3242  IntrinsicType xmm1( (~C).load(i,j) );
3243 
3244  for( size_t k=kbegin; k<K; ++k ) {
3245  xmm1 = xmm1 - A.load(i,k) * set( B(k,j) );
3246  }
3247 
3248  (~C).store( i, j, xmm1 );
3249  }
3250  }
3251 
3252  for( ; remainder && i<M; ++i )
3253  {
3254  size_t j( 0UL );
3255 
3256  for( ; (j+2UL) <= N; j+=2UL )
3257  {
3258  const size_t kbegin( ( IsLower<MT5>::value )
3259  ?( ( IsUpper<MT4>::value )
3260  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3261  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3262  :( IsUpper<MT4>::value ? i : 0UL ) );
3263  const size_t kend( ( IsUpper<MT5>::value )
3264  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3265  :( K ) );
3266 
3267  ElementType value1( (~C)(i,j ) );
3268  ElementType value2( (~C)(i,j+1UL) );
3269 
3270  for( size_t k=kbegin; k<kend; ++k ) {
3271  value1 -= A(i,k) * B(k,j );
3272  value2 -= A(i,k) * B(k,j+1UL);
3273  }
3274 
3275  (~C)(i,j ) = value1;
3276  (~C)(i,j+1UL) = value2;
3277  }
3278 
3279  if( j < N )
3280  {
3281  const size_t kbegin( ( IsLower<MT5>::value )
3282  ?( ( IsUpper<MT4>::value )
3283  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3284  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3285  :( IsUpper<MT4>::value ? i : 0UL ) );
3286 
3287  ElementType value( (~C)(i,j) );
3288 
3289  for( size_t k=kbegin; k<K; ++k ) {
3290  value -= A(i,k) * B(k,j);
3291  }
3292 
3293  (~C)(i,j) = value;
3294  }
3295  }
3296  }
3298  //**********************************************************************************************
3299 
3300  //**Default subtraction assignment to dense matrices (large matrices)***************************
3314  template< typename MT3 // Type of the left-hand side target matrix
3315  , typename MT4 // Type of the left-hand side matrix operand
3316  , typename MT5 > // Type of the right-hand side matrix operand
3317  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3318  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3319  {
3320  selectDefaultSubAssignKernel( C, A, B );
3321  }
3323  //**********************************************************************************************
3324 
3325  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
3340  template< typename MT3 // Type of the left-hand side target matrix
3341  , typename MT4 // Type of the left-hand side matrix operand
3342  , typename MT5 > // Type of the right-hand side matrix operand
3343  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3344  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3345  {
3346  selectSmallSubAssignKernel( ~C, A, B );
3347  }
3349  //**********************************************************************************************
3350 
3351  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
3366  template< typename MT3 // Type of the left-hand side target matrix
3367  , typename MT4 // Type of the left-hand side matrix operand
3368  , typename MT5 > // Type of the right-hand side matrix operand
3369  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3370  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3371  {
3372  typedef IntrinsicTrait<ElementType> IT;
3373 
3374  const size_t M( A.rows() );
3375  const size_t N( B.columns() );
3376  const size_t K( A.columns() );
3377 
3378  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3379 
3380  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
3381  {
3382  const size_t iend( min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
3383 
3384  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3385  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % IT::size ) ) == ipos, "Invalid end calculation" );
3386 
3387  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
3388  {
3389  const size_t jend( min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
3390 
3391  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
3392  {
3393  const size_t ktmp( min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
3394 
3395  size_t i( ii );
3396 
3397  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
3398  {
3399  const size_t i1( i+IT::size );
3400  const size_t i2( i+IT::size*2UL );
3401  const size_t i3( i+IT::size*3UL );
3402 
3403  size_t j( jj );
3404 
3405  for( ; (j+2UL) <= jend; j+=2UL )
3406  {
3407  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3408  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3409  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
3410  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3411 
3412  IntrinsicType xmm1( (~C).load(i ,j ) );
3413  IntrinsicType xmm2( (~C).load(i1,j ) );
3414  IntrinsicType xmm3( (~C).load(i2,j ) );
3415  IntrinsicType xmm4( (~C).load(i3,j ) );
3416  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
3417  IntrinsicType xmm6( (~C).load(i1,j+1UL) );
3418  IntrinsicType xmm7( (~C).load(i2,j+1UL) );
3419  IntrinsicType xmm8( (~C).load(i3,j+1UL) );
3420 
3421  for( size_t k=kbegin; k<kend; ++k ) {
3422  const IntrinsicType a1( A.load(i ,k) );
3423  const IntrinsicType a2( A.load(i1,k) );
3424  const IntrinsicType a3( A.load(i2,k) );
3425  const IntrinsicType a4( A.load(i3,k) );
3426  const IntrinsicType b1( set( B(k,j ) ) );
3427  const IntrinsicType b2( set( B(k,j+1UL) ) );
3428  xmm1 = xmm1 - a1 * b1;
3429  xmm2 = xmm2 - a2 * b1;
3430  xmm3 = xmm3 - a3 * b1;
3431  xmm4 = xmm4 - a4 * b1;
3432  xmm5 = xmm5 - a1 * b2;
3433  xmm6 = xmm6 - a2 * b2;
3434  xmm7 = xmm7 - a3 * b2;
3435  xmm8 = xmm8 - a4 * b2;
3436  }
3437 
3438  (~C).store( i , j , xmm1 );
3439  (~C).store( i1, j , xmm2 );
3440  (~C).store( i2, j , xmm3 );
3441  (~C).store( i3, j , xmm4 );
3442  (~C).store( i , j+1UL, xmm5 );
3443  (~C).store( i1, j+1UL, xmm6 );
3444  (~C).store( i2, j+1UL, xmm7 );
3445  (~C).store( i3, j+1UL, xmm8 );
3446  }
3447 
3448  if( j < jend )
3449  {
3450  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3451  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3452  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
3453  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3454 
3455  IntrinsicType xmm1( (~C).load(i ,j) );
3456  IntrinsicType xmm2( (~C).load(i1,j) );
3457  IntrinsicType xmm3( (~C).load(i2,j) );
3458  IntrinsicType xmm4( (~C).load(i3,j) );
3459 
3460  for( size_t k=kbegin; k<kend; ++k ) {
3461  const IntrinsicType b1( set( B(k,j) ) );
3462  xmm1 = xmm1 - A.load(i ,k) * b1;
3463  xmm2 = xmm2 - A.load(i1,k) * b1;
3464  xmm3 = xmm3 - A.load(i2,k) * b1;
3465  xmm4 = xmm4 - A.load(i3,k) * b1;
3466  }
3467 
3468  (~C).store( i , j, xmm1 );
3469  (~C).store( i1, j, xmm2 );
3470  (~C).store( i2, j, xmm3 );
3471  (~C).store( i3, j, xmm4 );
3472  }
3473  }
3474 
3475  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
3476  {
3477  const size_t i1( i+IT::size );
3478 
3479  size_t j( jj );
3480 
3481  for( ; (j+4UL) <= jend; j+=4UL )
3482  {
3483  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3484  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3485  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
3486  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
3487 
3488  IntrinsicType xmm1( (~C).load(i ,j ) );
3489  IntrinsicType xmm2( (~C).load(i1,j ) );
3490  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3491  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
3492  IntrinsicType xmm5( (~C).load(i ,j+2UL) );
3493  IntrinsicType xmm6( (~C).load(i1,j+2UL) );
3494  IntrinsicType xmm7( (~C).load(i ,j+3UL) );
3495  IntrinsicType xmm8( (~C).load(i1,j+3UL) );
3496 
3497  for( size_t k=kbegin; k<kend; ++k ) {
3498  const IntrinsicType a1( A.load(i ,k) );
3499  const IntrinsicType a2( A.load(i1,k) );
3500  const IntrinsicType b1( set( B(k,j ) ) );
3501  const IntrinsicType b2( set( B(k,j+1UL) ) );
3502  const IntrinsicType b3( set( B(k,j+2UL) ) );
3503  const IntrinsicType b4( set( B(k,j+3UL) ) );
3504  xmm1 = xmm1 - a1 * b1;
3505  xmm2 = xmm2 - a2 * b1;
3506  xmm3 = xmm3 - a1 * b2;
3507  xmm4 = xmm4 - a2 * b2;
3508  xmm5 = xmm5 - a1 * b3;
3509  xmm6 = xmm6 - a2 * b3;
3510  xmm7 = xmm7 - a1 * b4;
3511  xmm8 = xmm8 - a2 * b4;
3512  }
3513 
3514  (~C).store( i , j , xmm1 );
3515  (~C).store( i1, j , xmm2 );
3516  (~C).store( i , j+1UL, xmm3 );
3517  (~C).store( i1, j+1UL, xmm4 );
3518  (~C).store( i , j+2UL, xmm5 );
3519  (~C).store( i1, j+2UL, xmm6 );
3520  (~C).store( i , j+3UL, xmm7 );
3521  (~C).store( i1, j+3UL, xmm8 );
3522  }
3523 
3524  for( ; (j+2UL) <= jend; j+=2UL )
3525  {
3526  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3527  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3528  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
3529  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3530 
3531  IntrinsicType xmm1( (~C).load(i ,j ) );
3532  IntrinsicType xmm2( (~C).load(i1,j ) );
3533  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3534  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
3535 
3536  for( size_t k=kbegin; k<kend; ++k ) {
3537  const IntrinsicType a1( A.load(i ,k) );
3538  const IntrinsicType a2( A.load(i1,k) );
3539  const IntrinsicType b1( set( B(k,j ) ) );
3540  const IntrinsicType b2( set( B(k,j+1UL) ) );
3541  xmm1 = xmm1 - a1 * b1;
3542  xmm2 = xmm2 - a2 * b1;
3543  xmm3 = xmm3 - a1 * b2;
3544  xmm4 = xmm4 - a2 * b2;
3545  }
3546 
3547  (~C).store( i , j , xmm1 );
3548  (~C).store( i1, j , xmm2 );
3549  (~C).store( i , j+1UL, xmm3 );
3550  (~C).store( i1, j+1UL, xmm4 );
3551  }
3552 
3553  if( j < jend )
3554  {
3555  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3556  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3557  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
3558  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3559 
3560  IntrinsicType xmm1( (~C).load(i ,j) );
3561  IntrinsicType xmm2( (~C).load(i1,j) );
3562 
3563  for( size_t k=kbegin; k<kend; ++k ) {
3564  const IntrinsicType b1( set( B(k,j) ) );
3565  xmm1 = xmm1 - A.load(i ,k) * b1;
3566  xmm2 = xmm2 - A.load(i1,k) * b1;
3567  }
3568 
3569  (~C).store( i , j, xmm1 );
3570  (~C).store( i1, j, xmm2 );
3571  }
3572  }
3573 
3574  for( ; i<ipos; i+=IT::size )
3575  {
3576  for( size_t j=jj; j<jend; ++j )
3577  {
3578  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3579  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3580  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
3581  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3582 
3583  IntrinsicType xmm1( (~C).load(i,j) );
3584 
3585  for( size_t k=kbegin; k<kend; ++k ) {
3586  const IntrinsicType b1( set( B(k,j) ) );
3587  xmm1 = xmm1 - A.load(i,k) * b1;
3588  }
3589 
3590  (~C).store( i, j, xmm1 );
3591  }
3592  }
3593 
3594  for( ; remainder && i<iend; ++i )
3595  {
3596  for( size_t j=jj; j<jend; ++j )
3597  {
3598  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3599  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3600  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
3601  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3602 
3603  ElementType value( (~C)(i,j) );
3604 
3605  for( size_t k=kbegin; k<kend; ++k ) {
3606  value -= A(i,k) * B(k,j);
3607  }
3608 
3609  (~C)(i,j) = value;
3610  }
3611  }
3612  }
3613  }
3614  }
3615  }
3617  //**********************************************************************************************
3618 
3619  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
3633  template< typename MT3 // Type of the left-hand side target matrix
3634  , typename MT4 // Type of the left-hand side matrix operand
3635  , typename MT5 > // Type of the right-hand side matrix operand
3636  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3637  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3638  {
3639  selectLargeSubAssignKernel( C, A, B );
3640  }
3642  //**********************************************************************************************
3643 
3644  //**BLAS-based subraction assignment to dense matrices******************************************
3645 #if BLAZE_BLAS_MODE
3646 
3659  template< typename MT3 // Type of the left-hand side target matrix
3660  , typename MT4 // Type of the left-hand side matrix operand
3661  , typename MT5 > // Type of the right-hand side matrix operand
3662  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3663  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3664  {
3665  typedef typename MT3::ElementType ET;
3666 
3667  if( IsTriangular<MT4>::value ) {
3668  typename MT3::ResultType tmp( serial( B ) );
3669  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3670  subAssign( C, tmp );
3671  }
3672  else if( IsTriangular<MT5>::value ) {
3673  typename MT3::ResultType tmp( serial( A ) );
3674  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3675  subAssign( C, tmp );
3676  }
3677  else {
3678  gemm( C, A, B, ET(-1), ET(1) );
3679  }
3680  }
3682 #endif
3683  //**********************************************************************************************
3684 
3685  //**Restructuring subtraction assignment to row-major matrices**********************************
3701  template< typename MT > // Type of the target matrix
3702  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3703  subAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
3704  {
3706 
3708 
3709  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3710  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3711 
3712  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3713  subAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3714  else if( IsSymmetric<MT1>::value )
3715  subAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
3716  else
3717  subAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
3718  }
3720  //**********************************************************************************************
3721 
3722  //**Subtraction assignment to sparse matrices***************************************************
3723  // No special implementation for the subtraction assignment to sparse matrices.
3724  //**********************************************************************************************
3725 
3726  //**Multiplication assignment to dense matrices*************************************************
3727  // No special implementation for the multiplication assignment to dense matrices.
3728  //**********************************************************************************************
3729 
3730  //**Multiplication assignment to sparse matrices************************************************
3731  // No special implementation for the multiplication assignment to sparse matrices.
3732  //**********************************************************************************************
3733 
3734  //**SMP assignment to dense matrices************************************************************
3750  template< typename MT // Type of the target dense matrix
3751  , bool SO > // Storage order of the target dense matrix
3752  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3753  smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3754  {
3756 
3757  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3758  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3759 
3760  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
3761  return;
3762  }
3763  else if( rhs.lhs_.columns() == 0UL ) {
3764  reset( ~lhs );
3765  return;
3766  }
3767 
3768  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3769  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3770 
3771  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3772  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3773  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3774  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3775  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3776  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3777 
3778  smpAssign( ~lhs, A * B );
3779  }
3781  //**********************************************************************************************
3782 
3783  //**SMP assignment to sparse matrices***********************************************************
3799  template< typename MT // Type of the target sparse matrix
3800  , bool SO > // Storage order of the target sparse matrix
3801  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3802  smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3803  {
3805 
3806  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3807 
3814 
3815  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3816  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3817 
3818  const TmpType tmp( rhs );
3819  smpAssign( ~lhs, tmp );
3820  }
3822  //**********************************************************************************************
3823 
3824  //**Restructuring SMP assignment to row-major matrices******************************************
3839  template< typename MT > // Type of the target matrix
3840  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3841  smpAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
3842  {
3844 
3846 
3847  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3848  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3849 
3850  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3851  smpAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3852  else if( IsSymmetric<MT1>::value )
3853  smpAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
3854  else
3855  smpAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
3856  }
3858  //**********************************************************************************************
3859 
3860  //**SMP addition assignment to dense matrices***************************************************
3876  template< typename MT // Type of the target dense matrix
3877  , bool SO > // Storage order of the target dense matrix
3878  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3879  smpAddAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3880  {
3882 
3883  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3884  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3885 
3886  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3887  return;
3888  }
3889 
3890  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3891  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3892 
3893  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3894  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3895  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3896  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3897  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3898  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3899 
3900  smpAddAssign( ~lhs, A * B );
3901  }
3903  //**********************************************************************************************
3904 
3905  //**Restructuring SMP addition assignment to row-major matrices*********************************
3921  template< typename MT > // Type of the target matrix
3922  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3923  smpAddAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
3924  {
3926 
3928 
3929  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3930  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3931 
3932  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3933  smpAddAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3934  else if( IsSymmetric<MT1>::value )
3935  smpAddAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
3936  else
3937  smpAddAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
3938  }
3940  //**********************************************************************************************
3941 
3942  //**SMP addition assignment to sparse matrices**************************************************
3943  // No special implementation for the SMP addition assignment to sparse matrices.
3944  //**********************************************************************************************
3945 
3946  //**SMP subtraction assignment to dense matrices************************************************
3962  template< typename MT // Type of the target dense matrix
3963  , bool SO > // Storage order of the target dense matrix
3964  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3965  smpSubAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3966  {
3968 
3969  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3970  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3971 
3972  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3973  return;
3974  }
3975 
3976  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3977  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3978 
3979  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3980  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3981  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3982  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3983  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3984  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3985 
3986  smpSubAssign( ~lhs, A * B );
3987  }
3989  //**********************************************************************************************
3990 
3991  //**Restructuring SMP subtraction assignment to row-major matrices******************************
4007  template< typename MT > // Type of the target matrix
4008  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
4009  smpSubAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4010  {
4012 
4014 
4015  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4016  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4017 
4018  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
4019  smpSubAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
4020  else if( IsSymmetric<MT1>::value )
4021  smpSubAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
4022  else
4023  smpSubAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
4024  }
4026  //**********************************************************************************************
4027 
4028  //**SMP subtraction assignment to sparse matrices***********************************************
4029  // No special implementation for the SMP subtraction assignment to sparse matrices.
4030  //**********************************************************************************************
4031 
4032  //**SMP multiplication assignment to dense matrices*********************************************
4033  // No special implementation for the SMP multiplication assignment to dense matrices.
4034  //**********************************************************************************************
4035 
4036  //**SMP multiplication assignment to sparse matrices********************************************
4037  // No special implementation for the SMP multiplication assignment to sparse matrices.
4038  //**********************************************************************************************
4039 
4040  //**Compile time checks*************************************************************************
4048  //**********************************************************************************************
4049 };
4050 //*************************************************************************************************
4051 
4052 
4053 
4054 
4055 //=================================================================================================
4056 //
4057 // DMATSCALARMULTEXPR SPECIALIZATION
4058 //
4059 //=================================================================================================
4060 
4061 //*************************************************************************************************
4069 template< typename MT1 // Type of the left-hand side dense matrix
4070  , typename MT2 // Type of the right-hand side dense matrix
4071  , typename ST > // Type of the right-hand side scalar value
4072 class DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2>, ST, true >
4073  : public DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2>, ST, true >, true >
4074  , private MatScalarMultExpr
4075  , private Computation
4076 {
4077  private:
4078  //**Type definitions****************************************************************************
4079  typedef TDMatTDMatMultExpr<MT1,MT2> MMM;
4080  typedef typename MMM::ResultType RES;
4081  typedef typename MT1::ResultType RT1;
4082  typedef typename MT2::ResultType RT2;
4083  typedef typename RT1::ElementType ET1;
4084  typedef typename RT2::ElementType ET2;
4085  typedef typename MT1::CompositeType CT1;
4086  typedef typename MT2::CompositeType CT2;
4087  //**********************************************************************************************
4088 
4089  //**********************************************************************************************
4091  enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4092  //**********************************************************************************************
4093 
4094  //**********************************************************************************************
4096  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4097  //**********************************************************************************************
4098 
4099  //**********************************************************************************************
4101 
4106  template< typename T1, typename T2, typename T3 >
4107  struct CanExploitSymmetry {
4108  enum { value = IsRowMajorMatrix<T1>::value &&
4109  ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
4110  };
4111  //**********************************************************************************************
4112 
4113  //**********************************************************************************************
4115 
4118  template< typename T1, typename T2, typename T3 >
4119  struct IsEvaluationRequired {
4120  enum { value = ( evaluateLeft || evaluateRight ) &&
4121  !CanExploitSymmetry<T1,T2,T3>::value };
4122  };
4123  //**********************************************************************************************
4124 
4125  //**********************************************************************************************
4127 
4129  template< typename T1, typename T2, typename T3, typename T4 >
4130  struct UseBlasKernel {
4131  enum { value = BLAZE_BLAS_MODE &&
4132  HasMutableDataAccess<T1>::value &&
4133  HasConstDataAccess<T2>::value &&
4134  HasConstDataAccess<T3>::value &&
4135  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4136  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4137  IsBlasCompatible<typename T1::ElementType>::value &&
4138  IsBlasCompatible<typename T2::ElementType>::value &&
4139  IsBlasCompatible<typename T3::ElementType>::value &&
4140  IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
4141  IsSame< typename T1::ElementType, typename T3::ElementType >::value &&
4142  !( IsBuiltin<typename T1::ElementType>::value && IsComplex<T4>::value ) };
4143  };
4144  //**********************************************************************************************
4145 
4146  //**********************************************************************************************
4148 
4150  template< typename T1, typename T2, typename T3, typename T4 >
4151  struct UseVectorizedDefaultKernel {
4152  enum { value = useOptimizedKernels &&
4153  !IsDiagonal<T2>::value &&
4154  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4155  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
4156  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
4157  IsSame<typename T1::ElementType,T4>::value &&
4158  IntrinsicTrait<typename T1::ElementType>::addition &&
4159  IntrinsicTrait<typename T1::ElementType>::subtraction &&
4160  IntrinsicTrait<typename T1::ElementType>::multiplication };
4161  };
4162  //**********************************************************************************************
4163 
4164  public:
4165  //**Type definitions****************************************************************************
4166  typedef DMatScalarMultExpr<MMM,ST,true> This;
4167  typedef typename MultTrait<RES,ST>::Type ResultType;
4168  typedef typename ResultType::OppositeType OppositeType;
4169  typedef typename ResultType::TransposeType TransposeType;
4170  typedef typename ResultType::ElementType ElementType;
4171  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
4172  typedef const ElementType ReturnType;
4173  typedef const ResultType CompositeType;
4174 
4176  typedef const TDMatTDMatMultExpr<MT1,MT2> LeftOperand;
4177 
4179  typedef ST RightOperand;
4180 
4182  typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type LT;
4183 
4185  typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type RT;
4186  //**********************************************************************************************
4187 
4188  //**Compilation flags***************************************************************************
4190  enum { vectorizable = !IsDiagonal<MT1>::value &&
4191  MT1::vectorizable && MT2::vectorizable &&
4192  IsSame<ET1,ET2>::value &&
4193  IsSame<ET1,ST>::value &&
4194  IntrinsicTrait<ET1>::addition &&
4195  IntrinsicTrait<ET1>::multiplication };
4196 
4198  enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4199  !evaluateRight && MT2::smpAssignable };
4200  //**********************************************************************************************
4201 
4202  //**Constructor*********************************************************************************
4208  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4209  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4210  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4211  {}
4212  //**********************************************************************************************
4213 
4214  //**Access operator*****************************************************************************
4221  inline ReturnType operator()( size_t i, size_t j ) const {
4222  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4223  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4224  return matrix_(i,j) * scalar_;
4225  }
4226  //**********************************************************************************************
4227 
4228  //**At function*********************************************************************************
4236  inline ReturnType at( size_t i, size_t j ) const {
4237  if( i >= matrix_.rows() ) {
4238  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4239  }
4240  if( j >= matrix_.columns() ) {
4241  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4242  }
4243  return (*this)(i,j);
4244  }
4245  //**********************************************************************************************
4246 
4247  //**Rows function*******************************************************************************
4252  inline size_t rows() const {
4253  return matrix_.rows();
4254  }
4255  //**********************************************************************************************
4256 
4257  //**Columns function****************************************************************************
4262  inline size_t columns() const {
4263  return matrix_.columns();
4264  }
4265  //**********************************************************************************************
4266 
4267  //**Left operand access*************************************************************************
4272  inline LeftOperand leftOperand() const {
4273  return matrix_;
4274  }
4275  //**********************************************************************************************
4276 
4277  //**Right operand access************************************************************************
4282  inline RightOperand rightOperand() const {
4283  return scalar_;
4284  }
4285  //**********************************************************************************************
4286 
4287  //**********************************************************************************************
4293  template< typename T >
4294  inline bool canAlias( const T* alias ) const {
4295  return matrix_.canAlias( alias );
4296  }
4297  //**********************************************************************************************
4298 
4299  //**********************************************************************************************
4305  template< typename T >
4306  inline bool isAliased( const T* alias ) const {
4307  return matrix_.isAliased( alias );
4308  }
4309  //**********************************************************************************************
4310 
4311  //**********************************************************************************************
4316  inline bool isAligned() const {
4317  return matrix_.isAligned();
4318  }
4319  //**********************************************************************************************
4320 
4321  //**********************************************************************************************
4326  inline bool canSMPAssign() const {
4327  typename MMM::RightOperand B( matrix_.rightOperand() );
4328  return ( !BLAZE_BLAS_IS_PARALLEL ||
4329  ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
4330  ( B.columns() > SMP_TDMATTDMATMULT_THRESHOLD );
4331  }
4332  //**********************************************************************************************
4333 
4334  private:
4335  //**Member variables****************************************************************************
4336  LeftOperand matrix_;
4337  RightOperand scalar_;
4338  //**********************************************************************************************
4339 
4340  //**Assignment to dense matrices****************************************************************
4352  template< typename MT // Type of the target dense matrix
4353  , bool SO > // Storage order of the target dense matrix
4354  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
4355  assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4356  {
4358 
4359  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4360  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4361 
4362  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4363  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4364 
4365  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4366  return;
4367  }
4368  else if( left.columns() == 0UL ) {
4369  reset( ~lhs );
4370  return;
4371  }
4372 
4373  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4374  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4375 
4376  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4377  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4378  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4379  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4380  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4381  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4382 
4383  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4384  }
4385  //**********************************************************************************************
4386 
4387  //**Assignment to dense matrices (kernel selection)*********************************************
4398  template< typename MT3 // Type of the left-hand side target matrix
4399  , typename MT4 // Type of the left-hand side matrix operand
4400  , typename MT5 // Type of the right-hand side matrix operand
4401  , typename ST2 > // Type of the scalar value
4402  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4403  {
4404  if( ( IsDiagonal<MT4>::value ) ||
4405  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
4406  selectSmallAssignKernel( C, A, B, scalar );
4407  else
4408  selectBlasAssignKernel( C, A, B, scalar );
4409  }
4410  //**********************************************************************************************
4411 
4412  //**Default assignment to dense matrices (general/general)**************************************
4426  template< typename MT3 // Type of the left-hand side target matrix
4427  , typename MT4 // Type of the left-hand side matrix operand
4428  , typename MT5 // Type of the right-hand side matrix operand
4429  , typename ST2 > // Type of the scalar value
4430  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4431  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4432  {
4433  const size_t M( A.rows() );
4434  const size_t N( B.columns() );
4435  const size_t K( A.columns() );
4436 
4437  for( size_t j=0UL; j<N; ++j )
4438  {
4439  const size_t kbegin( ( IsLower<MT5>::value )
4440  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4441  :( 0UL ) );
4442  const size_t kend( ( IsUpper<MT5>::value )
4443  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4444  :( K ) );
4445  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4446 
4447  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
4448  for( size_t i=0UL; i<M; ++i ) {
4449  reset( (~C)(i,j) );
4450  }
4451  continue;
4452  }
4453 
4454  {
4455  const size_t ibegin( ( IsLower<MT4>::value )
4456  ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
4457  :( 0UL ) );
4458  const size_t iend( ( IsUpper<MT4>::value )
4459  ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
4460  :( M ) );
4461  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4462 
4463  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
4464  for( size_t i=0UL; i<ibegin; ++i ) {
4465  reset( C(i,j) );
4466  }
4467  }
4468  else if( IsStrictlyLower<MT4>::value ) {
4469  reset( C(0UL,j) );
4470  }
4471  for( size_t i=ibegin; i<iend; ++i ) {
4472  C(i,j) = A(i,kbegin) * B(kbegin,j);
4473  }
4474  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
4475  for( size_t i=iend; i<M; ++i ) {
4476  reset( C(i,j) );
4477  }
4478  }
4479  else if( IsStrictlyUpper<MT4>::value ) {
4480  reset( C(M-1UL,j) );
4481  }
4482  }
4483 
4484  for( size_t k=kbegin+1UL; k<kend; ++k )
4485  {
4486  const size_t ibegin( ( IsLower<MT4>::value )
4487  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
4488  :( 0UL ) );
4489  const size_t iend( ( IsUpper<MT4>::value )
4490  ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
4491  :( M ) );
4492  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4493 
4494  for( size_t i=ibegin; i<iend; ++i ) {
4495  C(i,j) += A(i,k) * B(k,j);
4496  }
4497  if( IsUpper<MT4>::value ) {
4498  C(iend,j) = A(iend,k) * B(k,j);
4499  }
4500  }
4501 
4502  {
4503  const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
4504  ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? j+1UL : j )
4505  :( 0UL ) );
4506  const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4507  ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? j : j+1UL )
4508  :( M ) );
4509  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4510 
4511  for( size_t i=ibegin; i<iend; ++i ) {
4512  C(i,j) *= scalar;
4513  }
4514  }
4515  }
4516  }
4517  //**********************************************************************************************
4518 
4519  //**Default assignment to dense matrices (general/diagonal)*************************************
4533  template< typename MT3 // Type of the left-hand side target matrix
4534  , typename MT4 // Type of the left-hand side matrix operand
4535  , typename MT5 // Type of the right-hand side matrix operand
4536  , typename ST2 > // Type of the scalar value
4537  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4538  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4539  {
4541 
4542  const size_t M( A.rows() );
4543  const size_t N( B.columns() );
4544 
4545  for( size_t j=0UL; j<N; ++j )
4546  {
4547  const size_t ibegin( ( IsLower<MT4>::value )
4548  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4549  :( 0UL ) );
4550  const size_t iend( ( IsUpper<MT4>::value )
4551  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4552  :( M ) );
4553  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4554 
4555  if( IsLower<MT4>::value ) {
4556  for( size_t i=0UL; i<ibegin; ++i ) {
4557  reset( C(i,j) );
4558  }
4559  }
4560  for( size_t i=ibegin; i<iend; ++i ) {
4561  C(i,j) = A(i,j) * B(j,j) * scalar;
4562  }
4563  if( IsUpper<MT4>::value ) {
4564  for( size_t i=iend; i<M; ++i ) {
4565  reset( C(i,j) );
4566  }
4567  }
4568  }
4569  }
4570  //**********************************************************************************************
4571 
4572  //**Default assignment to dense matrices (diagonal/general)*************************************
4586  template< typename MT3 // Type of the left-hand side target matrix
4587  , typename MT4 // Type of the left-hand side matrix operand
4588  , typename MT5 // Type of the right-hand side matrix operand
4589  , typename ST2 > // Type of the scalar value
4590  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4591  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4592  {
4594 
4595  const size_t M( A.rows() );
4596  const size_t N( B.columns() );
4597 
4598  for( size_t j=0UL; j<N; ++j )
4599  {
4600  const size_t ibegin( ( IsLower<MT5>::value )
4601  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4602  :( 0UL ) );
4603  const size_t iend( ( IsUpper<MT5>::value )
4604  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4605  :( M ) );
4606  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4607 
4608  if( IsLower<MT4>::value ) {
4609  for( size_t i=0UL; i<ibegin; ++i ) {
4610  reset( C(i,j) );
4611  }
4612  }
4613  for( size_t i=ibegin; i<iend; ++i ) {
4614  C(i,j) = A(i,i) * B(i,j) * scalar;
4615  }
4616  if( IsUpper<MT4>::value ) {
4617  for( size_t i=iend; i<M; ++i ) {
4618  reset( C(i,j) );
4619  }
4620  }
4621  }
4622  }
4623  //**********************************************************************************************
4624 
4625  //**Default assignment to dense matrices (diagonal/diagonal)************************************
4639  template< typename MT3 // Type of the left-hand side target matrix
4640  , typename MT4 // Type of the left-hand side matrix operand
4641  , typename MT5 // Type of the right-hand side matrix operand
4642  , typename ST2 > // Type of the scalar value
4643  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
4644  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4645  {
4647 
4648  reset( C );
4649 
4650  for( size_t i=0UL; i<A.rows(); ++i ) {
4651  C(i,i) = A(i,i) * B(i,i) * scalar;
4652  }
4653  }
4654  //**********************************************************************************************
4655 
4656  //**Default assignment to dense matrices (small matrices)***************************************
4670  template< typename MT3 // Type of the left-hand side target matrix
4671  , typename MT4 // Type of the left-hand side matrix operand
4672  , typename MT5 // Type of the right-hand side matrix operand
4673  , typename ST2 > // Type of the scalar value
4674  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4675  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4676  {
4677  selectDefaultAssignKernel( C, A, B, scalar );
4678  }
4679  //**********************************************************************************************
4680 
4681  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
4696  template< typename MT3 // Type of the left-hand side target matrix
4697  , typename MT4 // Type of the left-hand side matrix operand
4698  , typename MT5 // Type of the right-hand side matrix operand
4699  , typename ST2 > // Type of the scalar value
4700  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4701  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
4702  {
4707 
4708  if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
4709  const typename MT5::OppositeType tmp( serial( B ) );
4710  assign( ~C, A * tmp * scalar );
4711  }
4712  else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
4713  const typename MT4::OppositeType tmp( serial( A ) );
4714  assign( ~C, tmp * B * scalar );
4715  }
4716  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
4717  const typename MT5::OppositeType tmp( serial( B ) );
4718  assign( ~C, A * tmp * scalar );
4719  }
4720  else {
4721  const typename MT4::OppositeType tmp( serial( A ) );
4722  assign( ~C, tmp * B * scalar );
4723  }
4724  }
4725  //**********************************************************************************************
4726 
4727  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
4742  template< typename MT3 // Type of the left-hand side target matrix
4743  , typename MT4 // Type of the left-hand side matrix operand
4744  , typename MT5 // Type of the right-hand side matrix operand
4745  , typename ST2 > // Type of the scalar value
4746  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4747  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
4748  {
4749  typedef IntrinsicTrait<ElementType> IT;
4750 
4751  const size_t M( A.rows() );
4752  const size_t N( B.columns() );
4753  const size_t K( A.columns() );
4754 
4755  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
4756 
4757  const size_t ipos( remainder ? ( M & size_t(-IT::size) ) : M );
4758  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % IT::size ) ) == ipos, "Invalid end calculation" );
4759 
4760  const IntrinsicType factor( set( scalar ) );
4761 
4762  size_t i( 0UL );
4763 
4764  for( ; (i+IT::size*7UL) < ipos; i+=IT::size*8UL ) {
4765  for( size_t j=0UL; j<N; ++j )
4766  {
4767  const size_t kbegin( ( IsLower<MT5>::value )
4768  ?( ( IsUpper<MT4>::value )
4769  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4770  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4771  :( IsUpper<MT4>::value ? i : 0UL ) );
4772  const size_t kend( ( IsUpper<MT5>::value )
4773  ?( ( IsLower<MT4>::value )
4774  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4775  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
4776  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
4777 
4778  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4779 
4780  for( size_t k=kbegin; k<kend; ++k ) {
4781  const IntrinsicType b1( set( B(k,j) ) );
4782  xmm1 = xmm1 + A.load(i ,k) * b1;
4783  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4784  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4785  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4786  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
4787  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
4788  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
4789  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
4790  }
4791 
4792  (~C).store( i , j, xmm1 * factor );
4793  (~C).store( i+IT::size , j, xmm2 * factor );
4794  (~C).store( i+IT::size*2UL, j, xmm3 * factor );
4795  (~C).store( i+IT::size*3UL, j, xmm4 * factor );
4796  (~C).store( i+IT::size*4UL, j, xmm5 * factor );
4797  (~C).store( i+IT::size*5UL, j, xmm6 * factor );
4798  (~C).store( i+IT::size*6UL, j, xmm7 * factor );
4799  (~C).store( i+IT::size*7UL, j, xmm8 * factor );
4800  }
4801  }
4802 
4803  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
4804  {
4805  size_t j( 0UL );
4806 
4807  for( ; (j+2UL) <= N; j+=2UL )
4808  {
4809  const size_t kbegin( ( IsLower<MT5>::value )
4810  ?( ( IsUpper<MT4>::value )
4811  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4812  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4813  :( IsUpper<MT4>::value ? i : 0UL ) );
4814  const size_t kend( ( IsUpper<MT5>::value )
4815  ?( ( IsLower<MT4>::value )
4816  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4817  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4818  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
4819 
4820  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4821 
4822  for( size_t k=kbegin; k<kend; ++k ) {
4823  const IntrinsicType a1( A.load(i ,k) );
4824  const IntrinsicType a2( A.load(i+IT::size ,k) );
4825  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
4826  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
4827  const IntrinsicType b1( set( B(k,j ) ) );
4828  const IntrinsicType b2( set( B(k,j+1UL) ) );
4829  xmm1 = xmm1 + a1 * b1;
4830  xmm2 = xmm2 + a2 * b1;
4831  xmm3 = xmm3 + a3 * b1;
4832  xmm4 = xmm4 + a4 * b1;
4833  xmm5 = xmm5 + a1 * b2;
4834  xmm6 = xmm6 + a2 * b2;
4835  xmm7 = xmm7 + a3 * b2;
4836  xmm8 = xmm8 + a4 * b2;
4837  }
4838 
4839  (~C).store( i , j , xmm1 * factor );
4840  (~C).store( i+IT::size , j , xmm2 * factor );
4841  (~C).store( i+IT::size*2UL, j , xmm3 * factor );
4842  (~C).store( i+IT::size*3UL, j , xmm4 * factor );
4843  (~C).store( i , j+1UL, xmm5 * factor );
4844  (~C).store( i+IT::size , j+1UL, xmm6 * factor );
4845  (~C).store( i+IT::size*2UL, j+1UL, xmm7 * factor );
4846  (~C).store( i+IT::size*3UL, j+1UL, xmm8 * factor );
4847  }
4848 
4849  if( j < N )
4850  {
4851  const size_t kbegin( ( IsLower<MT5>::value )
4852  ?( ( IsUpper<MT4>::value )
4853  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4854  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4855  :( IsUpper<MT4>::value ? i : 0UL ) );
4856  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
4857 
4858  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4859 
4860  for( size_t k=kbegin; k<kend; ++k ) {
4861  const IntrinsicType b1( set( B(k,j) ) );
4862  xmm1 = xmm1 + A.load(i ,k) * b1;
4863  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4864  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4865  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4866  }
4867 
4868  (~C).store( i , j, xmm1 * factor );
4869  (~C).store( i+IT::size , j, xmm2 * factor );
4870  (~C).store( i+IT::size*2UL, j, xmm3 * factor );
4871  (~C).store( i+IT::size*3UL, j, xmm4 * factor );
4872  }
4873  }
4874 
4875  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
4876  {
4877  size_t j( 0UL );
4878 
4879  for( ; (j+2UL) <= N; j+=2UL )
4880  {
4881  const size_t kbegin( ( IsLower<MT5>::value )
4882  ?( ( IsUpper<MT4>::value )
4883  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4884  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4885  :( IsUpper<MT4>::value ? i : 0UL ) );
4886  const size_t kend( ( IsUpper<MT5>::value )
4887  ?( ( IsLower<MT4>::value )
4888  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4889  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4890  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
4891 
4892  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4893 
4894  for( size_t k=kbegin; k<kend; ++k ) {
4895  const IntrinsicType a1( A.load(i ,k) );
4896  const IntrinsicType a2( A.load(i+IT::size,k) );
4897  const IntrinsicType b1( set( B(k,j ) ) );
4898  const IntrinsicType b2( set( B(k,j+1UL) ) );
4899  xmm1 = xmm1 + a1 * b1;
4900  xmm2 = xmm2 + a2 * b1;
4901  xmm3 = xmm3 + a1 * b2;
4902  xmm4 = xmm4 + a2 * b2;
4903  }
4904 
4905  (~C).store( i , j , xmm1 * factor );
4906  (~C).store( i+IT::size, j , xmm2 * factor );
4907  (~C).store( i , j+1UL, xmm3 * factor );
4908  (~C).store( i+IT::size, j+1UL, xmm4 * factor );
4909  }
4910 
4911  if( j < N )
4912  {
4913  const size_t kbegin( ( IsLower<MT5>::value )
4914  ?( ( IsUpper<MT4>::value )
4915  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4916  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4917  :( IsUpper<MT4>::value ? i : 0UL ) );
4918  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
4919 
4920  IntrinsicType xmm1, xmm2;
4921 
4922  for( size_t k=kbegin; k<kend; ++k ) {
4923  const IntrinsicType b1( set( B(k,j) ) );
4924  xmm1 = xmm1 + A.load(i ,k) * b1;
4925  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
4926  }
4927 
4928  (~C).store( i , j, xmm1 * factor );
4929  (~C).store( i+IT::size, j, xmm2 * factor );
4930  }
4931  }
4932 
4933  for( ; i<ipos; i+=IT::size )
4934  {
4935  size_t j( 0UL );
4936 
4937  for( ; (j+2UL) <= N; j+=2UL )
4938  {
4939  const size_t kbegin( ( IsLower<MT5>::value )
4940  ?( ( IsUpper<MT4>::value )
4941  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4942  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4943  :( IsUpper<MT4>::value ? i : 0UL ) );
4944  const size_t kend( ( IsUpper<MT5>::value )
4945  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4946  :( K ) );
4947 
4948  IntrinsicType xmm1, xmm2;
4949 
4950  for( size_t k=kbegin; k<kend; ++k ) {
4951  const IntrinsicType a1( A.load(i,k) );
4952  xmm1 = xmm1 + a1 * set( B(k,j ) );
4953  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
4954  }
4955 
4956  (~C).store( i, j , xmm1 * factor );
4957  (~C).store( i, j+1UL, xmm2 * factor );
4958  }
4959 
4960  if( j < N )
4961  {
4962  const size_t kbegin( ( IsLower<MT5>::value )
4963  ?( ( IsUpper<MT4>::value )
4964  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4965  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4966  :( IsUpper<MT4>::value ? i : 0UL ) );
4967 
4968  IntrinsicType xmm1;
4969 
4970  for( size_t k=kbegin; k<K; ++k ) {
4971  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
4972  }
4973 
4974  (~C).store( i, j, xmm1 * factor );
4975  }
4976  }
4977 
4978  for( ; remainder && i<M; ++i )
4979  {
4980  size_t j( 0UL );
4981 
4982  for( ; (j+2UL) <= N; j+=2UL )
4983  {
4984  const size_t kbegin( ( IsLower<MT5>::value )
4985  ?( ( IsUpper<MT4>::value )
4986  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4987  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4988  :( IsUpper<MT4>::value ? i : 0UL ) );
4989  const size_t kend( ( IsUpper<MT5>::value )
4990  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4991  :( K ) );
4992 
4993  ElementType value1 = ElementType();
4994  ElementType value2 = ElementType();
4995 
4996  for( size_t k=kbegin; k<kend; ++k ) {
4997  value1 += A(i,k) * B(k,j );
4998  value2 += A(i,k) * B(k,j+1UL);
4999  }
5000 
5001  (~C)(i,j ) = value1 * scalar;
5002  (~C)(i,j+1UL) = value2 * scalar;
5003  }
5004 
5005  if( j < N )
5006  {
5007  const size_t kbegin( ( IsLower<MT5>::value )
5008  ?( ( IsUpper<MT4>::value )
5009  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5010  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5011  :( IsUpper<MT4>::value ? i : 0UL ) );
5012 
5013  ElementType value = ElementType();
5014 
5015  for( size_t k=kbegin; k<K; ++k ) {
5016  value += A(i,k) * B(k,j);
5017  }
5018 
5019  (~C)(i,j) = value * scalar;
5020  }
5021  }
5022  }
5023  //**********************************************************************************************
5024 
5025  //**Default assignment to dense matrices (large matrices)***************************************
5039  template< typename MT3 // Type of the left-hand side target matrix
5040  , typename MT4 // Type of the left-hand side matrix operand
5041  , typename MT5 // Type of the right-hand side matrix operand
5042  , typename ST2 > // Type of the scalar value
5043  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5044  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5045  {
5046  selectDefaultAssignKernel( C, A, B, scalar );
5047  }
5048  //**********************************************************************************************
5049 
5050  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
5065  template< typename MT3 // Type of the left-hand side target matrix
5066  , typename MT4 // Type of the left-hand side matrix operand
5067  , typename MT5 // Type of the right-hand side matrix operand
5068  , typename ST2 > // Type of the scalar value
5069  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5070  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5071  {
5072  selectSmallAssignKernel( ~C, A, B, scalar );
5073  }
5074  //**********************************************************************************************
5075 
5076  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
5091  template< typename MT3 // Type of the left-hand side target matrix
5092  , typename MT4 // Type of the left-hand side matrix operand
5093  , typename MT5 // Type of the right-hand side matrix operand
5094  , typename ST2 > // Type of the scalar value
5095  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5096  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5097  {
5098  typedef IntrinsicTrait<ElementType> IT;
5099 
5100  const size_t M( A.rows() );
5101  const size_t N( B.columns() );
5102  const size_t K( A.columns() );
5103 
5104  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
5105 
5106  const IntrinsicType factor( set( scalar ) );
5107 
5108  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
5109  {
5110  const size_t iend( min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
5111 
5112  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
5113  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % IT::size ) ) == ipos, "Invalid end calculation" );
5114 
5115  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
5116  {
5117  const size_t jend( min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
5118 
5119  for( size_t j=jj; j<jend; ++j ) {
5120  for( size_t i=ii; i<iend; ++i ) {
5121  reset( (~C)(i,j) );
5122  }
5123  }
5124 
5125  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
5126  {
5127  const size_t ktmp( min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
5128 
5129  size_t i( ii );
5130 
5131  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
5132  {
5133  const size_t i1( i+IT::size );
5134  const size_t i2( i+IT::size*2UL );
5135  const size_t i3( i+IT::size*3UL );
5136 
5137  size_t j( jj );
5138 
5139  for( ; (j+2UL) <= jend; j+=2UL )
5140  {
5141  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5142  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5143  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
5144  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5145 
5146  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5147 
5148  for( size_t k=kbegin; k<kend; ++k ) {
5149  const IntrinsicType a1( A.load(i ,k) );
5150  const IntrinsicType a2( A.load(i1,k) );
5151  const IntrinsicType a3( A.load(i2,k) );
5152  const IntrinsicType a4( A.load(i3,k) );
5153  const IntrinsicType b1( set( B(k,j ) ) );
5154  const IntrinsicType b2( set( B(k,j+1UL) ) );
5155  xmm1 = xmm1 + a1 * b1;
5156  xmm2 = xmm2 + a2 * b1;
5157  xmm3 = xmm3 + a3 * b1;
5158  xmm4 = xmm4 + a4 * b1;
5159  xmm5 = xmm5 + a1 * b2;
5160  xmm6 = xmm6 + a2 * b2;
5161  xmm7 = xmm7 + a3 * b2;
5162  xmm8 = xmm8 + a4 * b2;
5163  }
5164 
5165  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5166  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
5167  (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
5168  (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
5169  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
5170  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
5171  (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
5172  (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
5173  }
5174 
5175  if( j < jend )
5176  {
5177  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5178  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5179  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
5180  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5181 
5182  IntrinsicType xmm1, xmm2, xmm3, xmm4;
5183 
5184  for( size_t k=kbegin; k<kend; ++k ) {
5185  const IntrinsicType b1( set( B(k,j) ) );
5186  xmm1 = xmm1 + A.load(i ,k) * b1;
5187  xmm2 = xmm2 + A.load(i1,k) * b1;
5188  xmm3 = xmm3 + A.load(i2,k) * b1;
5189  xmm4 = xmm4 + A.load(i3,k) * b1;
5190  }
5191 
5192  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5193  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
5194  (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
5195  (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
5196  }
5197  }
5198 
5199  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
5200  {
5201  const size_t i1( i+IT::size );
5202 
5203  size_t j( jj );
5204 
5205  for( ; (j+4UL) <= jend; j+=4UL )
5206  {
5207  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5208  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5209  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
5210  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
5211 
5212  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5213 
5214  for( size_t k=kbegin; k<kend; ++k ) {
5215  const IntrinsicType a1( A.load(i ,k) );
5216  const IntrinsicType a2( A.load(i1,k) );
5217  const IntrinsicType b1( set( B(k,j ) ) );
5218  const IntrinsicType b2( set( B(k,j+1UL) ) );
5219  const IntrinsicType b3( set( B(k,j+2UL) ) );
5220  const IntrinsicType b4( set( B(k,j+3UL) ) );
5221  xmm1 = xmm1 + a1 * b1;
5222  xmm2 = xmm2 + a2 * b1;
5223  xmm3 = xmm3 + a1 * b2;
5224  xmm4 = xmm4 + a2 * b2;
5225  xmm5 = xmm5 + a1 * b3;
5226  xmm6 = xmm6 + a2 * b3;
5227  xmm7 = xmm7 + a1 * b4;
5228  xmm8 = xmm8 + a2 * b4;
5229  }
5230 
5231  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5232  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
5233  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5234  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
5235  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
5236  (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
5237  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
5238  (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
5239  }
5240 
5241  for( ; (j+2UL) <= jend; j+=2UL )
5242  {
5243  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5244  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5245  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
5246  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5247 
5248  IntrinsicType xmm1, xmm2, xmm3, xmm4;
5249 
5250  for( size_t k=kbegin; k<kend; ++k ) {
5251  const IntrinsicType a1( A.load(i ,k) );
5252  const IntrinsicType a2( A.load(i1,k) );
5253  const IntrinsicType b1( set( B(k,j ) ) );
5254  const IntrinsicType b2( set( B(k,j+1UL) ) );
5255  xmm1 = xmm1 + a1 * b1;
5256  xmm2 = xmm2 + a2 * b1;
5257  xmm3 = xmm3 + a1 * b2;
5258  xmm4 = xmm4 + a2 * b2;
5259  }
5260 
5261  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5262  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
5263  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5264  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
5265  }
5266 
5267  if( j < jend )
5268  {
5269  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5270  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5271  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
5272  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5273 
5274  IntrinsicType xmm1, xmm2;
5275 
5276  for( size_t k=kbegin; k<kend; ++k ) {
5277  const IntrinsicType b1( set( B(k,j) ) );
5278  xmm1 = xmm1 + A.load(i ,k) * b1;
5279  xmm2 = xmm2 + A.load(i1,k) * b1;
5280  }
5281 
5282  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5283  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
5284  }
5285  }
5286 
5287  for( ; i<ipos; i+=IT::size )
5288  {
5289  for( size_t j=jj; j<jend; ++j )
5290  {
5291  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5292  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5293  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
5294  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5295 
5296  IntrinsicType xmm1;
5297 
5298  for( size_t k=kbegin; k<kend; ++k ) {
5299  const IntrinsicType b1( set( B(k,j) ) );
5300  xmm1 = xmm1 + A.load(i,k) * b1;
5301  }
5302 
5303  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5304  }
5305  }
5306 
5307  for( ; remainder && i<iend; ++i )
5308  {
5309  for( size_t j=jj; j<jend; ++j )
5310  {
5311  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5312  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5313  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
5314  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5315 
5316  ElementType value = ElementType();
5317 
5318  for( size_t k=kbegin; k<kend; ++k ) {
5319  value += A(i,k) * B(k,j);
5320  }
5321 
5322  (~C)(i,j) += value * scalar;
5323  }
5324  }
5325  }
5326  }
5327  }
5328  }
5329  //**********************************************************************************************
5330 
5331  //**BLAS-based assignment to dense matrices (default)*******************************************
5345  template< typename MT3 // Type of the left-hand side target matrix
5346  , typename MT4 // Type of the left-hand side matrix operand
5347  , typename MT5 // Type of the right-hand side matrix operand
5348  , typename ST2 > // Type of the scalar value
5349  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
5350  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5351  {
5352  selectLargeAssignKernel( C, A, B, scalar );
5353  }
5354  //**********************************************************************************************
5355 
5356  //**BLAS-based assignment to dense matrices*****************************************************
5357 #if BLAZE_BLAS_MODE
5358 
5371  template< typename MT3 // Type of the left-hand side target matrix
5372  , typename MT4 // Type of the left-hand side matrix operand
5373  , typename MT5 // Type of the right-hand side matrix operand
5374  , typename ST2 > // Type of the scalar value
5375  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
5376  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5377  {
5378  typedef typename MT3::ElementType ET;
5379 
5380  if( IsTriangular<MT4>::value ) {
5381  assign( C, B );
5382  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5383  }
5384  else if( IsTriangular<MT5>::value ) {
5385  assign( C, A );
5386  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5387  }
5388  else {
5389  gemm( C, A, B, ET(scalar), ET(0) );
5390  }
5391  }
5392 #endif
5393  //**********************************************************************************************
5394 
5395  //**Assignment to sparse matrices***************************************************************
5407  template< typename MT // Type of the target sparse matrix
5408  , bool SO > // Storage order of the target sparse matrix
5409  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5410  assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5411  {
5413 
5414  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
5415 
5422 
5423  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5424  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5425 
5426  const TmpType tmp( serial( rhs ) );
5427  assign( ~lhs, tmp );
5428  }
5429  //**********************************************************************************************
5430 
5431  //**Restructuring assignment to row-major matrices**********************************************
5445  template< typename MT > // Type of the target matrix
5446  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5447  assign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
5448  {
5450 
5452 
5453  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5454  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5455 
5456  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5457  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5458 
5459  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
5460  assign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
5461  else if( IsSymmetric<MT1>::value )
5462  assign( ~lhs, trans( left ) * right * rhs.scalar_ );
5463  else
5464  assign( ~lhs, left * trans( right ) * rhs.scalar_ );
5465  }
5466  //**********************************************************************************************
5467 
5468  //**Addition assignment to dense matrices*******************************************************
5480  template< typename MT // Type of the target dense matrix
5481  , bool SO > // Storage order of the target dense matrix
5482  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5483  addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5484  {
5486 
5487  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5488  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5489 
5490  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5491  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5492 
5493  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
5494  return;
5495  }
5496 
5497  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
5498  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
5499 
5500  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5501  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
5502  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
5503  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
5504  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5505  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
5506 
5507  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5508  }
5509  //**********************************************************************************************
5510 
5511  //**Addition assignment to dense matrices (kernel selection)************************************
5522  template< typename MT3 // Type of the left-hand side target matrix
5523  , typename MT4 // Type of the left-hand side matrix operand
5524  , typename MT5 // Type of the right-hand side matrix operand
5525  , typename ST2 > // Type of the scalar value
5526  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5527  {
5528  if( ( IsDiagonal<MT4>::value ) ||
5529  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
5530  selectSmallAddAssignKernel( C, A, B, scalar );
5531  else
5532  selectBlasAddAssignKernel( C, A, B, scalar );
5533  }
5534  //**********************************************************************************************
5535 
5536  //**Default addition assignment to dense matrices (general/general)*****************************
5550  template< typename MT3 // Type of the left-hand side target matrix
5551  , typename MT4 // Type of the left-hand side matrix operand
5552  , typename MT5 // Type of the right-hand side matrix operand
5553  , typename ST2 > // Type of the scalar value
5554  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
5555  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5556  {
5557  const ResultType tmp( serial( A * B * scalar ) );
5558  addAssign( C, tmp );
5559  }
5560  //**********************************************************************************************
5561 
5562  //**Default addition assignment to dense matrices (general/diagonal)****************************
5576  template< typename MT3 // Type of the left-hand side target matrix
5577  , typename MT4 // Type of the left-hand side matrix operand
5578  , typename MT5 // Type of the right-hand side matrix operand
5579  , typename ST2 > // Type of the scalar value
5580  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
5581  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5582  {
5584 
5585  const size_t M( A.rows() );
5586  const size_t N( B.columns() );
5587 
5588  for( size_t j=0UL; j<N; ++j )
5589  {
5590  const size_t ibegin( ( IsLower<MT4>::value )
5591  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
5592  :( 0UL ) );
5593  const size_t iend( ( IsUpper<MT4>::value )
5594  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
5595  :( M ) );
5596  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5597 
5598  const size_t inum( iend - ibegin );
5599  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
5600 
5601  for( size_t i=ibegin; i<ipos; i+=2UL ) {
5602  C(i ,j) += A(i ,j) * B(j,j) * scalar;
5603  C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
5604  }
5605  if( ipos < iend ) {
5606  C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
5607  }
5608  }
5609  }
5610  //**********************************************************************************************
5611 
5612  //**Default addition assignment to dense matrices (diagonal/general)****************************
5626  template< typename MT3 // Type of the left-hand side target matrix
5627  , typename MT4 // Type of the left-hand side matrix operand
5628  , typename MT5 // Type of the right-hand side matrix operand
5629  , typename ST2 > // Type of the scalar value
5630  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
5631  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5632  {
5634 
5635  const size_t M( A.rows() );
5636  const size_t N( B.columns() );
5637 
5638  for( size_t j=0UL; j<N; ++j )
5639  {
5640  const size_t ibegin( ( IsLower<MT5>::value )
5641  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5642  :( 0UL ) );
5643  const size_t iend( ( IsUpper<MT5>::value )
5644  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5645  :( M ) );
5646  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5647 
5648  const size_t inum( iend - ibegin );
5649  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
5650 
5651  for( size_t i=ibegin; i<ipos; i+=2UL ) {
5652  C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
5653  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
5654  }
5655  if( ipos < iend ) {
5656  C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
5657  }
5658  }
5659  }
5660  //**********************************************************************************************
5661 
5662  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
5676  template< typename MT3 // Type of the left-hand side target matrix
5677  , typename MT4 // Type of the left-hand side matrix operand
5678  , typename MT5 // Type of the right-hand side matrix operand
5679  , typename ST2 > // Type of the scalar value
5680  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
5681  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5682  {
5684 
5685  for( size_t i=0UL; i<A.rows(); ++i ) {
5686  C(i,i) += A(i,i) * B(i,i) * scalar;
5687  }
5688  }
5689  //**********************************************************************************************
5690 
5691  //**Default addition assignment to dense matrices (small matrices)******************************
5705  template< typename MT3 // Type of the left-hand side target matrix
5706  , typename MT4 // Type of the left-hand side matrix operand
5707  , typename MT5 // Type of the right-hand side matrix operand
5708  , typename ST2 > // Type of the scalar value
5709  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5710  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5711  {
5712  selectDefaultAddAssignKernel( C, A, B, scalar );
5713  }
5714  //**********************************************************************************************
5715 
5716  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
5731  template< typename MT3 // Type of the left-hand side target matrix
5732  , typename MT4 // Type of the left-hand side matrix operand
5733  , typename MT5 // Type of the right-hand side matrix operand
5734  , typename ST2 > // Type of the scalar value
5735  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5736  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5737  {
5742 
5743  if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
5744  const typename MT5::OppositeType tmp( serial( B ) );
5745  addAssign( ~C, A * tmp * scalar );
5746  }
5747  else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
5748  const typename MT4::OppositeType tmp( serial( A ) );
5749  addAssign( ~C, tmp * B * scalar );
5750  }
5751  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5752  const typename MT5::OppositeType tmp( serial( B ) );
5753  addAssign( ~C, A * tmp * scalar );
5754  }
5755  else {
5756  const typename MT4::OppositeType tmp( serial( A ) );
5757  addAssign( ~C, tmp * B * scalar );
5758  }
5759  }
5760  //**********************************************************************************************
5761 
5762  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
5777  template< typename MT3 // Type of the left-hand side target matrix
5778  , typename MT4 // Type of the left-hand side matrix operand
5779  , typename MT5 // Type of the right-hand side matrix operand
5780  , typename ST2 > // Type of the scalar value
5781  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5782  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5783  {
5784  typedef IntrinsicTrait<ElementType> IT;
5785 
5786  const size_t M( A.rows() );
5787  const size_t N( B.columns() );
5788  const size_t K( A.columns() );
5789 
5790  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
5791 
5792  const size_t ipos( remainder ? ( M & size_t(-IT::size) ) : M );
5793  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % IT::size ) ) == ipos, "Invalid end calculation" );
5794 
5795  const IntrinsicType factor( set( scalar ) );
5796 
5797  size_t i( 0UL );
5798 
5799  for( ; (i+IT::size*7UL) < ipos; i+=IT::size*8UL ) {
5800  for( size_t j=0UL; j<N; ++j )
5801  {
5802  const size_t kbegin( ( IsLower<MT5>::value )
5803  ?( ( IsUpper<MT4>::value )
5804  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5805  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5806  :( IsUpper<MT4>::value ? i : 0UL ) );
5807  const size_t kend( ( IsUpper<MT5>::value )
5808  ?( ( IsLower<MT4>::value )
5809  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
5810  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
5811  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
5812 
5813  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5814 
5815  for( size_t k=kbegin; k<kend; ++k ) {
5816  const IntrinsicType b1( set( B(k,j) ) );
5817  xmm1 = xmm1 + A.load(i ,k) * b1;
5818  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
5819  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
5820  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
5821  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
5822  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
5823  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
5824  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
5825  }
5826 
5827  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5828  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
5829  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
5830  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
5831  (~C).store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) + xmm5 * factor );
5832  (~C).store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) + xmm6 * factor );
5833  (~C).store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) + xmm7 * factor );
5834  (~C).store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) + xmm8 * factor );
5835  }
5836  }
5837 
5838  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
5839  {
5840  size_t j( 0UL );
5841 
5842  for( ; (j+2UL) <= N; j+=2UL )
5843  {
5844  const size_t kbegin( ( IsLower<MT5>::value )
5845  ?( ( IsUpper<MT4>::value )
5846  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5847  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5848  :( IsUpper<MT4>::value ? i : 0UL ) );
5849  const size_t kend( ( IsUpper<MT5>::value )
5850  ?( ( IsLower<MT4>::value )
5851  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5852  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5853  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
5854 
5855  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5856 
5857  for( size_t k=kbegin; k<kend; ++k ) {
5858  const IntrinsicType a1( A.load(i ,k) );
5859  const IntrinsicType a2( A.load(i+IT::size ,k) );
5860  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
5861  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
5862  const IntrinsicType b1( set( B(k,j ) ) );
5863  const IntrinsicType b2( set( B(k,j+1UL) ) );
5864  xmm1 = xmm1 + a1 * b1;
5865  xmm2 = xmm2 + a2 * b1;
5866  xmm3 = xmm3 + a3 * b1;
5867  xmm4 = xmm4 + a4 * b1;
5868  xmm5 = xmm5 + a1 * b2;
5869  xmm6 = xmm6 + a2 * b2;
5870  xmm7 = xmm7 + a3 * b2;
5871  xmm8 = xmm8 + a4 * b2;
5872  }
5873 
5874  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5875  (~C).store( i+IT::size , j , (~C).load(i+IT::size ,j ) + xmm2 * factor );
5876  (~C).store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) + xmm3 * factor );
5877  (~C).store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) + xmm4 * factor );
5878  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
5879  (~C).store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) + xmm6 * factor );
5880  (~C).store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) + xmm7 * factor );
5881  (~C).store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) + xmm8 * factor );
5882  }
5883 
5884  if( j < N )
5885  {
5886  const size_t kbegin( ( IsLower<MT5>::value )
5887  ?( ( IsUpper<MT4>::value )
5888  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5889  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5890  :( IsUpper<MT4>::value ? i : 0UL ) );
5891  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
5892 
5893  IntrinsicType xmm1, xmm2, xmm3, xmm4;
5894 
5895  for( size_t k=kbegin; k<kend; ++k ) {
5896  const IntrinsicType b1( set( B(k,j) ) );
5897  xmm1 = xmm1 + A.load(i ,k) * b1;
5898  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
5899  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
5900  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
5901  }
5902 
5903  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5904  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
5905  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
5906  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
5907  }
5908  }
5909 
5910  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
5911  {
5912  size_t j( 0UL );
5913 
5914  for( ; (j+2UL) <= N; j+=2UL )
5915  {
5916  const size_t kbegin( ( IsLower<MT5>::value )
5917  ?( ( IsUpper<MT4>::value )
5918  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5919  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5920  :( IsUpper<MT4>::value ? i : 0UL ) );
5921  const size_t kend( ( IsUpper<MT5>::value )
5922  ?( ( IsLower<MT4>::value )
5923  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5924  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5925  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
5926 
5927  IntrinsicType xmm1, xmm2, xmm3, xmm4;
5928 
5929  for( size_t k=kbegin; k<kend; ++k ) {
5930  const IntrinsicType a1( A.load(i ,k) );
5931  const IntrinsicType a2( A.load(i+IT::size,k) );
5932  const IntrinsicType b1( set( B(k,j ) ) );
5933  const IntrinsicType b2( set( B(k,j+1UL) ) );
5934  xmm1 = xmm1 + a1 * b1;
5935  xmm2 = xmm2 + a2 * b1;
5936  xmm3 = xmm3 + a1 * b2;
5937  xmm4 = xmm4 + a2 * b2;
5938  }
5939 
5940  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5941  (~C).store( i+IT::size, j , (~C).load(i+IT::size,j ) + xmm2 * factor );
5942  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5943  (~C).store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) + xmm4 * factor );
5944  }
5945 
5946  if( j < N )
5947  {
5948  const size_t kbegin( ( IsLower<MT5>::value )
5949  ?( ( IsUpper<MT4>::value )
5950  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5951  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5952  :( IsUpper<MT4>::value ? i : 0UL ) );
5953  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
5954 
5955  IntrinsicType xmm1, xmm2;
5956 
5957  for( size_t k=kbegin; k<kend; ++k ) {
5958  const IntrinsicType b1( set( B(k,j) ) );
5959  xmm1 = xmm1 + A.load(i ,k) * b1;
5960  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
5961  }
5962 
5963  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5964  (~C).store( i+IT::size, j, (~C).load(i+IT::size,j) + xmm2 * factor );
5965  }
5966  }
5967 
5968  for( ; i<ipos; i+=IT::size )
5969  {
5970  size_t j( 0UL );
5971 
5972  for( ; (j+2UL) <= N; j+=2UL )
5973  {
5974  const size_t kbegin( ( IsLower<MT5>::value )
5975  ?( ( IsUpper<MT4>::value )
5976  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5977  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5978  :( IsUpper<MT4>::value ? i : 0UL ) );
5979  const size_t kend( ( IsUpper<MT5>::value )
5980  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
5981  :( K ) );
5982 
5983  IntrinsicType xmm1, xmm2;
5984 
5985  for( size_t k=kbegin; k<kend; ++k ) {
5986  const IntrinsicType a1( A.load(i,k) );
5987  xmm1 = xmm1 + a1 * set( B(k,j ) );
5988  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
5989  }
5990 
5991  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5992  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
5993  }
5994 
5995  if( j < N )
5996  {
5997  const size_t kbegin( ( IsLower<MT5>::value )
5998  ?( ( IsUpper<MT4>::value )
5999  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6000  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6001  :( IsUpper<MT4>::value ? i : 0UL ) );
6002 
6003  IntrinsicType xmm1;
6004 
6005  for( size_t k=kbegin; k<K; ++k ) {
6006  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
6007  }
6008 
6009  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
6010  }
6011  }
6012 
6013  for( ; remainder && i<M; ++i )
6014  {
6015  size_t j( 0UL );
6016 
6017  for( ; (j+2UL) <= N; j+=2UL )
6018  {
6019  const size_t kbegin( ( IsLower<MT5>::value )
6020  ?( ( IsUpper<MT4>::value )
6021  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6022  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6023  :( IsUpper<MT4>::value ? i : 0UL ) );
6024  const size_t kend( ( IsUpper<MT5>::value )
6025  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6026  :( K ) );
6027 
6028  ElementType value1 = ElementType();
6029  ElementType value2 = ElementType();
6030 
6031  for( size_t k=kbegin; k<kend; ++k ) {
6032  value1 += A(i,k) * B(k,j );
6033  value2 += A(i,k) * B(k,j+1UL);
6034  }
6035 
6036  (~C)(i,j ) += value1 * scalar;
6037  (~C)(i,j+1UL) += value2 * scalar;
6038  }
6039 
6040  if( j < N )
6041  {
6042  const size_t kbegin( ( IsLower<MT5>::value )
6043  ?( ( IsUpper<MT4>::value )
6044  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6045  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6046  :( IsUpper<MT4>::value ? i : 0UL ) );
6047 
6048  ElementType value = ElementType();
6049 
6050  for( size_t k=kbegin; k<K; ++k ) {
6051  value += A(i,k) * B(k,j);
6052  }
6053 
6054  (~C)(i,j) += value * scalar;
6055  }
6056  }
6057  }
6058  //**********************************************************************************************
6059 
6060  //**Default addition assignment to dense matrices (large matrices)******************************
6074  template< typename MT3 // Type of the left-hand side target matrix
6075  , typename MT4 // Type of the left-hand side matrix operand
6076  , typename MT5 // Type of the right-hand side matrix operand
6077  , typename ST2 > // Type of the scalar value
6078  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6079  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6080  {
6081  selectDefaultAddAssignKernel( C, A, B, scalar );
6082  }
6083  //**********************************************************************************************
6084 
6085  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
6100  template< typename MT3 // Type of the left-hand side target matrix
6101  , typename MT4 // Type of the left-hand side matrix operand
6102  , typename MT5 // Type of the right-hand side matrix operand
6103  , typename ST2 > // Type of the scalar value
6104  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6105  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6106  {
6107  selectSmallAddAssignKernel( ~C, A, B, scalar );
6108  }
6109  //**********************************************************************************************
6110 
6111  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
6126  template< typename MT3 // Type of the left-hand side target matrix
6127  , typename MT4 // Type of the left-hand side matrix operand
6128  , typename MT5 // Type of the right-hand side matrix operand
6129  , typename ST2 > // Type of the scalar value
6130  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6131  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6132  {
6133  typedef IntrinsicTrait<ElementType> IT;
6134 
6135  const size_t M( A.rows() );
6136  const size_t N( B.columns() );
6137  const size_t K( A.columns() );
6138 
6139  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6140 
6141  const IntrinsicType factor( set( scalar ) );
6142 
6143  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
6144  {
6145  const size_t iend( min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
6146 
6147  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
6148  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % IT::size ) ) == ipos, "Invalid end calculation" );
6149 
6150  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
6151  {
6152  const size_t jend( min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
6153 
6154  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
6155  {
6156  const size_t ktmp( min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
6157 
6158  size_t i( ii );
6159 
6160  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
6161  {
6162  const size_t i1( i+IT::size );
6163  const size_t i2( i+IT::size*2UL );
6164  const size_t i3( i+IT::size*3UL );
6165 
6166  size_t j( jj );
6167 
6168  for( ; (j+2UL) <= jend; j+=2UL )
6169  {
6170  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6171  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6172  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
6173  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
6174 
6175  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6176 
6177  for( size_t k=kbegin; k<kend; ++k ) {
6178  const IntrinsicType a1( A.load(i ,k) );
6179  const IntrinsicType a2( A.load(i1,k) );
6180  const IntrinsicType a3( A.load(i2,k) );
6181  const IntrinsicType a4( A.load(i3,k) );
6182  const IntrinsicType b1( set( B(k,j ) ) );
6183  const IntrinsicType b2( set( B(k,j+1UL) ) );
6184  xmm1 = xmm1 + a1 * b1;
6185  xmm2 = xmm2 + a2 * b1;
6186  xmm3 = xmm3 + a3 * b1;
6187  xmm4 = xmm4 + a4 * b1;
6188  xmm5 = xmm5 + a1 * b2;
6189  xmm6 = xmm6 + a2 * b2;
6190  xmm7 = xmm7 + a3 * b2;
6191  xmm8 = xmm8 + a4 * b2;
6192  }
6193 
6194  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6195  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
6196  (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
6197  (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
6198  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
6199  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
6200  (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
6201  (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
6202  }
6203 
6204  if( j < jend )
6205  {
6206  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6207  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6208  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
6209  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6210 
6211  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6212 
6213  for( size_t k=kbegin; k<kend; ++k ) {
6214  const IntrinsicType b1( set( B(k,j) ) );
6215  xmm1 = xmm1 + A.load(i ,k) * b1;
6216  xmm2 = xmm2 + A.load(i1,k) * b1;
6217  xmm3 = xmm3 + A.load(i2,k) * b1;
6218  xmm4 = xmm4 + A.load(i3,k) * b1;
6219  }
6220 
6221  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6222  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
6223  (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
6224  (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
6225  }
6226  }
6227 
6228  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
6229  {
6230  const size_t i1( i+IT::size );
6231 
6232  size_t j( jj );
6233 
6234  for( ; (j+4UL) <= jend; j+=4UL )
6235  {
6236  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6237  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6238  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
6239  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
6240 
6241  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6242 
6243  for( size_t k=kbegin; k<kend; ++k ) {
6244  const IntrinsicType a1( A.load(i ,k) );
6245  const IntrinsicType a2( A.load(i1,k) );
6246  const IntrinsicType b1( set( B(k,j ) ) );
6247  const IntrinsicType b2( set( B(k,j+1UL) ) );
6248  const IntrinsicType b3( set( B(k,j+2UL) ) );
6249  const IntrinsicType b4( set( B(k,j+3UL) ) );
6250  xmm1 = xmm1 + a1 * b1;
6251  xmm2 = xmm2 + a2 * b1;
6252  xmm3 = xmm3 + a1 * b2;
6253  xmm4 = xmm4 + a2 * b2;
6254  xmm5 = xmm5 + a1 * b3;
6255  xmm6 = xmm6 + a2 * b3;
6256  xmm7 = xmm7 + a1 * b4;
6257  xmm8 = xmm8 + a2 * b4;
6258  }
6259 
6260  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6261  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
6262  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6263  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
6264  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
6265  (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
6266  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
6267  (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
6268  }
6269 
6270  for( ; (j+2UL) <= jend; j+=2UL )
6271  {
6272  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6273  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6274  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
6275  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
6276 
6277  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6278 
6279  for( size_t k=kbegin; k<kend; ++k ) {
6280  const IntrinsicType a1( A.load(i ,k) );
6281  const IntrinsicType a2( A.load(i1,k) );
6282  const IntrinsicType b1( set( B(k,j ) ) );
6283  const IntrinsicType b2( set( B(k,j+1UL) ) );
6284  xmm1 = xmm1 + a1 * b1;
6285  xmm2 = xmm2 + a2 * b1;
6286  xmm3 = xmm3 + a1 * b2;
6287  xmm4 = xmm4 + a2 * b2;
6288  }
6289 
6290  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6291  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
6292  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6293  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
6294  }
6295 
6296  if( j < jend )
6297  {
6298  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6299  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6300  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
6301  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6302 
6303  IntrinsicType xmm1, xmm2;
6304 
6305  for( size_t k=kbegin; k<kend; ++k ) {
6306  const IntrinsicType b1( set( B(k,j) ) );
6307  xmm1 = xmm1 + A.load(i ,k) * b1;
6308  xmm2 = xmm2 + A.load(i1,k) * b1;
6309  }
6310 
6311  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6312  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
6313  }
6314  }
6315 
6316  for( ; i<ipos; i+=IT::size )
6317  {
6318  for( size_t j=jj; j<jend; ++j )
6319  {
6320  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6321  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6322  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
6323  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6324 
6325  IntrinsicType xmm1;
6326 
6327  for( size_t k=kbegin; k<kend; ++k ) {
6328  const IntrinsicType b1( set( B(k,j) ) );
6329  xmm1 = xmm1 + A.load(i,k) * b1;
6330  }
6331 
6332  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
6333  }
6334  }
6335 
6336  for( ; remainder && i<iend; ++i )
6337  {
6338  for( size_t j=jj; j<jend; ++j )
6339  {
6340  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6341  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6342  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
6343  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6344 
6345  ElementType value = ElementType();
6346 
6347  for( size_t k=kbegin; k<kend; ++k ) {
6348  value += A(i,k) * B(k,j);
6349  }
6350 
6351  (~C)(i,j) += value * scalar;
6352  }
6353  }
6354  }
6355  }
6356  }
6357  }
6358  //**********************************************************************************************
6359 
6360  //**BLAS-based addition assignment to dense matrices (default)**********************************
6375  template< typename MT3 // Type of the left-hand side target matrix
6376  , typename MT4 // Type of the left-hand side matrix operand
6377  , typename MT5 // Type of the right-hand side matrix operand
6378  , typename ST2 > // Type of the scalar value
6379  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
6380  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6381  {
6382  selectLargeAddAssignKernel( C, A, B, scalar );
6383  }
6384  //**********************************************************************************************
6385 
6386  //**BLAS-based addition assignment to dense matrices********************************************
6387 #if BLAZE_BLAS_MODE
6388 
6401  template< typename MT3 // Type of the left-hand side target matrix
6402  , typename MT4 // Type of the left-hand side matrix operand
6403  , typename MT5 // Type of the right-hand side matrix operand
6404  , typename ST2 > // Type of the scalar value
6405  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
6406  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6407  {
6408  typedef typename MT3::ElementType ET;
6409 
6410  if( IsTriangular<MT4>::value ) {
6411  typename MT3::ResultType tmp( serial( B ) );
6412  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6413  addAssign( C, tmp );
6414  }
6415  else if( IsTriangular<MT5>::value ) {
6416  typename MT3::ResultType tmp( serial( A ) );
6417  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6418  addAssign( C, tmp );
6419  }
6420  else {
6421  gemm( C, A, B, ET(scalar), ET(1) );
6422  }
6423  }
6424 #endif
6425  //**********************************************************************************************
6426 
6427  //**Restructuring addition assignment to row-major matrices*************************************
6442  template< typename MT > // Type of the target matrix
6443  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
6444  addAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
6445  {
6447 
6449 
6450  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6451  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6452 
6453  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6454  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6455 
6456  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
6457  addAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
6458  else if( IsSymmetric<MT1>::value )
6459  addAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
6460  else
6461  addAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
6462  }
6463  //**********************************************************************************************
6464 
6465  //**Addition assignment to sparse matrices******************************************************
6466  // No special implementation for the addition assignment to sparse matrices.
6467  //**********************************************************************************************
6468 
6469  //**Subtraction assignment to dense matrices****************************************************
6481  template< typename MT // Type of the target dense matrix
6482  , bool SO > // Storage order of the target dense matrix
6483  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
6484  subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6485  {
6487 
6488  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6489  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6490 
6491  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6492  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6493 
6494  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6495  return;
6496  }
6497 
6498  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6499  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6500 
6501  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6502  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6503  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6504  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6505  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6506  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6507 
6508  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6509  }
6510  //**********************************************************************************************
6511 
6512  //**Subtraction assignment to dense matrices (kernel selection)*********************************
6523  template< typename MT3 // Type of the left-hand side target matrix
6524  , typename MT4 // Type of the left-hand side matrix operand
6525  , typename MT5 // Type of the right-hand side matrix operand
6526  , typename ST2 > // Type of the scalar value
6527  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6528  {
6529  if( ( IsDiagonal<MT4>::value ) ||
6530  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6531  selectSmallSubAssignKernel( C, A, B, scalar );
6532  else
6533  selectBlasSubAssignKernel( C, A, B, scalar );
6534  }
6535  //**********************************************************************************************
6536 
6537  //**Default subtraction assignment to dense matrices (general/general)**************************
6551  template< typename MT3 // Type of the left-hand side target matrix
6552  , typename MT4 // Type of the left-hand side matrix operand
6553  , typename MT5 // Type of the right-hand side matrix operand
6554  , typename ST2 > // Type of the scalar value
6555  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6556  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6557  {
6558  const ResultType tmp( serial( A * B * scalar ) );
6559  subAssign( C, tmp );
6560  }
6561  //**********************************************************************************************
6562 
6563  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
6577  template< typename MT3 // Type of the left-hand side target matrix
6578  , typename MT4 // Type of the left-hand side matrix operand
6579  , typename MT5 // Type of the right-hand side matrix operand
6580  , typename ST2 > // Type of the scalar value
6581  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6582  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6583  {
6585 
6586  const size_t M( A.rows() );
6587  const size_t N( B.columns() );
6588 
6589  for( size_t j=0UL; j<N; ++j )
6590  {
6591  const size_t ibegin( ( IsLower<MT4>::value )
6592  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6593  :( 0UL ) );
6594  const size_t iend( ( IsUpper<MT4>::value )
6595  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6596  :( M ) );
6597  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6598 
6599  const size_t inum( iend - ibegin );
6600  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6601 
6602  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6603  C(i ,j) -= A(i ,j) * B(j,j) * scalar;
6604  C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
6605  }
6606  if( ipos < iend ) {
6607  C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
6608  }
6609  }
6610  }
6611  //**********************************************************************************************
6612 
6613  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
6627  template< typename MT3 // Type of the left-hand side target matrix
6628  , typename MT4 // Type of the left-hand side matrix operand
6629  , typename MT5 // Type of the right-hand side matrix operand
6630  , typename ST2 > // Type of the scalar value
6631  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6632  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6633  {
6635 
6636  const size_t M( A.rows() );
6637  const size_t N( B.columns() );
6638 
6639  for( size_t j=0UL; j<N; ++j )
6640  {
6641  const size_t ibegin( ( IsLower<MT5>::value )
6642  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6643  :( 0UL ) );
6644  const size_t iend( ( IsUpper<MT5>::value )
6645  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6646  :( M ) );
6647  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6648 
6649  const size_t inum( iend - ibegin );
6650  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6651 
6652  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6653  C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
6654  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6655  }
6656  if( ipos < iend ) {
6657  C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
6658  }
6659  }
6660  }
6661  //**********************************************************************************************
6662 
6663  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
6677  template< typename MT3 // Type of the left-hand side target matrix
6678  , typename MT4 // Type of the left-hand side matrix operand
6679  , typename MT5 // Type of the right-hand side matrix operand
6680  , typename ST2 > // Type of the scalar value
6681  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
6682  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6683  {
6685 
6686  for( size_t i=0UL; i<A.rows(); ++i ) {
6687  C(i,i) -= A(i,i) * B(i,i) * scalar;
6688  }
6689  }
6690  //**********************************************************************************************
6691 
6692  //**Default subtraction assignment to dense matrices (small matrices)***************************
6706  template< typename MT3 // Type of the left-hand side target matrix
6707  , typename MT4 // Type of the left-hand side matrix operand
6708  , typename MT5 // Type of the right-hand side matrix operand
6709  , typename ST2 > // Type of the scalar value
6710  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6711  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6712  {
6713  selectDefaultSubAssignKernel( C, A, B, scalar );
6714  }
6715  //**********************************************************************************************
6716 
6717  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
6732  template< typename MT3 // Type of the left-hand side target matrix
6733  , typename MT4 // Type of the left-hand side matrix operand
6734  , typename MT5 // Type of the right-hand side matrix operand
6735  , typename ST2 > // Type of the scalar value
6736  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6737  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6738  {
6743 
6744  if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
6745  const typename MT5::OppositeType tmp( serial( B ) );
6746  subAssign( ~C, A * tmp * scalar );
6747  }
6748  else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
6749  const typename MT4::OppositeType tmp( serial( A ) );
6750  subAssign( ~C, tmp * B * scalar );
6751  }
6752  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6753  const typename MT5::OppositeType tmp( serial( B ) );
6754  subAssign( ~C, A * tmp * scalar );
6755  }
6756  else {
6757  const typename MT4::OppositeType tmp( serial( A ) );
6758  subAssign( ~C, tmp * B * scalar );
6759  }
6760  }
6761  //**********************************************************************************************
6762 
6763  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
6778  template< typename MT3 // Type of the left-hand side target matrix
6779  , typename MT4 // Type of the left-hand side matrix operand
6780  , typename MT5 // Type of the right-hand side matrix operand
6781  , typename ST2 > // Type of the scalar value
6782  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6783  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6784  {
6785  typedef IntrinsicTrait<ElementType> IT;
6786 
6787  const size_t M( A.rows() );
6788  const size_t N( B.columns() );
6789  const size_t K( A.columns() );
6790 
6791  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6792 
6793  const size_t ipos( remainder ? ( M & size_t(-IT::size) ) : M );
6794  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % IT::size ) ) == ipos, "Invalid end calculation" );
6795 
6796  const IntrinsicType factor( set( scalar ) );
6797 
6798  size_t i( 0UL );
6799 
6800  for( ; (i+IT::size*7UL) < ipos; i+=IT::size*8UL ) {
6801  for( size_t j=0UL; j<N; ++j )
6802  {
6803  const size_t kbegin( ( IsLower<MT5>::value )
6804  ?( ( IsUpper<MT4>::value )
6805  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6806  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6807  :( IsUpper<MT4>::value ? i : 0UL ) );
6808  const size_t kend( ( IsUpper<MT5>::value )
6809  ?( ( IsLower<MT4>::value )
6810  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
6811  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
6812  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
6813 
6814  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6815 
6816  for( size_t k=kbegin; k<kend; ++k ) {
6817  const IntrinsicType b1( set( B(k,j) ) );
6818  xmm1 = xmm1 + A.load(i ,k) * b1;
6819  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
6820  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
6821  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
6822  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
6823  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
6824  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
6825  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
6826  }
6827 
6828  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6829  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
6830  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
6831  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
6832  (~C).store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) - xmm5 * factor );
6833  (~C).store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) - xmm6 * factor );
6834  (~C).store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) - xmm7 * factor );
6835  (~C).store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) - xmm8 * factor );
6836  }
6837  }
6838 
6839  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
6840  {
6841  size_t j( 0UL );
6842 
6843  for( ; (j+2UL) <= N; j+=2UL )
6844  {
6845  const size_t kbegin( ( IsLower<MT5>::value )
6846  ?( ( IsUpper<MT4>::value )
6847  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6848  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6849  :( IsUpper<MT4>::value ? i : 0UL ) );
6850  const size_t kend( ( IsUpper<MT5>::value )
6851  ?( ( IsLower<MT4>::value )
6852  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6853  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6854  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
6855 
6856  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6857 
6858  for( size_t k=kbegin; k<kend; ++k ) {
6859  const IntrinsicType a1( A.load(i ,k) );
6860  const IntrinsicType a2( A.load(i+IT::size ,k) );
6861  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
6862  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
6863  const IntrinsicType b1( set( B(k,j ) ) );
6864  const IntrinsicType b2( set( B(k,j+1UL) ) );
6865  xmm1 = xmm1 + a1 * b1;
6866  xmm2 = xmm2 + a2 * b1;
6867  xmm3 = xmm3 + a3 * b1;
6868  xmm4 = xmm4 + a4 * b1;
6869  xmm5 = xmm5 + a1 * b2;
6870  xmm6 = xmm6 + a2 * b2;
6871  xmm7 = xmm7 + a3 * b2;
6872  xmm8 = xmm8 + a4 * b2;
6873  }
6874 
6875  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6876  (~C).store( i+IT::size , j , (~C).load(i+IT::size ,j ) - xmm2 * factor );
6877  (~C).store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) - xmm3 * factor );
6878  (~C).store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) - xmm4 * factor );
6879  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
6880  (~C).store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) - xmm6 * factor );
6881  (~C).store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) - xmm7 * factor );
6882  (~C).store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) - xmm8 * factor );
6883  }
6884 
6885  if( j < N )
6886  {
6887  const size_t kbegin( ( IsLower<MT5>::value )
6888  ?( ( IsUpper<MT4>::value )
6889  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6890  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6891  :( IsUpper<MT4>::value ? i : 0UL ) );
6892  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
6893 
6894  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6895 
6896  for( size_t k=kbegin; k<kend; ++k ) {
6897  const IntrinsicType b1( set( B(k,j) ) );
6898  xmm1 = xmm1 + A.load(i ,k) * b1;
6899  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
6900  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
6901  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
6902  }
6903 
6904  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6905  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
6906  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
6907  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
6908  }
6909  }
6910 
6911  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
6912  {
6913  size_t j( 0UL );
6914 
6915  for( ; (j+2UL) <= N; j+=2UL )
6916  {
6917  const size_t kbegin( ( IsLower<MT5>::value )
6918  ?( ( IsUpper<MT4>::value )
6919  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6920  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6921  :( IsUpper<MT4>::value ? i : 0UL ) );
6922  const size_t kend( ( IsUpper<MT5>::value )
6923  ?( ( IsLower<MT4>::value )
6924  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6925  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6926  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
6927 
6928  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6929 
6930  for( size_t k=kbegin; k<kend; ++k ) {
6931  const IntrinsicType a1( A.load(i ,k) );
6932  const IntrinsicType a2( A.load(i+IT::size,k) );
6933  const IntrinsicType b1( set( B(k,j ) ) );
6934  const IntrinsicType b2( set( B(k,j+1UL) ) );
6935  xmm1 = xmm1 + a1 * b1;
6936  xmm2 = xmm2 + a2 * b1;
6937  xmm3 = xmm3 + a1 * b2;
6938  xmm4 = xmm4 + a2 * b2;
6939  }
6940 
6941  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6942  (~C).store( i+IT::size, j , (~C).load(i+IT::size,j ) - xmm2 * factor );
6943  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
6944  (~C).store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) - xmm4 * factor );
6945  }
6946 
6947  if( j < N )
6948  {
6949  const size_t kbegin( ( IsLower<MT5>::value )
6950  ?( ( IsUpper<MT4>::value )
6951  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6952  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6953  :( IsUpper<MT4>::value ? i : 0UL ) );
6954  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
6955 
6956  IntrinsicType xmm1, xmm2;
6957 
6958  for( size_t k=kbegin; k<kend; ++k ) {
6959  const IntrinsicType b1( set( B(k,j) ) );
6960  xmm1 = xmm1 + A.load(i ,k) * b1;
6961  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
6962  }
6963 
6964  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6965  (~C).store( i+IT::size, j, (~C).load(i+IT::size,j) - xmm2 * factor );
6966  }
6967  }
6968 
6969  for( ; i<ipos; i+=IT::size )
6970  {
6971  size_t j( 0UL );
6972 
6973  for( ; (j+2UL) <= N; j+=2UL )
6974  {
6975  const size_t kbegin( ( IsLower<MT5>::value )
6976  ?( ( IsUpper<MT4>::value )
6977  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6978  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6979  :( IsUpper<MT4>::value ? i : 0UL ) );
6980  const size_t kend( ( IsUpper<MT5>::value )
6981  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6982  :( K ) );
6983 
6984  IntrinsicType xmm1, xmm2;
6985 
6986  for( size_t k=kbegin; k<kend; ++k ) {
6987  const IntrinsicType a1( A.load(i,k) );
6988  xmm1 = xmm1 + a1 * set( B(k,j ) );
6989  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
6990  }
6991 
6992  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6993  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
6994  }
6995 
6996  if( j < N )
6997  {
6998  const size_t kbegin( ( IsLower<MT5>::value )
6999  ?( ( IsUpper<MT4>::value )
7000  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7001  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7002  :( IsUpper<MT4>::value ? i : 0UL ) );
7003 
7004  IntrinsicType xmm1;
7005 
7006  for( size_t k=kbegin; k<K; ++k ) {
7007  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
7008  }
7009 
7010  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
7011  }
7012  }
7013 
7014  for( ; remainder && i<M; ++i )
7015  {
7016  size_t j( 0UL );
7017 
7018  for( ; (j+2UL) <= N; j+=2UL )
7019  {
7020  const size_t kbegin( ( IsLower<MT5>::value )
7021  ?( ( IsUpper<MT4>::value )
7022  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7023  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7024  :( IsUpper<MT4>::value ? i : 0UL ) );
7025  const size_t kend( ( IsUpper<MT5>::value )
7026  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7027  :( K ) );
7028 
7029  ElementType value1 = ElementType();
7030  ElementType value2 = ElementType();
7031 
7032  for( size_t k=kbegin; k<kend; ++k ) {
7033  value1 += A(i,k) * B(k,j );
7034  value2 += A(i,k) * B(k,j+1UL);
7035  }
7036 
7037  (~C)(i,j ) -= value1 * scalar;
7038  (~C)(i,j+1UL) -= value2 * scalar;
7039  }
7040 
7041  if( j < N )
7042  {
7043  const size_t kbegin( ( IsLower<MT5>::value )
7044  ?( ( IsUpper<MT4>::value )
7045  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7046  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7047  :( IsUpper<MT4>::value ? i : 0UL ) );
7048 
7049  ElementType value = ElementType();
7050 
7051  for( size_t k=kbegin; k<K; ++k ) {
7052  value += A(i,k) * B(k,j);
7053  }
7054 
7055  (~C)(i,j) -= value * scalar;
7056  }
7057  }
7058  }
7059  //**********************************************************************************************
7060 
7061  //**Default subtraction assignment to dense matrices (large matrices)***************************
7075  template< typename MT3 // Type of the left-hand side target matrix
7076  , typename MT4 // Type of the left-hand side matrix operand
7077  , typename MT5 // Type of the right-hand side matrix operand
7078  , typename ST2 > // Type of the scalar value
7079  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7080  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7081  {
7082  selectDefaultSubAssignKernel( C, A, B, scalar );
7083  }
7084  //**********************************************************************************************
7085 
7086  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
7101  template< typename MT3 // Type of the left-hand side target matrix
7102  , typename MT4 // Type of the left-hand side matrix operand
7103  , typename MT5 // Type of the right-hand side matrix operand
7104  , typename ST2 > // Type of the scalar value
7105  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7106  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7107  {
7108  selectSmallSubAssignKernel( ~C, A, B, scalar );
7109  }
7110  //**********************************************************************************************
7111 
7112  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
7127  template< typename MT3 // Type of the left-hand side target matrix
7128  , typename MT4 // Type of the left-hand side matrix operand
7129  , typename MT5 // Type of the right-hand side matrix operand
7130  , typename ST2 > // Type of the scalar value
7131  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7132  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7133  {
7134  typedef IntrinsicTrait<ElementType> IT;
7135 
7136  const size_t M( A.rows() );
7137  const size_t N( B.columns() );
7138  const size_t K( A.columns() );
7139 
7140  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
7141 
7142  const IntrinsicType factor( set( scalar ) );
7143 
7144  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
7145  {
7146  const size_t iend( min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
7147 
7148  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
7149  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % IT::size ) ) == ipos, "Invalid end calculation" );
7150 
7151  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
7152  {
7153  const size_t jend( min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
7154 
7155  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
7156  {
7157  const size_t ktmp( min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
7158 
7159  size_t i( ii );
7160 
7161  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL )
7162  {
7163  const size_t i1( i+IT::size );
7164  const size_t i2( i+IT::size*2UL );
7165  const size_t i3( i+IT::size*3UL );
7166 
7167  size_t j( jj );
7168 
7169  for( ; (j+2UL) <= jend; j+=2UL )
7170  {
7171  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7172  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7173  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
7174  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7175 
7176  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7177 
7178  for( size_t k=kbegin; k<kend; ++k ) {
7179  const IntrinsicType a1( A.load(i ,k) );
7180  const IntrinsicType a2( A.load(i1,k) );
7181  const IntrinsicType a3( A.load(i2,k) );
7182  const IntrinsicType a4( A.load(i3,k) );
7183  const IntrinsicType b1( set( B(k,j ) ) );
7184  const IntrinsicType b2( set( B(k,j+1UL) ) );
7185  xmm1 = xmm1 + a1 * b1;
7186  xmm2 = xmm2 + a2 * b1;
7187  xmm3 = xmm3 + a3 * b1;
7188  xmm4 = xmm4 + a4 * b1;
7189  xmm5 = xmm5 + a1 * b2;
7190  xmm6 = xmm6 + a2 * b2;
7191  xmm7 = xmm7 + a3 * b2;
7192  xmm8 = xmm8 + a4 * b2;
7193  }
7194 
7195  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7196  (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
7197  (~C).store( i2, j , (~C).load(i2,j ) - xmm3 * factor );
7198  (~C).store( i3, j , (~C).load(i3,j ) - xmm4 * factor );
7199  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
7200  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm6 * factor );
7201  (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) - xmm7 * factor );
7202  (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) - xmm8 * factor );
7203  }
7204 
7205  if( j < jend )
7206  {
7207  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7208  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7209  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
7210  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7211 
7212  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7213 
7214  for( size_t k=kbegin; k<kend; ++k ) {
7215  const IntrinsicType b1( set( B(k,j) ) );
7216  xmm1 = xmm1 + A.load(i ,k) * b1;
7217  xmm2 = xmm2 + A.load(i1,k) * b1;
7218  xmm3 = xmm3 + A.load(i2,k) * b1;
7219  xmm4 = xmm4 + A.load(i3,k) * b1;
7220  }
7221 
7222  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7223  (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
7224  (~C).store( i2, j, (~C).load(i2,j) - xmm3 * factor );
7225  (~C).store( i3, j, (~C).load(i3,j) - xmm4 * factor );
7226  }
7227  }
7228 
7229  for( ; (i+IT::size) < ipos; i+=IT::size*2UL )
7230  {
7231  const size_t i1( i+IT::size );
7232 
7233  size_t j( jj );
7234 
7235  for( ; (j+4UL) <= jend; j+=4UL )
7236  {
7237  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7238  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7239  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
7240  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
7241 
7242  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7243 
7244  for( size_t k=kbegin; k<kend; ++k ) {
7245  const IntrinsicType a1( A.load(i ,k) );
7246  const IntrinsicType a2( A.load(i1,k) );
7247  const IntrinsicType b1( set( B(k,j ) ) );
7248  const IntrinsicType b2( set( B(k,j+1UL) ) );
7249  const IntrinsicType b3( set( B(k,j+2UL) ) );
7250  const IntrinsicType b4( set( B(k,j+3UL) ) );
7251  xmm1 = xmm1 + a1 * b1;
7252  xmm2 = xmm2 + a2 * b1;
7253  xmm3 = xmm3 + a1 * b2;
7254  xmm4 = xmm4 + a2 * b2;
7255  xmm5 = xmm5 + a1 * b3;
7256  xmm6 = xmm6 + a2 * b3;
7257  xmm7 = xmm7 + a1 * b4;
7258  xmm8 = xmm8 + a2 * b4;
7259  }
7260 
7261  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7262  (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
7263  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7264  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
7265  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
7266  (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) - xmm6 * factor );
7267  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
7268  (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) - xmm8 * factor );
7269  }
7270 
7271  for( ; (j+2UL) <= jend; j+=2UL )
7272  {
7273  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7274  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7275  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
7276  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7277 
7278  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7279 
7280  for( size_t k=kbegin; k<kend; ++k ) {
7281  const IntrinsicType a1( A.load(i ,k) );
7282  const IntrinsicType a2( A.load(i1,k) );
7283  const IntrinsicType b1( set( B(k,j ) ) );
7284  const IntrinsicType b2( set( B(k,j+1UL) ) );
7285  xmm1 = xmm1 + a1 * b1;
7286  xmm2 = xmm2 + a2 * b1;
7287  xmm3 = xmm3 + a1 * b2;
7288  xmm4 = xmm4 + a2 * b2;
7289  }
7290 
7291  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7292  (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
7293  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7294  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
7295  }
7296 
7297  if( j < jend )
7298  {
7299  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7300  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7301  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
7302  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7303 
7304  IntrinsicType xmm1, xmm2;
7305 
7306  for( size_t k=kbegin; k<kend; ++k ) {
7307  const IntrinsicType b1( set( B(k,j) ) );
7308  xmm1 = xmm1 + A.load(i ,k) * b1;
7309  xmm2 = xmm2 + A.load(i1,k) * b1;
7310  }
7311 
7312  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7313  (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
7314  }
7315  }
7316 
7317  for( ; i<ipos; i+=IT::size )
7318  {
7319  for( size_t j=jj; j<jend; ++j )
7320  {
7321  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7322  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7323  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
7324  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7325 
7326  IntrinsicType xmm1;
7327 
7328  for( size_t k=kbegin; k<kend; ++k ) {
7329  const IntrinsicType b1( set( B(k,j) ) );
7330  xmm1 = xmm1 + A.load(i,k) * b1;
7331  }
7332 
7333  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
7334  }
7335  }
7336 
7337  for( ; remainder && i<iend; ++i )
7338  {
7339  for( size_t j=jj; j<jend; ++j )
7340  {
7341  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7342  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7343  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
7344  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7345 
7346  ElementType value = ElementType();
7347 
7348  for( size_t k=kbegin; k<kend; ++k ) {
7349  value += A(i,k) * B(k,j);
7350  }
7351 
7352  (~C)(i,j) -= value * scalar;
7353  }
7354  }
7355  }
7356  }
7357  }
7358  }
7359  //**********************************************************************************************
7360 
7361  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
7376  template< typename MT3 // Type of the left-hand side target matrix
7377  , typename MT4 // Type of the left-hand side matrix operand
7378  , typename MT5 // Type of the right-hand side matrix operand
7379  , typename ST2 > // Type of the scalar value
7380  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7381  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7382  {
7383  selectLargeSubAssignKernel( C, A, B, scalar );
7384  }
7385  //**********************************************************************************************
7386 
7387  //**BLAS-based subraction assignment to dense matrices******************************************
7388 #if BLAZE_BLAS_MODE
7389 
7402  template< typename MT3 // Type of the left-hand side target matrix
7403  , typename MT4 // Type of the left-hand side matrix operand
7404  , typename MT5 // Type of the right-hand side matrix operand
7405  , typename ST2 > // Type of the scalar value
7406  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7407  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7408  {
7409  typedef typename MT3::ElementType ET;
7410 
7411  if( IsTriangular<MT4>::value ) {
7412  typename MT3::ResultType tmp( serial( B ) );
7413  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7414  subAssign( C, tmp );
7415  }
7416  else if( IsTriangular<MT5>::value ) {
7417  typename MT3::ResultType tmp( serial( A ) );
7418  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7419  subAssign( C, tmp );
7420  }
7421  else {
7422  gemm( C, A, B, ET(-scalar), ET(1) );
7423  }
7424  }
7425 #endif
7426  //**********************************************************************************************
7427 
7428  //**Restructuring subtraction assignment to row-major matrices**********************************
7442  template< typename MT > // Type of the target matrix
7443  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7444  subAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7445  {
7447 
7449 
7450  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7451  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7452 
7453  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7454  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7455 
7456  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7457  subAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7458  else if( IsSymmetric<MT1>::value )
7459  subAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7460  else
7461  subAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7462  }
7463  //**********************************************************************************************
7464 
7465  //**Subtraction assignment to sparse matrices***************************************************
7466  // No special implementation for the subtraction assignment to sparse matrices.
7467  //**********************************************************************************************
7468 
7469  //**Multiplication assignment to dense matrices*************************************************
7470  // No special implementation for the multiplication assignment to dense matrices.
7471  //**********************************************************************************************
7472 
7473  //**Multiplication assignment to sparse matrices************************************************
7474  // No special implementation for the multiplication assignment to sparse matrices.
7475  //**********************************************************************************************
7476 
7477  //**SMP assignment to dense matrices************************************************************
7492  template< typename MT // Type of the target dense matrix
7493  , bool SO > // Storage order of the target dense matrix
7494  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7495  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7496  {
7498 
7499  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7500  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7501 
7502  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7503  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7504 
7505  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
7506  return;
7507  }
7508  else if( left.columns() == 0UL ) {
7509  reset( ~lhs );
7510  return;
7511  }
7512 
7513  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7514  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7515 
7516  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7517  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7518  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7519  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7520  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7521  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7522 
7523  smpAssign( ~lhs, A * B * rhs.scalar_ );
7524  }
7525  //**********************************************************************************************
7526 
7527  //**SMP assignment to sparse matrices***********************************************************
7542  template< typename MT // Type of the target sparse matrix
7543  , bool SO > // Storage order of the target sparse matrix
7544  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7545  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7546  {
7548 
7549  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
7550 
7557 
7558  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7559  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7560 
7561  const TmpType tmp( rhs );
7562  smpAssign( ~lhs, tmp );
7563  }
7564  //**********************************************************************************************
7565 
7566  //**Restructuring SMP assignment to row-major matrices******************************************
7580  template< typename MT > // Type of the target matrix
7581  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7582  smpAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7583  {
7585 
7587 
7588  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7589  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7590 
7591  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7592  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7593 
7594  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7595  smpAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7596  else if( IsSymmetric<MT1>::value )
7597  smpAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7598  else
7599  smpAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7600  }
7601  //**********************************************************************************************
7602 
7603  //**SMP addition assignment to dense matrices***************************************************
7618  template< typename MT // Type of the target dense matrix
7619  , bool SO > // Storage order of the target dense matrix
7620  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7621  smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7622  {
7624 
7625  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7626  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7627 
7628  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7629  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7630 
7631  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7632  return;
7633  }
7634 
7635  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7636  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7637 
7638  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7639  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7640  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7641  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7642  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7643  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7644 
7645  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
7646  }
7647  //**********************************************************************************************
7648 
7649  //**Restructuring SMP addition assignment to row-major matrices*********************************
7664  template< typename MT > // Type of the target matrix
7665  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7666  smpAddAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7667  {
7669 
7671 
7672  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7673  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7674 
7675  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7676  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7677 
7678  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7679  smpAddAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7680  else if( IsSymmetric<MT1>::value )
7681  smpAddAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7682  else
7683  smpAddAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7684  }
7685  //**********************************************************************************************
7686 
7687  //**SMP addition assignment to sparse matrices**************************************************
7688  // No special implementation for the SMP addition assignment to sparse matrices.
7689  //**********************************************************************************************
7690 
7691  //**SMP subtraction assignment to dense matrices************************************************
7706  template< typename MT // Type of the target dense matrix
7707  , bool SO > // Storage order of the target dense matrix
7708  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7709  smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7710  {
7712 
7713  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7714  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7715 
7716  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7717  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7718 
7719  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7720  return;
7721  }
7722 
7723  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7724  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7725 
7726  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7727  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7728  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7729  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7730  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7731  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7732 
7733  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
7734  }
7735  //**********************************************************************************************
7736 
7737  //**Restructuring SMP subtraction assignment to row-major matrices******************************
7752  template< typename MT > // Type of the target matrix
7753  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7754  smpSubAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7755  {
7757 
7759 
7760  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7761  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7762 
7763  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7764  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7765 
7766  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7767  smpSubAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7768  else if( IsSymmetric<MT1>::value )
7769  smpSubAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7770  else
7771  smpSubAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7772  }
7773  //**********************************************************************************************
7774 
7775  //**SMP subtraction assignment to sparse matrices***********************************************
7776  // No special implementation for the SMP subtraction assignment to sparse matrices.
7777  //**********************************************************************************************
7778 
7779  //**SMP multiplication assignment to dense matrices*********************************************
7780  // No special implementation for the SMP multiplication assignment to dense matrices.
7781  //**********************************************************************************************
7782 
7783  //**SMP multiplication assignment to sparse matrices********************************************
7784  // No special implementation for the SMP multiplication assignment to sparse matrices.
7785  //**********************************************************************************************
7786 
7787  //**Compile time checks*************************************************************************
7795  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
7796  //**********************************************************************************************
7797 };
7799 //*************************************************************************************************
7800 
7801 
7802 
7803 
7804 //=================================================================================================
7805 //
7806 // GLOBAL BINARY ARITHMETIC OPERATORS
7807 //
7808 //=================================================================================================
7809 
7810 //*************************************************************************************************
7836 template< typename T1 // Type of the left-hand side dense matrix
7837  , typename T2 > // Type of the right-hand side dense matrix
7838 inline const TDMatTDMatMultExpr<T1,T2>
7840 {
7842 
7843  if( (~lhs).columns() != (~rhs).rows() ) {
7844  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
7845  }
7846 
7847  return TDMatTDMatMultExpr<T1,T2>( ~lhs, ~rhs );
7848 }
7849 //*************************************************************************************************
7850 
7851 
7852 
7853 
7854 //=================================================================================================
7855 //
7856 // ROWS SPECIALIZATIONS
7857 //
7858 //=================================================================================================
7859 
7860 //*************************************************************************************************
7862 template< typename MT1, typename MT2 >
7863 struct Rows< TDMatTDMatMultExpr<MT1,MT2> > : public Rows<MT1>
7864 {};
7866 //*************************************************************************************************
7867 
7868 
7869 
7870 
7871 //=================================================================================================
7872 //
7873 // COLUMNS SPECIALIZATIONS
7874 //
7875 //=================================================================================================
7876 
7877 //*************************************************************************************************
7879 template< typename MT1, typename MT2 >
7880 struct Columns< TDMatTDMatMultExpr<MT1,MT2> > : public Columns<MT2>
7881 {};
7883 //*************************************************************************************************
7884 
7885 
7886 
7887 
7888 //=================================================================================================
7889 //
7890 // ISALIGNED SPECIALIZATIONS
7891 //
7892 //=================================================================================================
7893 
7894 //*************************************************************************************************
7896 template< typename MT1, typename MT2 >
7897 struct IsAligned< TDMatTDMatMultExpr<MT1,MT2> >
7898  : public IsTrue< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7899 {};
7901 //*************************************************************************************************
7902 
7903 
7904 
7905 
7906 //=================================================================================================
7907 //
7908 // ISLOWER SPECIALIZATIONS
7909 //
7910 //=================================================================================================
7911 
7912 //*************************************************************************************************
7914 template< typename MT1, typename MT2 >
7915 struct IsLower< TDMatTDMatMultExpr<MT1,MT2> >
7916  : public IsTrue< And< IsLower<MT1>, IsLower<MT2> >::value >
7917 {};
7919 //*************************************************************************************************
7920 
7921 
7922 
7923 
7924 //=================================================================================================
7925 //
7926 // ISUNILOWER SPECIALIZATIONS
7927 //
7928 //=================================================================================================
7929 
7930 //*************************************************************************************************
7932 template< typename MT1, typename MT2 >
7933 struct IsUniLower< TDMatTDMatMultExpr<MT1,MT2> >
7934  : public IsTrue< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
7935 {};
7937 //*************************************************************************************************
7938 
7939 
7940 
7941 
7942 //=================================================================================================
7943 //
7944 // ISSTRICTLYLOWER SPECIALIZATIONS
7945 //
7946 //=================================================================================================
7947 
7948 //*************************************************************************************************
7950 template< typename MT1, typename MT2 >
7951 struct IsStrictlyLower< TDMatTDMatMultExpr<MT1,MT2> >
7952  : public IsTrue< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7953  , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
7954 {};
7956 //*************************************************************************************************
7957 
7958 
7959 
7960 
7961 //=================================================================================================
7962 //
7963 // ISUPPER SPECIALIZATIONS
7964 //
7965 //=================================================================================================
7966 
7967 //*************************************************************************************************
7969 template< typename MT1, typename MT2 >
7970 struct IsUpper< TDMatTDMatMultExpr<MT1,MT2> >
7971  : public IsTrue< And< IsUpper<MT1>, IsUpper<MT2> >::value >
7972 {};
7974 //*************************************************************************************************
7975 
7976 
7977 
7978 
7979 //=================================================================================================
7980 //
7981 // ISUNIUPPER SPECIALIZATIONS
7982 //
7983 //=================================================================================================
7984 
7985 //*************************************************************************************************
7987 template< typename MT1, typename MT2 >
7988 struct IsUniUpper< TDMatTDMatMultExpr<MT1,MT2> >
7989  : public IsTrue< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
7990 {};
7992 //*************************************************************************************************
7993 
7994 
7995 
7996 
7997 //=================================================================================================
7998 //
7999 // ISSTRICTLYUPPER SPECIALIZATIONS
8000 //
8001 //=================================================================================================
8002 
8003 //*************************************************************************************************
8005 template< typename MT1, typename MT2 >
8006 struct IsStrictlyUpper< TDMatTDMatMultExpr<MT1,MT2> >
8007  : public IsTrue< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
8008  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
8009 {};
8011 //*************************************************************************************************
8012 
8013 
8014 
8015 
8016 //=================================================================================================
8017 //
8018 // EXPRESSION TRAIT SPECIALIZATIONS
8019 //
8020 //=================================================================================================
8021 
8022 //*************************************************************************************************
8024 template< typename MT1, typename MT2, typename VT >
8025 struct TDMatDVecMultExprTrait< TDMatTDMatMultExpr<MT1,MT2>, VT >
8026 {
8027  public:
8028  //**********************************************************************************************
8029  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
8030  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
8031  IsDenseVector<VT>::value && IsColumnVector<VT>::value
8032  , typename TDMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
8033  , INVALID_TYPE >::Type Type;
8034  //**********************************************************************************************
8035 };
8037 //*************************************************************************************************
8038 
8039 
8040 //*************************************************************************************************
8042 template< typename MT1, typename MT2, typename VT >
8043 struct TDMatSVecMultExprTrait< TDMatTDMatMultExpr<MT1,MT2>, VT >
8044 {
8045  public:
8046  //**********************************************************************************************
8047  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
8048  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
8049  IsSparseVector<VT>::value && IsColumnVector<VT>::value
8050  , typename TDMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
8051  , INVALID_TYPE >::Type Type;
8052  //**********************************************************************************************
8053 };
8055 //*************************************************************************************************
8056 
8057 
8058 //*************************************************************************************************
8060 template< typename VT, typename MT1, typename MT2 >
8061 struct TDVecTDMatMultExprTrait< VT, TDMatTDMatMultExpr<MT1,MT2> >
8062 {
8063  public:
8064  //**********************************************************************************************
8065  typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
8066  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
8067  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
8068  , typename TDVecTDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8069  , INVALID_TYPE >::Type Type;
8070  //**********************************************************************************************
8071 };
8073 //*************************************************************************************************
8074 
8075 
8076 //*************************************************************************************************
8078 template< typename VT, typename MT1, typename MT2 >
8079 struct TSVecTDMatMultExprTrait< VT, TDMatTDMatMultExpr<MT1,MT2> >
8080 {
8081  public:
8082  //**********************************************************************************************
8083  typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
8084  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
8085  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
8086  , typename TDVecTDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8087  , INVALID_TYPE >::Type Type;
8088  //**********************************************************************************************
8089 };
8091 //*************************************************************************************************
8092 
8093 
8094 //*************************************************************************************************
8096 template< typename MT1, typename MT2, bool AF >
8097 struct SubmatrixExprTrait< TDMatTDMatMultExpr<MT1,MT2>, AF >
8098 {
8099  public:
8100  //**********************************************************************************************
8101  typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
8102  , typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
8103  //**********************************************************************************************
8104 };
8106 //*************************************************************************************************
8107 
8108 
8109 //*************************************************************************************************
8111 template< typename MT1, typename MT2 >
8112 struct RowExprTrait< TDMatTDMatMultExpr<MT1,MT2> >
8113 {
8114  public:
8115  //**********************************************************************************************
8116  typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
8117  //**********************************************************************************************
8118 };
8120 //*************************************************************************************************
8121 
8122 
8123 //*************************************************************************************************
8125 template< typename MT1, typename MT2 >
8126 struct ColumnExprTrait< TDMatTDMatMultExpr<MT1,MT2> >
8127 {
8128  public:
8129  //**********************************************************************************************
8130  typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
8131  //**********************************************************************************************
8132 };
8134 //*************************************************************************************************
8135 
8136 } // namespace blaze
8137 
8138 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exceptionThis macro encapsulates the default way of...
Definition: Exception.h:187
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:295
const MT::ElementType max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1729
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > >, simd_int16_t >::Type set(T value)
Sets all values in the vector to the given 2-byte integral value.
Definition: Set.h:73
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:246
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:252
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7820
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:105
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:249
Header file for basic type definitions.
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:240
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:252
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:150
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:207
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:507
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:351
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatTDMatMultExpr.h:244
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2588
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:259
Header file for the And class template.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:450
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:257
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Expression object for transpose dense matrix-transpose dense matrix multiplications.The TDMatTDMatMultExpr class represents the compile time expression for multiplications between two column-major dense matrices.
Definition: Forward.h:144
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:241
Header file for the IsUniLower type trait.
CompressedMatrix< Type, false > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:2584
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:153
TDMatTDMatMultExpr< MT1, MT2 > This
Type of this TDMatTDMatMultExpr instance.
Definition: TDMatTDMatMultExpr.h:239
Header file for the IsComplexDouble type trait.
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:377
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:397
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:79
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the Or class template.
Header file for the TDMatSVecMultExprTrait class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exceptionThis macro encapsulates the default way of Bla...
Definition: Exception.h:331
const MT::ElementType min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1682
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
Header file for the Not class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:245
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:431
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2586
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:421
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:148
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:280
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:149
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:116
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:79
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:1232
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:441
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:138
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
const bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the reset shim.
Constraint on the data type.
Constraints on the storage order of matrix types.
Header file for the HasMutableDataAccess type trait.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:151
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:122
Header file for the IsDenseVector type trait.
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:367
Header file for all intrinsic functionality.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:409
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:243
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:451
Header file for the IsRowMajorMatrix type trait.
const DMatTransExpr< MT,!SO > trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:944
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:258
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the TDMatDVecMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2583
Header file for the IsTrue value trait.
Header file for the IsComplex type trait.
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:387
Header file for the complex data type.
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:258
Header file for the IsUpper type trait.
Header file for exception macros.
Header file for the IsColumnVector type trait.
Constraint on the data type.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:242
Header file for the IsResizable type trait.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:255
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:152