DMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
52 #include <blaze/math/Exception.h>
60 #include <blaze/math/Functions.h>
61 #include <blaze/math/shims/Reset.h>
62 #include <blaze/math/SIMD.h>
103 #include <blaze/system/BLAS.h>
104 #include <blaze/system/Blocking.h>
106 #include <blaze/system/Thresholds.h>
107 #include <blaze/util/Assert.h>
108 #include <blaze/util/Complex.h>
112 #include <blaze/util/DisableIf.h>
113 #include <blaze/util/EnableIf.h>
115 #include <blaze/util/InvalidType.h>
117 #include <blaze/util/mpl/And.h>
118 #include <blaze/util/mpl/If.h>
119 #include <blaze/util/mpl/Not.h>
120 #include <blaze/util/mpl/Or.h>
121 #include <blaze/util/Types.h>
130 
131 
132 namespace blaze {
133 
134 //=================================================================================================
135 //
136 // CLASS DMATDMATMULTEXPR
137 //
138 //=================================================================================================
139 
140 //*************************************************************************************************
147 template< typename MT1 // Type of the left-hand side dense matrix
148  , typename MT2 > // Type of the right-hand side dense matrix
149 class DMatDMatMultExpr : public DenseMatrix< DMatDMatMultExpr<MT1,MT2>, false >
150  , private MatMatMultExpr
151  , private Computation
152 {
153  private:
154  //**Type definitions****************************************************************************
161  //**********************************************************************************************
162 
163  //**********************************************************************************************
165  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
166  //**********************************************************************************************
167 
168  //**********************************************************************************************
170  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
171  //**********************************************************************************************
172 
173  //**********************************************************************************************
175 
181  template< typename T1, typename T2, typename T3 >
182  struct CanExploitSymmetry {
183  enum : bool { value = IsColumnMajorMatrix<T1>::value &&
184  ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
185  };
187  //**********************************************************************************************
188 
189  //**********************************************************************************************
191 
195  template< typename T1, typename T2, typename T3 >
196  struct IsEvaluationRequired {
197  enum : bool { value = ( evaluateLeft || evaluateRight ) &&
198  !CanExploitSymmetry<T1,T2,T3>::value };
199  };
201  //**********************************************************************************************
202 
203  //**********************************************************************************************
205 
208  template< typename T1, typename T2, typename T3 >
209  struct UseBlasKernel {
211  HasMutableDataAccess<T1>::value &&
212  HasConstDataAccess<T2>::value &&
213  HasConstDataAccess<T3>::value &&
214  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
215  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
216  IsBLASCompatible< ElementType_<T1> >::value &&
217  IsBLASCompatible< ElementType_<T2> >::value &&
218  IsBLASCompatible< ElementType_<T3> >::value &&
219  IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
220  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
221  };
223  //**********************************************************************************************
224 
225  //**********************************************************************************************
227 
230  template< typename T1, typename T2, typename T3 >
231  struct UseVectorizedDefaultKernel {
232  enum : bool { value = useOptimizedKernels &&
233  !IsDiagonal<T3>::value &&
234  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
235  AreSIMDCombinable< ElementType_<T1>
236  , ElementType_<T2>
237  , ElementType_<T3> >::value &&
238  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
239  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
240  };
242  //**********************************************************************************************
243 
244  public:
245  //**Type definitions****************************************************************************
252  typedef const ElementType ReturnType;
253  typedef const ResultType CompositeType;
254 
256  typedef If_< IsExpression<MT1>, const MT1, const MT1& > LeftOperand;
257 
259  typedef If_< IsExpression<MT2>, const MT2, const MT2& > RightOperand;
260 
263 
266  //**********************************************************************************************
267 
268  //**Compilation flags***************************************************************************
270  enum : bool { simdEnabled = !IsDiagonal<MT2>::value &&
271  MT1::simdEnabled && MT2::simdEnabled &&
274 
276  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
277  !evaluateRight && MT2::smpAssignable };
278  //**********************************************************************************************
279 
280  //**SIMD properties*****************************************************************************
282  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
283  //**********************************************************************************************
284 
285  //**Constructor*********************************************************************************
291  explicit inline DMatDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
292  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
293  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
294  {
295  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
296  }
297  //**********************************************************************************************
298 
299  //**Access operator*****************************************************************************
306  inline ReturnType operator()( size_t i, size_t j ) const {
307  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
308  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
309 
310  if( IsDiagonal<MT1>::value ) {
311  return lhs_(i,i) * rhs_(i,j);
312  }
313  else if( IsDiagonal<MT2>::value ) {
314  return lhs_(i,j) * rhs_(j,j);
315  }
317  const size_t begin( ( IsUpper<MT1>::value )
318  ?( ( IsLower<MT2>::value )
319  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
320  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
321  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
322  :( ( IsLower<MT2>::value )
323  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
324  :( 0UL ) ) );
325  const size_t end( ( IsLower<MT1>::value )
326  ?( ( IsUpper<MT2>::value )
327  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
328  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
329  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
330  :( ( IsUpper<MT2>::value )
331  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
332  :( lhs_.columns() ) ) );
333 
334  if( begin >= end ) return ElementType();
335 
336  const size_t n( end - begin );
337 
338  return subvector( row( lhs_, i ), begin, n ) * subvector( column( rhs_, j ), begin, n );
339  }
340  else {
341  return row( lhs_, i ) * column( rhs_, j );
342  }
343  }
344  //**********************************************************************************************
345 
346  //**At function*********************************************************************************
354  inline ReturnType at( size_t i, size_t j ) const {
355  if( i >= lhs_.rows() ) {
356  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
357  }
358  if( j >= rhs_.columns() ) {
359  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
360  }
361  return (*this)(i,j);
362  }
363  //**********************************************************************************************
364 
365  //**Rows function*******************************************************************************
370  inline size_t rows() const noexcept {
371  return lhs_.rows();
372  }
373  //**********************************************************************************************
374 
375  //**Columns function****************************************************************************
380  inline size_t columns() const noexcept {
381  return rhs_.columns();
382  }
383  //**********************************************************************************************
384 
385  //**Left operand access*************************************************************************
390  inline LeftOperand leftOperand() const noexcept {
391  return lhs_;
392  }
393  //**********************************************************************************************
394 
395  //**Right operand access************************************************************************
400  inline RightOperand rightOperand() const noexcept {
401  return rhs_;
402  }
403  //**********************************************************************************************
404 
405  //**********************************************************************************************
411  template< typename T >
412  inline bool canAlias( const T* alias ) const noexcept {
413  return ( lhs_.canAlias( alias ) || rhs_.canAlias( alias ) );
414  }
415  //**********************************************************************************************
416 
417  //**********************************************************************************************
423  template< typename T >
424  inline bool isAliased( const T* alias ) const noexcept {
425  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
426  }
427  //**********************************************************************************************
428 
429  //**********************************************************************************************
434  inline bool isAligned() const noexcept {
435  return lhs_.isAligned() && rhs_.isAligned();
436  }
437  //**********************************************************************************************
438 
439  //**********************************************************************************************
444  inline bool canSMPAssign() const noexcept {
445  return ( !BLAZE_BLAS_IS_PARALLEL ||
446  ( rows() * columns() < DMATDMATMULT_THRESHOLD ) ) &&
447  ( rows() * columns() >= SMP_DMATDMATMULT_THRESHOLD );
448  }
449  //**********************************************************************************************
450 
451  private:
452  //**Member variables****************************************************************************
453  LeftOperand lhs_;
454  RightOperand rhs_;
455  //**********************************************************************************************
456 
457  //**Assignment to dense matrices****************************************************************
470  template< typename MT // Type of the target dense matrix
471  , bool SO > // Storage order of the target dense matrix
473  assign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
474  {
476 
477  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
478  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
479 
480  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
481  return;
482  }
483  else if( rhs.lhs_.columns() == 0UL ) {
484  reset( ~lhs );
485  return;
486  }
487 
488  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
489  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
490 
491  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
492  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
493  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
494  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
495  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
496  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
497 
498  DMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
499  }
501  //**********************************************************************************************
502 
503  //**Assignment to dense matrices (kernel selection)*********************************************
514  template< typename MT3 // Type of the left-hand side target matrix
515  , typename MT4 // Type of the left-hand side matrix operand
516  , typename MT5 > // Type of the right-hand side matrix operand
517  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
518  {
519  if( ( IsDiagonal<MT5>::value ) ||
520  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
521  selectSmallAssignKernel( C, A, B );
522  else
523  selectBlasAssignKernel( C, A, B );
524  }
526  //**********************************************************************************************
527 
528  //**Default assignment to dense matrices (general/general)**************************************
542  template< typename MT3 // Type of the left-hand side target matrix
543  , typename MT4 // Type of the left-hand side matrix operand
544  , typename MT5 > // Type of the right-hand side matrix operand
545  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
546  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
547  {
548  const size_t M( A.rows() );
549  const size_t N( B.columns() );
550  const size_t K( A.columns() );
551 
552  for( size_t i=0UL; i<M; ++i )
553  {
554  const size_t kbegin( ( IsUpper<MT4>::value )
555  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
556  :( 0UL ) );
557  const size_t kend( ( IsLower<MT4>::value )
558  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
559  :( K ) );
560  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
561 
562  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
563  for( size_t j=0UL; j<N; ++j ) {
564  reset( (~C)(i,j) );
565  }
566  continue;
567  }
568 
569  {
570  const size_t jbegin( ( IsUpper<MT5>::value )
571  ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
572  :( 0UL ) );
573  const size_t jend( ( IsLower<MT5>::value )
574  ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
575  :( N ) );
576  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
577 
578  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
579  for( size_t j=0UL; j<jbegin; ++j ) {
580  reset( C(i,j) );
581  }
582  }
583  else if( IsStrictlyUpper<MT5>::value ) {
584  reset( C(i,0UL) );
585  }
586  for( size_t j=jbegin; j<jend; ++j ) {
587  C(i,j) = A(i,kbegin) * B(kbegin,j);
588  }
589  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
590  for( size_t j=jend; j<N; ++j ) {
591  reset( C(i,j) );
592  }
593  }
594  else if( IsStrictlyLower<MT5>::value ) {
595  reset( C(i,N-1UL) );
596  }
597  }
598 
599  for( size_t k=kbegin+1UL; k<kend; ++k )
600  {
601  const size_t jbegin( ( IsUpper<MT5>::value )
602  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
603  :( 0UL ) );
604  const size_t jend( ( IsLower<MT5>::value )
605  ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
606  :( N ) );
607  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
608 
609  for( size_t j=jbegin; j<jend; ++j ) {
610  C(i,j) += A(i,k) * B(k,j);
611  }
612  if( IsLower<MT5>::value ) {
613  C(i,jend) = A(i,k) * B(k,jend);
614  }
615  }
616  }
617  }
619  //**********************************************************************************************
620 
621  //**Default assignment to dense matrices (general/diagonal)*************************************
635  template< typename MT3 // Type of the left-hand side target matrix
636  , typename MT4 // Type of the left-hand side matrix operand
637  , typename MT5 > // Type of the right-hand side matrix operand
638  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
639  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
640  {
642 
643  const size_t M( A.rows() );
644  const size_t N( B.columns() );
645 
646  for( size_t i=0UL; i<M; ++i )
647  {
648  const size_t jbegin( ( IsUpper<MT4>::value )
649  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
650  :( 0UL ) );
651  const size_t jend( ( IsLower<MT4>::value )
652  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
653  :( N ) );
654  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
655 
656  if( IsUpper<MT4>::value ) {
657  for( size_t j=0UL; j<jbegin; ++j ) {
658  reset( C(i,j) );
659  }
660  }
661  for( size_t j=jbegin; j<jend; ++j ) {
662  C(i,j) = A(i,j) * B(j,j);
663  }
664  if( IsLower<MT4>::value ) {
665  for( size_t j=jend; j<N; ++j ) {
666  reset( C(i,j) );
667  }
668  }
669  }
670  }
672  //**********************************************************************************************
673 
674  //**Default assignment to dense matrices (diagonal/general)*************************************
688  template< typename MT3 // Type of the left-hand side target matrix
689  , typename MT4 // Type of the left-hand side matrix operand
690  , typename MT5 > // Type of the right-hand side matrix operand
691  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
692  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
693  {
695 
696  const size_t M( A.rows() );
697  const size_t N( B.columns() );
698 
699  for( size_t i=0UL; i<M; ++i )
700  {
701  const size_t jbegin( ( IsUpper<MT5>::value )
702  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
703  :( 0UL ) );
704  const size_t jend( ( IsLower<MT5>::value )
705  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
706  :( N ) );
707  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
708 
709  if( IsUpper<MT5>::value ) {
710  for( size_t j=0UL; j<jbegin; ++j ) {
711  reset( C(i,j) );
712  }
713  }
714  for( size_t j=jbegin; j<jend; ++j ) {
715  C(i,j) = A(i,i) * B(i,j);
716  }
717  if( IsLower<MT5>::value ) {
718  for( size_t j=jend; j<N; ++j ) {
719  reset( C(i,j) );
720  }
721  }
722  }
723  }
725  //**********************************************************************************************
726 
727  //**Default assignment to dense matrices (diagonal/diagonal)************************************
741  template< typename MT3 // Type of the left-hand side target matrix
742  , typename MT4 // Type of the left-hand side matrix operand
743  , typename MT5 > // Type of the right-hand side matrix operand
744  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
745  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
746  {
748 
749  reset( C );
750 
751  for( size_t i=0UL; i<A.rows(); ++i ) {
752  C(i,i) = A(i,i) * B(i,i);
753  }
754  }
756  //**********************************************************************************************
757 
758  //**Default assignment to dense matrices (small matrices)***************************************
771  template< typename MT3 // Type of the left-hand side target matrix
772  , typename MT4 // Type of the left-hand side matrix operand
773  , typename MT5 > // Type of the right-hand side matrix operand
774  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
775  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
776  {
777  selectDefaultAssignKernel( C, A, B );
778  }
780  //**********************************************************************************************
781 
782  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
797  template< typename MT3 // Type of the left-hand side target matrix
798  , typename MT4 // Type of the left-hand side matrix operand
799  , typename MT5 > // Type of the right-hand side matrix operand
800  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
801  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
802  {
803  const size_t M( A.rows() );
804  const size_t N( B.columns() );
805  const size_t K( A.columns() );
806 
807  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
808 
809  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
810  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
811 
812  size_t j( 0UL );
813 
814  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
815  for( size_t i=0UL; i<M; ++i )
816  {
817  const size_t kbegin( ( IsUpper<MT4>::value )
818  ?( ( IsLower<MT5>::value )
819  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
820  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
821  :( IsLower<MT5>::value ? j : 0UL ) );
822  const size_t kend( ( IsLower<MT4>::value )
823  ?( ( IsUpper<MT5>::value )
824  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
825  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
826  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
827 
828  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
829 
830  for( size_t k=kbegin; k<kend; ++k ) {
831  const SIMDType a1( set( A(i,k) ) );
832  xmm1 = xmm1 + a1 * B.load(k,j );
833  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
834  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
835  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
836  xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
837  xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
838  xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
839  xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
840  }
841 
842  (~C).store( i, j , xmm1 );
843  (~C).store( i, j+SIMDSIZE , xmm2 );
844  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
845  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
846  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
847  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
848  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
849  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
850  }
851  }
852 
853  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
854  {
855  size_t i( 0UL );
856 
857  for( ; (i+2UL) <= M; i+=2UL )
858  {
859  const size_t kbegin( ( IsUpper<MT4>::value )
860  ?( ( IsLower<MT5>::value )
861  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
862  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
863  :( IsLower<MT5>::value ? j : 0UL ) );
864  const size_t kend( ( IsLower<MT4>::value )
865  ?( ( IsUpper<MT5>::value )
866  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
867  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
868  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
869 
870  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
871 
872  for( size_t k=kbegin; k<kend; ++k ) {
873  const SIMDType a1( set( A(i ,k) ) );
874  const SIMDType a2( set( A(i+1UL,k) ) );
875  const SIMDType b1( B.load(k,j ) );
876  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
877  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
878  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
879  xmm1 = xmm1 + a1 * b1;
880  xmm2 = xmm2 + a1 * b2;
881  xmm3 = xmm3 + a1 * b3;
882  xmm4 = xmm4 + a1 * b4;
883  xmm5 = xmm5 + a2 * b1;
884  xmm6 = xmm6 + a2 * b2;
885  xmm7 = xmm7 + a2 * b3;
886  xmm8 = xmm8 + a2 * b4;
887  }
888 
889  (~C).store( i , j , xmm1 );
890  (~C).store( i , j+SIMDSIZE , xmm2 );
891  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
892  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
893  (~C).store( i+1UL, j , xmm5 );
894  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
895  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
896  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
897  }
898 
899  if( i < M )
900  {
901  const size_t kbegin( ( IsUpper<MT4>::value )
902  ?( ( IsLower<MT5>::value )
903  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
904  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
905  :( IsLower<MT5>::value ? j : 0UL ) );
906  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
907 
908  SIMDType xmm1, xmm2, xmm3, xmm4;
909 
910  for( size_t k=kbegin; k<kend; ++k ) {
911  const SIMDType a1( set( A(i,k) ) );
912  xmm1 = xmm1 + a1 * B.load(k,j );
913  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
914  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
915  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
916  }
917 
918  (~C).store( i, j , xmm1 );
919  (~C).store( i, j+SIMDSIZE , xmm2 );
920  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
921  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
922  }
923  }
924 
925  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
926  {
927  size_t i( 0UL );
928 
929  for( ; (i+2UL) <= M; i+=2UL )
930  {
931  const size_t kbegin( ( IsUpper<MT4>::value )
932  ?( ( IsLower<MT5>::value )
933  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
934  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
935  :( IsLower<MT5>::value ? j : 0UL ) );
936  const size_t kend( ( IsLower<MT4>::value )
937  ?( ( IsUpper<MT5>::value )
938  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
939  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
940  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
941 
942  SIMDType xmm1, xmm2, xmm3, xmm4;
943 
944  for( size_t k=kbegin; k<kend; ++k ) {
945  const SIMDType a1( set( A(i ,k) ) );
946  const SIMDType a2( set( A(i+1UL,k) ) );
947  const SIMDType b1( B.load(k,j ) );
948  const SIMDType b2( B.load(k,j+SIMDSIZE) );
949  xmm1 = xmm1 + a1 * b1;
950  xmm2 = xmm2 + a1 * b2;
951  xmm3 = xmm3 + a2 * b1;
952  xmm4 = xmm4 + a2 * b2;
953  }
954 
955  (~C).store( i , j , xmm1 );
956  (~C).store( i , j+SIMDSIZE, xmm2 );
957  (~C).store( i+1UL, j , xmm3 );
958  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
959  }
960 
961  if( i < M )
962  {
963  const size_t kbegin( ( IsUpper<MT4>::value )
964  ?( ( IsLower<MT5>::value )
965  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
966  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
967  :( IsLower<MT5>::value ? j : 0UL ) );
968  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
969 
970  SIMDType xmm1, xmm2;
971 
972  for( size_t k=kbegin; k<kend; ++k ) {
973  const SIMDType a1( set( A(i,k) ) );
974  xmm1 = xmm1 + a1 * B.load(k,j );
975  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
976  }
977 
978  (~C).store( i, j , xmm1 );
979  (~C).store( i, j+SIMDSIZE, xmm2 );
980  }
981  }
982 
983  for( ; j<jpos; j+=SIMDSIZE )
984  {
985  size_t i( 0UL );
986 
987  for( ; (i+2UL) <= M; i+=2UL )
988  {
989  const size_t kbegin( ( IsUpper<MT4>::value )
990  ?( ( IsLower<MT5>::value )
991  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
992  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
993  :( IsLower<MT5>::value ? j : 0UL ) );
994  const size_t kend( ( IsLower<MT4>::value )
995  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
996  :( K ) );
997 
998  SIMDType xmm1, xmm2;
999 
1000  for( size_t k=kbegin; k<kend; ++k ) {
1001  const SIMDType b1( B.load(k,j) );
1002  xmm1 = xmm1 + set( A(i ,k) ) * b1;
1003  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
1004  }
1005 
1006  (~C).store( i , j, xmm1 );
1007  (~C).store( i+1UL, j, xmm2 );
1008  }
1009 
1010  if( i < M )
1011  {
1012  const size_t kbegin( ( IsUpper<MT4>::value )
1013  ?( ( IsLower<MT5>::value )
1014  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1015  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1016  :( IsLower<MT5>::value ? j : 0UL ) );
1017 
1018  SIMDType xmm1;
1019 
1020  for( size_t k=kbegin; k<K; ++k ) {
1021  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
1022  }
1023 
1024  (~C).store( i, j, xmm1 );
1025  }
1026  }
1027 
1028  for( ; remainder && j<N; ++j )
1029  {
1030  size_t i( 0UL );
1031 
1032  for( ; (i+2UL) <= M; i+=2UL )
1033  {
1034  const size_t kbegin( ( IsUpper<MT4>::value )
1035  ?( ( IsLower<MT5>::value )
1036  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1037  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1038  :( IsLower<MT5>::value ? j : 0UL ) );
1039  const size_t kend( ( IsLower<MT4>::value )
1040  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1041  :( K ) );
1042 
1043  ElementType value1 = ElementType();
1044  ElementType value2 = ElementType();
1045 
1046  for( size_t k=kbegin; k<kend; ++k ) {
1047  value1 += A(i ,k) * B(k,j);
1048  value2 += A(i+1UL,k) * B(k,j);
1049  }
1050 
1051  (~C)(i ,j) = value1;
1052  (~C)(i+1UL,j) = value2;
1053  }
1054 
1055  if( i < M )
1056  {
1057  const size_t kbegin( ( IsUpper<MT4>::value )
1058  ?( ( IsLower<MT5>::value )
1059  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1060  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1061  :( IsLower<MT5>::value ? j : 0UL ) );
1062 
1063  ElementType value = ElementType();
1064 
1065  for( size_t k=kbegin; k<K; ++k ) {
1066  value += A(i,k) * B(k,j);
1067  }
1068 
1069  (~C)(i,j) = value;
1070  }
1071  }
1072  }
1074  //**********************************************************************************************
1075 
1076  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1091  template< typename MT3 // Type of the left-hand side target matrix
1092  , typename MT4 // Type of the left-hand side matrix operand
1093  , typename MT5 > // Type of the right-hand side matrix operand
1094  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1095  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1096  {
1101 
1102  if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1103  const OppositeType_<MT4> tmp( serial( A ) );
1104  assign( ~C, tmp * B );
1105  }
1106  else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1107  const OppositeType_<MT5> tmp( serial( B ) );
1108  assign( ~C, A * tmp );
1109  }
1110  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1111  const OppositeType_<MT4> tmp( serial( A ) );
1112  assign( ~C, tmp * B );
1113  }
1114  else {
1115  const OppositeType_<MT5> tmp( serial( B ) );
1116  assign( ~C, A * tmp );
1117  }
1118  }
1120  //**********************************************************************************************
1121 
1122  //**Default assignment to dense matrices (large matrices)***************************************
1135  template< typename MT3 // Type of the left-hand side target matrix
1136  , typename MT4 // Type of the left-hand side matrix operand
1137  , typename MT5 > // Type of the right-hand side matrix operand
1138  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1139  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1140  {
1141  selectDefaultAssignKernel( C, A, B );
1142  }
1144  //**********************************************************************************************
1145 
1146  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
1161  template< typename MT3 // Type of the left-hand side target matrix
1162  , typename MT4 // Type of the left-hand side matrix operand
1163  , typename MT5 > // Type of the right-hand side matrix operand
1164  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1165  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1166  {
1167  const size_t M( A.rows() );
1168  const size_t N( B.columns() );
1169  const size_t K( A.columns() );
1170 
1171  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1172 
1173  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
1174  {
1175  const size_t jend( min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
1176 
1177  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1178  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1179 
1180  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
1181  {
1182  const size_t iend( min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
1183 
1184  for( size_t i=ii; i<iend; ++i ) {
1185  for( size_t j=jj; j<jend; ++j ) {
1186  reset( (~C)(i,j) );
1187  }
1188  }
1189 
1190  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
1191  {
1192  const size_t ktmp( min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
1193 
1194  size_t j( jj );
1195 
1196  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1197  {
1198  const size_t j1( j+SIMDSIZE );
1199  const size_t j2( j+SIMDSIZE*2UL );
1200  const size_t j3( j+SIMDSIZE*3UL );
1201 
1202  size_t i( ii );
1203 
1204  for( ; (i+2UL) <= iend; i+=2UL )
1205  {
1206  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1207  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1208  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1209  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
1210 
1211  SIMDType xmm1( (~C).load(i ,j ) );
1212  SIMDType xmm2( (~C).load(i ,j1) );
1213  SIMDType xmm3( (~C).load(i ,j2) );
1214  SIMDType xmm4( (~C).load(i ,j3) );
1215  SIMDType xmm5( (~C).load(i+1UL,j ) );
1216  SIMDType xmm6( (~C).load(i+1UL,j1) );
1217  SIMDType xmm7( (~C).load(i+1UL,j2) );
1218  SIMDType xmm8( (~C).load(i+1UL,j3) );
1219 
1220  for( size_t k=kbegin; k<kend; ++k ) {
1221  const SIMDType a1( set( A(i ,k) ) );
1222  const SIMDType a2( set( A(i+1UL,k) ) );
1223  const SIMDType b1( B.load(k,j ) );
1224  const SIMDType b2( B.load(k,j1) );
1225  const SIMDType b3( B.load(k,j2) );
1226  const SIMDType b4( B.load(k,j3) );
1227  xmm1 = xmm1 + a1 * b1;
1228  xmm2 = xmm2 + a1 * b2;
1229  xmm3 = xmm3 + a1 * b3;
1230  xmm4 = xmm4 + a1 * b4;
1231  xmm5 = xmm5 + a2 * b1;
1232  xmm6 = xmm6 + a2 * b2;
1233  xmm7 = xmm7 + a2 * b3;
1234  xmm8 = xmm8 + a2 * b4;
1235  }
1236 
1237  (~C).store( i , j , xmm1 );
1238  (~C).store( i , j1, xmm2 );
1239  (~C).store( i , j2, xmm3 );
1240  (~C).store( i , j3, xmm4 );
1241  (~C).store( i+1UL, j , xmm5 );
1242  (~C).store( i+1UL, j1, xmm6 );
1243  (~C).store( i+1UL, j2, xmm7 );
1244  (~C).store( i+1UL, j3, xmm8 );
1245  }
1246 
1247  if( i < iend )
1248  {
1249  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1250  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1251  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1252  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
1253 
1254  SIMDType xmm1( (~C).load(i,j ) );
1255  SIMDType xmm2( (~C).load(i,j1) );
1256  SIMDType xmm3( (~C).load(i,j2) );
1257  SIMDType xmm4( (~C).load(i,j3) );
1258 
1259  for( size_t k=kbegin; k<kend; ++k ) {
1260  const SIMDType a1( set( A(i,k) ) );
1261  xmm1 = xmm1 + a1 * B.load(k,j );
1262  xmm2 = xmm2 + a1 * B.load(k,j1);
1263  xmm3 = xmm3 + a1 * B.load(k,j2);
1264  xmm4 = xmm4 + a1 * B.load(k,j3);
1265  }
1266 
1267  (~C).store( i, j , xmm1 );
1268  (~C).store( i, j1, xmm2 );
1269  (~C).store( i, j2, xmm3 );
1270  (~C).store( i, j3, xmm4 );
1271  }
1272  }
1273 
1274  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1275  {
1276  const size_t j1( j+SIMDSIZE );
1277 
1278  size_t i( ii );
1279 
1280  for( ; (i+4UL) <= iend; i+=4UL )
1281  {
1282  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1283  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1284  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
1285  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
1286 
1287  SIMDType xmm1( (~C).load(i ,j ) );
1288  SIMDType xmm2( (~C).load(i ,j1) );
1289  SIMDType xmm3( (~C).load(i+1UL,j ) );
1290  SIMDType xmm4( (~C).load(i+1UL,j1) );
1291  SIMDType xmm5( (~C).load(i+2UL,j ) );
1292  SIMDType xmm6( (~C).load(i+2UL,j1) );
1293  SIMDType xmm7( (~C).load(i+3UL,j ) );
1294  SIMDType xmm8( (~C).load(i+3UL,j1) );
1295 
1296  for( size_t k=kbegin; k<kend; ++k ) {
1297  const SIMDType a1( set( A(i ,k) ) );
1298  const SIMDType a2( set( A(i+1UL,k) ) );
1299  const SIMDType a3( set( A(i+2UL,k) ) );
1300  const SIMDType a4( set( A(i+3UL,k) ) );
1301  const SIMDType b1( B.load(k,j ) );
1302  const SIMDType b2( B.load(k,j1) );
1303  xmm1 = xmm1 + a1 * b1;
1304  xmm2 = xmm2 + a1 * b2;
1305  xmm3 = xmm3 + a2 * b1;
1306  xmm4 = xmm4 + a2 * b2;
1307  xmm5 = xmm5 + a3 * b1;
1308  xmm6 = xmm6 + a3 * b2;
1309  xmm7 = xmm7 + a4 * b1;
1310  xmm8 = xmm8 + a4 * b2;
1311  }
1312 
1313  (~C).store( i , j , xmm1 );
1314  (~C).store( i , j1, xmm2 );
1315  (~C).store( i+1UL, j , xmm3 );
1316  (~C).store( i+1UL, j1, xmm4 );
1317  (~C).store( i+2UL, j , xmm5 );
1318  (~C).store( i+2UL, j1, xmm6 );
1319  (~C).store( i+3UL, j , xmm7 );
1320  (~C).store( i+3UL, j1, xmm8 );
1321  }
1322 
1323  for( ; (i+2UL) <= iend; i+=2UL )
1324  {
1325  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1326  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1327  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1328  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
1329 
1330  SIMDType xmm1( (~C).load(i ,j ) );
1331  SIMDType xmm2( (~C).load(i ,j1) );
1332  SIMDType xmm3( (~C).load(i+1UL,j ) );
1333  SIMDType xmm4( (~C).load(i+1UL,j1) );
1334 
1335  for( size_t k=kbegin; k<kend; ++k ) {
1336  const SIMDType a1( set( A(i ,k) ) );
1337  const SIMDType a2( set( A(i+1UL,k) ) );
1338  const SIMDType b1( B.load(k,j ) );
1339  const SIMDType b2( B.load(k,j1) );
1340  xmm1 = xmm1 + a1 * b1;
1341  xmm2 = xmm2 + a1 * b2;
1342  xmm3 = xmm3 + a2 * b1;
1343  xmm4 = xmm4 + a2 * b2;
1344  }
1345 
1346  (~C).store( i , j , xmm1 );
1347  (~C).store( i , j1, xmm2 );
1348  (~C).store( i+1UL, j , xmm3 );
1349  (~C).store( i+1UL, j1, xmm4 );
1350  }
1351 
1352  if( i < iend )
1353  {
1354  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1355  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1356  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1357  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
1358 
1359  SIMDType xmm1( (~C).load(i,j ) );
1360  SIMDType xmm2( (~C).load(i,j1) );
1361 
1362  for( size_t k=kbegin; k<kend; ++k ) {
1363  const SIMDType a1( set( A(i,k) ) );
1364  xmm1 = xmm1 + a1 * B.load(k,j );
1365  xmm2 = xmm2 + a1 * B.load(k,j1);
1366  }
1367 
1368  (~C).store( i, j , xmm1 );
1369  (~C).store( i, j1, xmm2 );
1370  }
1371  }
1372 
1373  for( ; j<jpos; j+=SIMDSIZE )
1374  {
1375  for( size_t i=ii; i<iend; ++i )
1376  {
1377  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1378  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1379  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1380  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
1381 
1382  SIMDType xmm1( (~C).load(i,j) );
1383 
1384  for( size_t k=kbegin; k<kend; ++k ) {
1385  const SIMDType a1( set( A(i,k) ) );
1386  xmm1 = xmm1 + a1 * B.load(k,j);
1387  }
1388 
1389  (~C).store( i, j, xmm1 );
1390  }
1391  }
1392 
1393  for( ; remainder && j<jend; ++j )
1394  {
1395  for( size_t i=ii; i<iend; ++i )
1396  {
1397  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1398  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1399  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1400  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
1401 
1402  ElementType value( (~C)(i,j) );
1403 
1404  for( size_t k=kbegin; k<kend; ++k ) {
1405  value += A(i,k) * B(k,j);
1406  }
1407 
1408  (~C)(i,j) = value;
1409  }
1410  }
1411  }
1412  }
1413  }
1414  }
1416  //**********************************************************************************************
1417 
1418  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
1432  template< typename MT3 // Type of the left-hand side target matrix
1433  , typename MT4 // Type of the left-hand side matrix operand
1434  , typename MT5 > // Type of the right-hand side matrix operand
1435  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1436  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1437  {
1438  selectSmallAssignKernel( ~C, A, B );
1439  }
1441  //**********************************************************************************************
1442 
1443  //**BLAS-based assignment to dense matrices (default)*******************************************
1456  template< typename MT3 // Type of the left-hand side target matrix
1457  , typename MT4 // Type of the left-hand side matrix operand
1458  , typename MT5 > // Type of the right-hand side matrix operand
1459  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
1460  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1461  {
1462  selectLargeAssignKernel( C, A, B );
1463  }
1465  //**********************************************************************************************
1466 
1467  //**BLAS-based assignment to dense matrices*****************************************************
1468 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1469 
1481  template< typename MT3 // Type of the left-hand side target matrix
1482  , typename MT4 // Type of the left-hand side matrix operand
1483  , typename MT5 > // Type of the right-hand side matrix operand
1484  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
1485  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1486  {
1487  typedef ElementType_<MT3> ET;
1488 
1489  if( IsTriangular<MT4>::value ) {
1490  assign( C, B );
1491  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1492  }
1493  else if( IsTriangular<MT5>::value ) {
1494  assign( C, A );
1495  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1496  }
1497  else {
1498  gemm( C, A, B, ET(1), ET(0) );
1499  }
1500  }
1502 #endif
1503  //**********************************************************************************************
1504 
1505  //**Assignment to sparse matrices***************************************************************
1518  template< typename MT // Type of the target sparse matrix
1519  , bool SO > // Storage order of the target sparse matrix
1520  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
1521  assign( SparseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
1522  {
1524 
1525  typedef IfTrue_< SO, OppositeType, ResultType > TmpType;
1526 
1532  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<TmpType> );
1533 
1534  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1535  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1536 
1537  const TmpType tmp( serial( rhs ) );
1538  assign( ~lhs, tmp );
1539  }
1541  //**********************************************************************************************
1542 
1543  //**Restructuring assignment to column-major matrices*******************************************
1558  template< typename MT > // Type of the target matrix
1559  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
1560  assign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
1561  {
1563 
1565 
1566  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1567  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1568 
1569  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
1570  assign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
1571  else if( IsSymmetric<MT1>::value )
1572  assign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
1573  else
1574  assign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
1575  }
1577  //**********************************************************************************************
1578 
1579  //**Addition assignment to dense matrices*******************************************************
1592  template< typename MT // Type of the target dense matrix
1593  , bool SO > // Storage order of the target dense matrix
1594  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
1595  addAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
1596  {
1598 
1599  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1600  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1601 
1602  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1603  return;
1604  }
1605 
1606  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1607  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1608 
1609  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1610  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1611  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1612  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1613  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1614  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1615 
1616  DMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1617  }
1619  //**********************************************************************************************
1620 
1621  //**Addition assignment to dense matrices (kernel selection)************************************
1632  template< typename MT3 // Type of the left-hand side target matrix
1633  , typename MT4 // Type of the left-hand side matrix operand
1634  , typename MT5 > // Type of the right-hand side matrix operand
1635  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1636  {
1637  if( ( IsDiagonal<MT5>::value ) ||
1638  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
1639  selectSmallAddAssignKernel( C, A, B );
1640  else
1641  selectBlasAddAssignKernel( C, A, B );
1642  }
1644  //**********************************************************************************************
1645 
1646  //**Default addition assignment to dense matrices (general/general)*****************************
1660  template< typename MT3 // Type of the left-hand side target matrix
1661  , typename MT4 // Type of the left-hand side matrix operand
1662  , typename MT5 > // Type of the right-hand side matrix operand
1663  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
1664  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1665  {
1666  const size_t M( A.rows() );
1667  const size_t N( B.columns() );
1668  const size_t K( A.columns() );
1669 
1670  for( size_t i=0UL; i<M; ++i )
1671  {
1672  const size_t kbegin( ( IsUpper<MT4>::value )
1673  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1674  :( 0UL ) );
1675  const size_t kend( ( IsLower<MT4>::value )
1676  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1677  :( K ) );
1678  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
1679 
1680  for( size_t k=kbegin; k<kend; ++k )
1681  {
1682  const size_t jbegin( ( IsUpper<MT5>::value )
1683  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
1684  :( 0UL ) );
1685  const size_t jend( ( IsLower<MT5>::value )
1686  ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
1687  :( N ) );
1688  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1689 
1690  const size_t jnum( jend - jbegin );
1691  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1692 
1693  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1694  C(i,j ) += A(i,k) * B(k,j );
1695  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1696  }
1697  if( jpos < jend ) {
1698  C(i,jpos) += A(i,k) * B(k,jpos);
1699  }
1700  }
1701  }
1702  }
1704  //**********************************************************************************************
1705 
1706  //**Default addition assignment to dense matrices (general/diagonal)****************************
1720  template< typename MT3 // Type of the left-hand side target matrix
1721  , typename MT4 // Type of the left-hand side matrix operand
1722  , typename MT5 > // Type of the right-hand side matrix operand
1723  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
1724  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1725  {
1727 
1728  const size_t M( A.rows() );
1729  const size_t N( B.columns() );
1730 
1731  for( size_t i=0UL; i<M; ++i )
1732  {
1733  const size_t jbegin( ( IsUpper<MT4>::value )
1734  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1735  :( 0UL ) );
1736  const size_t jend( ( IsLower<MT4>::value )
1737  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1738  :( N ) );
1739  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1740 
1741  const size_t jnum( jend - jbegin );
1742  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1743 
1744  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1745  C(i,j ) += A(i,j ) * B(j ,j );
1746  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1747  }
1748  if( jpos < jend ) {
1749  C(i,jpos) += A(i,jpos) * B(jpos,jpos);
1750  }
1751  }
1752  }
1754  //**********************************************************************************************
1755 
1756  //**Default addition assignment to dense matrices (diagonal/general)****************************
1770  template< typename MT3 // Type of the left-hand side target matrix
1771  , typename MT4 // Type of the left-hand side matrix operand
1772  , typename MT5 > // Type of the right-hand side matrix operand
1773  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
1774  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1775  {
1777 
1778  const size_t M( A.rows() );
1779  const size_t N( B.columns() );
1780 
1781  for( size_t i=0UL; i<M; ++i )
1782  {
1783  const size_t jbegin( ( IsUpper<MT5>::value )
1784  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
1785  :( 0UL ) );
1786  const size_t jend( ( IsLower<MT5>::value )
1787  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
1788  :( N ) );
1789  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1790 
1791  const size_t jnum( jend - jbegin );
1792  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1793 
1794  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1795  C(i,j ) += A(i,i) * B(i,j );
1796  C(i,j+1UL) += A(i,i) * B(i,j+1UL);
1797  }
1798  if( jpos < jend ) {
1799  C(i,jpos) += A(i,i) * B(i,jpos);
1800  }
1801  }
1802  }
1804  //**********************************************************************************************
1805 
1806  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
1820  template< typename MT3 // Type of the left-hand side target matrix
1821  , typename MT4 // Type of the left-hand side matrix operand
1822  , typename MT5 > // Type of the right-hand side matrix operand
1823  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
1824  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1825  {
1827 
1828  for( size_t i=0UL; i<A.rows(); ++i ) {
1829  C(i,i) += A(i,i) * B(i,i);
1830  }
1831  }
1833  //**********************************************************************************************
1834 
1835  //**Default addition assignment to dense matrices (small matrices)******************************
1849  template< typename MT3 // Type of the left-hand side target matrix
1850  , typename MT4 // Type of the left-hand side matrix operand
1851  , typename MT5 > // Type of the right-hand side matrix operand
1852  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1853  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1854  {
1855  selectDefaultAddAssignKernel( C, A, B );
1856  }
1858  //**********************************************************************************************
1859 
1860  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
1875  template< typename MT3 // Type of the left-hand side target matrix
1876  , typename MT4 // Type of the left-hand side matrix operand
1877  , typename MT5 > // Type of the right-hand side matrix operand
1878  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1879  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1880  {
1881  const size_t M( A.rows() );
1882  const size_t N( B.columns() );
1883  const size_t K( A.columns() );
1884 
1885  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1886 
1887  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1888  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1889 
1890  size_t j( 0UL );
1891 
1892  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1893  for( size_t i=0UL; i<M; ++i )
1894  {
1895  const size_t kbegin( ( IsUpper<MT4>::value )
1896  ?( ( IsLower<MT5>::value )
1897  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1898  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1899  :( IsLower<MT5>::value ? j : 0UL ) );
1900  const size_t kend( ( IsLower<MT4>::value )
1901  ?( ( IsUpper<MT5>::value )
1902  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
1903  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1904  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
1905 
1906  SIMDType xmm1( (~C).load(i,j ) );
1907  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
1908  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
1909  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
1910  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
1911  SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
1912  SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
1913  SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
1914 
1915  for( size_t k=kbegin; k<kend; ++k ) {
1916  const SIMDType a1( set( A(i,k) ) );
1917  xmm1 = xmm1 + a1 * B.load(k,j );
1918  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
1919  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
1920  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
1921  xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
1922  xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
1923  xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
1924  xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
1925  }
1926 
1927  (~C).store( i, j , xmm1 );
1928  (~C).store( i, j+SIMDSIZE , xmm2 );
1929  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1930  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1931  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1932  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
1933  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
1934  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
1935  }
1936  }
1937 
1938  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1939  {
1940  size_t i( 0UL );
1941 
1942  for( ; (i+2UL) <= M; i+=2UL )
1943  {
1944  const size_t kbegin( ( IsUpper<MT4>::value )
1945  ?( ( IsLower<MT5>::value )
1946  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1947  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1948  :( IsLower<MT5>::value ? j : 0UL ) );
1949  const size_t kend( ( IsLower<MT4>::value )
1950  ?( ( IsUpper<MT5>::value )
1951  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1952  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1953  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
1954 
1955  SIMDType xmm1( (~C).load(i ,j ) );
1956  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
1957  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
1958  SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
1959  SIMDType xmm5( (~C).load(i+1UL,j ) );
1960  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
1961  SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
1962  SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
1963 
1964  for( size_t k=kbegin; k<kend; ++k ) {
1965  const SIMDType a1( set( A(i ,k) ) );
1966  const SIMDType a2( set( A(i+1UL,k) ) );
1967  const SIMDType b1( B.load(k,j ) );
1968  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1969  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1970  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1971  xmm1 = xmm1 + a1 * b1;
1972  xmm2 = xmm2 + a1 * b2;
1973  xmm3 = xmm3 + a1 * b3;
1974  xmm4 = xmm4 + a1 * b4;
1975  xmm5 = xmm5 + a2 * b1;
1976  xmm6 = xmm6 + a2 * b2;
1977  xmm7 = xmm7 + a2 * b3;
1978  xmm8 = xmm8 + a2 * b4;
1979  }
1980 
1981  (~C).store( i , j , xmm1 );
1982  (~C).store( i , j+SIMDSIZE , xmm2 );
1983  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1984  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1985  (~C).store( i+1UL, j , xmm5 );
1986  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1987  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1988  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1989  }
1990 
1991  if( i < M )
1992  {
1993  const size_t kbegin( ( IsUpper<MT4>::value )
1994  ?( ( IsLower<MT5>::value )
1995  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1996  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1997  :( IsLower<MT5>::value ? j : 0UL ) );
1998  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
1999 
2000  SIMDType xmm1( (~C).load(i,j ) );
2001  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2002  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2003  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2004 
2005  for( size_t k=kbegin; k<kend; ++k ) {
2006  const SIMDType a1( set( A(i,k) ) );
2007  xmm1 = xmm1 + a1 * B.load(k,j );
2008  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
2009  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
2010  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
2011  }
2012 
2013  (~C).store( i, j , xmm1 );
2014  (~C).store( i, j+SIMDSIZE , xmm2 );
2015  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2016  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2017  }
2018  }
2019 
2020  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2021  {
2022  size_t i( 0UL );
2023 
2024  for( ; (i+2UL) <= M; i+=2UL )
2025  {
2026  const size_t kbegin( ( IsUpper<MT4>::value )
2027  ?( ( IsLower<MT5>::value )
2028  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2029  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2030  :( IsLower<MT5>::value ? j : 0UL ) );
2031  const size_t kend( ( IsLower<MT4>::value )
2032  ?( ( IsUpper<MT5>::value )
2033  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
2034  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2035  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
2036 
2037  SIMDType xmm1( (~C).load(i ,j ) );
2038  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2039  SIMDType xmm3( (~C).load(i+1UL,j ) );
2040  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2041 
2042  for( size_t k=kbegin; k<kend; ++k ) {
2043  const SIMDType a1( set( A(i ,k) ) );
2044  const SIMDType a2( set( A(i+1UL,k) ) );
2045  const SIMDType b1( B.load(k,j ) );
2046  const SIMDType b2( B.load(k,j+SIMDSIZE) );
2047  xmm1 = xmm1 + a1 * b1;
2048  xmm2 = xmm2 + a1 * b2;
2049  xmm3 = xmm3 + a2 * b1;
2050  xmm4 = xmm4 + a2 * b2;
2051  }
2052 
2053  (~C).store( i , j , xmm1 );
2054  (~C).store( i , j+SIMDSIZE, xmm2 );
2055  (~C).store( i+1UL, j , xmm3 );
2056  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
2057  }
2058 
2059  if( i < M )
2060  {
2061  const size_t kbegin( ( IsUpper<MT4>::value )
2062  ?( ( IsLower<MT5>::value )
2063  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2064  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2065  :( IsLower<MT5>::value ? j : 0UL ) );
2066  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
2067 
2068  SIMDType xmm1( (~C).load(i,j ) );
2069  SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
2070 
2071  for( size_t k=kbegin; k<kend; ++k ) {
2072  const SIMDType a1( set( A(i,k) ) );
2073  xmm1 = xmm1 + a1 * B.load(k,j );
2074  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
2075  }
2076 
2077  (~C).store( i, j , xmm1 );
2078  (~C).store( i, j+SIMDSIZE, xmm2 );
2079  }
2080  }
2081 
2082  for( ; j<jpos; j+=SIMDSIZE )
2083  {
2084  size_t i( 0UL );
2085 
2086  for( ; (i+2UL) <= M; i+=2UL )
2087  {
2088  const size_t kbegin( ( IsUpper<MT4>::value )
2089  ?( ( IsLower<MT5>::value )
2090  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2091  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2092  :( IsLower<MT5>::value ? j : 0UL ) );
2093  const size_t kend( ( IsLower<MT4>::value )
2094  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2095  :( K ) );
2096 
2097  SIMDType xmm1( (~C).load(i ,j) );
2098  SIMDType xmm2( (~C).load(i+1UL,j) );
2099 
2100  for( size_t k=kbegin; k<kend; ++k ) {
2101  const SIMDType b1( B.load(k,j) );
2102  xmm1 = xmm1 + set( A(i ,k) ) * b1;
2103  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
2104  }
2105 
2106  (~C).store( i , j, xmm1 );
2107  (~C).store( i+1UL, j, xmm2 );
2108  }
2109 
2110  if( i < M )
2111  {
2112  const size_t kbegin( ( IsUpper<MT4>::value )
2113  ?( ( IsLower<MT5>::value )
2114  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2115  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2116  :( IsLower<MT5>::value ? j : 0UL ) );
2117 
2118  SIMDType xmm1( (~C).load(i,j) );
2119 
2120  for( size_t k=kbegin; k<K; ++k ) {
2121  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
2122  }
2123 
2124  (~C).store( i, j, xmm1 );
2125  }
2126  }
2127 
2128  for( ; remainder && j<N; ++j )
2129  {
2130  size_t i( 0UL );
2131 
2132  for( ; (i+2UL) <= M; i+=2UL )
2133  {
2134  const size_t kbegin( ( IsUpper<MT4>::value )
2135  ?( ( IsLower<MT5>::value )
2136  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2137  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2138  :( IsLower<MT5>::value ? j : 0UL ) );
2139  const size_t kend( ( IsLower<MT4>::value )
2140  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2141  :( K ) );
2142 
2143  ElementType value1( (~C)(i ,j) );
2144  ElementType value2( (~C)(i+1UL,j) );;
2145 
2146  for( size_t k=kbegin; k<kend; ++k ) {
2147  value1 += A(i ,k) * B(k,j);
2148  value2 += A(i+1UL,k) * B(k,j);
2149  }
2150 
2151  (~C)(i ,j) = value1;
2152  (~C)(i+1UL,j) = value2;
2153  }
2154 
2155  if( i < M )
2156  {
2157  const size_t kbegin( ( IsUpper<MT4>::value )
2158  ?( ( IsLower<MT5>::value )
2159  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2160  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2161  :( IsLower<MT5>::value ? j : 0UL ) );
2162 
2163  ElementType value( (~C)(i,j) );
2164 
2165  for( size_t k=kbegin; k<K; ++k ) {
2166  value += A(i,k) * B(k,j);
2167  }
2168 
2169  (~C)(i,j) = value;
2170  }
2171  }
2172  }
2174  //**********************************************************************************************
2175 
2176  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2191  template< typename MT3 // Type of the left-hand side target matrix
2192  , typename MT4 // Type of the left-hand side matrix operand
2193  , typename MT5 > // Type of the right-hand side matrix operand
2194  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2195  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2196  {
2201 
2202  if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2203  const OppositeType_<MT4> tmp( serial( A ) );
2204  addAssign( ~C, tmp * B );
2205  }
2206  else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2207  const OppositeType_<MT5> tmp( serial( B ) );
2208  addAssign( ~C, A * tmp );
2209  }
2210  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2211  const OppositeType_<MT4> tmp( serial( A ) );
2212  addAssign( ~C, tmp * B );
2213  }
2214  else {
2215  const OppositeType_<MT5> tmp( serial( B ) );
2216  addAssign( ~C, A * tmp );
2217  }
2218  }
2220  //**********************************************************************************************
2221 
2222  //**Default addition assignment to dense matrices (large matrices)******************************
2236  template< typename MT3 // Type of the left-hand side target matrix
2237  , typename MT4 // Type of the left-hand side matrix operand
2238  , typename MT5 > // Type of the right-hand side matrix operand
2239  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2240  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2241  {
2242  selectDefaultAddAssignKernel( C, A, B );
2243  }
2245  //**********************************************************************************************
2246 
2247  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
2262  template< typename MT3 // Type of the left-hand side target matrix
2263  , typename MT4 // Type of the left-hand side matrix operand
2264  , typename MT5 > // Type of the right-hand side matrix operand
2265  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2266  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2267  {
2268  const size_t M( A.rows() );
2269  const size_t N( B.columns() );
2270  const size_t K( A.columns() );
2271 
2272  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2273 
2274  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
2275  {
2276  const size_t jend( min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
2277 
2278  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2279  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
2280 
2281  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
2282  {
2283  const size_t iend( min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
2284 
2285  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
2286  {
2287  const size_t ktmp( min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
2288 
2289  size_t j( jj );
2290 
2291  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2292  {
2293  const size_t j1( j+SIMDSIZE );
2294  const size_t j2( j+SIMDSIZE*2UL );
2295  const size_t j3( j+SIMDSIZE*3UL );
2296 
2297  size_t i( ii );
2298 
2299  for( ; (i+2UL) <= iend; i+=2UL )
2300  {
2301  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2302  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2303  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
2304  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
2305 
2306  SIMDType xmm1( (~C).load(i ,j ) );
2307  SIMDType xmm2( (~C).load(i ,j1) );
2308  SIMDType xmm3( (~C).load(i ,j2) );
2309  SIMDType xmm4( (~C).load(i ,j3) );
2310  SIMDType xmm5( (~C).load(i+1UL,j ) );
2311  SIMDType xmm6( (~C).load(i+1UL,j1) );
2312  SIMDType xmm7( (~C).load(i+1UL,j2) );
2313  SIMDType xmm8( (~C).load(i+1UL,j3) );
2314 
2315  for( size_t k=kbegin; k<kend; ++k ) {
2316  const SIMDType a1( set( A(i ,k) ) );
2317  const SIMDType a2( set( A(i+1UL,k) ) );
2318  const SIMDType b1( B.load(k,j ) );
2319  const SIMDType b2( B.load(k,j1) );
2320  const SIMDType b3( B.load(k,j2) );
2321  const SIMDType b4( B.load(k,j3) );
2322  xmm1 = xmm1 + a1 * b1;
2323  xmm2 = xmm2 + a1 * b2;
2324  xmm3 = xmm3 + a1 * b3;
2325  xmm4 = xmm4 + a1 * b4;
2326  xmm5 = xmm5 + a2 * b1;
2327  xmm6 = xmm6 + a2 * b2;
2328  xmm7 = xmm7 + a2 * b3;
2329  xmm8 = xmm8 + a2 * b4;
2330  }
2331 
2332  (~C).store( i , j , xmm1 );
2333  (~C).store( i , j1, xmm2 );
2334  (~C).store( i , j2, xmm3 );
2335  (~C).store( i , j3, xmm4 );
2336  (~C).store( i+1UL, j , xmm5 );
2337  (~C).store( i+1UL, j1, xmm6 );
2338  (~C).store( i+1UL, j2, xmm7 );
2339  (~C).store( i+1UL, j3, xmm8 );
2340  }
2341 
2342  if( i < iend )
2343  {
2344  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2345  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2346  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2347  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
2348 
2349  SIMDType xmm1( (~C).load(i,j ) );
2350  SIMDType xmm2( (~C).load(i,j1) );
2351  SIMDType xmm3( (~C).load(i,j2) );
2352  SIMDType xmm4( (~C).load(i,j3) );
2353 
2354  for( size_t k=kbegin; k<kend; ++k ) {
2355  const SIMDType a1( set( A(i,k) ) );
2356  xmm1 = xmm1 + a1 * B.load(k,j );
2357  xmm2 = xmm2 + a1 * B.load(k,j1);
2358  xmm3 = xmm3 + a1 * B.load(k,j2);
2359  xmm4 = xmm4 + a1 * B.load(k,j3);
2360  }
2361 
2362  (~C).store( i, j , xmm1 );
2363  (~C).store( i, j1, xmm2 );
2364  (~C).store( i, j2, xmm3 );
2365  (~C).store( i, j3, xmm4 );
2366  }
2367  }
2368 
2369  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2370  {
2371  const size_t j1( j+SIMDSIZE );
2372 
2373  size_t i( ii );
2374 
2375  for( ; (i+4UL) <= iend; i+=4UL )
2376  {
2377  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2378  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2379  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
2380  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
2381 
2382  SIMDType xmm1( (~C).load(i ,j ) );
2383  SIMDType xmm2( (~C).load(i ,j1) );
2384  SIMDType xmm3( (~C).load(i+1UL,j ) );
2385  SIMDType xmm4( (~C).load(i+1UL,j1) );
2386  SIMDType xmm5( (~C).load(i+2UL,j ) );
2387  SIMDType xmm6( (~C).load(i+2UL,j1) );
2388  SIMDType xmm7( (~C).load(i+3UL,j ) );
2389  SIMDType xmm8( (~C).load(i+3UL,j1) );
2390 
2391  for( size_t k=kbegin; k<kend; ++k ) {
2392  const SIMDType a1( set( A(i ,k) ) );
2393  const SIMDType a2( set( A(i+1UL,k) ) );
2394  const SIMDType a3( set( A(i+2UL,k) ) );
2395  const SIMDType a4( set( A(i+3UL,k) ) );
2396  const SIMDType b1( B.load(k,j ) );
2397  const SIMDType b2( B.load(k,j1) );
2398  xmm1 = xmm1 + a1 * b1;
2399  xmm2 = xmm2 + a1 * b2;
2400  xmm3 = xmm3 + a2 * b1;
2401  xmm4 = xmm4 + a2 * b2;
2402  xmm5 = xmm5 + a3 * b1;
2403  xmm6 = xmm6 + a3 * b2;
2404  xmm7 = xmm7 + a4 * b1;
2405  xmm8 = xmm8 + a4 * b2;
2406  }
2407 
2408  (~C).store( i , j , xmm1 );
2409  (~C).store( i , j1, xmm2 );
2410  (~C).store( i+1UL, j , xmm3 );
2411  (~C).store( i+1UL, j1, xmm4 );
2412  (~C).store( i+2UL, j , xmm5 );
2413  (~C).store( i+2UL, j1, xmm6 );
2414  (~C).store( i+3UL, j , xmm7 );
2415  (~C).store( i+3UL, j1, xmm8 );
2416  }
2417 
2418  for( ; (i+2UL) <= iend; i+=2UL )
2419  {
2420  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2421  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2422  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
2423  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
2424 
2425  SIMDType xmm1( (~C).load(i ,j ) );
2426  SIMDType xmm2( (~C).load(i ,j1) );
2427  SIMDType xmm3( (~C).load(i+1UL,j ) );
2428  SIMDType xmm4( (~C).load(i+1UL,j1) );
2429 
2430  for( size_t k=kbegin; k<kend; ++k ) {
2431  const SIMDType a1( set( A(i ,k) ) );
2432  const SIMDType a2( set( A(i+1UL,k) ) );
2433  const SIMDType b1( B.load(k,j ) );
2434  const SIMDType b2( B.load(k,j1) );
2435  xmm1 = xmm1 + a1 * b1;
2436  xmm2 = xmm2 + a1 * b2;
2437  xmm3 = xmm3 + a2 * b1;
2438  xmm4 = xmm4 + a2 * b2;
2439  }
2440 
2441  (~C).store( i , j , xmm1 );
2442  (~C).store( i , j1, xmm2 );
2443  (~C).store( i+1UL, j , xmm3 );
2444  (~C).store( i+1UL, j1, xmm4 );
2445  }
2446 
2447  if( i < iend )
2448  {
2449  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2450  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2451  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2452  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
2453 
2454  SIMDType xmm1( (~C).load(i,j ) );
2455  SIMDType xmm2( (~C).load(i,j1) );
2456 
2457  for( size_t k=kbegin; k<kend; ++k ) {
2458  const SIMDType a1( set( A(i,k) ) );
2459  xmm1 = xmm1 + a1 * B.load(k,j );
2460  xmm2 = xmm2 + a1 * B.load(k,j1);
2461  }
2462 
2463  (~C).store( i, j , xmm1 );
2464  (~C).store( i, j1, xmm2 );
2465  }
2466  }
2467 
2468  for( ; j<jpos; j+=SIMDSIZE )
2469  {
2470  for( size_t i=ii; i<iend; ++i )
2471  {
2472  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2473  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2474  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2475  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
2476 
2477  SIMDType xmm1( (~C).load(i,j) );
2478 
2479  for( size_t k=kbegin; k<kend; ++k ) {
2480  const SIMDType a1( set( A(i,k) ) );
2481  xmm1 = xmm1 + a1 * B.load(k,j);
2482  }
2483 
2484  (~C).store( i, j, xmm1 );
2485  }
2486  }
2487 
2488  for( ; remainder && j<jend; ++j )
2489  {
2490  for( size_t i=ii; i<iend; ++i )
2491  {
2492  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2493  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2494  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2495  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
2496 
2497  ElementType value( (~C)(i,j) );
2498 
2499  for( size_t k=kbegin; k<kend; ++k ) {
2500  value += A(i,k) * B(k,j);
2501  }
2502 
2503  (~C)(i,j) = value;
2504  }
2505  }
2506  }
2507  }
2508  }
2509  }
2511  //**********************************************************************************************
2512 
2513  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
2527  template< typename MT3 // Type of the left-hand side target matrix
2528  , typename MT4 // Type of the left-hand side matrix operand
2529  , typename MT5 > // Type of the right-hand side matrix operand
2530  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2531  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2532  {
2533  selectSmallAddAssignKernel( ~C, A, B );
2534  }
2536  //**********************************************************************************************
2537 
2538  //**BLAS-based addition assignment to dense matrices (default)**********************************
2552  template< typename MT3 // Type of the left-hand side target matrix
2553  , typename MT4 // Type of the left-hand side matrix operand
2554  , typename MT5 > // Type of the right-hand side matrix operand
2555  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
2556  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2557  {
2558  selectLargeAddAssignKernel( C, A, B );
2559  }
2561  //**********************************************************************************************
2562 
2563  //**BLAS-based addition assignment to dense matrices********************************************
2564 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2565 
2578  template< typename MT3 // Type of the left-hand side target matrix
2579  , typename MT4 // Type of the left-hand side matrix operand
2580  , typename MT5 > // Type of the right-hand side matrix operand
2581  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
2582  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2583  {
2584  typedef ElementType_<MT3> ET;
2585 
2586  if( IsTriangular<MT4>::value ) {
2587  ResultType_<MT3> tmp( serial( B ) );
2588  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2589  addAssign( C, tmp );
2590  }
2591  else if( IsTriangular<MT5>::value ) {
2592  ResultType_<MT3> tmp( serial( A ) );
2593  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2594  addAssign( C, tmp );
2595  }
2596  else {
2597  gemm( C, A, B, ET(1), ET(1) );
2598  }
2599  }
2601 #endif
2602  //**********************************************************************************************
2603 
2604  //**Restructuring addition assignment to column-major matrices**********************************
2619  template< typename MT > // Type of the target matrix
2620  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
2621  addAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
2622  {
2624 
2626 
2627  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2628  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2629 
2630  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
2631  addAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
2632  else if( IsSymmetric<MT1>::value )
2633  addAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
2634  else
2635  addAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
2636  }
2638  //**********************************************************************************************
2639 
2640  //**Addition assignment to sparse matrices******************************************************
2641  // No special implementation for the addition assignment to sparse matrices.
2642  //**********************************************************************************************
2643 
2644  //**Subtraction assignment to dense matrices****************************************************
2657  template< typename MT // Type of the target dense matrix
2658  , bool SO > // Storage order of the target dense matrix
2659  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
2660  subAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
2661  {
2663 
2664  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2665  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2666 
2667  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2668  return;
2669  }
2670 
2671  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2672  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2673 
2674  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2675  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2676  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2677  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2678  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2679  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2680 
2681  DMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2682  }
2684  //**********************************************************************************************
2685 
2686  //**Subtraction assignment to dense matrices (kernel selection)*********************************
2697  template< typename MT3 // Type of the left-hand side target matrix
2698  , typename MT4 // Type of the left-hand side matrix operand
2699  , typename MT5 > // Type of the right-hand side matrix operand
2700  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2701  {
2702  if( ( IsDiagonal<MT5>::value ) ||
2703  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
2704  selectSmallSubAssignKernel( C, A, B );
2705  else
2706  selectBlasSubAssignKernel( C, A, B );
2707  }
2709  //**********************************************************************************************
2710 
2711  //**Default subtraction assignment to dense matrices (general/general)**************************
2725  template< typename MT3 // Type of the left-hand side target matrix
2726  , typename MT4 // Type of the left-hand side matrix operand
2727  , typename MT5 > // Type of the right-hand side matrix operand
2728  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2729  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2730  {
2731  const size_t M( A.rows() );
2732  const size_t N( B.columns() );
2733  const size_t K( A.columns() );
2734 
2735  for( size_t i=0UL; i<M; ++i )
2736  {
2737  const size_t kbegin( ( IsUpper<MT4>::value )
2738  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2739  :( 0UL ) );
2740  const size_t kend( ( IsLower<MT4>::value )
2741  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2742  :( K ) );
2743  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2744 
2745  for( size_t k=kbegin; k<kend; ++k )
2746  {
2747  const size_t jbegin( ( IsUpper<MT5>::value )
2748  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
2749  :( 0UL ) );
2750  const size_t jend( ( IsLower<MT5>::value )
2751  ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
2752  :( N ) );
2753  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2754 
2755  const size_t jnum( jend - jbegin );
2756  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2757 
2758  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2759  C(i,j ) -= A(i,k) * B(k,j );
2760  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
2761  }
2762  if( jpos < jend ) {
2763  C(i,jpos) -= A(i,k) * B(k,jpos);
2764  }
2765  }
2766  }
2767  }
2769  //**********************************************************************************************
2770 
2771  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
2785  template< typename MT3 // Type of the left-hand side target matrix
2786  , typename MT4 // Type of the left-hand side matrix operand
2787  , typename MT5 > // Type of the right-hand side matrix operand
2788  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2789  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2790  {
2792 
2793  const size_t M( A.rows() );
2794  const size_t N( B.columns() );
2795 
2796  for( size_t i=0UL; i<M; ++i )
2797  {
2798  const size_t jbegin( ( IsUpper<MT4>::value )
2799  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2800  :( 0UL ) );
2801  const size_t jend( ( IsLower<MT4>::value )
2802  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2803  :( N ) );
2804  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2805 
2806  const size_t jnum( jend - jbegin );
2807  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2808 
2809  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2810  C(i,j ) -= A(i,j ) * B(j ,j );
2811  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
2812  }
2813  if( jpos < jend ) {
2814  C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
2815  }
2816  }
2817  }
2819  //**********************************************************************************************
2820 
2821  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
2835  template< typename MT3 // Type of the left-hand side target matrix
2836  , typename MT4 // Type of the left-hand side matrix operand
2837  , typename MT5 > // Type of the right-hand side matrix operand
2838  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2839  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2840  {
2842 
2843  const size_t M( A.rows() );
2844  const size_t N( B.columns() );
2845 
2846  for( size_t i=0UL; i<M; ++i )
2847  {
2848  const size_t jbegin( ( IsUpper<MT5>::value )
2849  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2850  :( 0UL ) );
2851  const size_t jend( ( IsLower<MT5>::value )
2852  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2853  :( N ) );
2854  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2855 
2856  const size_t jnum( jend - jbegin );
2857  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2858 
2859  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2860  C(i,j ) -= A(i,i) * B(i,j );
2861  C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
2862  }
2863  if( jpos < jend ) {
2864  C(i,jpos) -= A(i,i) * B(i,jpos);
2865  }
2866  }
2867  }
2869  //**********************************************************************************************
2870 
2871  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
2885  template< typename MT3 // Type of the left-hand side target matrix
2886  , typename MT4 // Type of the left-hand side matrix operand
2887  , typename MT5 > // Type of the right-hand side matrix operand
2888  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2889  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2890  {
2892 
2893  for( size_t i=0UL; i<A.rows(); ++i ) {
2894  C(i,i) -= A(i,i) * B(i,i);
2895  }
2896  }
2898  //**********************************************************************************************
2899 
2900  //**Default subtraction assignment to dense matrices (small matrices)***************************
2914  template< typename MT3 // Type of the left-hand side target matrix
2915  , typename MT4 // Type of the left-hand side matrix operand
2916  , typename MT5 > // Type of the right-hand side matrix operand
2917  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2918  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2919  {
2920  selectDefaultSubAssignKernel( C, A, B );
2921  }
2923  //**********************************************************************************************
2924 
2925  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
2940  template< typename MT3 // Type of the left-hand side target matrix
2941  , typename MT4 // Type of the left-hand side matrix operand
2942  , typename MT5 > // Type of the right-hand side matrix operand
2943  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2944  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2945  {
2946  const size_t M( A.rows() );
2947  const size_t N( B.columns() );
2948  const size_t K( A.columns() );
2949 
2950  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2951 
2952  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
2953  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
2954 
2955  size_t j( 0UL );
2956 
2957  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
2958  for( size_t i=0UL; i<M; ++i )
2959  {
2960  const size_t kbegin( ( IsUpper<MT4>::value )
2961  ?( ( IsLower<MT5>::value )
2962  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2963  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2964  :( IsLower<MT5>::value ? j : 0UL ) );
2965  const size_t kend( ( IsLower<MT4>::value )
2966  ?( ( IsUpper<MT5>::value )
2967  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
2968  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2969  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
2970 
2971  SIMDType xmm1( (~C).load(i,j ) );
2972  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2973  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2974  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2975  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2976  SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
2977  SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
2978  SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
2979 
2980  for( size_t k=kbegin; k<kend; ++k ) {
2981  const SIMDType a1( set( A(i,k) ) );
2982  xmm1 = xmm1 - a1 * B.load(k,j );
2983  xmm2 = xmm2 - a1 * B.load(k,j+SIMDSIZE );
2984  xmm3 = xmm3 - a1 * B.load(k,j+SIMDSIZE*2UL);
2985  xmm4 = xmm4 - a1 * B.load(k,j+SIMDSIZE*3UL);
2986  xmm5 = xmm5 - a1 * B.load(k,j+SIMDSIZE*4UL);
2987  xmm6 = xmm6 - a1 * B.load(k,j+SIMDSIZE*5UL);
2988  xmm7 = xmm7 - a1 * B.load(k,j+SIMDSIZE*6UL);
2989  xmm8 = xmm8 - a1 * B.load(k,j+SIMDSIZE*7UL);
2990  }
2991 
2992  (~C).store( i, j , xmm1 );
2993  (~C).store( i, j+SIMDSIZE , xmm2 );
2994  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2995  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2996  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2997  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
2998  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
2999  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
3000  }
3001  }
3002 
3003  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3004  {
3005  size_t i( 0UL );
3006 
3007  for( ; (i+2UL) <= M; i+=2UL )
3008  {
3009  const size_t kbegin( ( IsUpper<MT4>::value )
3010  ?( ( IsLower<MT5>::value )
3011  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3012  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3013  :( IsLower<MT5>::value ? j : 0UL ) );
3014  const size_t kend( ( IsLower<MT4>::value )
3015  ?( ( IsUpper<MT5>::value )
3016  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
3017  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3018  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
3019 
3020  SIMDType xmm1( (~C).load(i ,j ) );
3021  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3022  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3023  SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
3024  SIMDType xmm5( (~C).load(i+1UL,j ) );
3025  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
3026  SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3027  SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3028 
3029  for( size_t k=kbegin; k<kend; ++k ) {
3030  const SIMDType a1( set( A(i ,k) ) );
3031  const SIMDType a2( set( A(i+1UL,k) ) );
3032  const SIMDType b1( B.load(k,j ) );
3033  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3034  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3035  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3036  xmm1 = xmm1 - a1 * b1;
3037  xmm2 = xmm2 - a1 * b2;
3038  xmm3 = xmm3 - a1 * b3;
3039  xmm4 = xmm4 - a1 * b4;
3040  xmm5 = xmm5 - a2 * b1;
3041  xmm6 = xmm6 - a2 * b2;
3042  xmm7 = xmm7 - a2 * b3;
3043  xmm8 = xmm8 - a2 * b4;
3044  }
3045 
3046  (~C).store( i , j , xmm1 );
3047  (~C).store( i , j+SIMDSIZE , xmm2 );
3048  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3049  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3050  (~C).store( i+1UL, j , xmm5 );
3051  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
3052  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3053  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3054  }
3055 
3056  if( i < M )
3057  {
3058  const size_t kbegin( ( IsUpper<MT4>::value )
3059  ?( ( IsLower<MT5>::value )
3060  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3061  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3062  :( IsLower<MT5>::value ? j : 0UL ) );
3063  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
3064 
3065  SIMDType xmm1( (~C).load(i,j ) );
3066  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3067  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3068  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3069 
3070  for( size_t k=kbegin; k<kend; ++k ) {
3071  const SIMDType a1( set( A(i,k) ) );
3072  xmm1 = xmm1 - a1 * B.load(k,j );
3073  xmm2 = xmm2 - a1 * B.load(k,j+SIMDSIZE );
3074  xmm3 = xmm3 - a1 * B.load(k,j+SIMDSIZE*2UL);
3075  xmm4 = xmm4 - a1 * B.load(k,j+SIMDSIZE*3UL);
3076  }
3077 
3078  (~C).store( i, j , xmm1 );
3079  (~C).store( i, j+SIMDSIZE , xmm2 );
3080  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3081  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3082  }
3083  }
3084 
3085  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3086  {
3087  size_t i( 0UL );
3088 
3089  for( ; (i+2UL) <= M; i+=2UL )
3090  {
3091  const size_t kbegin( ( IsUpper<MT4>::value )
3092  ?( ( IsLower<MT5>::value )
3093  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3094  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3095  :( IsLower<MT5>::value ? j : 0UL ) );
3096  const size_t kend( ( IsLower<MT4>::value )
3097  ?( ( IsUpper<MT5>::value )
3098  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
3099  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3100  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
3101 
3102  SIMDType xmm1( (~C).load(i ,j ) );
3103  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3104  SIMDType xmm3( (~C).load(i+1UL,j ) );
3105  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3106 
3107  for( size_t k=kbegin; k<kend; ++k ) {
3108  const SIMDType a1( set( A(i ,k) ) );
3109  const SIMDType a2( set( A(i+1UL,k) ) );
3110  const SIMDType b1( B.load(k,j ) );
3111  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3112  xmm1 = xmm1 - a1 * b1;
3113  xmm2 = xmm2 - a1 * b2;
3114  xmm3 = xmm3 - a2 * b1;
3115  xmm4 = xmm4 - a2 * b2;
3116  }
3117 
3118  (~C).store( i , j , xmm1 );
3119  (~C).store( i , j+SIMDSIZE, xmm2 );
3120  (~C).store( i+1UL, j , xmm3 );
3121  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3122  }
3123 
3124  if( i < M )
3125  {
3126  const size_t kbegin( ( IsUpper<MT4>::value )
3127  ?( ( IsLower<MT5>::value )
3128  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3129  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3130  :( IsLower<MT5>::value ? j : 0UL ) );
3131  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
3132 
3133  SIMDType xmm1( (~C).load(i,j ) );
3134  SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
3135 
3136  for( size_t k=kbegin; k<kend; ++k ) {
3137  const SIMDType a1( set( A(i,k) ) );
3138  xmm1 = xmm1 - a1 * B.load(k,j );
3139  xmm2 = xmm2 - a1 * B.load(k,j+SIMDSIZE);
3140  }
3141 
3142  (~C).store( i, j , xmm1 );
3143  (~C).store( i, j+SIMDSIZE, xmm2 );
3144  }
3145  }
3146 
3147  for( ; j<jpos; j+=SIMDSIZE )
3148  {
3149  size_t i( 0UL );
3150 
3151  for( ; (i+2UL) <= M; i+=2UL )
3152  {
3153  const size_t kbegin( ( IsUpper<MT4>::value )
3154  ?( ( IsLower<MT5>::value )
3155  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3156  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3157  :( IsLower<MT5>::value ? j : 0UL ) );
3158  const size_t kend( ( IsLower<MT4>::value )
3159  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3160  :( K ) );
3161 
3162  SIMDType xmm1( (~C).load(i ,j) );
3163  SIMDType xmm2( (~C).load(i+1UL,j) );
3164 
3165  for( size_t k=kbegin; k<kend; ++k ) {
3166  const SIMDType b1( B.load(k,j) );
3167  xmm1 = xmm1 - set( A(i ,k) ) * b1;
3168  xmm2 = xmm2 - set( A(i+1UL,k) ) * b1;
3169  }
3170 
3171  (~C).store( i , j, xmm1 );
3172  (~C).store( i+1UL, j, xmm2 );
3173  }
3174 
3175  if( i < M )
3176  {
3177  const size_t kbegin( ( IsUpper<MT4>::value )
3178  ?( ( IsLower<MT5>::value )
3179  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3180  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3181  :( IsLower<MT5>::value ? j : 0UL ) );
3182 
3183  SIMDType xmm1( (~C).load(i,j) );
3184 
3185  for( size_t k=kbegin; k<K; ++k ) {
3186  xmm1 = xmm1 - set( A(i,k) ) * B.load(k,j);
3187  }
3188 
3189  (~C).store( i, j, xmm1 );
3190  }
3191  }
3192 
3193  for( ; remainder && j<N; ++j )
3194  {
3195  size_t i( 0UL );
3196 
3197  for( ; (i+2UL) <= M; i+=2UL )
3198  {
3199  const size_t kbegin( ( IsUpper<MT4>::value )
3200  ?( ( IsLower<MT5>::value )
3201  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3202  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3203  :( IsLower<MT5>::value ? j : 0UL ) );
3204  const size_t kend( ( IsLower<MT4>::value )
3205  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3206  :( K ) );
3207 
3208  ElementType value1( (~C)(i ,j) );
3209  ElementType value2( (~C)(i+1UL,j) );
3210 
3211  for( size_t k=kbegin; k<kend; ++k ) {
3212  value1 -= A(i ,k) * B(k,j);
3213  value2 -= A(i+1UL,k) * B(k,j);
3214  }
3215 
3216  (~C)(i ,j) = value1;
3217  (~C)(i+1UL,j) = value2;
3218  }
3219 
3220  if( i < M )
3221  {
3222  const size_t kbegin( ( IsUpper<MT4>::value )
3223  ?( ( IsLower<MT5>::value )
3224  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3225  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3226  :( IsLower<MT5>::value ? j : 0UL ) );
3227 
3228  ElementType value( (~C)(i,j) );
3229 
3230  for( size_t k=kbegin; k<K; ++k ) {
3231  value -= A(i,k) * B(k,j);
3232  }
3233 
3234  (~C)(i,j) = value;
3235  }
3236  }
3237  }
3239  //**********************************************************************************************
3240 
3241  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
3256  template< typename MT3 // Type of the left-hand side target matrix
3257  , typename MT4 // Type of the left-hand side matrix operand
3258  , typename MT5 > // Type of the right-hand side matrix operand
3259  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3260  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3261  {
3266 
3267  if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
3268  const OppositeType_<MT4> tmp( serial( A ) );
3269  subAssign( ~C, tmp * B );
3270  }
3271  else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
3272  const OppositeType_<MT5> tmp( serial( B ) );
3273  subAssign( ~C, A * tmp );
3274  }
3275  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
3276  const OppositeType_<MT4> tmp( serial( A ) );
3277  subAssign( ~C, tmp * B );
3278  }
3279  else {
3280  const OppositeType_<MT5> tmp( serial( B ) );
3281  subAssign( ~C, A * tmp );
3282  }
3283  }
3285  //**********************************************************************************************
3286 
3287  //**Default subtraction assignment to dense matrices (large matrices)***************************
3301  template< typename MT3 // Type of the left-hand side target matrix
3302  , typename MT4 // Type of the left-hand side matrix operand
3303  , typename MT5 > // Type of the right-hand side matrix operand
3304  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3305  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3306  {
3307  selectDefaultSubAssignKernel( C, A, B );
3308  }
3310  //**********************************************************************************************
3311 
3312  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
3327  template< typename MT3 // Type of the left-hand side target matrix
3328  , typename MT4 // Type of the left-hand side matrix operand
3329  , typename MT5 > // Type of the right-hand side matrix operand
3330  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3331  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3332  {
3333  const size_t M( A.rows() );
3334  const size_t N( B.columns() );
3335  const size_t K( A.columns() );
3336 
3337  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3338 
3339  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
3340  {
3341  const size_t jend( min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
3342 
3343  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3344  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3345 
3346  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
3347  {
3348  const size_t iend( min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
3349 
3350  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
3351  {
3352  const size_t ktmp( min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
3353 
3354  size_t j( jj );
3355 
3356  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3357  {
3358  const size_t j1( j+SIMDSIZE );
3359  const size_t j2( j+SIMDSIZE*2UL );
3360  const size_t j3( j+SIMDSIZE*3UL );
3361 
3362  size_t i( ii );
3363 
3364  for( ; (i+2UL) <= iend; i+=2UL )
3365  {
3366  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3367  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3368  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3369  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
3370 
3371  SIMDType xmm1( (~C).load(i ,j ) );
3372  SIMDType xmm2( (~C).load(i ,j1) );
3373  SIMDType xmm3( (~C).load(i ,j2) );
3374  SIMDType xmm4( (~C).load(i ,j3) );
3375  SIMDType xmm5( (~C).load(i+1UL,j ) );
3376  SIMDType xmm6( (~C).load(i+1UL,j1) );
3377  SIMDType xmm7( (~C).load(i+1UL,j2) );
3378  SIMDType xmm8( (~C).load(i+1UL,j3) );
3379 
3380  for( size_t k=kbegin; k<kend; ++k ) {
3381  const SIMDType a1( set( A(i ,k) ) );
3382  const SIMDType a2( set( A(i+1UL,k) ) );
3383  const SIMDType b1( B.load(k,j ) );
3384  const SIMDType b2( B.load(k,j1) );
3385  const SIMDType b3( B.load(k,j2) );
3386  const SIMDType b4( B.load(k,j3) );
3387  xmm1 = xmm1 - a1 * b1;
3388  xmm2 = xmm2 - a1 * b2;
3389  xmm3 = xmm3 - a1 * b3;
3390  xmm4 = xmm4 - a1 * b4;
3391  xmm5 = xmm5 - a2 * b1;
3392  xmm6 = xmm6 - a2 * b2;
3393  xmm7 = xmm7 - a2 * b3;
3394  xmm8 = xmm8 - a2 * b4;
3395  }
3396 
3397  (~C).store( i , j , xmm1 );
3398  (~C).store( i , j1, xmm2 );
3399  (~C).store( i , j2, xmm3 );
3400  (~C).store( i , j3, xmm4 );
3401  (~C).store( i+1UL, j , xmm5 );
3402  (~C).store( i+1UL, j1, xmm6 );
3403  (~C).store( i+1UL, j2, xmm7 );
3404  (~C).store( i+1UL, j3, xmm8 );
3405  }
3406 
3407  if( i < iend )
3408  {
3409  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3410  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3411  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3412  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
3413 
3414  SIMDType xmm1( (~C).load(i,j ) );
3415  SIMDType xmm2( (~C).load(i,j1) );
3416  SIMDType xmm3( (~C).load(i,j2) );
3417  SIMDType xmm4( (~C).load(i,j3) );
3418 
3419  for( size_t k=kbegin; k<kend; ++k ) {
3420  const SIMDType a1( set( A(i,k) ) );
3421  xmm1 = xmm1 - a1 * B.load(k,j );
3422  xmm2 = xmm2 - a1 * B.load(k,j1);
3423  xmm3 = xmm3 - a1 * B.load(k,j2);
3424  xmm4 = xmm4 - a1 * B.load(k,j3);
3425  }
3426 
3427  (~C).store( i, j , xmm1 );
3428  (~C).store( i, j1, xmm2 );
3429  (~C).store( i, j2, xmm3 );
3430  (~C).store( i, j3, xmm4 );
3431  }
3432  }
3433 
3434  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3435  {
3436  const size_t j1( j+SIMDSIZE );
3437 
3438  size_t i( ii );
3439 
3440  for( ; (i+4UL) <= iend; i+=4UL )
3441  {
3442  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3443  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3444  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
3445  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
3446 
3447  SIMDType xmm1( (~C).load(i ,j ) );
3448  SIMDType xmm2( (~C).load(i ,j1) );
3449  SIMDType xmm3( (~C).load(i+1UL,j ) );
3450  SIMDType xmm4( (~C).load(i+1UL,j1) );
3451  SIMDType xmm5( (~C).load(i+2UL,j ) );
3452  SIMDType xmm6( (~C).load(i+2UL,j1) );
3453  SIMDType xmm7( (~C).load(i+3UL,j ) );
3454  SIMDType xmm8( (~C).load(i+3UL,j1) );
3455 
3456  for( size_t k=kbegin; k<kend; ++k ) {
3457  const SIMDType a1( set( A(i ,k) ) );
3458  const SIMDType a2( set( A(i+1UL,k) ) );
3459  const SIMDType a3( set( A(i+2UL,k) ) );
3460  const SIMDType a4( set( A(i+3UL,k) ) );
3461  const SIMDType b1( B.load(k,j ) );
3462  const SIMDType b2( B.load(k,j1) );
3463  xmm1 = xmm1 - a1 * b1;
3464  xmm2 = xmm2 - a1 * b2;
3465  xmm3 = xmm3 - a2 * b1;
3466  xmm4 = xmm4 - a2 * b2;
3467  xmm5 = xmm5 - a3 * b1;
3468  xmm6 = xmm6 - a3 * b2;
3469  xmm7 = xmm7 - a4 * b1;
3470  xmm8 = xmm8 - a4 * b2;
3471  }
3472 
3473  (~C).store( i , j , xmm1 );
3474  (~C).store( i , j1, xmm2 );
3475  (~C).store( i+1UL, j , xmm3 );
3476  (~C).store( i+1UL, j1, xmm4 );
3477  (~C).store( i+2UL, j , xmm5 );
3478  (~C).store( i+2UL, j1, xmm6 );
3479  (~C).store( i+3UL, j , xmm7 );
3480  (~C).store( i+3UL, j1, xmm8 );
3481  }
3482 
3483  for( ; (i+2UL) <= iend; i+=2UL )
3484  {
3485  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3486  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3487  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3488  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
3489 
3490  SIMDType xmm1( (~C).load(i ,j ) );
3491  SIMDType xmm2( (~C).load(i ,j1) );
3492  SIMDType xmm3( (~C).load(i+1UL,j ) );
3493  SIMDType xmm4( (~C).load(i+1UL,j1) );
3494 
3495  for( size_t k=kbegin; k<kend; ++k ) {
3496  const SIMDType a1( set( A(i ,k) ) );
3497  const SIMDType a2( set( A(i+1UL,k) ) );
3498  const SIMDType b1( B.load(k,j ) );
3499  const SIMDType b2( B.load(k,j1) );
3500  xmm1 = xmm1 - a1 * b1;
3501  xmm2 = xmm2 - a1 * b2;
3502  xmm3 = xmm3 - a2 * b1;
3503  xmm4 = xmm4 - a2 * b2;
3504  }
3505 
3506  (~C).store( i , j , xmm1 );
3507  (~C).store( i , j1, xmm2 );
3508  (~C).store( i+1UL, j , xmm3 );
3509  (~C).store( i+1UL, j1, xmm4 );
3510  }
3511 
3512  if( i < iend )
3513  {
3514  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3515  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3516  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3517  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
3518 
3519  SIMDType xmm1( (~C).load(i,j ) );
3520  SIMDType xmm2( (~C).load(i,j1) );
3521 
3522  for( size_t k=kbegin; k<kend; ++k ) {
3523  const SIMDType a1( set( A(i,k) ) );
3524  xmm1 = xmm1 - a1 * B.load(k,j );
3525  xmm2 = xmm2 - a1 * B.load(k,j1);
3526  }
3527 
3528  (~C).store( i, j , xmm1 );
3529  (~C).store( i, j1, xmm2 );
3530  }
3531  }
3532 
3533  for( ; j<jpos; j+=SIMDSIZE )
3534  {
3535  for( size_t i=ii; i<iend; ++i )
3536  {
3537  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3538  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3539  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3540  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
3541 
3542  SIMDType xmm1( (~C).load(i,j) );
3543 
3544  for( size_t k=kbegin; k<kend; ++k ) {
3545  const SIMDType a1( set( A(i,k) ) );
3546  xmm1 = xmm1 - a1 * B.load(k,j);
3547  }
3548 
3549  (~C).store( i, j, xmm1 );
3550  }
3551  }
3552 
3553  for( ; remainder && j<jend; ++j )
3554  {
3555  for( size_t i=ii; i<iend; ++i )
3556  {
3557  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3558  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3559  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3560  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
3561 
3562  ElementType value( (~C)(i,j) );
3563 
3564  for( size_t k=kbegin; k<kend; ++k ) {
3565  value -= A(i,k) * B(k,j);
3566  }
3567 
3568  (~C)(i,j) = value;
3569  }
3570  }
3571  }
3572  }
3573  }
3574  }
3576  //**********************************************************************************************
3577 
3578  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
3592  template< typename MT3 // Type of the left-hand side target matrix
3593  , typename MT4 // Type of the left-hand side matrix operand
3594  , typename MT5 > // Type of the right-hand side matrix operand
3595  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3596  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3597  {
3598  selectSmallSubAssignKernel( ~C, A, B );
3599  }
3601  //**********************************************************************************************
3602 
3603  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
3617  template< typename MT3 // Type of the left-hand side target matrix
3618  , typename MT4 // Type of the left-hand side matrix operand
3619  , typename MT5 > // Type of the right-hand side matrix operand
3620  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
3621  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3622  {
3623  selectLargeSubAssignKernel( C, A, B );
3624  }
3626  //**********************************************************************************************
3627 
3628  //**BLAS-based subraction assignment to dense matrices******************************************
3629 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3630 
3643  template< typename MT3 // Type of the left-hand side target matrix
3644  , typename MT4 // Type of the left-hand side matrix operand
3645  , typename MT5 > // Type of the right-hand side matrix operand
3646  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
3647  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3648  {
3649  typedef ElementType_<MT3> ET;
3650 
3651  if( IsTriangular<MT4>::value ) {
3652  ResultType_<MT3> tmp( serial( B ) );
3653  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3654  subAssign( C, tmp );
3655  }
3656  else if( IsTriangular<MT5>::value ) {
3657  ResultType_<MT3> tmp( serial( A ) );
3658  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3659  subAssign( C, tmp );
3660  }
3661  else {
3662  gemm( C, A, B, ET(-1), ET(1) );
3663  }
3664  }
3666 #endif
3667  //**********************************************************************************************
3668 
3669  //**Restructuring subtraction assignment to column-major matrices*******************************
3684  template< typename MT > // Type of the target matrix
3685  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3686  subAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
3687  {
3689 
3691 
3692  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3693  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3694 
3695  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3696  subAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3697  else if( IsSymmetric<MT1>::value )
3698  subAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
3699  else
3700  subAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
3701  }
3703  //**********************************************************************************************
3704 
3705  //**Subtraction assignment to sparse matrices***************************************************
3706  // No special implementation for the subtraction assignment to sparse matrices.
3707  //**********************************************************************************************
3708 
3709  //**Multiplication assignment to dense matrices*************************************************
3710  // No special implementation for the multiplication assignment to dense matrices.
3711  //**********************************************************************************************
3712 
3713  //**Multiplication assignment to sparse matrices************************************************
3714  // No special implementation for the multiplication assignment to sparse matrices.
3715  //**********************************************************************************************
3716 
3717  //**SMP assignment to dense matrices************************************************************
3732  template< typename MT // Type of the target dense matrix
3733  , bool SO > // Storage order of the target dense matrix
3734  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3735  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
3736  {
3738 
3739  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3740  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3741 
3742  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
3743  return;
3744  }
3745  else if( rhs.lhs_.columns() == 0UL ) {
3746  reset( ~lhs );
3747  return;
3748  }
3749 
3750  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3751  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3752 
3753  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3754  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3755  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3756  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3757  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3758  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3759 
3760  smpAssign( ~lhs, A * B );
3761  }
3763  //**********************************************************************************************
3764 
3765  //**SMP assignment to sparse matrices***********************************************************
3780  template< typename MT // Type of the target sparse matrix
3781  , bool SO > // Storage order of the target sparse matrix
3782  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3783  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
3784  {
3786 
3787  typedef IfTrue_< SO, OppositeType, ResultType > TmpType;
3788 
3794  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<TmpType> );
3795 
3796  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3797  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3798 
3799  const TmpType tmp( rhs );
3800  smpAssign( ~lhs, tmp );
3801  }
3803  //**********************************************************************************************
3804 
3805  //**Restructuring SMP assignment to column-major matrices***************************************
3820  template< typename MT > // Type of the target matrix
3821  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3822  smpAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
3823  {
3825 
3827 
3828  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3829  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3830 
3831  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3832  smpAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3833  else if( IsSymmetric<MT1>::value )
3834  smpAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
3835  else
3836  smpAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
3837  }
3839  //**********************************************************************************************
3840 
3841  //**SMP addition assignment to dense matrices***************************************************
3857  template< typename MT // Type of the target dense matrix
3858  , bool SO > // Storage order of the target dense matrix
3859  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3860  smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
3861  {
3863 
3864  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3865  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3866 
3867  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3868  return;
3869  }
3870 
3871  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3872  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3873 
3874  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3875  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3876  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3877  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3878  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3879  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3880 
3881  smpAddAssign( ~lhs, A * B );
3882  }
3884  //**********************************************************************************************
3885 
3886  //**Restructuring SMP addition assignment to column-major matrices******************************
3901  template< typename MT > // Type of the target matrix
3902  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3903  smpAddAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
3904  {
3906 
3908 
3909  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3910  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3911 
3912  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3913  smpAddAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3914  else if( IsSymmetric<MT1>::value )
3915  smpAddAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
3916  else
3917  smpAddAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
3918  }
3920  //**********************************************************************************************
3921 
3922  //**SMP addition assignment to sparse matrices**************************************************
3923  // No special implementation for the SMP addition assignment to sparse matrices.
3924  //**********************************************************************************************
3925 
3926  //**SMP subtraction assignment to dense matrices************************************************
3942  template< typename MT // Type of the target dense matrix
3943  , bool SO > // Storage order of the target dense matrix
3944  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3945  smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
3946  {
3948 
3949  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3950  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3951 
3952  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3953  return;
3954  }
3955 
3956  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3957  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3958 
3959  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3960  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3961  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3962  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3963  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3964  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3965 
3966  smpSubAssign( ~lhs, A * B );
3967  }
3969  //**********************************************************************************************
3970 
3971  //**Restructuring SMP subtraction assignment to column-major matrices***************************
3986  template< typename MT > // Type of the target matrix
3987  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3988  smpSubAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
3989  {
3991 
3993 
3994  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3995  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3996 
3997  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3998  smpSubAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3999  else if( IsSymmetric<MT1>::value )
4000  smpSubAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
4001  else
4002  smpSubAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
4003  }
4005  //**********************************************************************************************
4006 
4007  //**SMP subtraction assignment to sparse matrices***********************************************
4008  // No special implementation for the SMP subtraction assignment to sparse matrices.
4009  //**********************************************************************************************
4010 
4011  //**SMP multiplication assignment to dense matrices*********************************************
4012  // No special implementation for the SMP multiplication assignment to dense matrices.
4013  //**********************************************************************************************
4014 
4015  //**SMP multiplication assignment to sparse matrices********************************************
4016  // No special implementation for the SMP multiplication assignment to sparse matrices.
4017  //**********************************************************************************************
4018 
4019  //**Compile time checks*************************************************************************
4027  //**********************************************************************************************
4028 };
4029 //*************************************************************************************************
4030 
4031 
4032 
4033 
4034 //=================================================================================================
4035 //
4036 // DMATSCALARMULTEXPR SPECIALIZATION
4037 //
4038 //=================================================================================================
4039 
4040 //*************************************************************************************************
4048 template< typename MT1 // Type of the left-hand side dense matrix
4049  , typename MT2 // Type of the right-hand side dense matrix
4050  , typename ST > // Type of the right-hand side scalar value
4051 class DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2>, ST, false >
4052  : public DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2>, ST, false >, false >
4053  , private MatScalarMultExpr
4054  , private Computation
4055 {
4056  private:
4057  //**Type definitions****************************************************************************
4058  typedef DMatDMatMultExpr<MT1,MT2> MMM;
4059  typedef ResultType_<MMM> RES;
4060  typedef ResultType_<MT1> RT1;
4061  typedef ResultType_<MT2> RT2;
4062  typedef ElementType_<RT1> ET1;
4063  typedef ElementType_<RT2> ET2;
4064  typedef CompositeType_<MT1> CT1;
4065  typedef CompositeType_<MT2> CT2;
4066  //**********************************************************************************************
4067 
4068  //**********************************************************************************************
4070  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4071  //**********************************************************************************************
4072 
4073  //**********************************************************************************************
4075  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4076  //**********************************************************************************************
4077 
4078  //**********************************************************************************************
4080 
4085  template< typename T1, typename T2, typename T3 >
4086  struct CanExploitSymmetry {
4087  enum : bool { value = IsColumnMajorMatrix<T1>::value &&
4088  ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
4089  };
4090  //**********************************************************************************************
4091 
4092  //**********************************************************************************************
4094 
4097  template< typename T1, typename T2, typename T3 >
4098  struct IsEvaluationRequired {
4099  enum : bool { value = ( evaluateLeft || evaluateRight ) &&
4100  !CanExploitSymmetry<T1,T2,T3>::value };
4101  };
4102  //**********************************************************************************************
4103 
4104  //**********************************************************************************************
4106 
4108  template< typename T1, typename T2, typename T3, typename T4 >
4109  struct UseBlasKernel {
4111  HasMutableDataAccess<T1>::value &&
4112  HasConstDataAccess<T2>::value &&
4113  HasConstDataAccess<T3>::value &&
4114  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4115  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4116  IsBLASCompatible< ElementType_<T1> >::value &&
4117  IsBLASCompatible< ElementType_<T2> >::value &&
4118  IsBLASCompatible< ElementType_<T3> >::value &&
4119  IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
4120  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
4121  !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
4122  };
4123  //**********************************************************************************************
4124 
4125  //**********************************************************************************************
4127 
4129  template< typename T1, typename T2, typename T3, typename T4 >
4130  struct UseVectorizedDefaultKernel {
4131  enum : bool { value = useOptimizedKernels &&
4132  !IsDiagonal<T3>::value &&
4133  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4134  AreSIMDCombinable< ElementType_<T1>
4135  , ElementType_<T2>
4136  , ElementType_<T3>
4137  , T4 >::value &&
4138  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
4139  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
4140  };
4141  //**********************************************************************************************
4142 
4143  public:
4144  //**Type definitions****************************************************************************
4145  typedef DMatScalarMultExpr<MMM,ST,false> This;
4146  typedef MultTrait_<RES,ST> ResultType;
4147  typedef OppositeType_<ResultType> OppositeType;
4148  typedef TransposeType_<ResultType> TransposeType;
4149  typedef ElementType_<ResultType> ElementType;
4150  typedef SIMDTrait_<ElementType> SIMDType;
4151  typedef const ElementType ReturnType;
4152  typedef const ResultType CompositeType;
4153 
4155  typedef const DMatDMatMultExpr<MT1,MT2> LeftOperand;
4156 
4158  typedef ST RightOperand;
4159 
4161  typedef IfTrue_< evaluateLeft, const RT1, CT1 > LT;
4162 
4164  typedef IfTrue_< evaluateRight, const RT2, CT2 > RT;
4165  //**********************************************************************************************
4166 
4167  //**Compilation flags***************************************************************************
4169  enum : bool { simdEnabled = !IsDiagonal<MT2>::value &&
4170  MT1::simdEnabled && MT2::simdEnabled &&
4171  AreSIMDCombinable<ET1,ET2,ST>::value &&
4172  HasSIMDAdd<ET1,ET2>::value &&
4173  HasSIMDMult<ET1,ET2>::value };
4174 
4176  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4177  !evaluateRight && MT2::smpAssignable };
4178  //**********************************************************************************************
4179 
4180  //**SIMD properties*****************************************************************************
4182  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
4183  //**********************************************************************************************
4184 
4185  //**Constructor*********************************************************************************
4191  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4192  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4193  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4194  {}
4195  //**********************************************************************************************
4196 
4197  //**Access operator*****************************************************************************
4204  inline ReturnType operator()( size_t i, size_t j ) const {
4205  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4206  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4207  return matrix_(i,j) * scalar_;
4208  }
4209  //**********************************************************************************************
4210 
4211  //**At function*********************************************************************************
4219  inline ReturnType at( size_t i, size_t j ) const {
4220  if( i >= matrix_.rows() ) {
4221  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4222  }
4223  if( j >= matrix_.columns() ) {
4224  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4225  }
4226  return (*this)(i,j);
4227  }
4228  //**********************************************************************************************
4229 
4230  //**Rows function*******************************************************************************
4235  inline size_t rows() const {
4236  return matrix_.rows();
4237  }
4238  //**********************************************************************************************
4239 
4240  //**Columns function****************************************************************************
4245  inline size_t columns() const {
4246  return matrix_.columns();
4247  }
4248  //**********************************************************************************************
4249 
4250  //**Left operand access*************************************************************************
4255  inline LeftOperand leftOperand() const {
4256  return matrix_;
4257  }
4258  //**********************************************************************************************
4259 
4260  //**Right operand access************************************************************************
4265  inline RightOperand rightOperand() const {
4266  return scalar_;
4267  }
4268  //**********************************************************************************************
4269 
4270  //**********************************************************************************************
4276  template< typename T >
4277  inline bool canAlias( const T* alias ) const {
4278  return matrix_.canAlias( alias );
4279  }
4280  //**********************************************************************************************
4281 
4282  //**********************************************************************************************
4288  template< typename T >
4289  inline bool isAliased( const T* alias ) const {
4290  return matrix_.isAliased( alias );
4291  }
4292  //**********************************************************************************************
4293 
4294  //**********************************************************************************************
4299  inline bool isAligned() const {
4300  return matrix_.isAligned();
4301  }
4302  //**********************************************************************************************
4303 
4304  //**********************************************************************************************
4309  inline bool canSMPAssign() const noexcept {
4310  return ( !BLAZE_BLAS_IS_PARALLEL ||
4311  ( rows() * columns() < DMATDMATMULT_THRESHOLD ) ) &&
4312  ( rows() * columns() >= SMP_DMATDMATMULT_THRESHOLD );
4313  }
4314  //**********************************************************************************************
4315 
4316  private:
4317  //**Member variables****************************************************************************
4318  LeftOperand matrix_;
4319  RightOperand scalar_;
4320  //**********************************************************************************************
4321 
4322  //**Assignment to dense matrices****************************************************************
4334  template< typename MT // Type of the target dense matrix
4335  , bool SO > // Storage order of the target dense matrix
4336  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
4337  assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4338  {
4340 
4341  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4342  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4343 
4344  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
4345  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
4346 
4347  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4348  return;
4349  }
4350  else if( left.columns() == 0UL ) {
4351  reset( ~lhs );
4352  return;
4353  }
4354 
4355  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4356  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4357 
4358  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4359  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4360  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4361  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4362  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4363  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4364 
4365  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4366  }
4367  //**********************************************************************************************
4368 
4369  //**Assignment to dense matrices (kernel selection)*********************************************
4380  template< typename MT3 // Type of the left-hand side target matrix
4381  , typename MT4 // Type of the left-hand side matrix operand
4382  , typename MT5 // Type of the right-hand side matrix operand
4383  , typename ST2 > // Type of the scalar value
4384  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4385  {
4386  if( ( IsDiagonal<MT5>::value ) ||
4387  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
4388  selectSmallAssignKernel( C, A, B, scalar );
4389  else
4390  selectBlasAssignKernel( C, A, B, scalar );
4391  }
4392  //**********************************************************************************************
4393 
4394  //**Default assignment to dense matrices (general/general)**************************************
4408  template< typename MT3 // Type of the left-hand side target matrix
4409  , typename MT4 // Type of the left-hand side matrix operand
4410  , typename MT5 // Type of the right-hand side matrix operand
4411  , typename ST2 > // Type of the scalar value
4412  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4413  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4414  {
4415  const size_t M( A.rows() );
4416  const size_t N( B.columns() );
4417  const size_t K( A.columns() );
4418 
4419  for( size_t i=0UL; i<M; ++i )
4420  {
4421  const size_t kbegin( ( IsUpper<MT4>::value )
4422  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4423  :( 0UL ) );
4424  const size_t kend( ( IsLower<MT4>::value )
4425  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4426  :( K ) );
4427  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4428 
4429  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
4430  for( size_t j=0UL; j<N; ++j ) {
4431  reset( (~C)(i,j) );
4432  }
4433  continue;
4434  }
4435 
4436  {
4437  const size_t jbegin( ( IsUpper<MT5>::value )
4438  ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
4439  :( 0UL ) );
4440  const size_t jend( ( IsLower<MT5>::value )
4441  ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
4442  :( N ) );
4443  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4444 
4445  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
4446  for( size_t j=0UL; j<jbegin; ++j ) {
4447  reset( C(i,j) );
4448  }
4449  }
4450  else if( IsStrictlyUpper<MT5>::value ) {
4451  reset( C(i,0UL) );
4452  }
4453  for( size_t j=jbegin; j<jend; ++j ) {
4454  C(i,j) = A(i,kbegin) * B(kbegin,j);
4455  }
4456  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
4457  for( size_t j=jend; j<N; ++j ) {
4458  reset( C(i,j) );
4459  }
4460  }
4461  else if( IsStrictlyLower<MT5>::value ) {
4462  reset( C(i,N-1UL) );
4463  }
4464  }
4465 
4466  for( size_t k=kbegin+1UL; k<kend; ++k )
4467  {
4468  const size_t jbegin( ( IsUpper<MT5>::value )
4469  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
4470  :( 0UL ) );
4471  const size_t jend( ( IsLower<MT5>::value )
4472  ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
4473  :( N ) );
4474  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4475 
4476  for( size_t j=jbegin; j<jend; ++j ) {
4477  C(i,j) += A(i,k) * B(k,j);
4478  }
4479  if( IsLower<MT5>::value ) {
4480  C(i,jend) = A(i,k) * B(k,jend);
4481  }
4482  }
4483 
4484  {
4485  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4486  ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? i+1UL : i )
4487  :( 0UL ) );
4488  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
4489  ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? i : i+1UL )
4490  :( N ) );
4491  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4492 
4493  for( size_t j=jbegin; j<jend; ++j ) {
4494  C(i,j) *= scalar;
4495  }
4496  }
4497  }
4498  }
4499  //**********************************************************************************************
4500 
4501  //**Default assignment to dense matrices (general/diagonal)*************************************
4515  template< typename MT3 // Type of the left-hand side target matrix
4516  , typename MT4 // Type of the left-hand side matrix operand
4517  , typename MT5 // Type of the right-hand side matrix operand
4518  , typename ST2 > // Type of the scalar value
4519  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4520  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4521  {
4523 
4524  const size_t M( A.rows() );
4525  const size_t N( B.columns() );
4526 
4527  for( size_t i=0UL; i<M; ++i )
4528  {
4529  const size_t jbegin( ( IsUpper<MT4>::value )
4530  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4531  :( 0UL ) );
4532  const size_t jend( ( IsLower<MT4>::value )
4533  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4534  :( N ) );
4535  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4536 
4537  if( IsUpper<MT4>::value ) {
4538  for( size_t j=0UL; j<jbegin; ++j ) {
4539  reset( C(i,j) );
4540  }
4541  }
4542  for( size_t j=jbegin; j<jend; ++j ) {
4543  C(i,j) = A(i,j) * B(j,j) * scalar;
4544  }
4545  if( IsLower<MT4>::value ) {
4546  for( size_t j=jend; j<N; ++j ) {
4547  reset( C(i,j) );
4548  }
4549  }
4550  }
4551  }
4552  //**********************************************************************************************
4553 
4554  //**Default assignment to dense matrices (diagonal/general)*************************************
4568  template< typename MT3 // Type of the left-hand side target matrix
4569  , typename MT4 // Type of the left-hand side matrix operand
4570  , typename MT5 // Type of the right-hand side matrix operand
4571  , typename ST2 > // Type of the scalar value
4572  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4573  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4574  {
4576 
4577  const size_t M( A.rows() );
4578  const size_t N( B.columns() );
4579 
4580  for( size_t i=0UL; i<M; ++i )
4581  {
4582  const size_t jbegin( ( IsUpper<MT5>::value )
4583  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4584  :( 0UL ) );
4585  const size_t jend( ( IsLower<MT5>::value )
4586  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4587  :( N ) );
4588  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4589 
4590  if( IsUpper<MT5>::value ) {
4591  for( size_t j=0UL; j<jbegin; ++j ) {
4592  reset( C(i,j) );
4593  }
4594  }
4595  for( size_t j=jbegin; j<jend; ++j ) {
4596  C(i,j) = A(i,i) * B(i,j) * scalar;
4597  }
4598  if( IsLower<MT5>::value ) {
4599  for( size_t j=jend; j<N; ++j ) {
4600  reset( C(i,j) );
4601  }
4602  }
4603  }
4604  }
4605  //**********************************************************************************************
4606 
4607  //**Default assignment to dense matrices (diagonal/diagonal)************************************
4621  template< typename MT3 // Type of the left-hand side target matrix
4622  , typename MT4 // Type of the left-hand side matrix operand
4623  , typename MT5 // Type of the right-hand side matrix operand
4624  , typename ST2 > // Type of the scalar value
4625  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
4626  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4627  {
4629 
4630  reset( C );
4631 
4632  for( size_t i=0UL; i<A.rows(); ++i ) {
4633  C(i,i) = A(i,i) * B(i,i) * scalar;
4634  }
4635  }
4636  //**********************************************************************************************
4637 
4638  //**Default assignment to dense matrices (small matrices)***************************************
4652  template< typename MT3 // Type of the left-hand side target matrix
4653  , typename MT4 // Type of the left-hand side matrix operand
4654  , typename MT5 // Type of the right-hand side matrix operand
4655  , typename ST2 > // Type of the scalar value
4656  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4657  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4658  {
4659  selectDefaultAssignKernel( C, A, B, scalar );
4660  }
4661  //**********************************************************************************************
4662 
4663  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
4678  template< typename MT3 // Type of the left-hand side target matrix
4679  , typename MT4 // Type of the left-hand side matrix operand
4680  , typename MT5 // Type of the right-hand side matrix operand
4681  , typename ST2 > // Type of the scalar value
4682  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4683  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
4684  {
4685  const size_t M( A.rows() );
4686  const size_t N( B.columns() );
4687  const size_t K( A.columns() );
4688 
4689  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
4690 
4691  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
4692  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4693 
4694  const SIMDType factor( set( scalar ) );
4695 
4696  size_t j( 0UL );
4697 
4698  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
4699  for( size_t i=0UL; i<M; ++i )
4700  {
4701  const size_t kbegin( ( IsUpper<MT4>::value )
4702  ?( ( IsLower<MT5>::value )
4703  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4704  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4705  :( IsLower<MT5>::value ? j : 0UL ) );
4706  const size_t kend( ( IsLower<MT4>::value )
4707  ?( ( IsUpper<MT5>::value )
4708  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
4709  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4710  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
4711 
4712  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4713 
4714  for( size_t k=kbegin; k<kend; ++k ) {
4715  const SIMDType a1( set( A(i,k) ) );
4716  xmm1 = xmm1 + a1 * B.load(k,j );
4717  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
4718  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
4719  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
4720  xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
4721  xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
4722  xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
4723  xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
4724  }
4725 
4726  (~C).store( i, j , xmm1 * factor );
4727  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
4728  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
4729  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
4730  (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
4731  (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
4732  (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
4733  (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
4734  }
4735  }
4736 
4737  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4738  {
4739  size_t i( 0UL );
4740 
4741  for( ; (i+2UL) <= M; i+=2UL )
4742  {
4743  const size_t kbegin( ( IsUpper<MT4>::value )
4744  ?( ( IsLower<MT5>::value )
4745  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4746  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4747  :( IsLower<MT5>::value ? j : 0UL ) );
4748  const size_t kend( ( IsLower<MT4>::value )
4749  ?( ( IsUpper<MT5>::value )
4750  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
4751  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4752  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
4753 
4754  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4755 
4756  for( size_t k=kbegin; k<kend; ++k ) {
4757  const SIMDType a1( set( A(i ,k) ) );
4758  const SIMDType a2( set( A(i+1UL,k) ) );
4759  const SIMDType b1( B.load(k,j ) );
4760  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4761  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4762  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
4763  xmm1 = xmm1 + a1 * b1;
4764  xmm2 = xmm2 + a1 * b2;
4765  xmm3 = xmm3 + a1 * b3;
4766  xmm4 = xmm4 + a1 * b4;
4767  xmm5 = xmm5 + a2 * b1;
4768  xmm6 = xmm6 + a2 * b2;
4769  xmm7 = xmm7 + a2 * b3;
4770  xmm8 = xmm8 + a2 * b4;
4771  }
4772 
4773  (~C).store( i , j , xmm1 * factor );
4774  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
4775  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
4776  (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
4777  (~C).store( i+1UL, j , xmm5 * factor );
4778  (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
4779  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
4780  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
4781  }
4782 
4783  if( i < M )
4784  {
4785  const size_t kbegin( ( IsUpper<MT4>::value )
4786  ?( ( IsLower<MT5>::value )
4787  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4788  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4789  :( IsLower<MT5>::value ? j : 0UL ) );
4790  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
4791 
4792  SIMDType xmm1, xmm2, xmm3, xmm4;
4793 
4794  for( size_t k=kbegin; k<kend; ++k ) {
4795  const SIMDType a1( set( A(i,k) ) );
4796  xmm1 = xmm1 + a1 * B.load(k,j );
4797  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
4798  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
4799  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
4800  }
4801 
4802  (~C).store( i, j , xmm1 * factor );
4803  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
4804  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
4805  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
4806  }
4807  }
4808 
4809  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4810  {
4811  size_t i( 0UL );
4812 
4813  for( ; (i+2UL) <= M; i+=2UL )
4814  {
4815  const size_t kbegin( ( IsUpper<MT4>::value )
4816  ?( ( IsLower<MT5>::value )
4817  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4818  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4819  :( IsLower<MT5>::value ? j : 0UL ) );
4820  const size_t kend( ( IsLower<MT4>::value )
4821  ?( ( IsUpper<MT5>::value )
4822  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
4823  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4824  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
4825 
4826  SIMDType xmm1, xmm2, xmm3, xmm4;
4827 
4828  for( size_t k=kbegin; k<kend; ++k ) {
4829  const SIMDType a1( set( A(i ,k) ) );
4830  const SIMDType a2( set( A(i+1UL,k) ) );
4831  const SIMDType b1( B.load(k,j ) );
4832  const SIMDType b2( B.load(k,j+SIMDSIZE) );
4833  xmm1 = xmm1 + a1 * b1;
4834  xmm2 = xmm2 + a1 * b2;
4835  xmm3 = xmm3 + a2 * b1;
4836  xmm4 = xmm4 + a2 * b2;
4837  }
4838 
4839  (~C).store( i , j , xmm1 * factor );
4840  (~C).store( i , j+SIMDSIZE, xmm2 * factor );
4841  (~C).store( i+1UL, j , xmm3 * factor );
4842  (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
4843  }
4844 
4845  if( i < M )
4846  {
4847  const size_t kbegin( ( IsUpper<MT4>::value )
4848  ?( ( IsLower<MT5>::value )
4849  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4850  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4851  :( IsLower<MT5>::value ? j : 0UL ) );
4852  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
4853 
4854  SIMDType xmm1, xmm2;
4855 
4856  for( size_t k=kbegin; k<kend; ++k ) {
4857  const SIMDType a1( set( A(i,k) ) );
4858  xmm1 = xmm1 + a1 * B.load(k,j );
4859  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
4860  }
4861 
4862  (~C).store( i, j , xmm1 * factor );
4863  (~C).store( i, j+SIMDSIZE, xmm2 * factor );
4864  }
4865  }
4866 
4867  for( ; j<jpos; j+=SIMDSIZE )
4868  {
4869  size_t i( 0UL );
4870 
4871  for( ; (i+2UL) <= M; i+=2UL )
4872  {
4873  const size_t kbegin( ( IsUpper<MT4>::value )
4874  ?( ( IsLower<MT5>::value )
4875  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4876  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4877  :( IsLower<MT5>::value ? j : 0UL ) );
4878  const size_t kend( ( IsLower<MT4>::value )
4879  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4880  :( K ) );
4881 
4882  SIMDType xmm1, xmm2;
4883 
4884  for( size_t k=kbegin; k<kend; ++k ) {
4885  const SIMDType b1( B.load(k,j) );
4886  xmm1 = xmm1 + set( A(i ,k) ) * b1;
4887  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
4888  }
4889 
4890  (~C).store( i , j, xmm1 * factor );
4891  (~C).store( i+1UL, j, xmm2 * factor );
4892  }
4893 
4894  if( i < M )
4895  {
4896  const size_t kbegin( ( IsUpper<MT4>::value )
4897  ?( ( IsLower<MT5>::value )
4898  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4899  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4900  :( IsLower<MT5>::value ? j : 0UL ) );
4901 
4902  SIMDType xmm1;
4903 
4904  for( size_t k=kbegin; k<K; ++k ) {
4905  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
4906  }
4907 
4908  (~C).store( i, j, xmm1 * factor );
4909  }
4910  }
4911 
4912  for( ; remainder && j<N; ++j )
4913  {
4914  size_t i( 0UL );
4915 
4916  for( ; (i+2UL) <= M; i+=2UL )
4917  {
4918  const size_t kbegin( ( IsUpper<MT4>::value )
4919  ?( ( IsLower<MT5>::value )
4920  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4921  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4922  :( IsLower<MT5>::value ? j : 0UL ) );
4923  const size_t kend( ( IsLower<MT4>::value )
4924  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4925  :( K ) );
4926 
4927  ElementType value1 = ElementType();
4928  ElementType value2 = ElementType();
4929 
4930  for( size_t k=kbegin; k<kend; ++k ) {
4931  value1 += A(i ,k) * B(k,j);
4932  value2 += A(i+1UL,k) * B(k,j);
4933  }
4934 
4935  (~C)(i ,j) = value1 * scalar;
4936  (~C)(i+1UL,j) = value2 * scalar;
4937  }
4938 
4939  if( i < M )
4940  {
4941  const size_t kbegin( ( IsUpper<MT4>::value )
4942  ?( ( IsLower<MT5>::value )
4943  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4944  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4945  :( IsLower<MT5>::value ? j : 0UL ) );
4946 
4947  ElementType value = ElementType();
4948 
4949  for( size_t k=kbegin; k<K; ++k ) {
4950  value += A(i,k) * B(k,j);
4951  }
4952 
4953  (~C)(i,j) = value * scalar;
4954  }
4955  }
4956  }
4957  //**********************************************************************************************
4958 
4959  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
4974  template< typename MT3 // Type of the left-hand side target matrix
4975  , typename MT4 // Type of the left-hand side matrix operand
4976  , typename MT5 // Type of the right-hand side matrix operand
4977  , typename ST2 > // Type of the scalar value
4978  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4979  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
4980  {
4985 
4986  if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
4987  const OppositeType_<MT4> tmp( serial( A ) );
4988  assign( ~C, tmp * B * scalar );
4989  }
4990  else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
4991  const OppositeType_<MT5> tmp( serial( B ) );
4992  assign( ~C, A * tmp * scalar );
4993  }
4994  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
4995  const OppositeType_<MT4> tmp( serial( A ) );
4996  assign( ~C, tmp * B * scalar );
4997  }
4998  else {
4999  const OppositeType_<MT5> tmp( serial( B ) );
5000  assign( ~C, A * tmp * scalar );
5001  }
5002  }
5003  //**********************************************************************************************
5004 
5005  //**Default assignment to dense matrices (large matrices)***************************************
5019  template< typename MT3 // Type of the left-hand side target matrix
5020  , typename MT4 // Type of the left-hand side matrix operand
5021  , typename MT5 // Type of the right-hand side matrix operand
5022  , typename ST2 > // Type of the scalar value
5023  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5024  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5025  {
5026  selectDefaultAssignKernel( C, A, B, scalar );
5027  }
5028  //**********************************************************************************************
5029 
5030  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
5045  template< typename MT3 // Type of the left-hand side target matrix
5046  , typename MT4 // Type of the left-hand side matrix operand
5047  , typename MT5 // Type of the right-hand side matrix operand
5048  , typename ST2 > // Type of the scalar value
5049  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5050  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5051  {
5052  const size_t M( A.rows() );
5053  const size_t N( B.columns() );
5054  const size_t K( A.columns() );
5055 
5056  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5057 
5058  const SIMDType factor( set( scalar ) );
5059 
5060  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
5061  {
5062  const size_t jend( min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
5063 
5064  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
5065  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
5066 
5067  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
5068  {
5069  const size_t iend( min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
5070 
5071  for( size_t i=ii; i<iend; ++i ) {
5072  for( size_t j=jj; j<jend; ++j ) {
5073  reset( (~C)(i,j) );
5074  }
5075  }
5076 
5077  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
5078  {
5079  const size_t ktmp( min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
5080 
5081  size_t j( jj );
5082 
5083  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5084  {
5085  const size_t j1( j+SIMDSIZE );
5086  const size_t j2( j+SIMDSIZE*2UL );
5087  const size_t j3( j+SIMDSIZE*3UL );
5088 
5089  size_t i( ii );
5090 
5091  for( ; (i+2UL) <= iend; i+=2UL )
5092  {
5093  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5094  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5095  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5096  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
5097 
5098  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5099 
5100  for( size_t k=kbegin; k<kend; ++k ) {
5101  const SIMDType a1( set( A(i ,k) ) );
5102  const SIMDType a2( set( A(i+1UL,k) ) );
5103  const SIMDType b1( B.load(k,j ) );
5104  const SIMDType b2( B.load(k,j1) );
5105  const SIMDType b3( B.load(k,j2) );
5106  const SIMDType b4( B.load(k,j3) );
5107  xmm1 = xmm1 + a1 * b1;
5108  xmm2 = xmm2 + a1 * b2;
5109  xmm3 = xmm3 + a1 * b3;
5110  xmm4 = xmm4 + a1 * b4;
5111  xmm5 = xmm5 + a2 * b1;
5112  xmm6 = xmm6 + a2 * b2;
5113  xmm7 = xmm7 + a2 * b3;
5114  xmm8 = xmm8 + a2 * b4;
5115  }
5116 
5117  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5118  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
5119  (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
5120  (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
5121  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
5122  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
5123  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
5124  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
5125  }
5126 
5127  if( i < iend )
5128  {
5129  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5130  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5131  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5132  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
5133 
5134  SIMDType xmm1, xmm2, xmm3, xmm4;
5135 
5136  for( size_t k=kbegin; k<kend; ++k ) {
5137  const SIMDType a1( set( A(i,k) ) );
5138  xmm1 = xmm1 + a1 * B.load(k,j );
5139  xmm2 = xmm2 + a1 * B.load(k,j1);
5140  xmm3 = xmm3 + a1 * B.load(k,j2);
5141  xmm4 = xmm4 + a1 * B.load(k,j3);
5142  }
5143 
5144  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5145  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
5146  (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
5147  (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
5148  }
5149  }
5150 
5151  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5152  {
5153  const size_t j1( j+SIMDSIZE );
5154 
5155  size_t i( ii );
5156 
5157  for( ; (i+4UL) <= iend; i+=4UL )
5158  {
5159  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5160  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5161  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
5162  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
5163 
5164  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5165 
5166  for( size_t k=kbegin; k<kend; ++k ) {
5167  const SIMDType a1( set( A(i ,k) ) );
5168  const SIMDType a2( set( A(i+1UL,k) ) );
5169  const SIMDType a3( set( A(i+2UL,k) ) );
5170  const SIMDType a4( set( A(i+3UL,k) ) );
5171  const SIMDType b1( B.load(k,j ) );
5172  const SIMDType b2( B.load(k,j1) );
5173  xmm1 = xmm1 + a1 * b1;
5174  xmm2 = xmm2 + a1 * b2;
5175  xmm3 = xmm3 + a2 * b1;
5176  xmm4 = xmm4 + a2 * b2;
5177  xmm5 = xmm5 + a3 * b1;
5178  xmm6 = xmm6 + a3 * b2;
5179  xmm7 = xmm7 + a4 * b1;
5180  xmm8 = xmm8 + a4 * b2;
5181  }
5182 
5183  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5184  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
5185  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5186  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
5187  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
5188  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
5189  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
5190  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
5191  }
5192 
5193  for( ; (i+2UL) <= iend; i+=2UL )
5194  {
5195  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5196  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5197  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5198  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
5199 
5200  SIMDType xmm1, xmm2, xmm3, xmm4;
5201 
5202  for( size_t k=kbegin; k<kend; ++k ) {
5203  const SIMDType a1( set( A(i ,k) ) );
5204  const SIMDType a2( set( A(i+1UL,k) ) );
5205  const SIMDType b1( B.load(k,j ) );
5206  const SIMDType b2( B.load(k,j1) );
5207  xmm1 = xmm1 + a1 * b1;
5208  xmm2 = xmm2 + a1 * b2;
5209  xmm3 = xmm3 + a2 * b1;
5210  xmm4 = xmm4 + a2 * b2;
5211  }
5212 
5213  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5214  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
5215  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5216  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
5217  }
5218 
5219  if( i < iend )
5220  {
5221  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5222  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5223  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5224  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
5225 
5226  SIMDType xmm1, xmm2;
5227 
5228  for( size_t k=kbegin; k<kend; ++k ) {
5229  const SIMDType a1( set( A(i,k) ) );
5230  xmm1 = xmm1 + a1 * B.load(k,j );
5231  xmm2 = xmm2 + a1 * B.load(k,j1);
5232  }
5233 
5234  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5235  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
5236  }
5237  }
5238 
5239  for( ; j<jpos; j+=SIMDSIZE )
5240  {
5241  for( size_t i=ii; i<iend; ++i )
5242  {
5243  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5244  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5245  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5246  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
5247 
5248  SIMDType xmm1;
5249 
5250  for( size_t k=kbegin; k<kend; ++k ) {
5251  const SIMDType a1( set( A(i,k) ) );
5252  xmm1 = xmm1 + a1 * B.load(k,j);
5253  }
5254 
5255  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5256  }
5257  }
5258 
5259  for( ; remainder && j<jend; ++j )
5260  {
5261  for( size_t i=ii; i<iend; ++i )
5262  {
5263  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5264  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5265  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5266  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
5267 
5268  ElementType value = ElementType();
5269 
5270  for( size_t k=kbegin; k<kend; ++k ) {
5271  value += A(i,k) * B(k,j);
5272  }
5273 
5274  (~C)(i,j) += value * scalar;
5275  }
5276  }
5277  }
5278  }
5279  }
5280  }
5281  //**********************************************************************************************
5282 
5283  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
5297  template< typename MT3 // Type of the left-hand side target matrix
5298  , typename MT4 // Type of the left-hand side matrix operand
5299  , typename MT5 // Type of the right-hand side matrix operand
5300  , typename ST2 > // Type of the scalar value
5301  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5302  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5303  {
5304  selectSmallAssignKernel( ~C, A, B, scalar );
5305  }
5306  //**********************************************************************************************
5307 
5308  //**BLAS-based assignment to dense matrices (default)*******************************************
5322  template< typename MT3 // Type of the left-hand side target matrix
5323  , typename MT4 // Type of the left-hand side matrix operand
5324  , typename MT5 // Type of the right-hand side matrix operand
5325  , typename ST2 > // Type of the scalar value
5326  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
5327  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5328  {
5329  selectLargeAssignKernel( C, A, B, scalar );
5330  }
5331  //**********************************************************************************************
5332 
5333  //**BLAS-based assignment to dense matrices*****************************************************
5334 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
5335 
5348  template< typename MT3 // Type of the left-hand side target matrix
5349  , typename MT4 // Type of the left-hand side matrix operand
5350  , typename MT5 // Type of the right-hand side matrix operand
5351  , typename ST2 > // Type of the scalar value
5352  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
5353  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5354  {
5355  typedef ElementType_<MT3> ET;
5356 
5357  if( IsTriangular<MT4>::value ) {
5358  assign( C, B );
5359  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5360  }
5361  else if( IsTriangular<MT5>::value ) {
5362  assign( C, A );
5363  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5364  }
5365  else {
5366  gemm( C, A, B, ET(scalar), ET(0) );
5367  }
5368  }
5369 #endif
5370  //**********************************************************************************************
5371 
5372  //**Assignment to sparse matrices***************************************************************
5384  template< typename MT // Type of the target sparse matrix
5385  , bool SO > // Storage order of the target sparse matrix
5386  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
5387  assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5388  {
5390 
5391  typedef IfTrue_< SO, OppositeType, ResultType > TmpType;
5392 
5398  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<TmpType> );
5399 
5400  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5401  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5402 
5403  const TmpType tmp( serial( rhs ) );
5404  assign( ~lhs, tmp );
5405  }
5406  //**********************************************************************************************
5407 
5408  //**Restructuring assignment to column-major matrices*******************************************
5422  template< typename MT > // Type of the target matrix
5423  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
5424  assign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
5425  {
5427 
5429 
5430  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5431  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5432 
5433  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
5434  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
5435 
5436  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
5437  assign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
5438  else if( IsSymmetric<MT1>::value )
5439  assign( ~lhs, trans( left ) * right * rhs.scalar_ );
5440  else
5441  assign( ~lhs, left * trans( right ) * rhs.scalar_ );
5442  }
5443  //**********************************************************************************************
5444 
5445  //**Addition assignment to dense matrices*******************************************************
5457  template< typename MT // Type of the target dense matrix
5458  , bool SO > // Storage order of the target dense matrix
5459  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
5460  addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5461  {
5463 
5464  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5465  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5466 
5467  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
5468  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
5469 
5470  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
5471  return;
5472  }
5473 
5474  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
5475  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
5476 
5477  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5478  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
5479  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
5480  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
5481  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5482  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
5483 
5484  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5485  }
5486  //**********************************************************************************************
5487 
5488  //**Addition assignment to dense matrices (kernel selection)************************************
5499  template< typename MT3 // Type of the left-hand side target matrix
5500  , typename MT4 // Type of the left-hand side matrix operand
5501  , typename MT5 // Type of the right-hand side matrix operand
5502  , typename ST2 > // Type of the scalar value
5503  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5504  {
5505  if( ( IsDiagonal<MT5>::value ) ||
5506  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
5507  selectSmallAddAssignKernel( C, A, B, scalar );
5508  else
5509  selectBlasAddAssignKernel( C, A, B, scalar );
5510  }
5511  //**********************************************************************************************
5512 
5513  //**Default addition assignment to dense matrices (general/general)*****************************
5527  template< typename MT3 // Type of the left-hand side target matrix
5528  , typename MT4 // Type of the left-hand side matrix operand
5529  , typename MT5 // Type of the right-hand side matrix operand
5530  , typename ST2 > // Type of the scalar value
5531  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
5532  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5533  {
5534  const ResultType tmp( serial( A * B * scalar ) );
5535  addAssign( C, tmp );
5536  }
5537  //**********************************************************************************************
5538 
5539  //**Default addition assignment to dense matrices (general/diagonal)****************************
5553  template< typename MT3 // Type of the left-hand side target matrix
5554  , typename MT4 // Type of the left-hand side matrix operand
5555  , typename MT5 // Type of the right-hand side matrix operand
5556  , typename ST2 > // Type of the scalar value
5557  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
5558  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5559  {
5561 
5562  const size_t M( A.rows() );
5563  const size_t N( B.columns() );
5564 
5565  for( size_t i=0UL; i<M; ++i )
5566  {
5567  const size_t jbegin( ( IsUpper<MT4>::value )
5568  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
5569  :( 0UL ) );
5570  const size_t jend( ( IsLower<MT4>::value )
5571  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
5572  :( N ) );
5573  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5574 
5575  const size_t jnum( jend - jbegin );
5576  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
5577 
5578  for( size_t j=jbegin; j<jpos; j+=2UL ) {
5579  C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
5580  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
5581  }
5582  if( jpos < jend ) {
5583  C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
5584  }
5585  }
5586  }
5587  //**********************************************************************************************
5588 
5589  //**Default addition assignment to dense matrices (diagonal/general)****************************
5603  template< typename MT3 // Type of the left-hand side target matrix
5604  , typename MT4 // Type of the left-hand side matrix operand
5605  , typename MT5 // Type of the right-hand side matrix operand
5606  , typename ST2 > // Type of the scalar value
5607  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
5608  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5609  {
5611 
5612  const size_t M( A.rows() );
5613  const size_t N( B.columns() );
5614 
5615  for( size_t i=0UL; i<M; ++i )
5616  {
5617  const size_t jbegin( ( IsUpper<MT5>::value )
5618  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
5619  :( 0UL ) );
5620  const size_t jend( ( IsLower<MT5>::value )
5621  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
5622  :( N ) );
5623  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5624 
5625  const size_t jnum( jend - jbegin );
5626  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
5627 
5628  for( size_t j=jbegin; j<jpos; j+=2UL ) {
5629  C(i,j ) += A(i,i) * B(i,j ) * scalar;
5630  C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
5631  }
5632  if( jpos < jend ) {
5633  C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
5634  }
5635  }
5636  }
5637  //**********************************************************************************************
5638 
5639  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
5653  template< typename MT3 // Type of the left-hand side target matrix
5654  , typename MT4 // Type of the left-hand side matrix operand
5655  , typename MT5 // Type of the right-hand side matrix operand
5656  , typename ST2 > // Type of the scalar value
5657  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
5658  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5659  {
5661 
5662  for( size_t i=0UL; i<A.rows(); ++i ) {
5663  C(i,i) += A(i,i) * B(i,i) * scalar;
5664  }
5665  }
5666  //**********************************************************************************************
5667 
5668  //**Default addition assignment to dense matrices (small matrices)******************************
5682  template< typename MT3 // Type of the left-hand side target matrix
5683  , typename MT4 // Type of the left-hand side matrix operand
5684  , typename MT5 // Type of the right-hand side matrix operand
5685  , typename ST2 > // Type of the scalar value
5686  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5687  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5688  {
5689  selectDefaultAddAssignKernel( C, A, B, scalar );
5690  }
5691  //**********************************************************************************************
5692 
5693  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
5708  template< typename MT3 // Type of the left-hand side target matrix
5709  , typename MT4 // Type of the left-hand side matrix operand
5710  , typename MT5 // Type of the right-hand side matrix operand
5711  , typename ST2 > // Type of the scalar value
5712  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5713  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5714  {
5715  const size_t M( A.rows() );
5716  const size_t N( B.columns() );
5717  const size_t K( A.columns() );
5718 
5719  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5720 
5721  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
5722  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
5723 
5724  const SIMDType factor( set( scalar ) );
5725 
5726  size_t j( 0UL );
5727 
5728  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5729  for( size_t i=0UL; i<M; ++i )
5730  {
5731  const size_t kbegin( ( IsUpper<MT4>::value )
5732  ?( ( IsLower<MT5>::value )
5733  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5734  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5735  :( IsLower<MT5>::value ? j : 0UL ) );
5736  const size_t kend( ( IsLower<MT4>::value )
5737  ?( ( IsUpper<MT5>::value )
5738  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
5739  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
5740  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
5741 
5742  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5743 
5744  for( size_t k=kbegin; k<kend; ++k ) {
5745  const SIMDType a1( set( A(i,k) ) );
5746  xmm1 = xmm1 + a1 * B.load(k,j );
5747  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
5748  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
5749  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
5750  xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
5751  xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
5752  xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
5753  xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
5754  }
5755 
5756  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5757  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
5758  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
5759  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
5760  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
5761  (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
5762  (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
5763  (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
5764  }
5765  }
5766 
5767  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5768  {
5769  size_t i( 0UL );
5770 
5771  for( ; (i+2UL) <= M; i+=2UL )
5772  {
5773  const size_t kbegin( ( IsUpper<MT4>::value )
5774  ?( ( IsLower<MT5>::value )
5775  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5776  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5777  :( IsLower<MT5>::value ? j : 0UL ) );
5778  const size_t kend( ( IsLower<MT4>::value )
5779  ?( ( IsUpper<MT5>::value )
5780  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
5781  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5782  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
5783 
5784  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5785 
5786  for( size_t k=kbegin; k<kend; ++k ) {
5787  const SIMDType a1( set( A(i ,k) ) );
5788  const SIMDType a2( set( A(i+1UL,k) ) );
5789  const SIMDType b1( B.load(k,j ) );
5790  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5791  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5792  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5793  xmm1 = xmm1 + a1 * b1;
5794  xmm2 = xmm2 + a1 * b2;
5795  xmm3 = xmm3 + a1 * b3;
5796  xmm4 = xmm4 + a1 * b4;
5797  xmm5 = xmm5 + a2 * b1;
5798  xmm6 = xmm6 + a2 * b2;
5799  xmm7 = xmm7 + a2 * b3;
5800  xmm8 = xmm8 + a2 * b4;
5801  }
5802 
5803  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5804  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
5805  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
5806  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
5807  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
5808  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
5809  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
5810  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
5811  }
5812 
5813  if( i < M )
5814  {
5815  const size_t kbegin( ( IsUpper<MT4>::value )
5816  ?( ( IsLower<MT5>::value )
5817  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5818  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5819  :( IsLower<MT5>::value ? j : 0UL ) );
5820  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
5821 
5822  SIMDType xmm1, xmm2, xmm3, xmm4;
5823 
5824  for( size_t k=kbegin; k<kend; ++k ) {
5825  const SIMDType a1( set( A(i,k) ) );
5826  xmm1 = xmm1 + a1 * B.load(k,j );
5827  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
5828  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
5829  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
5830  }
5831 
5832  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5833  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
5834  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
5835  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
5836  }
5837  }
5838 
5839  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5840  {
5841  size_t i( 0UL );
5842 
5843  for( ; (i+2UL) <= M; i+=2UL )
5844  {
5845  const size_t kbegin( ( IsUpper<MT4>::value )
5846  ?( ( IsLower<MT5>::value )
5847  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5848  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5849  :( IsLower<MT5>::value ? j : 0UL ) );
5850  const size_t kend( ( IsLower<MT4>::value )
5851  ?( ( IsUpper<MT5>::value )
5852  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
5853  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5854  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
5855 
5856  SIMDType xmm1, xmm2, xmm3, xmm4;
5857 
5858  for( size_t k=kbegin; k<kend; ++k ) {
5859  const SIMDType a1( set( A(i ,k) ) );
5860  const SIMDType a2( set( A(i+1UL,k) ) );
5861  const SIMDType b1( B.load(k,j ) );
5862  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5863  xmm1 = xmm1 + a1 * b1;
5864  xmm2 = xmm2 + a1 * b2;
5865  xmm3 = xmm3 + a2 * b1;
5866  xmm4 = xmm4 + a2 * b2;
5867  }
5868 
5869  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5870  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
5871  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5872  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
5873  }
5874 
5875  if( i < M )
5876  {
5877  const size_t kbegin( ( IsUpper<MT4>::value )
5878  ?( ( IsLower<MT5>::value )
5879  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5880  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5881  :( IsLower<MT5>::value ? j : 0UL ) );
5882  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
5883 
5884  SIMDType xmm1, xmm2;
5885 
5886  for( size_t k=kbegin; k<kend; ++k ) {
5887  const SIMDType a1( set( A(i,k) ) );
5888  xmm1 = xmm1 + a1 * B.load(k,j );
5889  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
5890  }
5891 
5892  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5893  (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + xmm2 * factor );
5894  }
5895  }
5896 
5897  for( ; j<jpos; j+=SIMDSIZE )
5898  {
5899  size_t i( 0UL );
5900 
5901  for( ; (i+2UL) <= M; i+=2UL )
5902  {
5903  const size_t kbegin( ( IsUpper<MT4>::value )
5904  ?( ( IsLower<MT5>::value )
5905  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5906  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5907  :( IsLower<MT5>::value ? j : 0UL ) );
5908  const size_t kend( ( IsLower<MT4>::value )
5909  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5910  :( K ) );
5911 
5912  SIMDType xmm1, xmm2;
5913 
5914  for( size_t k=kbegin; k<kend; ++k ) {
5915  const SIMDType b1( B.load(k,j) );
5916  xmm1 = xmm1 + set( A(i ,k) ) * b1;
5917  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
5918  }
5919 
5920  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5921  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
5922  }
5923 
5924  if( i < M )
5925  {
5926  const size_t kbegin( ( IsUpper<MT4>::value )
5927  ?( ( IsLower<MT5>::value )
5928  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5929  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5930  :( IsLower<MT5>::value ? j : 0UL ) );
5931 
5932  SIMDType xmm1;
5933 
5934  for( size_t k=kbegin; k<K; ++k ) {
5935  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
5936  }
5937 
5938  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5939  }
5940  }
5941 
5942  for( ; remainder && j<N; ++j )
5943  {
5944  size_t i( 0UL );
5945 
5946  for( ; (i+2UL) <= M; i+=2UL )
5947  {
5948  const size_t kbegin( ( IsUpper<MT4>::value )
5949  ?( ( IsLower<MT5>::value )
5950  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5951  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5952  :( IsLower<MT5>::value ? j : 0UL ) );
5953  const size_t kend( ( IsLower<MT4>::value )
5954  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5955  :( K ) );
5956 
5957  ElementType value1 = ElementType();
5958  ElementType value2 = ElementType();
5959 
5960  for( size_t k=kbegin; k<kend; ++k ) {
5961  value1 += A(i ,k) * B(k,j);
5962  value2 += A(i+1UL,k) * B(k,j);
5963  }
5964 
5965  (~C)(i ,j) += value1 * scalar;
5966  (~C)(i+1UL,j) += value2 * scalar;
5967  }
5968 
5969  if( i < M )
5970  {
5971  const size_t kbegin( ( IsUpper<MT4>::value )
5972  ?( ( IsLower<MT5>::value )
5973  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5974  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5975  :( IsLower<MT5>::value ? j : 0UL ) );
5976 
5977  ElementType value = ElementType();
5978 
5979  for( size_t k=kbegin; k<K; ++k ) {
5980  value += A(i,k) * B(k,j);
5981  }
5982 
5983  (~C)(i,j) += value * scalar;
5984  }
5985  }
5986  }
5987  //**********************************************************************************************
5988 
5989  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
6004  template< typename MT3 // Type of the left-hand side target matrix
6005  , typename MT4 // Type of the left-hand side matrix operand
6006  , typename MT5 // Type of the right-hand side matrix operand
6007  , typename ST2 > // Type of the scalar value
6008  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6009  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6010  {
6015 
6016  if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
6017  const OppositeType_<MT4> tmp( serial( A ) );
6018  addAssign( ~C, tmp * B * scalar );
6019  }
6020  else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
6021  const OppositeType_<MT5> tmp( serial( B ) );
6022  addAssign( ~C, A * tmp * scalar );
6023  }
6024  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6025  const OppositeType_<MT4> tmp( serial( A ) );
6026  addAssign( ~C, tmp * B * scalar );
6027  }
6028  else {
6029  const OppositeType_<MT5> tmp( serial( B ) );
6030  addAssign( ~C, A * tmp * scalar );
6031  }
6032  }
6033  //**********************************************************************************************
6034 
6035  //**Default addition assignment to dense matrices (large matrices)******************************
6049  template< typename MT3 // Type of the left-hand side target matrix
6050  , typename MT4 // Type of the left-hand side matrix operand
6051  , typename MT5 // Type of the right-hand side matrix operand
6052  , typename ST2 > // Type of the scalar value
6053  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6054  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6055  {
6056  selectDefaultAddAssignKernel( C, A, B, scalar );
6057  }
6058  //**********************************************************************************************
6059 
6060  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
6075  template< typename MT3 // Type of the left-hand side target matrix
6076  , typename MT4 // Type of the left-hand side matrix operand
6077  , typename MT5 // Type of the right-hand side matrix operand
6078  , typename ST2 > // Type of the scalar value
6079  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6080  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6081  {
6082  const size_t M( A.rows() );
6083  const size_t N( B.columns() );
6084  const size_t K( A.columns() );
6085 
6086  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6087 
6088  const SIMDType factor( set( scalar ) );
6089 
6090  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
6091  {
6092  const size_t jend( min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
6093 
6094  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
6095  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
6096 
6097  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
6098  {
6099  const size_t iend( min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
6100 
6101  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
6102  {
6103  const size_t ktmp( min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
6104 
6105  size_t j( jj );
6106 
6107  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6108  {
6109  const size_t j1( j+SIMDSIZE );
6110  const size_t j2( j+SIMDSIZE*2UL );
6111  const size_t j3( j+SIMDSIZE*3UL );
6112 
6113  size_t i( ii );
6114 
6115  for( ; (i+2UL) <= iend; i+=2UL )
6116  {
6117  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6118  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6119  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
6120  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
6121 
6122  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6123 
6124  for( size_t k=kbegin; k<kend; ++k ) {
6125  const SIMDType a1( set( A(i ,k) ) );
6126  const SIMDType a2( set( A(i+1UL,k) ) );
6127  const SIMDType b1( B.load(k,j ) );
6128  const SIMDType b2( B.load(k,j1) );
6129  const SIMDType b3( B.load(k,j2) );
6130  const SIMDType b4( B.load(k,j3) );
6131  xmm1 = xmm1 + a1 * b1;
6132  xmm2 = xmm2 + a1 * b2;
6133  xmm3 = xmm3 + a1 * b3;
6134  xmm4 = xmm4 + a1 * b4;
6135  xmm5 = xmm5 + a2 * b1;
6136  xmm6 = xmm6 + a2 * b2;
6137  xmm7 = xmm7 + a2 * b3;
6138  xmm8 = xmm8 + a2 * b4;
6139  }
6140 
6141  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6142  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
6143  (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
6144  (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
6145  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
6146  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
6147  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
6148  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
6149  }
6150 
6151  if( i < iend )
6152  {
6153  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6154  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6155  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6156  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
6157 
6158  SIMDType xmm1, xmm2, xmm3, xmm4;
6159 
6160  for( size_t k=kbegin; k<kend; ++k ) {
6161  const SIMDType a1( set( A(i,k) ) );
6162  xmm1 = xmm1 + a1 * B.load(k,j );
6163  xmm2 = xmm2 + a1 * B.load(k,j1);
6164  xmm3 = xmm3 + a1 * B.load(k,j2);
6165  xmm4 = xmm4 + a1 * B.load(k,j3);
6166  }
6167 
6168  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6169  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
6170  (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
6171  (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
6172  }
6173  }
6174 
6175  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6176  {
6177  const size_t j1( j+SIMDSIZE );
6178 
6179  size_t i( ii );
6180 
6181  for( ; (i+4UL) <= iend; i+=4UL )
6182  {
6183  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6184  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6185  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
6186  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
6187 
6188  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6189 
6190  for( size_t k=kbegin; k<kend; ++k ) {
6191  const SIMDType a1( set( A(i ,k) ) );
6192  const SIMDType a2( set( A(i+1UL,k) ) );
6193  const SIMDType a3( set( A(i+2UL,k) ) );
6194  const SIMDType a4( set( A(i+3UL,k) ) );
6195  const SIMDType b1( B.load(k,j ) );
6196  const SIMDType b2( B.load(k,j1) );
6197  xmm1 = xmm1 + a1 * b1;
6198  xmm2 = xmm2 + a1 * b2;
6199  xmm3 = xmm3 + a2 * b1;
6200  xmm4 = xmm4 + a2 * b2;
6201  xmm5 = xmm5 + a3 * b1;
6202  xmm6 = xmm6 + a3 * b2;
6203  xmm7 = xmm7 + a4 * b1;
6204  xmm8 = xmm8 + a4 * b2;
6205  }
6206 
6207  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6208  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
6209  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
6210  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
6211  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
6212  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
6213  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
6214  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
6215  }
6216 
6217  for( ; (i+2UL) <= iend; i+=2UL )
6218  {
6219  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6220  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6221  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
6222  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
6223 
6224  SIMDType xmm1, xmm2, xmm3, xmm4;
6225 
6226  for( size_t k=kbegin; k<kend; ++k ) {
6227  const SIMDType a1( set( A(i ,k) ) );
6228  const SIMDType a2( set( A(i+1UL,k) ) );
6229  const SIMDType b1( B.load(k,j ) );
6230  const SIMDType b2( B.load(k,j1) );
6231  xmm1 = xmm1 + a1 * b1;
6232  xmm2 = xmm2 + a1 * b2;
6233  xmm3 = xmm3 + a2 * b1;
6234  xmm4 = xmm4 + a2 * b2;
6235  }
6236 
6237  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6238  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
6239  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
6240  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
6241  }
6242 
6243  if( i < iend )
6244  {
6245  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6246  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6247  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6248  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
6249 
6250  SIMDType xmm1, xmm2;
6251 
6252  for( size_t k=kbegin; k<kend; ++k ) {
6253  const SIMDType a1( set( A(i,k) ) );
6254  xmm1 = xmm1 + a1 * B.load(k,j );
6255  xmm2 = xmm2 + a1 * B.load(k,j1);
6256  }
6257 
6258  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6259  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
6260  }
6261  }
6262 
6263  for( ; j<jpos; j+=SIMDSIZE )
6264  {
6265  for( size_t i=ii; i<iend; ++i )
6266  {
6267  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6268  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6269  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6270  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
6271 
6272  SIMDType xmm1;
6273 
6274  for( size_t k=kbegin; k<kend; ++k ) {
6275  const SIMDType a1( set( A(i,k) ) );
6276  xmm1 = xmm1 + a1 * B.load(k,j);
6277  }
6278 
6279  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
6280  }
6281  }
6282 
6283  for( ; remainder && j<jend; ++j )
6284  {
6285  for( size_t i=ii; i<iend; ++i )
6286  {
6287  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6288  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6289  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6290  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
6291 
6292  ElementType value = ElementType();
6293 
6294  for( size_t k=kbegin; k<kend; ++k ) {
6295  value += A(i,k) * B(k,j);
6296  }
6297 
6298  (~C)(i,j) += value * scalar;
6299  }
6300  }
6301  }
6302  }
6303  }
6304  }
6305  //**********************************************************************************************
6306 
6307  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
6321  template< typename MT3 // Type of the left-hand side target matrix
6322  , typename MT4 // Type of the left-hand side matrix operand
6323  , typename MT5 // Type of the right-hand side matrix operand
6324  , typename ST2 > // Type of the scalar value
6325  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6326  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6327  {
6328  selectSmallAddAssignKernel( ~C, A, B, scalar );
6329  }
6330  //**********************************************************************************************
6331 
6332  //**BLAS-based addition assignment to dense matrices (default)**********************************
6346  template< typename MT3 // Type of the left-hand side target matrix
6347  , typename MT4 // Type of the left-hand side matrix operand
6348  , typename MT5 // Type of the right-hand side matrix operand
6349  , typename ST2 > // Type of the scalar value
6350  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
6351  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6352  {
6353  selectLargeAddAssignKernel( C, A, B, scalar );
6354  }
6355  //**********************************************************************************************
6356 
6357  //**BLAS-based addition assignment to dense matrices********************************************
6358 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6359 
6372  template< typename MT3 // Type of the left-hand side target matrix
6373  , typename MT4 // Type of the left-hand side matrix operand
6374  , typename MT5 // Type of the right-hand side matrix operand
6375  , typename ST2 > // Type of the scalar value
6376  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
6377  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6378  {
6379  typedef ElementType_<MT3> ET;
6380 
6381  if( IsTriangular<MT4>::value ) {
6382  ResultType_<MT3> tmp( serial( B ) );
6383  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6384  addAssign( C, tmp );
6385  }
6386  else if( IsTriangular<MT5>::value ) {
6387  ResultType_<MT3> tmp( serial( A ) );
6388  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6389  addAssign( C, tmp );
6390  }
6391  else {
6392  gemm( C, A, B, ET(scalar), ET(1) );
6393  }
6394  }
6395 #endif
6396  //**********************************************************************************************
6397 
6398  //**Restructuring addition assignment to column-major matrices**********************************
6412  template< typename MT > // Type of the target matrix
6413  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
6414  addAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
6415  {
6417 
6419 
6420  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6421  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6422 
6423  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6424  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6425 
6426  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
6427  addAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
6428  else if( IsSymmetric<MT1>::value )
6429  addAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
6430  else
6431  addAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
6432  }
6433  //**********************************************************************************************
6434 
6435  //**Addition assignment to sparse matrices******************************************************
6436  // No special implementation for the addition assignment to sparse matrices.
6437  //**********************************************************************************************
6438 
6439  //**Subtraction assignment to dense matrices****************************************************
6451  template< typename MT // Type of the target dense matrix
6452  , bool SO > // Storage order of the target dense matrix
6453  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
6454  subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6455  {
6457 
6458  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6459  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6460 
6461  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6462  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6463 
6464  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6465  return;
6466  }
6467 
6468  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6469  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6470 
6471  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6472  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6473  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6474  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6475  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6476  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6477 
6478  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6479  }
6480  //**********************************************************************************************
6481 
6482  //**Subtraction assignment to dense matrices (kernel selection)*********************************
6493  template< typename MT3 // Type of the left-hand side target matrix
6494  , typename MT4 // Type of the left-hand side matrix operand
6495  , typename MT5 // Type of the right-hand side matrix operand
6496  , typename ST2 > // Type of the scalar value
6497  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6498  {
6499  if( ( IsDiagonal<MT5>::value ) ||
6500  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6501  selectSmallSubAssignKernel( C, A, B, scalar );
6502  else
6503  selectBlasSubAssignKernel( C, A, B, scalar );
6504  }
6505  //**********************************************************************************************
6506 
6507  //**Default subtraction assignment to dense matrices (general/general)**************************
6521  template< typename MT3 // Type of the left-hand side target matrix
6522  , typename MT4 // Type of the left-hand side matrix operand
6523  , typename MT5 // Type of the right-hand side matrix operand
6524  , typename ST2 > // Type of the scalar value
6525  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6526  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6527  {
6528  const ResultType tmp( serial( A * B * scalar ) );
6529  subAssign( C, tmp );
6530  }
6531  //**********************************************************************************************
6532 
6533  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
6547  template< typename MT3 // Type of the left-hand side target matrix
6548  , typename MT4 // Type of the left-hand side matrix operand
6549  , typename MT5 // Type of the right-hand side matrix operand
6550  , typename ST2 > // Type of the scalar value
6551  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6552  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6553  {
6555 
6556  const size_t M( A.rows() );
6557  const size_t N( B.columns() );
6558 
6559  for( size_t i=0UL; i<M; ++i )
6560  {
6561  const size_t jbegin( ( IsUpper<MT4>::value )
6562  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6563  :( 0UL ) );
6564  const size_t jend( ( IsLower<MT4>::value )
6565  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6566  :( N ) );
6567  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6568 
6569  const size_t jnum( jend - jbegin );
6570  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6571 
6572  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6573  C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
6574  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6575  }
6576  if( jpos < jend ) {
6577  C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
6578  }
6579  }
6580  }
6581  //**********************************************************************************************
6582 
6583  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
6597  template< typename MT3 // Type of the left-hand side target matrix
6598  , typename MT4 // Type of the left-hand side matrix operand
6599  , typename MT5 // Type of the right-hand side matrix operand
6600  , typename ST2 > // Type of the scalar value
6601  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6602  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6603  {
6605 
6606  const size_t M( A.rows() );
6607  const size_t N( B.columns() );
6608 
6609  for( size_t i=0UL; i<M; ++i )
6610  {
6611  const size_t jbegin( ( IsUpper<MT5>::value )
6612  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
6613  :( 0UL ) );
6614  const size_t jend( ( IsLower<MT5>::value )
6615  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
6616  :( N ) );
6617  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6618 
6619  const size_t jnum( jend - jbegin );
6620  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6621 
6622  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6623  C(i,j ) -= A(i,i) * B(i,j ) * scalar;
6624  C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
6625  }
6626  if( jpos < jend ) {
6627  C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
6628  }
6629  }
6630  }
6631  //**********************************************************************************************
6632 
6633  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
6647  template< typename MT3 // Type of the left-hand side target matrix
6648  , typename MT4 // Type of the left-hand side matrix operand
6649  , typename MT5 // Type of the right-hand side matrix operand
6650  , typename ST2 > // Type of the scalar value
6651  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
6652  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6653  {
6655 
6656  for( size_t i=0UL; i<A.rows(); ++i ) {
6657  C(i,i) -= A(i,i) * B(i,i) * scalar;
6658  }
6659  }
6660  //**********************************************************************************************
6661 
6662  //**Default subtraction assignment to dense matrices (small matrices)***************************
6676  template< typename MT3 // Type of the left-hand side target matrix
6677  , typename MT4 // Type of the left-hand side matrix operand
6678  , typename MT5 // Type of the right-hand side matrix operand
6679  , typename ST2 > // Type of the scalar value
6680  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6681  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6682  {
6683  selectDefaultSubAssignKernel( C, A, B, scalar );
6684  }
6685  //**********************************************************************************************
6686 
6687  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
6702  template< typename MT3 // Type of the left-hand side target matrix
6703  , typename MT4 // Type of the left-hand side matrix operand
6704  , typename MT5 // Type of the right-hand side matrix operand
6705  , typename ST2 > // Type of the scalar value
6706  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6707  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6708  {
6709  const size_t M( A.rows() );
6710  const size_t N( B.columns() );
6711  const size_t K( A.columns() );
6712 
6713  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6714 
6715  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
6716  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
6717 
6718  const SIMDType factor( set( scalar ) );
6719 
6720  size_t j( 0UL );
6721 
6722  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
6723  for( size_t i=0UL; i<M; ++i )
6724  {
6725  const size_t kbegin( ( IsUpper<MT4>::value )
6726  ?( ( IsLower<MT5>::value )
6727  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6728  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6729  :( IsLower<MT5>::value ? j : 0UL ) );
6730  const size_t kend( ( IsLower<MT4>::value )
6731  ?( ( IsUpper<MT5>::value )
6732  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
6733  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
6734  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
6735 
6736  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6737 
6738  for( size_t k=kbegin; k<kend; ++k ) {
6739  const SIMDType a1( set( A(i,k) ) );
6740  xmm1 = xmm1 + a1 * B.load(k,j );
6741  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
6742  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
6743  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
6744  xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
6745  xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
6746  xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
6747  xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
6748  }
6749 
6750  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6751  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
6752  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
6753  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
6754  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
6755  (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
6756  (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
6757  (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
6758  }
6759  }
6760 
6761  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6762  {
6763  size_t i( 0UL );
6764 
6765  for( ; (i+2UL) <= M; i+=2UL )
6766  {
6767  const size_t kbegin( ( IsUpper<MT4>::value )
6768  ?( ( IsLower<MT5>::value )
6769  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6770  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6771  :( IsLower<MT5>::value ? j : 0UL ) );
6772  const size_t kend( ( IsLower<MT4>::value )
6773  ?( ( IsUpper<MT5>::value )
6774  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
6775  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6776  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
6777 
6778  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6779 
6780  for( size_t k=kbegin; k<kend; ++k ) {
6781  const SIMDType a1( set( A(i ,k) ) );
6782  const SIMDType a2( set( A(i+1UL,k) ) );
6783  const SIMDType b1( B.load(k,j ) );
6784  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6785  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6786  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6787  xmm1 = xmm1 + a1 * b1;
6788  xmm2 = xmm2 + a1 * b2;
6789  xmm3 = xmm3 + a1 * b3;
6790  xmm4 = xmm4 + a1 * b4;
6791  xmm5 = xmm5 + a2 * b1;
6792  xmm6 = xmm6 + a2 * b2;
6793  xmm7 = xmm7 + a2 * b3;
6794  xmm8 = xmm8 + a2 * b4;
6795  }
6796 
6797  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6798  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
6799  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
6800  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
6801  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
6802  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
6803  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
6804  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
6805  }
6806 
6807  if( i < M )
6808  {
6809  const size_t kbegin( ( IsUpper<MT4>::value )
6810  ?( ( IsLower<MT5>::value )
6811  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6812  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6813  :( IsLower<MT5>::value ? j : 0UL ) );
6814  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
6815 
6816  SIMDType xmm1, xmm2, xmm3, xmm4;
6817 
6818  for( size_t k=kbegin; k<kend; ++k ) {
6819  const SIMDType a1( set( A(i,k) ) );
6820  xmm1 = xmm1 + a1 * B.load(k,j );
6821  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
6822  xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
6823  xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
6824  }
6825 
6826  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6827  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
6828  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
6829  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
6830  }
6831  }
6832 
6833  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6834  {
6835  size_t i( 0UL );
6836 
6837  for( ; (i+2UL) <= M; i+=2UL )
6838  {
6839  const size_t kbegin( ( IsUpper<MT4>::value )
6840  ?( ( IsLower<MT5>::value )
6841  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6842  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6843  :( IsLower<MT5>::value ? j : 0UL ) );
6844  const size_t kend( ( IsLower<MT4>::value )
6845  ?( ( IsUpper<MT5>::value )
6846  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
6847  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6848  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
6849 
6850  SIMDType xmm1, xmm2, xmm3, xmm4;
6851 
6852  for( size_t k=kbegin; k<kend; ++k ) {
6853  const SIMDType a1( set( A(i ,k) ) );
6854  const SIMDType a2( set( A(i+1UL,k) ) );
6855  const SIMDType b1( B.load(k,j ) );
6856  const SIMDType b2( B.load(k,j+SIMDSIZE) );
6857  xmm1 = xmm1 + a1 * b1;
6858  xmm2 = xmm2 + a1 * b2;
6859  xmm3 = xmm3 + a2 * b1;
6860  xmm4 = xmm4 + a2 * b2;
6861  }
6862 
6863  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6864  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
6865  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
6866  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
6867  }
6868 
6869  if( i < M )
6870  {
6871  const size_t kbegin( ( IsUpper<MT4>::value )
6872  ?( ( IsLower<MT5>::value )
6873  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6874  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6875  :( IsLower<MT5>::value ? j : 0UL ) );
6876  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
6877 
6878  SIMDType xmm1, xmm2;
6879 
6880  for( size_t k=kbegin; k<kend; ++k ) {
6881  const SIMDType a1( set( A(i,k) ) );
6882  xmm1 = xmm1 + a1 * B.load(k,j );
6883  xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
6884  }
6885 
6886  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6887  (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - xmm2 * factor );
6888  }
6889  }
6890 
6891  for( ; j<jpos; j+=SIMDSIZE )
6892  {
6893  size_t i( 0UL );
6894 
6895  for( ; (i+2UL) <= M; i+=2UL )
6896  {
6897  const size_t kbegin( ( IsUpper<MT4>::value )
6898  ?( ( IsLower<MT5>::value )
6899  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6900  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6901  :( IsLower<MT5>::value ? j : 0UL ) );
6902  const size_t kend( ( IsLower<MT4>::value )
6903  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6904  :( K ) );
6905 
6906  SIMDType xmm1, xmm2;
6907 
6908  for( size_t k=kbegin; k<kend; ++k ) {
6909  const SIMDType b1( B.load(k,j) );
6910  xmm1 = xmm1 + set( A(i ,k) ) * b1;
6911  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
6912  }
6913 
6914  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6915  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
6916  }
6917 
6918  if( i < M )
6919  {
6920  const size_t kbegin( ( IsUpper<MT4>::value )
6921  ?( ( IsLower<MT5>::value )
6922  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6923  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6924  :( IsLower<MT5>::value ? j : 0UL ) );
6925 
6926  SIMDType xmm1;
6927 
6928  for( size_t k=kbegin; k<K; ++k ) {
6929  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
6930  }
6931 
6932  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
6933  }
6934  }
6935 
6936  for( ; remainder && j<N; ++j )
6937  {
6938  size_t i( 0UL );
6939 
6940  for( ; (i+2UL) <= M; i+=2UL )
6941  {
6942  const size_t kbegin( ( IsUpper<MT4>::value )
6943  ?( ( IsLower<MT5>::value )
6944  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6945  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6946  :( IsLower<MT5>::value ? j : 0UL ) );
6947  const size_t kend( ( IsLower<MT4>::value )
6948  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6949  :( K ) );
6950 
6951  ElementType value1 = ElementType();
6952  ElementType value2 = ElementType();
6953 
6954  for( size_t k=kbegin; k<kend; ++k ) {
6955  value1 += A(i ,k) * B(k,j);
6956  value2 += A(i+1UL,k) * B(k,j);
6957  }
6958 
6959  (~C)(i ,j) -= value1 * scalar;
6960  (~C)(i+1UL,j) -= value2 * scalar;
6961  }
6962 
6963  if( i < M )
6964  {
6965  const size_t kbegin( ( IsUpper<MT4>::value )
6966  ?( ( IsLower<MT5>::value )
6967  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6968  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6969  :( IsLower<MT5>::value ? j : 0UL ) );
6970 
6971  ElementType value = ElementType();
6972 
6973  for( size_t k=kbegin; k<K; ++k ) {
6974  value += A(i,k) * B(k,j);
6975  }
6976 
6977  (~C)(i,j) -= value * scalar;
6978  }
6979  }
6980  }
6981  //**********************************************************************************************
6982 
6983  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
6997  template< typename MT3 // Type of the left-hand side target matrix
6998  , typename MT4 // Type of the left-hand side matrix operand
6999  , typename MT5 // Type of the right-hand side matrix operand
7000  , typename ST2 > // Type of the scalar value
7001  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7002  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7003  {
7008 
7009  if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
7010  const OppositeType_<MT4> tmp( serial( A ) );
7011  subAssign( ~C, tmp * B * scalar );
7012  }
7013  else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
7014  const OppositeType_<MT5> tmp( serial( B ) );
7015  subAssign( ~C, A * tmp * scalar );
7016  }
7017  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
7018  const OppositeType_<MT4> tmp( serial( A ) );
7019  subAssign( ~C, tmp * B * scalar );
7020  }
7021  else {
7022  const OppositeType_<MT5> tmp( serial( B ) );
7023  subAssign( ~C, A * tmp * scalar );
7024  }
7025  }
7026  //**********************************************************************************************
7027 
7028  //**Default subtraction assignment to dense matrices (large matrices)***************************
7042  template< typename MT3 // Type of the left-hand side target matrix
7043  , typename MT4 // Type of the left-hand side matrix operand
7044  , typename MT5 // Type of the right-hand side matrix operand
7045  , typename ST2 > // Type of the scalar value
7046  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7047  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7048  {
7049  selectDefaultSubAssignKernel( C, A, B, scalar );
7050  }
7051  //**********************************************************************************************
7052 
7053  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
7068  template< typename MT3 // Type of the left-hand side target matrix
7069  , typename MT4 // Type of the left-hand side matrix operand
7070  , typename MT5 // Type of the right-hand side matrix operand
7071  , typename ST2 > // Type of the scalar value
7072  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7073  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7074  {
7075  const size_t M( A.rows() );
7076  const size_t N( B.columns() );
7077  const size_t K( A.columns() );
7078 
7079  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
7080 
7081  const SIMDType factor( set( scalar ) );
7082 
7083  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
7084  {
7085  const size_t jend( min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
7086 
7087  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
7088  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
7089 
7090  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
7091  {
7092  const size_t iend( min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
7093 
7094  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
7095  {
7096  const size_t ktmp( min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
7097 
7098  size_t j( jj );
7099 
7100  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7101  {
7102  const size_t j1( j+SIMDSIZE );
7103  const size_t j2( j+SIMDSIZE*2UL );
7104  const size_t j3( j+SIMDSIZE*3UL );
7105 
7106  size_t i( ii );
7107 
7108  for( ; (i+2UL) <= iend; i+=2UL )
7109  {
7110  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7111  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7112  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7113  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
7114 
7115  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7116 
7117  for( size_t k=kbegin; k<kend; ++k ) {
7118  const SIMDType a1( set( A(i ,k) ) );
7119  const SIMDType a2( set( A(i+1UL,k) ) );
7120  const SIMDType b1( B.load(k,j ) );
7121  const SIMDType b2( B.load(k,j1) );
7122  const SIMDType b3( B.load(k,j2) );
7123  const SIMDType b4( B.load(k,j3) );
7124  xmm1 = xmm1 + a1 * b1;
7125  xmm2 = xmm2 + a1 * b2;
7126  xmm3 = xmm3 + a1 * b3;
7127  xmm4 = xmm4 + a1 * b4;
7128  xmm5 = xmm5 + a2 * b1;
7129  xmm6 = xmm6 + a2 * b2;
7130  xmm7 = xmm7 + a2 * b3;
7131  xmm8 = xmm8 + a2 * b4;
7132  }
7133 
7134  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7135  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
7136  (~C).store( i , j2, (~C).load(i ,j2) - xmm3 * factor );
7137  (~C).store( i , j3, (~C).load(i ,j3) - xmm4 * factor );
7138  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
7139  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm6 * factor );
7140  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) - xmm7 * factor );
7141  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) - xmm8 * factor );
7142  }
7143 
7144  if( i < iend )
7145  {
7146  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7147  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7148  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7149  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
7150 
7151  SIMDType xmm1, xmm2, xmm3, xmm4;
7152 
7153  for( size_t k=kbegin; k<kend; ++k ) {
7154  const SIMDType a1( set( A(i,k) ) );
7155  xmm1 = xmm1 + a1 * B.load(k,j );
7156  xmm2 = xmm2 + a1 * B.load(k,j1);
7157  xmm3 = xmm3 + a1 * B.load(k,j2);
7158  xmm4 = xmm4 + a1 * B.load(k,j3);
7159  }
7160 
7161  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7162  (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
7163  (~C).store( i, j2, (~C).load(i,j2) - xmm3 * factor );
7164  (~C).store( i, j3, (~C).load(i,j3) - xmm4 * factor );
7165  }
7166  }
7167 
7168  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
7169  {
7170  const size_t j1( j+SIMDSIZE );
7171 
7172  size_t i( ii );
7173 
7174  for( ; (i+4UL) <= iend; i+=4UL )
7175  {
7176  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7177  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7178  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
7179  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
7180 
7181  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7182 
7183  for( size_t k=kbegin; k<kend; ++k ) {
7184  const SIMDType a1( set( A(i ,k) ) );
7185  const SIMDType a2( set( A(i+1UL,k) ) );
7186  const SIMDType a3( set( A(i+2UL,k) ) );
7187  const SIMDType a4( set( A(i+3UL,k) ) );
7188  const SIMDType b1( B.load(k,j ) );
7189  const SIMDType b2( B.load(k,j1) );
7190  xmm1 = xmm1 + a1 * b1;
7191  xmm2 = xmm2 + a1 * b2;
7192  xmm3 = xmm3 + a2 * b1;
7193  xmm4 = xmm4 + a2 * b2;
7194  xmm5 = xmm5 + a3 * b1;
7195  xmm6 = xmm6 + a3 * b2;
7196  xmm7 = xmm7 + a4 * b1;
7197  xmm8 = xmm8 + a4 * b2;
7198  }
7199 
7200  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7201  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
7202  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
7203  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
7204  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
7205  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) - xmm6 * factor );
7206  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
7207  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) - xmm8 * factor );
7208  }
7209 
7210  for( ; (i+2UL) <= iend; i+=2UL )
7211  {
7212  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7213  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7214  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7215  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
7216 
7217  SIMDType xmm1, xmm2, xmm3, xmm4;
7218 
7219  for( size_t k=kbegin; k<kend; ++k ) {
7220  const SIMDType a1( set( A(i ,k) ) );
7221  const SIMDType a2( set( A(i+1UL,k) ) );
7222  const SIMDType b1( B.load(k,j ) );
7223  const SIMDType b2( B.load(k,j1) );
7224  xmm1 = xmm1 + a1 * b1;
7225  xmm2 = xmm2 + a1 * b2;
7226  xmm3 = xmm3 + a2 * b1;
7227  xmm4 = xmm4 + a2 * b2;
7228  }
7229 
7230  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7231  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
7232  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
7233  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
7234  }
7235 
7236  if( i < iend )
7237  {
7238  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7239  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7240  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7241  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
7242 
7243  SIMDType xmm1, xmm2;
7244 
7245  for( size_t k=kbegin; k<kend; ++k ) {
7246  const SIMDType a1( set( A(i,k) ) );
7247  xmm1 = xmm1 + a1 * B.load(k,j );
7248  xmm2 = xmm2 + a1 * B.load(k,j1);
7249  }
7250 
7251  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7252  (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
7253  }
7254  }
7255 
7256  for( ; j<jpos; j+=SIMDSIZE )
7257  {
7258  for( size_t i=ii; i<iend; ++i )
7259  {
7260  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7261  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7262  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7263  ( IsUpper<MT5>::value )?( min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
7264 
7265  SIMDType xmm1;
7266 
7267  for( size_t k=kbegin; k<kend; ++k ) {
7268  const SIMDType a1( set( A(i,k) ) );
7269  xmm1 = xmm1 + a1 * B.load(k,j);
7270  }
7271 
7272  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
7273  }
7274  }
7275 
7276  for( ; remainder && j<jend; ++j )
7277  {
7278  for( size_t i=ii; i<iend; ++i )
7279  {
7280  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7281  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7282  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7283  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
7284 
7285  ElementType value = ElementType();
7286 
7287  for( size_t k=kbegin; k<kend; ++k ) {
7288  value += A(i,k) * B(k,j);
7289  }
7290 
7291  (~C)(i,j) -= value * scalar;
7292  }
7293  }
7294  }
7295  }
7296  }
7297  }
7298  //**********************************************************************************************
7299 
7300  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
7314  template< typename MT3 // Type of the left-hand side target matrix
7315  , typename MT4 // Type of the left-hand side matrix operand
7316  , typename MT5 // Type of the right-hand side matrix operand
7317  , typename ST2 > // Type of the scalar value
7318  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7319  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7320  {
7321  selectSmallSubAssignKernel( ~C, A, B, scalar );
7322  }
7323  //**********************************************************************************************
7324 
7325  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
7339  template< typename MT3 // Type of the left-hand side target matrix
7340  , typename MT4 // Type of the left-hand side matrix operand
7341  , typename MT5 // Type of the right-hand side matrix operand
7342  , typename ST2 > // Type of the scalar value
7343  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7344  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7345  {
7346  selectLargeSubAssignKernel( C, A, B, scalar );
7347  }
7348  //**********************************************************************************************
7349 
7350  //**BLAS-based subraction assignment to dense matrices******************************************
7351 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7352 
7365  template< typename MT3 // Type of the left-hand side target matrix
7366  , typename MT4 // Type of the left-hand side matrix operand
7367  , typename MT5 // Type of the right-hand side matrix operand
7368  , typename ST2 > // Type of the scalar value
7369  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7370  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7371  {
7372  typedef ElementType_<MT3> ET;
7373 
7374  if( IsTriangular<MT4>::value ) {
7375  ResultType_<MT3> tmp( serial( B ) );
7376  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7377  subAssign( C, tmp );
7378  }
7379  else if( IsTriangular<MT5>::value ) {
7380  ResultType_<MT3> tmp( serial( A ) );
7381  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7382  subAssign( C, tmp );
7383  }
7384  else {
7385  gemm( C, A, B, ET(-scalar), ET(1) );
7386  }
7387  }
7388 #endif
7389  //**********************************************************************************************
7390 
7391  //**Restructuring subtraction assignment to column-major matrices*******************************
7405  template< typename MT > // Type of the target matrix
7406  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7407  subAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
7408  {
7410 
7412 
7413  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7414  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7415 
7416  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7417  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7418 
7419  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7420  subAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7421  else if( IsSymmetric<MT1>::value )
7422  subAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7423  else
7424  subAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7425  }
7426  //**********************************************************************************************
7427 
7428  //**Subtraction assignment to sparse matrices***************************************************
7429  // No special implementation for the subtraction assignment to sparse matrices.
7430  //**********************************************************************************************
7431 
7432  //**Multiplication assignment to dense matrices*************************************************
7433  // No special implementation for the multiplication assignment to dense matrices.
7434  //**********************************************************************************************
7435 
7436  //**Multiplication assignment to sparse matrices************************************************
7437  // No special implementation for the multiplication assignment to sparse matrices.
7438  //**********************************************************************************************
7439 
7440  //**SMP assignment to dense matrices************************************************************
7455  template< typename MT // Type of the target dense matrix
7456  , bool SO > // Storage order of the target dense matrix
7457  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7458  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7459  {
7461 
7462  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7463  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7464 
7465  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7466  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7467 
7468  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
7469  return;
7470  }
7471  else if( left.columns() == 0UL ) {
7472  reset( ~lhs );
7473  return;
7474  }
7475 
7476  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7477  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7478 
7479  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7480  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7481  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7482  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7483  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7484  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7485 
7486  smpAssign( ~lhs, A * B * rhs.scalar_ );
7487  }
7488  //**********************************************************************************************
7489 
7490  //**SMP assignment to sparse matrices***********************************************************
7505  template< typename MT // Type of the target sparse matrix
7506  , bool SO > // Storage order of the target sparse matrix
7507  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7508  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7509  {
7511 
7512  typedef IfTrue_< SO, OppositeType, ResultType > TmpType;
7513 
7519  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<TmpType> );
7520 
7521  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7522  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7523 
7524  const TmpType tmp( rhs );
7525  smpAssign( ~lhs, tmp );
7526  }
7527  //**********************************************************************************************
7528 
7529  //**Restructuring SMP assignment to column-major matrices***************************************
7543  template< typename MT > // Type of the target matrix
7544  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7545  smpAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
7546  {
7548 
7550 
7551  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7552  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7553 
7554  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7555  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7556 
7557  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7558  smpAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7559  else if( IsSymmetric<MT1>::value )
7560  smpAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7561  else
7562  smpAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7563  }
7564  //**********************************************************************************************
7565 
7566  //**SMP addition assignment to dense matrices***************************************************
7581  template< typename MT // Type of the target dense matrix
7582  , bool SO > // Storage order of the target dense matrix
7583  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7584  smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7585  {
7587 
7588  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7589  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7590 
7591  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7592  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7593 
7594  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7595  return;
7596  }
7597 
7598  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7599  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7600 
7601  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7602  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7603  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7604  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7605  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7606  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7607 
7608  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
7609  }
7610  //**********************************************************************************************
7611 
7612  //**Restructuring SMP addition assignment to column-major matrices******************************
7626  template< typename MT > // Type of the target matrix
7627  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7628  smpAddAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
7629  {
7631 
7633 
7634  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7635  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7636 
7637  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7638  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7639 
7640  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7641  smpAddAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7642  else if( IsSymmetric<MT1>::value )
7643  smpAddAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7644  else
7645  smpAddAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7646  }
7647  //**********************************************************************************************
7648 
7649  //**SMP addition assignment to sparse matrices**************************************************
7650  // No special implementation for the SMP addition assignment to sparse matrices.
7651  //**********************************************************************************************
7652 
7653  //**SMP subtraction assignment to dense matrices************************************************
7668  template< typename MT // Type of the target dense matrix
7669  , bool SO > // Storage order of the target dense matrix
7670  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7671  smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7672  {
7674 
7675  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7676  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7677 
7678  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7679  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7680 
7681  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7682  return;
7683  }
7684 
7685  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7686  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7687 
7688  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7689  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7690  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7691  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7692  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7693  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7694 
7695  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
7696  }
7697  //**********************************************************************************************
7698 
7699  //**Restructuring SMP subtraction assignment to column-major matrices***************************
7713  template< typename MT > // Type of the target matrix
7714  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7715  smpSubAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
7716  {
7718 
7720 
7721  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7722  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7723 
7724  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7725  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7726 
7727  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7728  smpSubAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7729  else if( IsSymmetric<MT1>::value )
7730  smpSubAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7731  else
7732  smpSubAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7733  }
7734  //**********************************************************************************************
7735 
7736  //**SMP subtraction assignment to sparse matrices***********************************************
7737  // No special implementation for the SMP subtraction assignment to sparse matrices.
7738  //**********************************************************************************************
7739 
7740  //**SMP multiplication assignment to dense matrices*********************************************
7741  // No special implementation for the SMP multiplication assignment to dense matrices.
7742  //**********************************************************************************************
7743 
7744  //**SMP multiplication assignment to sparse matrices********************************************
7745  // No special implementation for the SMP multiplication assignment to sparse matrices.
7746  //**********************************************************************************************
7747 
7748  //**Compile time checks*************************************************************************
7756  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
7757  //**********************************************************************************************
7758 };
7760 //*************************************************************************************************
7761 
7762 
7763 
7764 
7765 //=================================================================================================
7766 //
7767 // GLOBAL BINARY ARITHMETIC OPERATORS
7768 //
7769 //=================================================================================================
7770 
7771 //*************************************************************************************************
7797 template< typename T1 // Type of the left-hand side dense matrix
7798  , typename T2 > // Type of the right-hand side dense matrix
7799 inline const DMatDMatMultExpr<T1,T2>
7801 {
7803 
7804  if( (~lhs).columns() != (~rhs).rows() ) {
7805  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
7806  }
7807 
7808  return DMatDMatMultExpr<T1,T2>( ~lhs, ~rhs );
7809 }
7810 //*************************************************************************************************
7811 
7812 
7813 
7814 
7815 //=================================================================================================
7816 //
7817 // ROWS SPECIALIZATIONS
7818 //
7819 //=================================================================================================
7820 
7821 //*************************************************************************************************
7823 template< typename MT1, typename MT2 >
7824 struct Rows< DMatDMatMultExpr<MT1,MT2> > : public Rows<MT1>
7825 {};
7827 //*************************************************************************************************
7828 
7829 
7830 
7831 
7832 //=================================================================================================
7833 //
7834 // COLUMNS SPECIALIZATIONS
7835 //
7836 //=================================================================================================
7837 
7838 //*************************************************************************************************
7840 template< typename MT1, typename MT2 >
7841 struct Columns< DMatDMatMultExpr<MT1,MT2> > : public Columns<MT2>
7842 {};
7844 //*************************************************************************************************
7845 
7846 
7847 
7848 
7849 //=================================================================================================
7850 //
7851 // ISALIGNED SPECIALIZATIONS
7852 //
7853 //=================================================================================================
7854 
7855 //*************************************************************************************************
7857 template< typename MT1, typename MT2 >
7858 struct IsAligned< DMatDMatMultExpr<MT1,MT2> >
7859  : public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7860 {};
7862 //*************************************************************************************************
7863 
7864 
7865 
7866 
7867 //=================================================================================================
7868 //
7869 // ISLOWER SPECIALIZATIONS
7870 //
7871 //=================================================================================================
7872 
7873 //*************************************************************************************************
7875 template< typename MT1, typename MT2 >
7876 struct IsLower< DMatDMatMultExpr<MT1,MT2> >
7877  : public BoolConstant< And< IsLower<MT1>, IsLower<MT2> >::value >
7878 {};
7880 //*************************************************************************************************
7881 
7882 
7883 
7884 
7885 //=================================================================================================
7886 //
7887 // ISUNILOWER SPECIALIZATIONS
7888 //
7889 //=================================================================================================
7890 
7891 //*************************************************************************************************
7893 template< typename MT1, typename MT2 >
7894 struct IsUniLower< DMatDMatMultExpr<MT1,MT2> >
7895  : public BoolConstant< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
7896 {};
7898 //*************************************************************************************************
7899 
7900 
7901 
7902 
7903 //=================================================================================================
7904 //
7905 // ISSTRICTLYLOWER SPECIALIZATIONS
7906 //
7907 //=================================================================================================
7908 
7909 //*************************************************************************************************
7911 template< typename MT1, typename MT2 >
7912 struct IsStrictlyLower< DMatDMatMultExpr<MT1,MT2> >
7913  : public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7914  , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
7915 {};
7917 //*************************************************************************************************
7918 
7919 
7920 
7921 
7922 //=================================================================================================
7923 //
7924 // ISUPPER SPECIALIZATIONS
7925 //
7926 //=================================================================================================
7927 
7928 //*************************************************************************************************
7930 template< typename MT1, typename MT2 >
7931 struct IsUpper< DMatDMatMultExpr<MT1,MT2> >
7932  : public BoolConstant< And< IsUpper<MT1>, IsUpper<MT2> >::value >
7933 {};
7935 //*************************************************************************************************
7936 
7937 
7938 
7939 
7940 //=================================================================================================
7941 //
7942 // ISUNIUPPER SPECIALIZATIONS
7943 //
7944 //=================================================================================================
7945 
7946 //*************************************************************************************************
7948 template< typename MT1, typename MT2 >
7949 struct IsUniUpper< DMatDMatMultExpr<MT1,MT2> >
7950  : public BoolConstant< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
7951 {};
7953 //*************************************************************************************************
7954 
7955 
7956 
7957 
7958 //=================================================================================================
7959 //
7960 // ISSTRICTLYUPPER SPECIALIZATIONS
7961 //
7962 //=================================================================================================
7963 
7964 //*************************************************************************************************
7966 template< typename MT1, typename MT2 >
7967 struct IsStrictlyUpper< DMatDMatMultExpr<MT1,MT2> >
7968  : public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
7969  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
7970 {};
7972 //*************************************************************************************************
7973 
7974 
7975 
7976 
7977 //=================================================================================================
7978 //
7979 // EXPRESSION TRAIT SPECIALIZATIONS
7980 //
7981 //=================================================================================================
7982 
7983 //*************************************************************************************************
7985 template< typename MT1, typename MT2, typename VT >
7986 struct DMatDVecMultExprTrait< DMatDMatMultExpr<MT1,MT2>, VT >
7987 {
7988  public:
7989  //**********************************************************************************************
7990  using Type = If_< And< IsDenseMatrix<MT1>, IsRowMajorMatrix<MT1>
7991  , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2>
7992  , IsDenseVector<VT>, IsColumnVector<VT> >
7993  , DMatDVecMultExprTrait_< MT1, DMatDVecMultExprTrait_<MT2,VT> >
7994  , INVALID_TYPE >;
7995  //**********************************************************************************************
7996 };
7998 //*************************************************************************************************
7999 
8000 
8001 //*************************************************************************************************
8003 template< typename MT1, typename MT2, typename VT >
8004 struct DMatSVecMultExprTrait< DMatDMatMultExpr<MT1,MT2>, VT >
8005 {
8006  public:
8007  //**********************************************************************************************
8008  using Type = If_< And< IsDenseMatrix<MT1>, IsRowMajorMatrix<MT1>
8009  , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2>
8010  , IsSparseVector<VT>, IsColumnVector<VT> >
8011  , DMatDVecMultExprTrait_< MT1, DMatSVecMultExprTrait_<MT2,VT> >
8012  , INVALID_TYPE >;
8013  //**********************************************************************************************
8014 };
8016 //*************************************************************************************************
8017 
8018 
8019 //*************************************************************************************************
8021 template< typename VT, typename MT1, typename MT2 >
8022 struct TDVecDMatMultExprTrait< VT, DMatDMatMultExpr<MT1,MT2> >
8023 {
8024  public:
8025  //**********************************************************************************************
8026  using Type = If_< And< IsDenseVector<VT>, IsRowVector<VT>
8027  , IsDenseMatrix<MT1>, IsRowMajorMatrix<MT1>
8028  , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2> >
8029  , TDVecDMatMultExprTrait_< TDVecDMatMultExprTrait_<VT,MT1>, MT2 >
8030  , INVALID_TYPE >;
8031  //**********************************************************************************************
8032 };
8034 //*************************************************************************************************
8035 
8036 
8037 //*************************************************************************************************
8039 template< typename VT, typename MT1, typename MT2 >
8040 struct TSVecDMatMultExprTrait< VT, DMatDMatMultExpr<MT1,MT2> >
8041 {
8042  public:
8043  //**********************************************************************************************
8044  using Type = If_< And< IsSparseVector<VT>, IsRowVector<VT>
8045  , IsDenseMatrix<MT1>, IsRowMajorMatrix<MT1>
8046  , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2> >
8047  , TDVecDMatMultExprTrait_< TSVecDMatMultExprTrait_<VT,MT1>, MT2 >
8048  , INVALID_TYPE >;
8049  //**********************************************************************************************
8050 };
8052 //*************************************************************************************************
8053 
8054 
8055 //*************************************************************************************************
8057 template< typename MT1, typename MT2, bool AF >
8058 struct SubmatrixExprTrait< DMatDMatMultExpr<MT1,MT2>, AF >
8059 {
8060  public:
8061  //**********************************************************************************************
8062  using Type = MultExprTrait_< SubmatrixExprTrait_<const MT1,AF>
8063  , SubmatrixExprTrait_<const MT2,AF> >;
8064  //**********************************************************************************************
8065 };
8067 //*************************************************************************************************
8068 
8069 
8070 //*************************************************************************************************
8072 template< typename MT1, typename MT2 >
8073 struct RowExprTrait< DMatDMatMultExpr<MT1,MT2> >
8074 {
8075  public:
8076  //**********************************************************************************************
8077  using Type = MultExprTrait_< RowExprTrait_<const MT1>, MT2 >;
8078  //**********************************************************************************************
8079 };
8081 //*************************************************************************************************
8082 
8083 
8084 //*************************************************************************************************
8086 template< typename MT1, typename MT2 >
8087 struct ColumnExprTrait< DMatDMatMultExpr<MT1,MT2> >
8088 {
8089  public:
8090  //**********************************************************************************************
8091  using Type = MultExprTrait_< MT1, ColumnExprTrait_<const MT2> >;
8092  //**********************************************************************************************
8093 };
8095 //*************************************************************************************************
8096 
8097 } // namespace blaze
8098 
8099 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:250
Header file for auxiliary alias declarations.
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
Header file for the SparseVector base class.
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:291
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the ColumnExprTrait class template.
DMatDMatMultExpr< MT1, MT2 > This
Type of this DMatDMatMultExpr instance.
Definition: DMatDMatMultExpr.h:246
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:390
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:160
Header file for the IsRowVector type trait.
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:223
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1669
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Expression object for dense matrix-dense matrix multiplications.The DMatDMatMultExpr class represents...
Definition: DMatDMatMultExpr.h:149
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:249
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1716
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:126
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDMatMultExpr.h:251
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:400
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:247
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:248
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:262
If_< IsExpression< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:259
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:156
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:434
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:155
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:254
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:354
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:157
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:109
Header file for the IsNumeric type trait.
BLAZE_ALWAYS_INLINE const EnableIf_< And< IsIntegral< T >, HasSize< T, 1UL > >, If_< IsSigned< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:76
Header file for the HasConstDataAccess type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:126
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:306
System settings for the BLAS mode.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:453
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
Header file for the IsSparseVector type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:412
If_< IsExpression< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:256
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Utility type for generic codes.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:253
Constraint on the data type.
Constraints on the storage order of matrix types.
Header file for the HasMutableDataAccess type trait.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:444
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:370
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:243
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Header file for the IsDenseVector type trait.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:252
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:424
Header file for the AreSIMDCombinable type trait.
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:158
Header file for the IsRowMajorMatrix type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:380
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
const DMatTransExpr< MT,!SO > trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:950
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Header file for the IsComplex type trait.
Header file for the TSVecDMatMultExprTrait class template.
Header file for the complex data type.
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:265
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:454
Header file for the IsResizable type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:159
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.