DMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
58 #include <blaze/math/Functions.h>
59 #include <blaze/math/Intrinsics.h>
60 #include <blaze/math/shims/Reset.h>
97 #include <blaze/system/BLAS.h>
98 #include <blaze/system/Blocking.h>
100 #include <blaze/system/Thresholds.h>
101 #include <blaze/util/Assert.h>
102 #include <blaze/util/Complex.h>
106 #include <blaze/util/DisableIf.h>
107 #include <blaze/util/EnableIf.h>
108 #include <blaze/util/Exception.h>
109 #include <blaze/util/InvalidType.h>
111 #include <blaze/util/mpl/And.h>
112 #include <blaze/util/mpl/Not.h>
113 #include <blaze/util/mpl/Or.h>
114 #include <blaze/util/SelectType.h>
115 #include <blaze/util/Types.h>
125 
126 
127 namespace blaze {
128 
129 //=================================================================================================
130 //
131 // CLASS DMATDMATMULTEXPR
132 //
133 //=================================================================================================
134 
135 //*************************************************************************************************
142 template< typename MT1 // Type of the left-hand side dense matrix
143  , typename MT2 > // Type of the right-hand side dense matrix
144 class DMatDMatMultExpr : public DenseMatrix< DMatDMatMultExpr<MT1,MT2>, false >
145  , private MatMatMultExpr
146  , private Computation
147 {
148  private:
149  //**Type definitions****************************************************************************
150  typedef typename MT1::ResultType RT1;
151  typedef typename MT2::ResultType RT2;
152  typedef typename RT1::ElementType ET1;
153  typedef typename RT2::ElementType ET2;
154  typedef typename MT1::CompositeType CT1;
155  typedef typename MT2::CompositeType CT2;
156  //**********************************************************************************************
157 
158  //**********************************************************************************************
161  //**********************************************************************************************
162 
163  //**********************************************************************************************
165  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
166  //**********************************************************************************************
167 
168  //**********************************************************************************************
170 
176  template< typename T1, typename T2, typename T3 >
177  struct CanExploitSymmetry {
178  enum { value = IsColumnMajorMatrix<T1>::value &&
179  ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
180  };
182  //**********************************************************************************************
183 
184  //**********************************************************************************************
186 
190  template< typename T1, typename T2, typename T3 >
191  struct IsEvaluationRequired {
192  enum { value = ( evaluateLeft || evaluateRight ) &&
193  !CanExploitSymmetry<T1,T2,T3>::value };
194  };
196  //**********************************************************************************************
197 
198  //**********************************************************************************************
200 
203  template< typename T1, typename T2, typename T3 >
204  struct UseBlasKernel {
205  enum { value = BLAZE_BLAS_MODE &&
206  HasMutableDataAccess<T1>::value &&
207  HasConstDataAccess<T2>::value &&
208  HasConstDataAccess<T3>::value &&
209  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
210  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
211  IsBlasCompatible<typename T1::ElementType>::value &&
212  IsBlasCompatible<typename T2::ElementType>::value &&
213  IsBlasCompatible<typename T3::ElementType>::value &&
214  IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
215  IsSame< typename T1::ElementType, typename T3::ElementType >::value };
216  };
218  //**********************************************************************************************
219 
220  //**********************************************************************************************
222 
225  template< typename T1, typename T2, typename T3 >
226  struct UseVectorizedDefaultKernel {
227  enum { value = useOptimizedKernels &&
228  !IsDiagonal<T3>::value &&
229  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
230  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
231  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
232  IntrinsicTrait<typename T1::ElementType>::addition &&
233  IntrinsicTrait<typename T1::ElementType>::subtraction &&
234  IntrinsicTrait<typename T1::ElementType>::multiplication };
235  };
237  //**********************************************************************************************
238 
239  public:
240  //**Type definitions****************************************************************************
247  typedef const ElementType ReturnType;
248  typedef const ResultType CompositeType;
249 
251  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
252 
254  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
255 
258 
261  //**********************************************************************************************
262 
263  //**Compilation flags***************************************************************************
265  enum { vectorizable = !IsDiagonal<MT2>::value &&
266  MT1::vectorizable && MT2::vectorizable &&
270 
272  enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
273  !evaluateRight && MT2::smpAssignable };
274  //**********************************************************************************************
275 
276  //**Constructor*********************************************************************************
282  explicit inline DMatDMatMultExpr( const MT1& lhs, const MT2& rhs )
283  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
284  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
285  {
286  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
287  }
288  //**********************************************************************************************
289 
290  //**Access operator*****************************************************************************
297  inline ReturnType operator()( size_t i, size_t j ) const {
298  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
299  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
300 
301  const size_t kbegin( ( IsUpper<MT1>::value )
302  ?( ( IsLower<MT2>::value )
303  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
304  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
305  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
306  :( ( IsLower<MT2>::value )
307  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
308  :( 0UL ) ) );
309  const size_t kend( ( IsLower<MT1>::value )
310  ?( ( IsUpper<MT2>::value )
311  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
312  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
313  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
314  :( ( IsUpper<MT2>::value )
315  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
316  :( lhs_.columns() ) ) );
317 
318  if( lhs_.columns() == 0UL ||
319  ( ( IsTriangular<MT1>::value || IsTriangular<MT2>::value ) && kbegin >= kend ) )
320  return ElementType();
321 
323  return lhs_(i,i) * rhs_(i,j);
324 
326  return lhs_(i,j) * rhs_(j,j);
327 
328  const size_t knum( kend - kbegin );
329  const size_t kpos( kbegin + ( ( knum - 1UL ) & size_t(-2) ) + 1UL );
330 
331  ElementType tmp( lhs_(i,kbegin) * rhs_(kbegin,j) );
332 
333  for( size_t k=kbegin+1UL; k<kpos; k+=2UL ) {
334  tmp += lhs_(i,k ) * rhs_(k ,j);
335  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
336  }
337  if( kpos < kend ) {
338  tmp += lhs_(i,kpos) * rhs_(kpos,j);
339  }
340 
341  return tmp;
342  }
343  //**********************************************************************************************
344 
345  //**At function*********************************************************************************
353  inline ReturnType at( size_t i, size_t j ) const {
354  if( i >= lhs_.rows() ) {
355  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
356  }
357  if( j >= rhs_.columns() ) {
358  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
359  }
360  return (*this)(i,j);
361  }
362  //**********************************************************************************************
363 
364  //**Rows function*******************************************************************************
369  inline size_t rows() const {
370  return lhs_.rows();
371  }
372  //**********************************************************************************************
373 
374  //**Columns function****************************************************************************
379  inline size_t columns() const {
380  return rhs_.columns();
381  }
382  //**********************************************************************************************
383 
384  //**Left operand access*************************************************************************
389  inline LeftOperand leftOperand() const {
390  return lhs_;
391  }
392  //**********************************************************************************************
393 
394  //**Right operand access************************************************************************
399  inline RightOperand rightOperand() const {
400  return rhs_;
401  }
402  //**********************************************************************************************
403 
404  //**********************************************************************************************
410  template< typename T >
411  inline bool canAlias( const T* alias ) const {
412  return ( lhs_.canAlias( alias ) || rhs_.canAlias( alias ) );
413  }
414  //**********************************************************************************************
415 
416  //**********************************************************************************************
422  template< typename T >
423  inline bool isAliased( const T* alias ) const {
424  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
425  }
426  //**********************************************************************************************
427 
428  //**********************************************************************************************
433  inline bool isAligned() const {
434  return lhs_.isAligned() && rhs_.isAligned();
435  }
436  //**********************************************************************************************
437 
438  //**********************************************************************************************
443  inline bool canSMPAssign() const {
444  return ( !BLAZE_BLAS_IS_PARALLEL ||
445  ( rows() * columns() < DMATDMATMULT_THRESHOLD ) ) &&
446  ( rows() > SMP_DMATDMATMULT_THRESHOLD );
447  }
448  //**********************************************************************************************
449 
450  private:
451  //**Member variables****************************************************************************
452  LeftOperand lhs_;
453  RightOperand rhs_;
454  //**********************************************************************************************
455 
456  //**Assignment to dense matrices****************************************************************
469  template< typename MT // Type of the target dense matrix
470  , bool SO > // Storage order of the target dense matrix
471  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
472  assign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
473  {
475 
476  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
477  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
478 
479  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
480  return;
481  }
482  else if( rhs.lhs_.columns() == 0UL ) {
483  reset( ~lhs );
484  return;
485  }
486 
487  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
488  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
489 
490  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
491  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
492  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
493  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
494  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
495  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
496 
497  DMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
498  }
500  //**********************************************************************************************
501 
502  //**Assignment to dense matrices (kernel selection)*********************************************
513  template< typename MT3 // Type of the left-hand side target matrix
514  , typename MT4 // Type of the left-hand side matrix operand
515  , typename MT5 > // Type of the right-hand side matrix operand
516  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
517  {
518  if( ( IsDiagonal<MT5>::value ) ||
519  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
520  selectSmallAssignKernel( C, A, B );
521  else
522  selectBlasAssignKernel( C, A, B );
523  }
525  //**********************************************************************************************
526 
527  //**Default assignment to dense matrices (general/general)**************************************
541  template< typename MT3 // Type of the left-hand side target matrix
542  , typename MT4 // Type of the left-hand side matrix operand
543  , typename MT5 > // Type of the right-hand side matrix operand
544  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
545  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
546  {
547  const size_t M( A.rows() );
548  const size_t N( B.columns() );
549  const size_t K( A.columns() );
550 
551  for( size_t i=0UL; i<M; ++i )
552  {
553  const size_t kbegin( ( IsUpper<MT4>::value )
554  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
555  :( 0UL ) );
556  const size_t kend( ( IsLower<MT4>::value )
557  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
558  :( K ) );
559  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
560 
561  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
562  for( size_t j=0UL; j<N; ++j ) {
563  reset( (~C)(i,j) );
564  }
565  continue;
566  }
567 
568  {
569  const size_t jbegin( ( IsUpper<MT5>::value )
570  ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
571  :( 0UL ) );
572  const size_t jend( ( IsLower<MT5>::value )
573  ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
574  :( N ) );
575  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
576 
577  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
578  for( size_t j=0UL; j<jbegin; ++j ) {
579  reset( C(i,j) );
580  }
581  }
582  else if( IsStrictlyUpper<MT5>::value ) {
583  reset( C(i,0UL) );
584  }
585  for( size_t j=jbegin; j<jend; ++j ) {
586  C(i,j) = A(i,kbegin) * B(kbegin,j);
587  }
588  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
589  for( size_t j=jend; j<N; ++j ) {
590  reset( C(i,j) );
591  }
592  }
593  else if( IsStrictlyLower<MT5>::value ) {
594  reset( C(i,N-1UL) );
595  }
596  }
597 
598  for( size_t k=kbegin+1UL; k<kend; ++k )
599  {
600  const size_t jbegin( ( IsUpper<MT5>::value )
601  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
602  :( 0UL ) );
603  const size_t jend( ( IsLower<MT5>::value )
604  ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
605  :( N ) );
606  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
607 
608  for( size_t j=jbegin; j<jend; ++j ) {
609  C(i,j) += A(i,k) * B(k,j);
610  }
611  if( IsLower<MT5>::value ) {
612  C(i,jend) = A(i,k) * B(k,jend);
613  }
614  }
615  }
616  }
618  //**********************************************************************************************
619 
620  //**Default assignment to dense matrices (general/diagonal)*************************************
634  template< typename MT3 // Type of the left-hand side target matrix
635  , typename MT4 // Type of the left-hand side matrix operand
636  , typename MT5 > // Type of the right-hand side matrix operand
637  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
638  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
639  {
641 
642  const size_t M( A.rows() );
643  const size_t N( B.columns() );
644 
645  for( size_t i=0UL; i<M; ++i )
646  {
647  const size_t jbegin( ( IsUpper<MT4>::value )
648  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
649  :( 0UL ) );
650  const size_t jend( ( IsLower<MT4>::value )
651  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
652  :( N ) );
653  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
654 
655  if( IsUpper<MT4>::value ) {
656  for( size_t j=0UL; j<jbegin; ++j ) {
657  reset( C(i,j) );
658  }
659  }
660  for( size_t j=jbegin; j<jend; ++j ) {
661  C(i,j) = A(i,j) * B(j,j);
662  }
663  if( IsLower<MT4>::value ) {
664  for( size_t j=jend; j<N; ++j ) {
665  reset( C(i,j) );
666  }
667  }
668  }
669  }
671  //**********************************************************************************************
672 
673  //**Default assignment to dense matrices (diagonal/general)*************************************
687  template< typename MT3 // Type of the left-hand side target matrix
688  , typename MT4 // Type of the left-hand side matrix operand
689  , typename MT5 > // Type of the right-hand side matrix operand
690  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
691  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
692  {
694 
695  const size_t M( A.rows() );
696  const size_t N( B.columns() );
697 
698  for( size_t i=0UL; i<M; ++i )
699  {
700  const size_t jbegin( ( IsUpper<MT5>::value )
701  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
702  :( 0UL ) );
703  const size_t jend( ( IsLower<MT5>::value )
704  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
705  :( N ) );
706  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
707 
708  if( IsUpper<MT5>::value ) {
709  for( size_t j=0UL; j<jbegin; ++j ) {
710  reset( C(i,j) );
711  }
712  }
713  for( size_t j=jbegin; j<jend; ++j ) {
714  C(i,j) = A(i,i) * B(i,j);
715  }
716  if( IsLower<MT5>::value ) {
717  for( size_t j=jend; j<N; ++j ) {
718  reset( C(i,j) );
719  }
720  }
721  }
722  }
724  //**********************************************************************************************
725 
726  //**Default assignment to dense matrices (diagonal/diagonal)************************************
740  template< typename MT3 // Type of the left-hand side target matrix
741  , typename MT4 // Type of the left-hand side matrix operand
742  , typename MT5 > // Type of the right-hand side matrix operand
743  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
744  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
745  {
747 
748  reset( C );
749 
750  for( size_t i=0UL; i<A.rows(); ++i ) {
751  C(i,i) = A(i,i) * B(i,i);
752  }
753  }
755  //**********************************************************************************************
756 
757  //**Default assignment to dense matrices (small matrices)***************************************
770  template< typename MT3 // Type of the left-hand side target matrix
771  , typename MT4 // Type of the left-hand side matrix operand
772  , typename MT5 > // Type of the right-hand side matrix operand
773  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
774  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
775  {
776  selectDefaultAssignKernel( C, A, B );
777  }
779  //**********************************************************************************************
780 
781  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
796  template< typename MT3 // Type of the left-hand side target matrix
797  , typename MT4 // Type of the left-hand side matrix operand
798  , typename MT5 > // Type of the right-hand side matrix operand
799  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
800  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
801  {
802  typedef IntrinsicTrait<ElementType> IT;
803 
804  const size_t M( A.rows() );
805  const size_t N( B.columns() );
806  const size_t K( A.columns() );
807 
808  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
809 
810  const size_t jpos( remainder ? ( N & size_t(-IT::size) ) : N );
811  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % IT::size ) ) == jpos, "Invalid end calculation" );
812 
813  size_t j( 0UL );
814 
815  for( ; (j+IT::size*7UL) < jpos; j+=IT::size*8UL ) {
816  for( size_t i=0UL; i<M; ++i )
817  {
818  const size_t kbegin( ( IsUpper<MT4>::value )
819  ?( ( IsLower<MT5>::value )
820  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
821  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
822  :( IsLower<MT5>::value ? j : 0UL ) );
823  const size_t kend( ( IsLower<MT4>::value )
824  ?( ( IsUpper<MT5>::value )
825  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
826  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
827  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
828 
829  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
830 
831  for( size_t k=kbegin; k<kend; ++k ) {
832  const IntrinsicType a1( set( A(i,k) ) );
833  xmm1 = xmm1 + a1 * B.load(k,j );
834  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
835  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
836  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
837  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
838  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
839  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
840  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
841  }
842 
843  (~C).store( i, j , xmm1 );
844  (~C).store( i, j+IT::size , xmm2 );
845  (~C).store( i, j+IT::size*2UL, xmm3 );
846  (~C).store( i, j+IT::size*3UL, xmm4 );
847  (~C).store( i, j+IT::size*4UL, xmm5 );
848  (~C).store( i, j+IT::size*5UL, xmm6 );
849  (~C).store( i, j+IT::size*6UL, xmm7 );
850  (~C).store( i, j+IT::size*7UL, xmm8 );
851  }
852  }
853 
854  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
855  {
856  size_t i( 0UL );
857 
858  for( ; (i+2UL) <= M; i+=2UL )
859  {
860  const size_t kbegin( ( IsUpper<MT4>::value )
861  ?( ( IsLower<MT5>::value )
862  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
863  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
864  :( IsLower<MT5>::value ? j : 0UL ) );
865  const size_t kend( ( IsLower<MT4>::value )
866  ?( ( IsUpper<MT5>::value )
867  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
868  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
869  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
870 
871  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
872 
873  for( size_t k=kbegin; k<kend; ++k ) {
874  const IntrinsicType a1( set( A(i ,k) ) );
875  const IntrinsicType a2( set( A(i+1UL,k) ) );
876  const IntrinsicType b1( B.load(k,j ) );
877  const IntrinsicType b2( B.load(k,j+IT::size ) );
878  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
879  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
880  xmm1 = xmm1 + a1 * b1;
881  xmm2 = xmm2 + a1 * b2;
882  xmm3 = xmm3 + a1 * b3;
883  xmm4 = xmm4 + a1 * b4;
884  xmm5 = xmm5 + a2 * b1;
885  xmm6 = xmm6 + a2 * b2;
886  xmm7 = xmm7 + a2 * b3;
887  xmm8 = xmm8 + a2 * b4;
888  }
889 
890  (~C).store( i , j , xmm1 );
891  (~C).store( i , j+IT::size , xmm2 );
892  (~C).store( i , j+IT::size*2UL, xmm3 );
893  (~C).store( i , j+IT::size*3UL, xmm4 );
894  (~C).store( i+1UL, j , xmm5 );
895  (~C).store( i+1UL, j+IT::size , xmm6 );
896  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
897  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
898  }
899 
900  if( i < M )
901  {
902  const size_t kbegin( ( IsUpper<MT4>::value )
903  ?( ( IsLower<MT5>::value )
904  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
905  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
906  :( IsLower<MT5>::value ? j : 0UL ) );
907  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
908 
909  IntrinsicType xmm1, xmm2, xmm3, xmm4;
910 
911  for( size_t k=kbegin; k<kend; ++k ) {
912  const IntrinsicType a1( set( A(i,k) ) );
913  xmm1 = xmm1 + a1 * B.load(k,j );
914  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
915  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
916  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
917  }
918 
919  (~C).store( i, j , xmm1 );
920  (~C).store( i, j+IT::size , xmm2 );
921  (~C).store( i, j+IT::size*2UL, xmm3 );
922  (~C).store( i, j+IT::size*3UL, xmm4 );
923  }
924  }
925 
926  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
927  {
928  size_t i( 0UL );
929 
930  for( ; (i+2UL) <= M; i+=2UL )
931  {
932  const size_t kbegin( ( IsUpper<MT4>::value )
933  ?( ( IsLower<MT5>::value )
934  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
935  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
936  :( IsLower<MT5>::value ? j : 0UL ) );
937  const size_t kend( ( IsLower<MT4>::value )
938  ?( ( IsUpper<MT5>::value )
939  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
940  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
941  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
942 
943  IntrinsicType xmm1, xmm2, xmm3, xmm4;
944 
945  for( size_t k=kbegin; k<kend; ++k ) {
946  const IntrinsicType a1( set( A(i ,k) ) );
947  const IntrinsicType a2( set( A(i+1UL,k) ) );
948  const IntrinsicType b1( B.load(k,j ) );
949  const IntrinsicType b2( B.load(k,j+IT::size) );
950  xmm1 = xmm1 + a1 * b1;
951  xmm2 = xmm2 + a1 * b2;
952  xmm3 = xmm3 + a2 * b1;
953  xmm4 = xmm4 + a2 * b2;
954  }
955 
956  (~C).store( i , j , xmm1 );
957  (~C).store( i , j+IT::size, xmm2 );
958  (~C).store( i+1UL, j , xmm3 );
959  (~C).store( i+1UL, j+IT::size, xmm4 );
960  }
961 
962  if( i < M )
963  {
964  const size_t kbegin( ( IsUpper<MT4>::value )
965  ?( ( IsLower<MT5>::value )
966  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
967  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
968  :( IsLower<MT5>::value ? j : 0UL ) );
969  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
970 
971  IntrinsicType xmm1, xmm2;
972 
973  for( size_t k=kbegin; k<kend; ++k ) {
974  const IntrinsicType a1( set( A(i,k) ) );
975  xmm1 = xmm1 + a1 * B.load(k,j );
976  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
977  }
978 
979  (~C).store( i, j , xmm1 );
980  (~C).store( i, j+IT::size, xmm2 );
981  }
982  }
983 
984  for( ; j<jpos; j+=IT::size )
985  {
986  size_t i( 0UL );
987 
988  for( ; (i+2UL) <= M; i+=2UL )
989  {
990  const size_t kbegin( ( IsUpper<MT4>::value )
991  ?( ( IsLower<MT5>::value )
992  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
993  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
994  :( IsLower<MT5>::value ? j : 0UL ) );
995  const size_t kend( ( IsLower<MT4>::value )
996  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
997  :( K ) );
998 
999  IntrinsicType xmm1, xmm2;
1000 
1001  for( size_t k=kbegin; k<kend; ++k ) {
1002  const IntrinsicType b1( B.load(k,j) );
1003  xmm1 = xmm1 + set( A(i ,k) ) * b1;
1004  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
1005  }
1006 
1007  (~C).store( i , j, xmm1 );
1008  (~C).store( i+1UL, j, xmm2 );
1009  }
1010 
1011  if( i < M )
1012  {
1013  const size_t kbegin( ( IsUpper<MT4>::value )
1014  ?( ( IsLower<MT5>::value )
1015  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1016  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1017  :( IsLower<MT5>::value ? j : 0UL ) );
1018 
1019  IntrinsicType xmm1;
1020 
1021  for( size_t k=kbegin; k<K; ++k ) {
1022  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
1023  }
1024 
1025  (~C).store( i, j, xmm1 );
1026  }
1027  }
1028 
1029  for( ; remainder && j<N; ++j )
1030  {
1031  size_t i( 0UL );
1032 
1033  for( ; (i+2UL) <= M; i+=2UL )
1034  {
1035  const size_t kbegin( ( IsUpper<MT4>::value )
1036  ?( ( IsLower<MT5>::value )
1037  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1038  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1039  :( IsLower<MT5>::value ? j : 0UL ) );
1040  const size_t kend( ( IsLower<MT4>::value )
1041  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1042  :( K ) );
1043 
1044  ElementType value1 = ElementType();
1045  ElementType value2 = ElementType();
1046 
1047  for( size_t k=kbegin; k<kend; ++k ) {
1048  value1 += A(i ,k) * B(k,j);
1049  value2 += A(i+1UL,k) * B(k,j);
1050  }
1051 
1052  (~C)(i ,j) = value1;
1053  (~C)(i+1UL,j) = value2;
1054  }
1055 
1056  if( i < M )
1057  {
1058  const size_t kbegin( ( IsUpper<MT4>::value )
1059  ?( ( IsLower<MT5>::value )
1060  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1061  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1062  :( IsLower<MT5>::value ? j : 0UL ) );
1063 
1064  ElementType value = ElementType();
1065 
1066  for( size_t k=kbegin; k<K; ++k ) {
1067  value += A(i,k) * B(k,j);
1068  }
1069 
1070  (~C)(i,j) = value;
1071  }
1072  }
1073  }
1075  //**********************************************************************************************
1076 
1077  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1092  template< typename MT3 // Type of the left-hand side target matrix
1093  , typename MT4 // Type of the left-hand side matrix operand
1094  , typename MT5 > // Type of the right-hand side matrix operand
1095  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1096  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1097  {
1102 
1103  if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1104  const typename MT4::OppositeType tmp( serial( A ) );
1105  assign( ~C, tmp * B );
1106  }
1107  else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1108  const typename MT5::OppositeType tmp( serial( B ) );
1109  assign( ~C, A * tmp );
1110  }
1111  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1112  const typename MT4::OppositeType tmp( serial( A ) );
1113  assign( ~C, tmp * B );
1114  }
1115  else {
1116  const typename MT5::OppositeType tmp( serial( B ) );
1117  assign( ~C, A * tmp );
1118  }
1119  }
1121  //**********************************************************************************************
1122 
1123  //**Default assignment to dense matrices (large matrices)***************************************
1136  template< typename MT3 // Type of the left-hand side target matrix
1137  , typename MT4 // Type of the left-hand side matrix operand
1138  , typename MT5 > // Type of the right-hand side matrix operand
1139  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1140  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1141  {
1142  selectDefaultAssignKernel( C, A, B );
1143  }
1145  //**********************************************************************************************
1146 
1147  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
1162  template< typename MT3 // Type of the left-hand side target matrix
1163  , typename MT4 // Type of the left-hand side matrix operand
1164  , typename MT5 > // Type of the right-hand side matrix operand
1165  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1166  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1167  {
1168  typedef IntrinsicTrait<ElementType> IT;
1169 
1170  const size_t M( A.rows() );
1171  const size_t N( B.columns() );
1172  const size_t K( A.columns() );
1173 
1174  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1175 
1176  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
1177  {
1178  const size_t jend( min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
1179 
1180  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1181  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % IT::size ) ) == jpos, "Invalid end calculation" );
1182 
1183  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
1184  {
1185  const size_t iend( min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
1186 
1187  for( size_t i=ii; i<iend; ++i ) {
1188  for( size_t j=jj; j<jend; ++j ) {
1189  reset( (~C)(i,j) );
1190  }
1191  }
1192 
1193  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
1194  {
1195  const size_t ktmp( min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
1196 
1197  size_t j( jj );
1198 
1199  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
1200  {
1201  const size_t j1( j+IT::size );
1202  const size_t j2( j+IT::size*2UL );
1203  const size_t j3( j+IT::size*3UL );
1204 
1205  size_t i( ii );
1206 
1207  for( ; (i+2UL) <= iend; i+=2UL )
1208  {
1209  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1210  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1211  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1212  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
1213 
1214  IntrinsicType xmm1( (~C).load(i ,j ) );
1215  IntrinsicType xmm2( (~C).load(i ,j1) );
1216  IntrinsicType xmm3( (~C).load(i ,j2) );
1217  IntrinsicType xmm4( (~C).load(i ,j3) );
1218  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1219  IntrinsicType xmm6( (~C).load(i+1UL,j1) );
1220  IntrinsicType xmm7( (~C).load(i+1UL,j2) );
1221  IntrinsicType xmm8( (~C).load(i+1UL,j3) );
1222 
1223  for( size_t k=kbegin; k<kend; ++k ) {
1224  const IntrinsicType a1( set( A(i ,k) ) );
1225  const IntrinsicType a2( set( A(i+1UL,k) ) );
1226  const IntrinsicType b1( B.load(k,j ) );
1227  const IntrinsicType b2( B.load(k,j1) );
1228  const IntrinsicType b3( B.load(k,j2) );
1229  const IntrinsicType b4( B.load(k,j3) );
1230  xmm1 = xmm1 + a1 * b1;
1231  xmm2 = xmm2 + a1 * b2;
1232  xmm3 = xmm3 + a1 * b3;
1233  xmm4 = xmm4 + a1 * b4;
1234  xmm5 = xmm5 + a2 * b1;
1235  xmm6 = xmm6 + a2 * b2;
1236  xmm7 = xmm7 + a2 * b3;
1237  xmm8 = xmm8 + a2 * b4;
1238  }
1239 
1240  (~C).store( i , j , xmm1 );
1241  (~C).store( i , j1, xmm2 );
1242  (~C).store( i , j2, xmm3 );
1243  (~C).store( i , j3, xmm4 );
1244  (~C).store( i+1UL, j , xmm5 );
1245  (~C).store( i+1UL, j1, xmm6 );
1246  (~C).store( i+1UL, j2, xmm7 );
1247  (~C).store( i+1UL, j3, xmm8 );
1248  }
1249 
1250  if( i < iend )
1251  {
1252  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1253  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1254  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1255  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
1256 
1257  IntrinsicType xmm1( (~C).load(i,j ) );
1258  IntrinsicType xmm2( (~C).load(i,j1) );
1259  IntrinsicType xmm3( (~C).load(i,j2) );
1260  IntrinsicType xmm4( (~C).load(i,j3) );
1261 
1262  for( size_t k=kbegin; k<kend; ++k ) {
1263  const IntrinsicType a1( set( A(i,k) ) );
1264  xmm1 = xmm1 + a1 * B.load(k,j );
1265  xmm2 = xmm2 + a1 * B.load(k,j1);
1266  xmm3 = xmm3 + a1 * B.load(k,j2);
1267  xmm4 = xmm4 + a1 * B.load(k,j3);
1268  }
1269 
1270  (~C).store( i, j , xmm1 );
1271  (~C).store( i, j1, xmm2 );
1272  (~C).store( i, j2, xmm3 );
1273  (~C).store( i, j3, xmm4 );
1274  }
1275  }
1276 
1277  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
1278  {
1279  const size_t j1( j+IT::size );
1280 
1281  size_t i( ii );
1282 
1283  for( ; (i+4UL) <= iend; i+=4UL )
1284  {
1285  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1286  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1287  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
1288  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
1289 
1290  IntrinsicType xmm1( (~C).load(i ,j ) );
1291  IntrinsicType xmm2( (~C).load(i ,j1) );
1292  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1293  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
1294  IntrinsicType xmm5( (~C).load(i+2UL,j ) );
1295  IntrinsicType xmm6( (~C).load(i+2UL,j1) );
1296  IntrinsicType xmm7( (~C).load(i+3UL,j ) );
1297  IntrinsicType xmm8( (~C).load(i+3UL,j1) );
1298 
1299  for( size_t k=kbegin; k<kend; ++k ) {
1300  const IntrinsicType a1( set( A(i ,k) ) );
1301  const IntrinsicType a2( set( A(i+1UL,k) ) );
1302  const IntrinsicType a3( set( A(i+2UL,k) ) );
1303  const IntrinsicType a4( set( A(i+3UL,k) ) );
1304  const IntrinsicType b1( B.load(k,j ) );
1305  const IntrinsicType b2( B.load(k,j1) );
1306  xmm1 = xmm1 + a1 * b1;
1307  xmm2 = xmm2 + a1 * b2;
1308  xmm3 = xmm3 + a2 * b1;
1309  xmm4 = xmm4 + a2 * b2;
1310  xmm5 = xmm5 + a3 * b1;
1311  xmm6 = xmm6 + a3 * b2;
1312  xmm7 = xmm7 + a4 * b1;
1313  xmm8 = xmm8 + a4 * b2;
1314  }
1315 
1316  (~C).store( i , j , xmm1 );
1317  (~C).store( i , j1, xmm2 );
1318  (~C).store( i+1UL, j , xmm3 );
1319  (~C).store( i+1UL, j1, xmm4 );
1320  (~C).store( i+2UL, j , xmm5 );
1321  (~C).store( i+2UL, j1, xmm6 );
1322  (~C).store( i+3UL, j , xmm7 );
1323  (~C).store( i+3UL, j1, xmm8 );
1324  }
1325 
1326  for( ; (i+2UL) <= iend; i+=2UL )
1327  {
1328  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1329  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1330  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1331  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
1332 
1333  IntrinsicType xmm1( (~C).load(i ,j ) );
1334  IntrinsicType xmm2( (~C).load(i ,j1) );
1335  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1336  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
1337 
1338  for( size_t k=kbegin; k<kend; ++k ) {
1339  const IntrinsicType a1( set( A(i ,k) ) );
1340  const IntrinsicType a2( set( A(i+1UL,k) ) );
1341  const IntrinsicType b1( B.load(k,j ) );
1342  const IntrinsicType b2( B.load(k,j1) );
1343  xmm1 = xmm1 + a1 * b1;
1344  xmm2 = xmm2 + a1 * b2;
1345  xmm3 = xmm3 + a2 * b1;
1346  xmm4 = xmm4 + a2 * b2;
1347  }
1348 
1349  (~C).store( i , j , xmm1 );
1350  (~C).store( i , j1, xmm2 );
1351  (~C).store( i+1UL, j , xmm3 );
1352  (~C).store( i+1UL, j1, xmm4 );
1353  }
1354 
1355  if( i < iend )
1356  {
1357  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1358  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1359  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1360  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
1361 
1362  IntrinsicType xmm1( (~C).load(i,j ) );
1363  IntrinsicType xmm2( (~C).load(i,j1) );
1364 
1365  for( size_t k=kbegin; k<kend; ++k ) {
1366  const IntrinsicType a1( set( A(i,k) ) );
1367  xmm1 = xmm1 + a1 * B.load(k,j );
1368  xmm2 = xmm2 + a1 * B.load(k,j1);
1369  }
1370 
1371  (~C).store( i, j , xmm1 );
1372  (~C).store( i, j1, xmm2 );
1373  }
1374  }
1375 
1376  for( ; j<jpos; j+=IT::size )
1377  {
1378  for( size_t i=ii; i<iend; ++i )
1379  {
1380  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1381  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1382  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1383  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
1384 
1385  IntrinsicType xmm1( (~C).load(i,j) );
1386 
1387  for( size_t k=kbegin; k<kend; ++k ) {
1388  const IntrinsicType a1( set( A(i,k) ) );
1389  xmm1 = xmm1 + a1 * B.load(k,j);
1390  }
1391 
1392  (~C).store( i, j, xmm1 );
1393  }
1394  }
1395 
1396  for( ; remainder && j<jend; ++j )
1397  {
1398  for( size_t i=ii; i<iend; ++i )
1399  {
1400  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1401  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1402  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1403  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
1404 
1405  ElementType value( (~C)(i,j) );
1406 
1407  for( size_t k=kbegin; k<kend; ++k ) {
1408  value += A(i,k) * B(k,j);
1409  }
1410 
1411  (~C)(i,j) = value;
1412  }
1413  }
1414  }
1415  }
1416  }
1417  }
1419  //**********************************************************************************************
1420 
1421  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
1435  template< typename MT3 // Type of the left-hand side target matrix
1436  , typename MT4 // Type of the left-hand side matrix operand
1437  , typename MT5 > // Type of the right-hand side matrix operand
1438  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1439  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1440  {
1441  selectSmallAssignKernel( ~C, A, B );
1442  }
1444  //**********************************************************************************************
1445 
1446  //**BLAS-based assignment to dense matrices (default)*******************************************
1459  template< typename MT3 // Type of the left-hand side target matrix
1460  , typename MT4 // Type of the left-hand side matrix operand
1461  , typename MT5 > // Type of the right-hand side matrix operand
1462  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
1463  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1464  {
1465  selectLargeAssignKernel( C, A, B );
1466  }
1468  //**********************************************************************************************
1469 
1470  //**BLAS-based assignment to dense matrices*****************************************************
1471 #if BLAZE_BLAS_MODE
1472 
1484  template< typename MT3 // Type of the left-hand side target matrix
1485  , typename MT4 // Type of the left-hand side matrix operand
1486  , typename MT5 > // Type of the right-hand side matrix operand
1487  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
1488  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1489  {
1490  typedef typename MT3::ElementType ET;
1491 
1492  if( IsTriangular<MT4>::value ) {
1493  assign( C, B );
1494  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1495  }
1496  else if( IsTriangular<MT5>::value ) {
1497  assign( C, A );
1498  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1499  }
1500  else {
1501  gemm( C, A, B, ET(1), ET(0) );
1502  }
1503  }
1505 #endif
1506  //**********************************************************************************************
1507 
1508  //**Assignment to sparse matrices***************************************************************
1521  template< typename MT // Type of the target sparse matrix
1522  , bool SO > // Storage order of the target sparse matrix
1523  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1524  assign( SparseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
1525  {
1527 
1528  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
1529 
1536 
1537  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1538  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1539 
1540  const TmpType tmp( serial( rhs ) );
1541  assign( ~lhs, tmp );
1542  }
1544  //**********************************************************************************************
1545 
1546  //**Restructuring assignment to column-major matrices*******************************************
1561  template< typename MT > // Type of the target matrix
1562  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1563  assign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
1564  {
1566 
1568 
1569  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1570  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1571 
1572  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
1573  assign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
1574  else if( IsSymmetric<MT1>::value )
1575  assign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
1576  else
1577  assign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
1578  }
1580  //**********************************************************************************************
1581 
1582  //**Addition assignment to dense matrices*******************************************************
1595  template< typename MT // Type of the target dense matrix
1596  , bool SO > // Storage order of the target dense matrix
1597  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1598  addAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
1599  {
1601 
1602  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1603  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1604 
1605  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1606  return;
1607  }
1608 
1609  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1610  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1611 
1612  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1613  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1614  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1615  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1616  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1617  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1618 
1619  DMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1620  }
1622  //**********************************************************************************************
1623 
1624  //**Addition assignment to dense matrices (kernel selection)************************************
1635  template< typename MT3 // Type of the left-hand side target matrix
1636  , typename MT4 // Type of the left-hand side matrix operand
1637  , typename MT5 > // Type of the right-hand side matrix operand
1638  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1639  {
1640  if( ( IsDiagonal<MT5>::value ) ||
1641  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
1642  selectSmallAddAssignKernel( C, A, B );
1643  else
1644  selectBlasAddAssignKernel( C, A, B );
1645  }
1647  //**********************************************************************************************
1648 
1649  //**Default addition assignment to dense matrices (general/general)*****************************
1663  template< typename MT3 // Type of the left-hand side target matrix
1664  , typename MT4 // Type of the left-hand side matrix operand
1665  , typename MT5 > // Type of the right-hand side matrix operand
1666  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
1667  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1668  {
1669  const size_t M( A.rows() );
1670  const size_t N( B.columns() );
1671  const size_t K( A.columns() );
1672 
1673  for( size_t i=0UL; i<M; ++i )
1674  {
1675  const size_t kbegin( ( IsUpper<MT4>::value )
1676  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1677  :( 0UL ) );
1678  const size_t kend( ( IsLower<MT4>::value )
1679  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1680  :( K ) );
1681  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
1682 
1683  for( size_t k=kbegin; k<kend; ++k )
1684  {
1685  const size_t jbegin( ( IsUpper<MT5>::value )
1686  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
1687  :( 0UL ) );
1688  const size_t jend( ( IsLower<MT5>::value )
1689  ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
1690  :( N ) );
1691  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1692 
1693  const size_t jnum( jend - jbegin );
1694  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1695 
1696  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1697  C(i,j ) += A(i,k) * B(k,j );
1698  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1699  }
1700  if( jpos < jend ) {
1701  C(i,jpos) += A(i,k) * B(k,jpos);
1702  }
1703  }
1704  }
1705  }
1707  //**********************************************************************************************
1708 
1709  //**Default addition assignment to dense matrices (general/diagonal)****************************
1723  template< typename MT3 // Type of the left-hand side target matrix
1724  , typename MT4 // Type of the left-hand side matrix operand
1725  , typename MT5 > // Type of the right-hand side matrix operand
1726  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
1727  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1728  {
1730 
1731  const size_t M( A.rows() );
1732  const size_t N( B.columns() );
1733 
1734  for( size_t i=0UL; i<M; ++i )
1735  {
1736  const size_t jbegin( ( IsUpper<MT4>::value )
1737  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1738  :( 0UL ) );
1739  const size_t jend( ( IsLower<MT4>::value )
1740  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1741  :( N ) );
1742  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1743 
1744  const size_t jnum( jend - jbegin );
1745  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1746 
1747  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1748  C(i,j ) += A(i,j ) * B(j ,j );
1749  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1750  }
1751  if( jpos < jend ) {
1752  C(i,jpos) += A(i,jpos) * B(jpos,jpos);
1753  }
1754  }
1755  }
1757  //**********************************************************************************************
1758 
1759  //**Default addition assignment to dense matrices (diagonal/general)****************************
1773  template< typename MT3 // Type of the left-hand side target matrix
1774  , typename MT4 // Type of the left-hand side matrix operand
1775  , typename MT5 > // Type of the right-hand side matrix operand
1776  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
1777  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1778  {
1780 
1781  const size_t M( A.rows() );
1782  const size_t N( B.columns() );
1783 
1784  for( size_t i=0UL; i<M; ++i )
1785  {
1786  const size_t jbegin( ( IsUpper<MT5>::value )
1787  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
1788  :( 0UL ) );
1789  const size_t jend( ( IsLower<MT5>::value )
1790  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
1791  :( N ) );
1792  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1793 
1794  const size_t jnum( jend - jbegin );
1795  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1796 
1797  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1798  C(i,j ) += A(i,i) * B(i,j );
1799  C(i,j+1UL) += A(i,i) * B(i,j+1UL);
1800  }
1801  if( jpos < jend ) {
1802  C(i,jpos) += A(i,i) * B(i,jpos);
1803  }
1804  }
1805  }
1807  //**********************************************************************************************
1808 
1809  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
1823  template< typename MT3 // Type of the left-hand side target matrix
1824  , typename MT4 // Type of the left-hand side matrix operand
1825  , typename MT5 > // Type of the right-hand side matrix operand
1826  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
1827  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1828  {
1830 
1831  for( size_t i=0UL; i<A.rows(); ++i ) {
1832  C(i,i) += A(i,i) * B(i,i);
1833  }
1834  }
1836  //**********************************************************************************************
1837 
1838  //**Default addition assignment to dense matrices (small matrices)******************************
1852  template< typename MT3 // Type of the left-hand side target matrix
1853  , typename MT4 // Type of the left-hand side matrix operand
1854  , typename MT5 > // Type of the right-hand side matrix operand
1855  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1856  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1857  {
1858  selectDefaultAddAssignKernel( C, A, B );
1859  }
1861  //**********************************************************************************************
1862 
1863  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
1878  template< typename MT3 // Type of the left-hand side target matrix
1879  , typename MT4 // Type of the left-hand side matrix operand
1880  , typename MT5 > // Type of the right-hand side matrix operand
1881  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1882  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1883  {
1884  typedef IntrinsicTrait<ElementType> IT;
1885 
1886  const size_t M( A.rows() );
1887  const size_t N( B.columns() );
1888  const size_t K( A.columns() );
1889 
1890  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1891 
1892  const size_t jpos( remainder ? ( N & size_t(-IT::size) ) : N );
1893  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % IT::size ) ) == jpos, "Invalid end calculation" );
1894 
1895  size_t j( 0UL );
1896 
1897  for( ; (j+IT::size*7UL) < jpos; j+=IT::size*8UL ) {
1898  for( size_t i=0UL; i<M; ++i )
1899  {
1900  const size_t kbegin( ( IsUpper<MT4>::value )
1901  ?( ( IsLower<MT5>::value )
1902  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1903  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1904  :( IsLower<MT5>::value ? j : 0UL ) );
1905  const size_t kend( ( IsLower<MT4>::value )
1906  ?( ( IsUpper<MT5>::value )
1907  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
1908  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1909  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
1910 
1911  IntrinsicType xmm1( (~C).load(i,j ) );
1912  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
1913  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
1914  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
1915  IntrinsicType xmm5( (~C).load(i,j+IT::size*4UL) );
1916  IntrinsicType xmm6( (~C).load(i,j+IT::size*5UL) );
1917  IntrinsicType xmm7( (~C).load(i,j+IT::size*6UL) );
1918  IntrinsicType xmm8( (~C).load(i,j+IT::size*7UL) );
1919 
1920  for( size_t k=kbegin; k<kend; ++k ) {
1921  const IntrinsicType a1( set( A(i,k) ) );
1922  xmm1 = xmm1 + a1 * B.load(k,j );
1923  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1924  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1925  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1926  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
1927  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
1928  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
1929  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
1930  }
1931 
1932  (~C).store( i, j , xmm1 );
1933  (~C).store( i, j+IT::size , xmm2 );
1934  (~C).store( i, j+IT::size*2UL, xmm3 );
1935  (~C).store( i, j+IT::size*3UL, xmm4 );
1936  (~C).store( i, j+IT::size*4UL, xmm5 );
1937  (~C).store( i, j+IT::size*5UL, xmm6 );
1938  (~C).store( i, j+IT::size*6UL, xmm7 );
1939  (~C).store( i, j+IT::size*7UL, xmm8 );
1940  }
1941  }
1942 
1943  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
1944  {
1945  size_t i( 0UL );
1946 
1947  for( ; (i+2UL) <= M; i+=2UL )
1948  {
1949  const size_t kbegin( ( IsUpper<MT4>::value )
1950  ?( ( IsLower<MT5>::value )
1951  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1952  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1953  :( IsLower<MT5>::value ? j : 0UL ) );
1954  const size_t kend( ( IsLower<MT4>::value )
1955  ?( ( IsUpper<MT5>::value )
1956  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
1957  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1958  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
1959 
1960  IntrinsicType xmm1( (~C).load(i ,j ) );
1961  IntrinsicType xmm2( (~C).load(i ,j+IT::size ) );
1962  IntrinsicType xmm3( (~C).load(i ,j+IT::size*2UL) );
1963  IntrinsicType xmm4( (~C).load(i ,j+IT::size*3UL) );
1964  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1965  IntrinsicType xmm6( (~C).load(i+1UL,j+IT::size ) );
1966  IntrinsicType xmm7( (~C).load(i+1UL,j+IT::size*2UL) );
1967  IntrinsicType xmm8( (~C).load(i+1UL,j+IT::size*3UL) );
1968 
1969  for( size_t k=kbegin; k<kend; ++k ) {
1970  const IntrinsicType a1( set( A(i ,k) ) );
1971  const IntrinsicType a2( set( A(i+1UL,k) ) );
1972  const IntrinsicType b1( B.load(k,j ) );
1973  const IntrinsicType b2( B.load(k,j+IT::size ) );
1974  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
1975  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
1976  xmm1 = xmm1 + a1 * b1;
1977  xmm2 = xmm2 + a1 * b2;
1978  xmm3 = xmm3 + a1 * b3;
1979  xmm4 = xmm4 + a1 * b4;
1980  xmm5 = xmm5 + a2 * b1;
1981  xmm6 = xmm6 + a2 * b2;
1982  xmm7 = xmm7 + a2 * b3;
1983  xmm8 = xmm8 + a2 * b4;
1984  }
1985 
1986  (~C).store( i , j , xmm1 );
1987  (~C).store( i , j+IT::size , xmm2 );
1988  (~C).store( i , j+IT::size*2UL, xmm3 );
1989  (~C).store( i , j+IT::size*3UL, xmm4 );
1990  (~C).store( i+1UL, j , xmm5 );
1991  (~C).store( i+1UL, j+IT::size , xmm6 );
1992  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
1993  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
1994  }
1995 
1996  if( i < M )
1997  {
1998  const size_t kbegin( ( IsUpper<MT4>::value )
1999  ?( ( IsLower<MT5>::value )
2000  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2001  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2002  :( IsLower<MT5>::value ? j : 0UL ) );
2003  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
2004 
2005  IntrinsicType xmm1( (~C).load(i,j ) );
2006  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
2007  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
2008  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
2009 
2010  for( size_t k=kbegin; k<kend; ++k ) {
2011  const IntrinsicType a1( set( A(i,k) ) );
2012  xmm1 = xmm1 + a1 * B.load(k,j );
2013  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2014  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2015  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2016  }
2017 
2018  (~C).store( i, j , xmm1 );
2019  (~C).store( i, j+IT::size , xmm2 );
2020  (~C).store( i, j+IT::size*2UL, xmm3 );
2021  (~C).store( i, j+IT::size*3UL, xmm4 );
2022  }
2023  }
2024 
2025  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
2026  {
2027  size_t i( 0UL );
2028 
2029  for( ; (i+2UL) <= M; i+=2UL )
2030  {
2031  const size_t kbegin( ( IsUpper<MT4>::value )
2032  ?( ( IsLower<MT5>::value )
2033  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2034  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2035  :( IsLower<MT5>::value ? j : 0UL ) );
2036  const size_t kend( ( IsLower<MT4>::value )
2037  ?( ( IsUpper<MT5>::value )
2038  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
2039  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2040  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
2041 
2042  IntrinsicType xmm1( (~C).load(i ,j ) );
2043  IntrinsicType xmm2( (~C).load(i ,j+IT::size) );
2044  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
2045  IntrinsicType xmm4( (~C).load(i+1UL,j+IT::size) );
2046 
2047  for( size_t k=kbegin; k<kend; ++k ) {
2048  const IntrinsicType a1( set( A(i ,k) ) );
2049  const IntrinsicType a2( set( A(i+1UL,k) ) );
2050  const IntrinsicType b1( B.load(k,j ) );
2051  const IntrinsicType b2( B.load(k,j+IT::size) );
2052  xmm1 = xmm1 + a1 * b1;
2053  xmm2 = xmm2 + a1 * b2;
2054  xmm3 = xmm3 + a2 * b1;
2055  xmm4 = xmm4 + a2 * b2;
2056  }
2057 
2058  (~C).store( i , j , xmm1 );
2059  (~C).store( i , j+IT::size, xmm2 );
2060  (~C).store( i+1UL, j , xmm3 );
2061  (~C).store( i+1UL, j+IT::size, xmm4 );
2062  }
2063 
2064  if( i < M )
2065  {
2066  const size_t kbegin( ( IsUpper<MT4>::value )
2067  ?( ( IsLower<MT5>::value )
2068  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2069  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2070  :( IsLower<MT5>::value ? j : 0UL ) );
2071  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
2072 
2073  IntrinsicType xmm1( (~C).load(i,j ) );
2074  IntrinsicType xmm2( (~C).load(i,j+IT::size) );
2075 
2076  for( size_t k=kbegin; k<kend; ++k ) {
2077  const IntrinsicType a1( set( A(i,k) ) );
2078  xmm1 = xmm1 + a1 * B.load(k,j );
2079  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
2080  }
2081 
2082  (~C).store( i, j , xmm1 );
2083  (~C).store( i, j+IT::size, xmm2 );
2084  }
2085  }
2086 
2087  for( ; j<jpos; j+=IT::size )
2088  {
2089  size_t i( 0UL );
2090 
2091  for( ; (i+2UL) <= M; i+=2UL )
2092  {
2093  const size_t kbegin( ( IsUpper<MT4>::value )
2094  ?( ( IsLower<MT5>::value )
2095  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2096  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2097  :( IsLower<MT5>::value ? j : 0UL ) );
2098  const size_t kend( ( IsLower<MT4>::value )
2099  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2100  :( K ) );
2101 
2102  IntrinsicType xmm1( (~C).load(i ,j) );
2103  IntrinsicType xmm2( (~C).load(i+1UL,j) );
2104 
2105  for( size_t k=kbegin; k<kend; ++k ) {
2106  const IntrinsicType b1( B.load(k,j) );
2107  xmm1 = xmm1 + set( A(i ,k) ) * b1;
2108  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
2109  }
2110 
2111  (~C).store( i , j, xmm1 );
2112  (~C).store( i+1UL, j, xmm2 );
2113  }
2114 
2115  if( i < M )
2116  {
2117  const size_t kbegin( ( IsUpper<MT4>::value )
2118  ?( ( IsLower<MT5>::value )
2119  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2120  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2121  :( IsLower<MT5>::value ? j : 0UL ) );
2122 
2123  IntrinsicType xmm1( (~C).load(i,j) );
2124 
2125  for( size_t k=kbegin; k<K; ++k ) {
2126  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
2127  }
2128 
2129  (~C).store( i, j, xmm1 );
2130  }
2131  }
2132 
2133  for( ; remainder && j<N; ++j )
2134  {
2135  size_t i( 0UL );
2136 
2137  for( ; (i+2UL) <= M; i+=2UL )
2138  {
2139  const size_t kbegin( ( IsUpper<MT4>::value )
2140  ?( ( IsLower<MT5>::value )
2141  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2142  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2143  :( IsLower<MT5>::value ? j : 0UL ) );
2144  const size_t kend( ( IsLower<MT4>::value )
2145  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2146  :( K ) );
2147 
2148  ElementType value1( (~C)(i ,j) );
2149  ElementType value2( (~C)(i+1UL,j) );;
2150 
2151  for( size_t k=kbegin; k<kend; ++k ) {
2152  value1 += A(i ,k) * B(k,j);
2153  value2 += A(i+1UL,k) * B(k,j);
2154  }
2155 
2156  (~C)(i ,j) = value1;
2157  (~C)(i+1UL,j) = value2;
2158  }
2159 
2160  if( i < M )
2161  {
2162  const size_t kbegin( ( IsUpper<MT4>::value )
2163  ?( ( IsLower<MT5>::value )
2164  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2165  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2166  :( IsLower<MT5>::value ? j : 0UL ) );
2167 
2168  ElementType value( (~C)(i,j) );
2169 
2170  for( size_t k=kbegin; k<K; ++k ) {
2171  value += A(i,k) * B(k,j);
2172  }
2173 
2174  (~C)(i,j) = value;
2175  }
2176  }
2177  }
2179  //**********************************************************************************************
2180 
2181  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2196  template< typename MT3 // Type of the left-hand side target matrix
2197  , typename MT4 // Type of the left-hand side matrix operand
2198  , typename MT5 > // Type of the right-hand side matrix operand
2199  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2200  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2201  {
2206 
2207  if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2208  const typename MT4::OppositeType tmp( serial( A ) );
2209  addAssign( ~C, tmp * B );
2210  }
2211  else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2212  const typename MT5::OppositeType tmp( serial( B ) );
2213  addAssign( ~C, A * tmp );
2214  }
2215  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2216  const typename MT4::OppositeType tmp( serial( A ) );
2217  addAssign( ~C, tmp * B );
2218  }
2219  else {
2220  const typename MT5::OppositeType tmp( serial( B ) );
2221  addAssign( ~C, A * tmp );
2222  }
2223  }
2225  //**********************************************************************************************
2226 
2227  //**Default addition assignment to dense matrices (large matrices)******************************
2241  template< typename MT3 // Type of the left-hand side target matrix
2242  , typename MT4 // Type of the left-hand side matrix operand
2243  , typename MT5 > // Type of the right-hand side matrix operand
2244  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2245  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2246  {
2247  selectDefaultAddAssignKernel( C, A, B );
2248  }
2250  //**********************************************************************************************
2251 
2252  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
2267  template< typename MT3 // Type of the left-hand side target matrix
2268  , typename MT4 // Type of the left-hand side matrix operand
2269  , typename MT5 > // Type of the right-hand side matrix operand
2270  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2271  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2272  {
2273  typedef IntrinsicTrait<ElementType> IT;
2274 
2275  const size_t M( A.rows() );
2276  const size_t N( B.columns() );
2277  const size_t K( A.columns() );
2278 
2279  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2280 
2281  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
2282  {
2283  const size_t jend( min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
2284 
2285  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
2286  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % IT::size ) ) == jpos, "Invalid end calculation" );
2287 
2288  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
2289  {
2290  const size_t iend( min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
2291 
2292  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
2293  {
2294  const size_t ktmp( min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
2295 
2296  size_t j( jj );
2297 
2298  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
2299  {
2300  const size_t j1( j+IT::size );
2301  const size_t j2( j+IT::size*2UL );
2302  const size_t j3( j+IT::size*3UL );
2303 
2304  size_t i( ii );
2305 
2306  for( ; (i+2UL) <= iend; i+=2UL )
2307  {
2308  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2309  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2310  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
2311  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
2312 
2313  IntrinsicType xmm1( (~C).load(i ,j ) );
2314  IntrinsicType xmm2( (~C).load(i ,j1) );
2315  IntrinsicType xmm3( (~C).load(i ,j2) );
2316  IntrinsicType xmm4( (~C).load(i ,j3) );
2317  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
2318  IntrinsicType xmm6( (~C).load(i+1UL,j1) );
2319  IntrinsicType xmm7( (~C).load(i+1UL,j2) );
2320  IntrinsicType xmm8( (~C).load(i+1UL,j3) );
2321 
2322  for( size_t k=kbegin; k<kend; ++k ) {
2323  const IntrinsicType a1( set( A(i ,k) ) );
2324  const IntrinsicType a2( set( A(i+1UL,k) ) );
2325  const IntrinsicType b1( B.load(k,j ) );
2326  const IntrinsicType b2( B.load(k,j1) );
2327  const IntrinsicType b3( B.load(k,j2) );
2328  const IntrinsicType b4( B.load(k,j3) );
2329  xmm1 = xmm1 + a1 * b1;
2330  xmm2 = xmm2 + a1 * b2;
2331  xmm3 = xmm3 + a1 * b3;
2332  xmm4 = xmm4 + a1 * b4;
2333  xmm5 = xmm5 + a2 * b1;
2334  xmm6 = xmm6 + a2 * b2;
2335  xmm7 = xmm7 + a2 * b3;
2336  xmm8 = xmm8 + a2 * b4;
2337  }
2338 
2339  (~C).store( i , j , xmm1 );
2340  (~C).store( i , j1, xmm2 );
2341  (~C).store( i , j2, xmm3 );
2342  (~C).store( i , j3, xmm4 );
2343  (~C).store( i+1UL, j , xmm5 );
2344  (~C).store( i+1UL, j1, xmm6 );
2345  (~C).store( i+1UL, j2, xmm7 );
2346  (~C).store( i+1UL, j3, xmm8 );
2347  }
2348 
2349  if( i < iend )
2350  {
2351  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2352  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2353  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2354  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
2355 
2356  IntrinsicType xmm1( (~C).load(i,j ) );
2357  IntrinsicType xmm2( (~C).load(i,j1) );
2358  IntrinsicType xmm3( (~C).load(i,j2) );
2359  IntrinsicType xmm4( (~C).load(i,j3) );
2360 
2361  for( size_t k=kbegin; k<kend; ++k ) {
2362  const IntrinsicType a1( set( A(i,k) ) );
2363  xmm1 = xmm1 + a1 * B.load(k,j );
2364  xmm2 = xmm2 + a1 * B.load(k,j1);
2365  xmm3 = xmm3 + a1 * B.load(k,j2);
2366  xmm4 = xmm4 + a1 * B.load(k,j3);
2367  }
2368 
2369  (~C).store( i, j , xmm1 );
2370  (~C).store( i, j1, xmm2 );
2371  (~C).store( i, j2, xmm3 );
2372  (~C).store( i, j3, xmm4 );
2373  }
2374  }
2375 
2376  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
2377  {
2378  const size_t j1( j+IT::size );
2379 
2380  size_t i( ii );
2381 
2382  for( ; (i+4UL) <= iend; i+=4UL )
2383  {
2384  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2385  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2386  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
2387  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
2388 
2389  IntrinsicType xmm1( (~C).load(i ,j ) );
2390  IntrinsicType xmm2( (~C).load(i ,j1) );
2391  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
2392  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
2393  IntrinsicType xmm5( (~C).load(i+2UL,j ) );
2394  IntrinsicType xmm6( (~C).load(i+2UL,j1) );
2395  IntrinsicType xmm7( (~C).load(i+3UL,j ) );
2396  IntrinsicType xmm8( (~C).load(i+3UL,j1) );
2397 
2398  for( size_t k=kbegin; k<kend; ++k ) {
2399  const IntrinsicType a1( set( A(i ,k) ) );
2400  const IntrinsicType a2( set( A(i+1UL,k) ) );
2401  const IntrinsicType a3( set( A(i+2UL,k) ) );
2402  const IntrinsicType a4( set( A(i+3UL,k) ) );
2403  const IntrinsicType b1( B.load(k,j ) );
2404  const IntrinsicType b2( B.load(k,j1) );
2405  xmm1 = xmm1 + a1 * b1;
2406  xmm2 = xmm2 + a1 * b2;
2407  xmm3 = xmm3 + a2 * b1;
2408  xmm4 = xmm4 + a2 * b2;
2409  xmm5 = xmm5 + a3 * b1;
2410  xmm6 = xmm6 + a3 * b2;
2411  xmm7 = xmm7 + a4 * b1;
2412  xmm8 = xmm8 + a4 * b2;
2413  }
2414 
2415  (~C).store( i , j , xmm1 );
2416  (~C).store( i , j1, xmm2 );
2417  (~C).store( i+1UL, j , xmm3 );
2418  (~C).store( i+1UL, j1, xmm4 );
2419  (~C).store( i+2UL, j , xmm5 );
2420  (~C).store( i+2UL, j1, xmm6 );
2421  (~C).store( i+3UL, j , xmm7 );
2422  (~C).store( i+3UL, j1, xmm8 );
2423  }
2424 
2425  for( ; (i+2UL) <= iend; i+=2UL )
2426  {
2427  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2428  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2429  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
2430  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
2431 
2432  IntrinsicType xmm1( (~C).load(i ,j ) );
2433  IntrinsicType xmm2( (~C).load(i ,j1) );
2434  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
2435  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
2436 
2437  for( size_t k=kbegin; k<kend; ++k ) {
2438  const IntrinsicType a1( set( A(i ,k) ) );
2439  const IntrinsicType a2( set( A(i+1UL,k) ) );
2440  const IntrinsicType b1( B.load(k,j ) );
2441  const IntrinsicType b2( B.load(k,j1) );
2442  xmm1 = xmm1 + a1 * b1;
2443  xmm2 = xmm2 + a1 * b2;
2444  xmm3 = xmm3 + a2 * b1;
2445  xmm4 = xmm4 + a2 * b2;
2446  }
2447 
2448  (~C).store( i , j , xmm1 );
2449  (~C).store( i , j1, xmm2 );
2450  (~C).store( i+1UL, j , xmm3 );
2451  (~C).store( i+1UL, j1, xmm4 );
2452  }
2453 
2454  if( i < iend )
2455  {
2456  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2457  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2458  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2459  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
2460 
2461  IntrinsicType xmm1( (~C).load(i,j ) );
2462  IntrinsicType xmm2( (~C).load(i,j1) );
2463 
2464  for( size_t k=kbegin; k<kend; ++k ) {
2465  const IntrinsicType a1( set( A(i,k) ) );
2466  xmm1 = xmm1 + a1 * B.load(k,j );
2467  xmm2 = xmm2 + a1 * B.load(k,j1);
2468  }
2469 
2470  (~C).store( i, j , xmm1 );
2471  (~C).store( i, j1, xmm2 );
2472  }
2473  }
2474 
2475  for( ; j<jpos; j+=IT::size )
2476  {
2477  for( size_t i=ii; i<iend; ++i )
2478  {
2479  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2480  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2481  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2482  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
2483 
2484  IntrinsicType xmm1( (~C).load(i,j) );
2485 
2486  for( size_t k=kbegin; k<kend; ++k ) {
2487  const IntrinsicType a1( set( A(i,k) ) );
2488  xmm1 = xmm1 + a1 * B.load(k,j);
2489  }
2490 
2491  (~C).store( i, j, xmm1 );
2492  }
2493  }
2494 
2495  for( ; remainder && j<jend; ++j )
2496  {
2497  for( size_t i=ii; i<iend; ++i )
2498  {
2499  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2500  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2501  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2502  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
2503 
2504  ElementType value( (~C)(i,j) );
2505 
2506  for( size_t k=kbegin; k<kend; ++k ) {
2507  value += A(i,k) * B(k,j);
2508  }
2509 
2510  (~C)(i,j) = value;
2511  }
2512  }
2513  }
2514  }
2515  }
2516  }
2518  //**********************************************************************************************
2519 
2520  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
2534  template< typename MT3 // Type of the left-hand side target matrix
2535  , typename MT4 // Type of the left-hand side matrix operand
2536  , typename MT5 > // Type of the right-hand side matrix operand
2537  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2538  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2539  {
2540  selectSmallAddAssignKernel( ~C, A, B );
2541  }
2543  //**********************************************************************************************
2544 
2545  //**BLAS-based addition assignment to dense matrices (default)**********************************
2559  template< typename MT3 // Type of the left-hand side target matrix
2560  , typename MT4 // Type of the left-hand side matrix operand
2561  , typename MT5 > // Type of the right-hand side matrix operand
2562  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2563  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2564  {
2565  selectLargeAddAssignKernel( C, A, B );
2566  }
2568  //**********************************************************************************************
2569 
2570  //**BLAS-based addition assignment to dense matrices********************************************
2571 #if BLAZE_BLAS_MODE
2572 
2585  template< typename MT3 // Type of the left-hand side target matrix
2586  , typename MT4 // Type of the left-hand side matrix operand
2587  , typename MT5 > // Type of the right-hand side matrix operand
2588  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2589  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2590  {
2591  typedef typename MT3::ElementType ET;
2592 
2593  if( IsTriangular<MT4>::value ) {
2594  typename MT3::ResultType tmp( serial( B ) );
2595  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2596  addAssign( C, tmp );
2597  }
2598  else if( IsTriangular<MT5>::value ) {
2599  typename MT3::ResultType tmp( serial( A ) );
2600  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2601  addAssign( C, tmp );
2602  }
2603  else {
2604  gemm( C, A, B, ET(1), ET(1) );
2605  }
2606  }
2608 #endif
2609  //**********************************************************************************************
2610 
2611  //**Restructuring addition assignment to column-major matrices**********************************
2626  template< typename MT > // Type of the target matrix
2627  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
2628  addAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
2629  {
2631 
2633 
2634  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2635  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2636 
2637  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
2638  addAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
2639  else if( IsSymmetric<MT1>::value )
2640  addAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
2641  else
2642  addAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
2643  }
2645  //**********************************************************************************************
2646 
2647  //**Addition assignment to sparse matrices******************************************************
2648  // No special implementation for the addition assignment to sparse matrices.
2649  //**********************************************************************************************
2650 
2651  //**Subtraction assignment to dense matrices****************************************************
2664  template< typename MT // Type of the target dense matrix
2665  , bool SO > // Storage order of the target dense matrix
2666  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
2667  subAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
2668  {
2670 
2671  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2672  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2673 
2674  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2675  return;
2676  }
2677 
2678  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2679  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2680 
2681  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2682  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2683  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2684  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2685  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2686  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2687 
2688  DMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2689  }
2691  //**********************************************************************************************
2692 
2693  //**Subtraction assignment to dense matrices (kernel selection)*********************************
2704  template< typename MT3 // Type of the left-hand side target matrix
2705  , typename MT4 // Type of the left-hand side matrix operand
2706  , typename MT5 > // Type of the right-hand side matrix operand
2707  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2708  {
2709  if( ( IsDiagonal<MT5>::value ) ||
2710  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
2711  selectSmallSubAssignKernel( C, A, B );
2712  else
2713  selectBlasSubAssignKernel( C, A, B );
2714  }
2716  //**********************************************************************************************
2717 
2718  //**Default subtraction assignment to dense matrices (general/general)**************************
2732  template< typename MT3 // Type of the left-hand side target matrix
2733  , typename MT4 // Type of the left-hand side matrix operand
2734  , typename MT5 > // Type of the right-hand side matrix operand
2735  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2736  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2737  {
2738  const size_t M( A.rows() );
2739  const size_t N( B.columns() );
2740  const size_t K( A.columns() );
2741 
2742  for( size_t i=0UL; i<M; ++i )
2743  {
2744  const size_t kbegin( ( IsUpper<MT4>::value )
2745  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2746  :( 0UL ) );
2747  const size_t kend( ( IsLower<MT4>::value )
2748  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2749  :( K ) );
2750  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2751 
2752  for( size_t k=kbegin; k<kend; ++k )
2753  {
2754  const size_t jbegin( ( IsUpper<MT5>::value )
2755  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
2756  :( 0UL ) );
2757  const size_t jend( ( IsLower<MT5>::value )
2758  ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
2759  :( N ) );
2760  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2761 
2762  const size_t jnum( jend - jbegin );
2763  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2764 
2765  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2766  C(i,j ) -= A(i,k) * B(k,j );
2767  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
2768  }
2769  if( jpos < jend ) {
2770  C(i,jpos) -= A(i,k) * B(k,jpos);
2771  }
2772  }
2773  }
2774  }
2776  //**********************************************************************************************
2777 
2778  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
2792  template< typename MT3 // Type of the left-hand side target matrix
2793  , typename MT4 // Type of the left-hand side matrix operand
2794  , typename MT5 > // Type of the right-hand side matrix operand
2795  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2796  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2797  {
2799 
2800  const size_t M( A.rows() );
2801  const size_t N( B.columns() );
2802 
2803  for( size_t i=0UL; i<M; ++i )
2804  {
2805  const size_t jbegin( ( IsUpper<MT4>::value )
2806  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2807  :( 0UL ) );
2808  const size_t jend( ( IsLower<MT4>::value )
2809  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2810  :( N ) );
2811  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2812 
2813  const size_t jnum( jend - jbegin );
2814  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2815 
2816  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2817  C(i,j ) -= A(i,j ) * B(j ,j );
2818  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
2819  }
2820  if( jpos < jend ) {
2821  C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
2822  }
2823  }
2824  }
2826  //**********************************************************************************************
2827 
2828  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
2842  template< typename MT3 // Type of the left-hand side target matrix
2843  , typename MT4 // Type of the left-hand side matrix operand
2844  , typename MT5 > // Type of the right-hand side matrix operand
2845  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2846  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2847  {
2849 
2850  const size_t M( A.rows() );
2851  const size_t N( B.columns() );
2852 
2853  for( size_t i=0UL; i<M; ++i )
2854  {
2855  const size_t jbegin( ( IsUpper<MT5>::value )
2856  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2857  :( 0UL ) );
2858  const size_t jend( ( IsLower<MT5>::value )
2859  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2860  :( N ) );
2861  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2862 
2863  const size_t jnum( jend - jbegin );
2864  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2865 
2866  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2867  C(i,j ) -= A(i,i) * B(i,j );
2868  C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
2869  }
2870  if( jpos < jend ) {
2871  C(i,jpos) -= A(i,i) * B(i,jpos);
2872  }
2873  }
2874  }
2876  //**********************************************************************************************
2877 
2878  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
2892  template< typename MT3 // Type of the left-hand side target matrix
2893  , typename MT4 // Type of the left-hand side matrix operand
2894  , typename MT5 > // Type of the right-hand side matrix operand
2895  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
2896  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2897  {
2899 
2900  for( size_t i=0UL; i<A.rows(); ++i ) {
2901  C(i,i) -= A(i,i) * B(i,i);
2902  }
2903  }
2905  //**********************************************************************************************
2906 
2907  //**Default subtraction assignment to dense matrices (small matrices)***************************
2921  template< typename MT3 // Type of the left-hand side target matrix
2922  , typename MT4 // Type of the left-hand side matrix operand
2923  , typename MT5 > // Type of the right-hand side matrix operand
2924  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2925  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2926  {
2927  selectDefaultSubAssignKernel( C, A, B );
2928  }
2930  //**********************************************************************************************
2931 
2932  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
2947  template< typename MT3 // Type of the left-hand side target matrix
2948  , typename MT4 // Type of the left-hand side matrix operand
2949  , typename MT5 > // Type of the right-hand side matrix operand
2950  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2951  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2952  {
2953  typedef IntrinsicTrait<ElementType> IT;
2954 
2955  const size_t M( A.rows() );
2956  const size_t N( B.columns() );
2957  const size_t K( A.columns() );
2958 
2959  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2960 
2961  const size_t jpos( remainder ? ( N & size_t(-IT::size) ) : N );
2962  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % IT::size ) ) == jpos, "Invalid end calculation" );
2963 
2964  size_t j( 0UL );
2965 
2966  for( ; (j+IT::size*7UL) < jpos; j+=IT::size*8UL ) {
2967  for( size_t i=0UL; i<M; ++i )
2968  {
2969  const size_t kbegin( ( IsUpper<MT4>::value )
2970  ?( ( IsLower<MT5>::value )
2971  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2972  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2973  :( IsLower<MT5>::value ? j : 0UL ) );
2974  const size_t kend( ( IsLower<MT4>::value )
2975  ?( ( IsUpper<MT5>::value )
2976  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
2977  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2978  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
2979 
2980  IntrinsicType xmm1( (~C).load(i,j ) );
2981  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
2982  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
2983  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
2984  IntrinsicType xmm5( (~C).load(i,j+IT::size*4UL) );
2985  IntrinsicType xmm6( (~C).load(i,j+IT::size*5UL) );
2986  IntrinsicType xmm7( (~C).load(i,j+IT::size*6UL) );
2987  IntrinsicType xmm8( (~C).load(i,j+IT::size*7UL) );
2988 
2989  for( size_t k=kbegin; k<kend; ++k ) {
2990  const IntrinsicType a1( set( A(i,k) ) );
2991  xmm1 = xmm1 - a1 * B.load(k,j );
2992  xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
2993  xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
2994  xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
2995  xmm5 = xmm5 - a1 * B.load(k,j+IT::size*4UL);
2996  xmm6 = xmm6 - a1 * B.load(k,j+IT::size*5UL);
2997  xmm7 = xmm7 - a1 * B.load(k,j+IT::size*6UL);
2998  xmm8 = xmm8 - a1 * B.load(k,j+IT::size*7UL);
2999  }
3000 
3001  (~C).store( i, j , xmm1 );
3002  (~C).store( i, j+IT::size , xmm2 );
3003  (~C).store( i, j+IT::size*2UL, xmm3 );
3004  (~C).store( i, j+IT::size*3UL, xmm4 );
3005  (~C).store( i, j+IT::size*4UL, xmm5 );
3006  (~C).store( i, j+IT::size*5UL, xmm6 );
3007  (~C).store( i, j+IT::size*6UL, xmm7 );
3008  (~C).store( i, j+IT::size*7UL, xmm8 );
3009  }
3010  }
3011 
3012  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
3013  {
3014  size_t i( 0UL );
3015 
3016  for( ; (i+2UL) <= M; i+=2UL )
3017  {
3018  const size_t kbegin( ( IsUpper<MT4>::value )
3019  ?( ( IsLower<MT5>::value )
3020  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3021  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3022  :( IsLower<MT5>::value ? j : 0UL ) );
3023  const size_t kend( ( IsLower<MT4>::value )
3024  ?( ( IsUpper<MT5>::value )
3025  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
3026  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3027  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
3028 
3029  IntrinsicType xmm1( (~C).load(i ,j ) );
3030  IntrinsicType xmm2( (~C).load(i ,j+IT::size ) );
3031  IntrinsicType xmm3( (~C).load(i ,j+IT::size*2UL) );
3032  IntrinsicType xmm4( (~C).load(i ,j+IT::size*3UL) );
3033  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
3034  IntrinsicType xmm6( (~C).load(i+1UL,j+IT::size ) );
3035  IntrinsicType xmm7( (~C).load(i+1UL,j+IT::size*2UL) );
3036  IntrinsicType xmm8( (~C).load(i+1UL,j+IT::size*3UL) );
3037 
3038  for( size_t k=kbegin; k<kend; ++k ) {
3039  const IntrinsicType a1( set( A(i ,k) ) );
3040  const IntrinsicType a2( set( A(i+1UL,k) ) );
3041  const IntrinsicType b1( B.load(k,j ) );
3042  const IntrinsicType b2( B.load(k,j+IT::size ) );
3043  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
3044  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
3045  xmm1 = xmm1 - a1 * b1;
3046  xmm2 = xmm2 - a1 * b2;
3047  xmm3 = xmm3 - a1 * b3;
3048  xmm4 = xmm4 - a1 * b4;
3049  xmm5 = xmm5 - a2 * b1;
3050  xmm6 = xmm6 - a2 * b2;
3051  xmm7 = xmm7 - a2 * b3;
3052  xmm8 = xmm8 - a2 * b4;
3053  }
3054 
3055  (~C).store( i , j , xmm1 );
3056  (~C).store( i , j+IT::size , xmm2 );
3057  (~C).store( i , j+IT::size*2UL, xmm3 );
3058  (~C).store( i , j+IT::size*3UL, xmm4 );
3059  (~C).store( i+1UL, j , xmm5 );
3060  (~C).store( i+1UL, j+IT::size , xmm6 );
3061  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
3062  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
3063  }
3064 
3065  if( i < M )
3066  {
3067  const size_t kbegin( ( IsUpper<MT4>::value )
3068  ?( ( IsLower<MT5>::value )
3069  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3070  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3071  :( IsLower<MT5>::value ? j : 0UL ) );
3072  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
3073 
3074  IntrinsicType xmm1( (~C).load(i,j ) );
3075  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
3076  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
3077  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
3078 
3079  for( size_t k=kbegin; k<kend; ++k ) {
3080  const IntrinsicType a1( set( A(i,k) ) );
3081  xmm1 = xmm1 - a1 * B.load(k,j );
3082  xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
3083  xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
3084  xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
3085  }
3086 
3087  (~C).store( i, j , xmm1 );
3088  (~C).store( i, j+IT::size , xmm2 );
3089  (~C).store( i, j+IT::size*2UL, xmm3 );
3090  (~C).store( i, j+IT::size*3UL, xmm4 );
3091  }
3092  }
3093 
3094  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
3095  {
3096  size_t i( 0UL );
3097 
3098  for( ; (i+2UL) <= M; i+=2UL )
3099  {
3100  const size_t kbegin( ( IsUpper<MT4>::value )
3101  ?( ( IsLower<MT5>::value )
3102  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3103  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3104  :( IsLower<MT5>::value ? j : 0UL ) );
3105  const size_t kend( ( IsLower<MT4>::value )
3106  ?( ( IsUpper<MT5>::value )
3107  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
3108  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3109  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
3110 
3111  IntrinsicType xmm1( (~C).load(i ,j ) );
3112  IntrinsicType xmm2( (~C).load(i ,j+IT::size) );
3113  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
3114  IntrinsicType xmm4( (~C).load(i+1UL,j+IT::size) );
3115 
3116  for( size_t k=kbegin; k<kend; ++k ) {
3117  const IntrinsicType a1( set( A(i ,k) ) );
3118  const IntrinsicType a2( set( A(i+1UL,k) ) );
3119  const IntrinsicType b1( B.load(k,j ) );
3120  const IntrinsicType b2( B.load(k,j+IT::size) );
3121  xmm1 = xmm1 - a1 * b1;
3122  xmm2 = xmm2 - a1 * b2;
3123  xmm3 = xmm3 - a2 * b1;
3124  xmm4 = xmm4 - a2 * b2;
3125  }
3126 
3127  (~C).store( i , j , xmm1 );
3128  (~C).store( i , j+IT::size, xmm2 );
3129  (~C).store( i+1UL, j , xmm3 );
3130  (~C).store( i+1UL, j+IT::size, xmm4 );
3131  }
3132 
3133  if( i < M )
3134  {
3135  const size_t kbegin( ( IsUpper<MT4>::value )
3136  ?( ( IsLower<MT5>::value )
3137  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3138  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3139  :( IsLower<MT5>::value ? j : 0UL ) );
3140  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
3141 
3142  IntrinsicType xmm1( (~C).load(i,j ) );
3143  IntrinsicType xmm2( (~C).load(i,j+IT::size) );
3144 
3145  for( size_t k=kbegin; k<kend; ++k ) {
3146  const IntrinsicType a1( set( A(i,k) ) );
3147  xmm1 = xmm1 - a1 * B.load(k,j );
3148  xmm2 = xmm2 - a1 * B.load(k,j+IT::size);
3149  }
3150 
3151  (~C).store( i, j , xmm1 );
3152  (~C).store( i, j+IT::size, xmm2 );
3153  }
3154  }
3155 
3156  for( ; j<jpos; j+=IT::size )
3157  {
3158  size_t i( 0UL );
3159 
3160  for( ; (i+2UL) <= M; i+=2UL )
3161  {
3162  const size_t kbegin( ( IsUpper<MT4>::value )
3163  ?( ( IsLower<MT5>::value )
3164  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3165  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3166  :( IsLower<MT5>::value ? j : 0UL ) );
3167  const size_t kend( ( IsLower<MT4>::value )
3168  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3169  :( K ) );
3170 
3171  IntrinsicType xmm1( (~C).load(i ,j) );
3172  IntrinsicType xmm2( (~C).load(i+1UL,j) );
3173 
3174  for( size_t k=kbegin; k<kend; ++k ) {
3175  const IntrinsicType b1( B.load(k,j) );
3176  xmm1 = xmm1 - set( A(i ,k) ) * b1;
3177  xmm2 = xmm2 - set( A(i+1UL,k) ) * b1;
3178  }
3179 
3180  (~C).store( i , j, xmm1 );
3181  (~C).store( i+1UL, j, xmm2 );
3182  }
3183 
3184  if( i < M )
3185  {
3186  const size_t kbegin( ( IsUpper<MT4>::value )
3187  ?( ( IsLower<MT5>::value )
3188  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3189  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3190  :( IsLower<MT5>::value ? j : 0UL ) );
3191 
3192  IntrinsicType xmm1( (~C).load(i,j) );
3193 
3194  for( size_t k=kbegin; k<K; ++k ) {
3195  xmm1 = xmm1 - set( A(i,k) ) * B.load(k,j);
3196  }
3197 
3198  (~C).store( i, j, xmm1 );
3199  }
3200  }
3201 
3202  for( ; remainder && j<N; ++j )
3203  {
3204  size_t i( 0UL );
3205 
3206  for( ; (i+2UL) <= M; i+=2UL )
3207  {
3208  const size_t kbegin( ( IsUpper<MT4>::value )
3209  ?( ( IsLower<MT5>::value )
3210  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3211  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3212  :( IsLower<MT5>::value ? j : 0UL ) );
3213  const size_t kend( ( IsLower<MT4>::value )
3214  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3215  :( K ) );
3216 
3217  ElementType value1( (~C)(i ,j) );
3218  ElementType value2( (~C)(i+1UL,j) );
3219 
3220  for( size_t k=kbegin; k<kend; ++k ) {
3221  value1 -= A(i ,k) * B(k,j);
3222  value2 -= A(i+1UL,k) * B(k,j);
3223  }
3224 
3225  (~C)(i ,j) = value1;
3226  (~C)(i+1UL,j) = value2;
3227  }
3228 
3229  if( i < M )
3230  {
3231  const size_t kbegin( ( IsUpper<MT4>::value )
3232  ?( ( IsLower<MT5>::value )
3233  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3234  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3235  :( IsLower<MT5>::value ? j : 0UL ) );
3236 
3237  ElementType value( (~C)(i,j) );
3238 
3239  for( size_t k=kbegin; k<K; ++k ) {
3240  value -= A(i,k) * B(k,j);
3241  }
3242 
3243  (~C)(i,j) = value;
3244  }
3245  }
3246  }
3248  //**********************************************************************************************
3249 
3250  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
3265  template< typename MT3 // Type of the left-hand side target matrix
3266  , typename MT4 // Type of the left-hand side matrix operand
3267  , typename MT5 > // Type of the right-hand side matrix operand
3268  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3269  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3270  {
3275 
3276  if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
3277  const typename MT4::OppositeType tmp( serial( A ) );
3278  subAssign( ~C, tmp * B );
3279  }
3280  else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
3281  const typename MT5::OppositeType tmp( serial( B ) );
3282  subAssign( ~C, A * tmp );
3283  }
3284  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
3285  const typename MT4::OppositeType tmp( serial( A ) );
3286  subAssign( ~C, tmp * B );
3287  }
3288  else {
3289  const typename MT5::OppositeType tmp( serial( B ) );
3290  subAssign( ~C, A * tmp );
3291  }
3292  }
3294  //**********************************************************************************************
3295 
3296  //**Default subtraction assignment to dense matrices (large matrices)***************************
3310  template< typename MT3 // Type of the left-hand side target matrix
3311  , typename MT4 // Type of the left-hand side matrix operand
3312  , typename MT5 > // Type of the right-hand side matrix operand
3313  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3314  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3315  {
3316  selectDefaultSubAssignKernel( C, A, B );
3317  }
3319  //**********************************************************************************************
3320 
3321  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
3336  template< typename MT3 // Type of the left-hand side target matrix
3337  , typename MT4 // Type of the left-hand side matrix operand
3338  , typename MT5 > // Type of the right-hand side matrix operand
3339  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3340  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3341  {
3342  typedef IntrinsicTrait<ElementType> IT;
3343 
3344  const size_t M( A.rows() );
3345  const size_t N( B.columns() );
3346  const size_t K( A.columns() );
3347 
3348  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3349 
3350  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
3351  {
3352  const size_t jend( min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
3353 
3354  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3355  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % IT::size ) ) == jpos, "Invalid end calculation" );
3356 
3357  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
3358  {
3359  const size_t iend( min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
3360 
3361  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
3362  {
3363  const size_t ktmp( min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
3364 
3365  size_t j( jj );
3366 
3367  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
3368  {
3369  const size_t j1( j+IT::size );
3370  const size_t j2( j+IT::size*2UL );
3371  const size_t j3( j+IT::size*3UL );
3372 
3373  size_t i( ii );
3374 
3375  for( ; (i+2UL) <= iend; i+=2UL )
3376  {
3377  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3378  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3379  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3380  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
3381 
3382  IntrinsicType xmm1( (~C).load(i ,j ) );
3383  IntrinsicType xmm2( (~C).load(i ,j1) );
3384  IntrinsicType xmm3( (~C).load(i ,j2) );
3385  IntrinsicType xmm4( (~C).load(i ,j3) );
3386  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
3387  IntrinsicType xmm6( (~C).load(i+1UL,j1) );
3388  IntrinsicType xmm7( (~C).load(i+1UL,j2) );
3389  IntrinsicType xmm8( (~C).load(i+1UL,j3) );
3390 
3391  for( size_t k=kbegin; k<kend; ++k ) {
3392  const IntrinsicType a1( set( A(i ,k) ) );
3393  const IntrinsicType a2( set( A(i+1UL,k) ) );
3394  const IntrinsicType b1( B.load(k,j ) );
3395  const IntrinsicType b2( B.load(k,j1) );
3396  const IntrinsicType b3( B.load(k,j2) );
3397  const IntrinsicType b4( B.load(k,j3) );
3398  xmm1 = xmm1 - a1 * b1;
3399  xmm2 = xmm2 - a1 * b2;
3400  xmm3 = xmm3 - a1 * b3;
3401  xmm4 = xmm4 - a1 * b4;
3402  xmm5 = xmm5 - a2 * b1;
3403  xmm6 = xmm6 - a2 * b2;
3404  xmm7 = xmm7 - a2 * b3;
3405  xmm8 = xmm8 - a2 * b4;
3406  }
3407 
3408  (~C).store( i , j , xmm1 );
3409  (~C).store( i , j1, xmm2 );
3410  (~C).store( i , j2, xmm3 );
3411  (~C).store( i , j3, xmm4 );
3412  (~C).store( i+1UL, j , xmm5 );
3413  (~C).store( i+1UL, j1, xmm6 );
3414  (~C).store( i+1UL, j2, xmm7 );
3415  (~C).store( i+1UL, j3, xmm8 );
3416  }
3417 
3418  if( i < iend )
3419  {
3420  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3421  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3422  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3423  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
3424 
3425  IntrinsicType xmm1( (~C).load(i,j ) );
3426  IntrinsicType xmm2( (~C).load(i,j1) );
3427  IntrinsicType xmm3( (~C).load(i,j2) );
3428  IntrinsicType xmm4( (~C).load(i,j3) );
3429 
3430  for( size_t k=kbegin; k<kend; ++k ) {
3431  const IntrinsicType a1( set( A(i,k) ) );
3432  xmm1 = xmm1 - a1 * B.load(k,j );
3433  xmm2 = xmm2 - a1 * B.load(k,j1);
3434  xmm3 = xmm3 - a1 * B.load(k,j2);
3435  xmm4 = xmm4 - a1 * B.load(k,j3);
3436  }
3437 
3438  (~C).store( i, j , xmm1 );
3439  (~C).store( i, j1, xmm2 );
3440  (~C).store( i, j2, xmm3 );
3441  (~C).store( i, j3, xmm4 );
3442  }
3443  }
3444 
3445  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
3446  {
3447  const size_t j1( j+IT::size );
3448 
3449  size_t i( ii );
3450 
3451  for( ; (i+4UL) <= iend; i+=4UL )
3452  {
3453  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3454  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3455  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
3456  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
3457 
3458  IntrinsicType xmm1( (~C).load(i ,j ) );
3459  IntrinsicType xmm2( (~C).load(i ,j1) );
3460  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
3461  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
3462  IntrinsicType xmm5( (~C).load(i+2UL,j ) );
3463  IntrinsicType xmm6( (~C).load(i+2UL,j1) );
3464  IntrinsicType xmm7( (~C).load(i+3UL,j ) );
3465  IntrinsicType xmm8( (~C).load(i+3UL,j1) );
3466 
3467  for( size_t k=kbegin; k<kend; ++k ) {
3468  const IntrinsicType a1( set( A(i ,k) ) );
3469  const IntrinsicType a2( set( A(i+1UL,k) ) );
3470  const IntrinsicType a3( set( A(i+2UL,k) ) );
3471  const IntrinsicType a4( set( A(i+3UL,k) ) );
3472  const IntrinsicType b1( B.load(k,j ) );
3473  const IntrinsicType b2( B.load(k,j1) );
3474  xmm1 = xmm1 - a1 * b1;
3475  xmm2 = xmm2 - a1 * b2;
3476  xmm3 = xmm3 - a2 * b1;
3477  xmm4 = xmm4 - a2 * b2;
3478  xmm5 = xmm5 - a3 * b1;
3479  xmm6 = xmm6 - a3 * b2;
3480  xmm7 = xmm7 - a4 * b1;
3481  xmm8 = xmm8 - a4 * b2;
3482  }
3483 
3484  (~C).store( i , j , xmm1 );
3485  (~C).store( i , j1, xmm2 );
3486  (~C).store( i+1UL, j , xmm3 );
3487  (~C).store( i+1UL, j1, xmm4 );
3488  (~C).store( i+2UL, j , xmm5 );
3489  (~C).store( i+2UL, j1, xmm6 );
3490  (~C).store( i+3UL, j , xmm7 );
3491  (~C).store( i+3UL, j1, xmm8 );
3492  }
3493 
3494  for( ; (i+2UL) <= iend; i+=2UL )
3495  {
3496  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3497  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3498  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3499  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
3500 
3501  IntrinsicType xmm1( (~C).load(i ,j ) );
3502  IntrinsicType xmm2( (~C).load(i ,j1) );
3503  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
3504  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
3505 
3506  for( size_t k=kbegin; k<kend; ++k ) {
3507  const IntrinsicType a1( set( A(i ,k) ) );
3508  const IntrinsicType a2( set( A(i+1UL,k) ) );
3509  const IntrinsicType b1( B.load(k,j ) );
3510  const IntrinsicType b2( B.load(k,j1) );
3511  xmm1 = xmm1 - a1 * b1;
3512  xmm2 = xmm2 - a1 * b2;
3513  xmm3 = xmm3 - a2 * b1;
3514  xmm4 = xmm4 - a2 * b2;
3515  }
3516 
3517  (~C).store( i , j , xmm1 );
3518  (~C).store( i , j1, xmm2 );
3519  (~C).store( i+1UL, j , xmm3 );
3520  (~C).store( i+1UL, j1, xmm4 );
3521  }
3522 
3523  if( i < iend )
3524  {
3525  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3526  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3527  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3528  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
3529 
3530  IntrinsicType xmm1( (~C).load(i,j ) );
3531  IntrinsicType xmm2( (~C).load(i,j1) );
3532 
3533  for( size_t k=kbegin; k<kend; ++k ) {
3534  const IntrinsicType a1( set( A(i,k) ) );
3535  xmm1 = xmm1 - a1 * B.load(k,j );
3536  xmm2 = xmm2 - a1 * B.load(k,j1);
3537  }
3538 
3539  (~C).store( i, j , xmm1 );
3540  (~C).store( i, j1, xmm2 );
3541  }
3542  }
3543 
3544  for( ; j<jpos; j+=IT::size )
3545  {
3546  for( size_t i=ii; i<iend; ++i )
3547  {
3548  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3549  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3550  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3551  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
3552 
3553  IntrinsicType xmm1( (~C).load(i,j) );
3554 
3555  for( size_t k=kbegin; k<kend; ++k ) {
3556  const IntrinsicType a1( set( A(i,k) ) );
3557  xmm1 = xmm1 - a1 * B.load(k,j);
3558  }
3559 
3560  (~C).store( i, j, xmm1 );
3561  }
3562  }
3563 
3564  for( ; remainder && j<jend; ++j )
3565  {
3566  for( size_t i=ii; i<iend; ++i )
3567  {
3568  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3569  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3570  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3571  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
3572 
3573  ElementType value( (~C)(i,j) );
3574 
3575  for( size_t k=kbegin; k<kend; ++k ) {
3576  value -= A(i,k) * B(k,j);
3577  }
3578 
3579  (~C)(i,j) = value;
3580  }
3581  }
3582  }
3583  }
3584  }
3585  }
3587  //**********************************************************************************************
3588 
3589  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
3603  template< typename MT3 // Type of the left-hand side target matrix
3604  , typename MT4 // Type of the left-hand side matrix operand
3605  , typename MT5 > // Type of the right-hand side matrix operand
3606  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3607  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3608  {
3609  selectSmallSubAssignKernel( ~C, A, B );
3610  }
3612  //**********************************************************************************************
3613 
3614  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
3628  template< typename MT3 // Type of the left-hand side target matrix
3629  , typename MT4 // Type of the left-hand side matrix operand
3630  , typename MT5 > // Type of the right-hand side matrix operand
3631  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3632  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3633  {
3634  selectLargeSubAssignKernel( C, A, B );
3635  }
3637  //**********************************************************************************************
3638 
3639  //**BLAS-based subraction assignment to dense matrices******************************************
3640 #if BLAZE_BLAS_MODE
3641 
3654  template< typename MT3 // Type of the left-hand side target matrix
3655  , typename MT4 // Type of the left-hand side matrix operand
3656  , typename MT5 > // Type of the right-hand side matrix operand
3657  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3658  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3659  {
3660  typedef typename MT3::ElementType ET;
3661 
3662  if( IsTriangular<MT4>::value ) {
3663  typename MT3::ResultType tmp( serial( B ) );
3664  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3665  subAssign( C, tmp );
3666  }
3667  else if( IsTriangular<MT5>::value ) {
3668  typename MT3::ResultType tmp( serial( A ) );
3669  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3670  subAssign( C, tmp );
3671  }
3672  else {
3673  gemm( C, A, B, ET(-1), ET(1) );
3674  }
3675  }
3677 #endif
3678  //**********************************************************************************************
3679 
3680  //**Restructuring subtraction assignment to column-major matrices*******************************
3695  template< typename MT > // Type of the target matrix
3696  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3697  subAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
3698  {
3700 
3702 
3703  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3704  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3705 
3706  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3707  subAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3708  else if( IsSymmetric<MT1>::value )
3709  subAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
3710  else
3711  subAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
3712  }
3714  //**********************************************************************************************
3715 
3716  //**Subtraction assignment to sparse matrices***************************************************
3717  // No special implementation for the subtraction assignment to sparse matrices.
3718  //**********************************************************************************************
3719 
3720  //**Multiplication assignment to dense matrices*************************************************
3721  // No special implementation for the multiplication assignment to dense matrices.
3722  //**********************************************************************************************
3723 
3724  //**Multiplication assignment to sparse matrices************************************************
3725  // No special implementation for the multiplication assignment to sparse matrices.
3726  //**********************************************************************************************
3727 
3728  //**SMP assignment to dense matrices************************************************************
3743  template< typename MT // Type of the target dense matrix
3744  , bool SO > // Storage order of the target dense matrix
3745  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3746  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
3747  {
3749 
3750  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3751  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3752 
3753  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
3754  return;
3755  }
3756  else if( rhs.lhs_.columns() == 0UL ) {
3757  reset( ~lhs );
3758  return;
3759  }
3760 
3761  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3762  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3763 
3764  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3765  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3766  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3767  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3768  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3769  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3770 
3771  smpAssign( ~lhs, A * B );
3772  }
3774  //**********************************************************************************************
3775 
3776  //**SMP assignment to sparse matrices***********************************************************
3791  template< typename MT // Type of the target sparse matrix
3792  , bool SO > // Storage order of the target sparse matrix
3793  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3794  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
3795  {
3797 
3798  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
3799 
3806 
3807  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3808  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3809 
3810  const TmpType tmp( rhs );
3811  smpAssign( ~lhs, tmp );
3812  }
3814  //**********************************************************************************************
3815 
3816  //**Restructuring SMP assignment to column-major matrices***************************************
3831  template< typename MT > // Type of the target matrix
3832  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3833  smpAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
3834  {
3836 
3838 
3839  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3840  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3841 
3842  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3843  smpAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3844  else if( IsSymmetric<MT1>::value )
3845  smpAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
3846  else
3847  smpAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
3848  }
3850  //**********************************************************************************************
3851 
3852  //**SMP addition assignment to dense matrices***************************************************
3868  template< typename MT // Type of the target dense matrix
3869  , bool SO > // Storage order of the target dense matrix
3870  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3871  smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
3872  {
3874 
3875  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3876  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3877 
3878  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3879  return;
3880  }
3881 
3882  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3883  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3884 
3885  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3886  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3887  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3888  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3889  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3890  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3891 
3892  smpAddAssign( ~lhs, A * B );
3893  }
3895  //**********************************************************************************************
3896 
3897  //**Restructuring SMP addition assignment to column-major matrices******************************
3912  template< typename MT > // Type of the target matrix
3913  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3914  smpAddAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
3915  {
3917 
3919 
3920  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3921  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3922 
3923  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3924  smpAddAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3925  else if( IsSymmetric<MT1>::value )
3926  smpAddAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
3927  else
3928  smpAddAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
3929  }
3931  //**********************************************************************************************
3932 
3933  //**SMP addition assignment to sparse matrices**************************************************
3934  // No special implementation for the SMP addition assignment to sparse matrices.
3935  //**********************************************************************************************
3936 
3937  //**SMP subtraction assignment to dense matrices************************************************
3953  template< typename MT // Type of the target dense matrix
3954  , bool SO > // Storage order of the target dense matrix
3955  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3956  smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
3957  {
3959 
3960  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3961  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3962 
3963  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3964  return;
3965  }
3966 
3967  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3968  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3969 
3970  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3971  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3972  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3973  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3974  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3975  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3976 
3977  smpSubAssign( ~lhs, A * B );
3978  }
3980  //**********************************************************************************************
3981 
3982  //**Restructuring SMP subtraction assignment to column-major matrices***************************
3997  template< typename MT > // Type of the target matrix
3998  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3999  smpSubAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4000  {
4002 
4004 
4005  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4006  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4007 
4008  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
4009  smpSubAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
4010  else if( IsSymmetric<MT1>::value )
4011  smpSubAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
4012  else
4013  smpSubAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
4014  }
4016  //**********************************************************************************************
4017 
4018  //**SMP subtraction assignment to sparse matrices***********************************************
4019  // No special implementation for the SMP subtraction assignment to sparse matrices.
4020  //**********************************************************************************************
4021 
4022  //**SMP multiplication assignment to dense matrices*********************************************
4023  // No special implementation for the SMP multiplication assignment to dense matrices.
4024  //**********************************************************************************************
4025 
4026  //**SMP multiplication assignment to sparse matrices********************************************
4027  // No special implementation for the SMP multiplication assignment to sparse matrices.
4028  //**********************************************************************************************
4029 
4030  //**Compile time checks*************************************************************************
4038  //**********************************************************************************************
4039 };
4040 //*************************************************************************************************
4041 
4042 
4043 
4044 
4045 //=================================================================================================
4046 //
4047 // DMATSCALARMULTEXPR SPECIALIZATION
4048 //
4049 //=================================================================================================
4050 
4051 //*************************************************************************************************
4059 template< typename MT1 // Type of the left-hand side dense matrix
4060  , typename MT2 // Type of the right-hand side dense matrix
4061  , typename ST > // Type of the right-hand side scalar value
4062 class DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2>, ST, false >
4063  : public DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2>, ST, false >, false >
4064  , private MatScalarMultExpr
4065  , private Computation
4066 {
4067  private:
4068  //**Type definitions****************************************************************************
4069  typedef DMatDMatMultExpr<MT1,MT2> MMM;
4070  typedef typename MMM::ResultType RES;
4071  typedef typename MT1::ResultType RT1;
4072  typedef typename MT2::ResultType RT2;
4073  typedef typename RT1::ElementType ET1;
4074  typedef typename RT2::ElementType ET2;
4075  typedef typename MT1::CompositeType CT1;
4076  typedef typename MT2::CompositeType CT2;
4077  //**********************************************************************************************
4078 
4079  //**********************************************************************************************
4081  enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4082  //**********************************************************************************************
4083 
4084  //**********************************************************************************************
4086  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4087  //**********************************************************************************************
4088 
4089  //**********************************************************************************************
4091 
4096  template< typename T1, typename T2, typename T3 >
4097  struct CanExploitSymmetry {
4098  enum { value = IsColumnMajorMatrix<T1>::value &&
4099  ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
4100  };
4101  //**********************************************************************************************
4102 
4103  //**********************************************************************************************
4105 
4108  template< typename T1, typename T2, typename T3 >
4109  struct IsEvaluationRequired {
4110  enum { value = ( evaluateLeft || evaluateRight ) &&
4111  !CanExploitSymmetry<T1,T2,T3>::value };
4112  };
4113  //**********************************************************************************************
4114 
4115  //**********************************************************************************************
4117 
4119  template< typename T1, typename T2, typename T3, typename T4 >
4120  struct UseBlasKernel {
4121  enum { value = BLAZE_BLAS_MODE &&
4122  HasMutableDataAccess<T1>::value &&
4123  HasConstDataAccess<T2>::value &&
4124  HasConstDataAccess<T3>::value &&
4125  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4126  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4127  IsBlasCompatible<typename T1::ElementType>::value &&
4128  IsBlasCompatible<typename T2::ElementType>::value &&
4129  IsBlasCompatible<typename T3::ElementType>::value &&
4130  IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
4131  IsSame< typename T1::ElementType, typename T3::ElementType >::value &&
4132  !( IsBuiltin<typename T1::ElementType>::value && IsComplex<T4>::value ) };
4133  };
4134  //**********************************************************************************************
4135 
4136  //**********************************************************************************************
4138 
4140  template< typename T1, typename T2, typename T3, typename T4 >
4141  struct UseVectorizedDefaultKernel {
4142  enum { value = useOptimizedKernels &&
4143  !IsDiagonal<T3>::value &&
4144  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4145  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
4146  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
4147  IsSame<typename T1::ElementType,T4>::value &&
4148  IntrinsicTrait<typename T1::ElementType>::addition &&
4149  IntrinsicTrait<typename T1::ElementType>::subtraction &&
4150  IntrinsicTrait<typename T1::ElementType>::multiplication };
4151  };
4152  //**********************************************************************************************
4153 
4154  public:
4155  //**Type definitions****************************************************************************
4156  typedef DMatScalarMultExpr<MMM,ST,false> This;
4157  typedef typename MultTrait<RES,ST>::Type ResultType;
4158  typedef typename ResultType::OppositeType OppositeType;
4159  typedef typename ResultType::TransposeType TransposeType;
4160  typedef typename ResultType::ElementType ElementType;
4161  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
4162  typedef const ElementType ReturnType;
4163  typedef const ResultType CompositeType;
4164 
4166  typedef const DMatDMatMultExpr<MT1,MT2> LeftOperand;
4167 
4169  typedef ST RightOperand;
4170 
4172  typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type LT;
4173 
4175  typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type RT;
4176  //**********************************************************************************************
4177 
4178  //**Compilation flags***************************************************************************
4180  enum { vectorizable = !IsDiagonal<MT2>::value &&
4181  MT1::vectorizable && MT2::vectorizable &&
4182  IsSame<ET1,ET2>::value &&
4183  IsSame<ET1,ST>::value &&
4184  IntrinsicTrait<ET1>::addition &&
4185  IntrinsicTrait<ET1>::multiplication };
4186 
4188  enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4189  !evaluateRight && MT2::smpAssignable };
4190  //**********************************************************************************************
4191 
4192  //**Constructor*********************************************************************************
4198  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4199  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4200  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4201  {}
4202  //**********************************************************************************************
4203 
4204  //**Access operator*****************************************************************************
4211  inline ReturnType operator()( size_t i, size_t j ) const {
4212  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4213  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4214  return matrix_(i,j) * scalar_;
4215  }
4216  //**********************************************************************************************
4217 
4218  //**At function*********************************************************************************
4226  inline ReturnType at( size_t i, size_t j ) const {
4227  if( i >= matrix_.rows() ) {
4228  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4229  }
4230  if( j >= matrix_.columns() ) {
4231  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4232  }
4233  return (*this)(i,j);
4234  }
4235  //**********************************************************************************************
4236 
4237  //**Rows function*******************************************************************************
4242  inline size_t rows() const {
4243  return matrix_.rows();
4244  }
4245  //**********************************************************************************************
4246 
4247  //**Columns function****************************************************************************
4252  inline size_t columns() const {
4253  return matrix_.columns();
4254  }
4255  //**********************************************************************************************
4256 
4257  //**Left operand access*************************************************************************
4262  inline LeftOperand leftOperand() const {
4263  return matrix_;
4264  }
4265  //**********************************************************************************************
4266 
4267  //**Right operand access************************************************************************
4272  inline RightOperand rightOperand() const {
4273  return scalar_;
4274  }
4275  //**********************************************************************************************
4276 
4277  //**********************************************************************************************
4283  template< typename T >
4284  inline bool canAlias( const T* alias ) const {
4285  return matrix_.canAlias( alias );
4286  }
4287  //**********************************************************************************************
4288 
4289  //**********************************************************************************************
4295  template< typename T >
4296  inline bool isAliased( const T* alias ) const {
4297  return matrix_.isAliased( alias );
4298  }
4299  //**********************************************************************************************
4300 
4301  //**********************************************************************************************
4306  inline bool isAligned() const {
4307  return matrix_.isAligned();
4308  }
4309  //**********************************************************************************************
4310 
4311  //**********************************************************************************************
4316  inline bool canSMPAssign() const {
4317  typename MMM::LeftOperand A( matrix_.leftOperand() );
4318  return ( !BLAZE_BLAS_IS_PARALLEL ||
4319  ( rows() * columns() < DMATDMATMULT_THRESHOLD ) ) &&
4320  ( A.rows() > SMP_DMATDMATMULT_THRESHOLD );
4321  }
4322  //**********************************************************************************************
4323 
4324  private:
4325  //**Member variables****************************************************************************
4326  LeftOperand matrix_;
4327  RightOperand scalar_;
4328  //**********************************************************************************************
4329 
4330  //**Assignment to dense matrices****************************************************************
4342  template< typename MT // Type of the target dense matrix
4343  , bool SO > // Storage order of the target dense matrix
4344  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
4345  assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4346  {
4348 
4349  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4350  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4351 
4352  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4353  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4354 
4355  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4356  return;
4357  }
4358  else if( left.columns() == 0UL ) {
4359  reset( ~lhs );
4360  return;
4361  }
4362 
4363  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4364  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4365 
4366  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4367  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4368  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4369  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4370  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4371  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4372 
4373  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4374  }
4375  //**********************************************************************************************
4376 
4377  //**Assignment to dense matrices (kernel selection)*********************************************
4388  template< typename MT3 // Type of the left-hand side target matrix
4389  , typename MT4 // Type of the left-hand side matrix operand
4390  , typename MT5 // Type of the right-hand side matrix operand
4391  , typename ST2 > // Type of the scalar value
4392  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4393  {
4394  if( ( IsDiagonal<MT5>::value ) ||
4395  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
4396  selectSmallAssignKernel( C, A, B, scalar );
4397  else
4398  selectBlasAssignKernel( C, A, B, scalar );
4399  }
4400  //**********************************************************************************************
4401 
4402  //**Default assignment to dense matrices (general/general)**************************************
4416  template< typename MT3 // Type of the left-hand side target matrix
4417  , typename MT4 // Type of the left-hand side matrix operand
4418  , typename MT5 // Type of the right-hand side matrix operand
4419  , typename ST2 > // Type of the scalar value
4420  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4421  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4422  {
4423  const size_t M( A.rows() );
4424  const size_t N( B.columns() );
4425  const size_t K( A.columns() );
4426 
4427  for( size_t i=0UL; i<M; ++i )
4428  {
4429  const size_t kbegin( ( IsUpper<MT4>::value )
4430  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4431  :( 0UL ) );
4432  const size_t kend( ( IsLower<MT4>::value )
4433  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4434  :( K ) );
4435  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4436 
4437  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
4438  for( size_t j=0UL; j<N; ++j ) {
4439  reset( (~C)(i,j) );
4440  }
4441  continue;
4442  }
4443 
4444  {
4445  const size_t jbegin( ( IsUpper<MT5>::value )
4446  ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
4447  :( 0UL ) );
4448  const size_t jend( ( IsLower<MT5>::value )
4449  ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
4450  :( N ) );
4451  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4452 
4453  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
4454  for( size_t j=0UL; j<jbegin; ++j ) {
4455  reset( C(i,j) );
4456  }
4457  }
4458  else if( IsStrictlyUpper<MT5>::value ) {
4459  reset( C(i,0UL) );
4460  }
4461  for( size_t j=jbegin; j<jend; ++j ) {
4462  C(i,j) = A(i,kbegin) * B(kbegin,j);
4463  }
4464  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
4465  for( size_t j=jend; j<N; ++j ) {
4466  reset( C(i,j) );
4467  }
4468  }
4469  else if( IsStrictlyLower<MT5>::value ) {
4470  reset( C(i,N-1UL) );
4471  }
4472  }
4473 
4474  for( size_t k=kbegin+1UL; k<kend; ++k )
4475  {
4476  const size_t jbegin( ( IsUpper<MT5>::value )
4477  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
4478  :( 0UL ) );
4479  const size_t jend( ( IsLower<MT5>::value )
4480  ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
4481  :( N ) );
4482  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4483 
4484  for( size_t j=jbegin; j<jend; ++j ) {
4485  C(i,j) += A(i,k) * B(k,j);
4486  }
4487  if( IsLower<MT5>::value ) {
4488  C(i,jend) = A(i,k) * B(k,jend);
4489  }
4490  }
4491 
4492  {
4493  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4494  ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? i+1UL : i )
4495  :( 0UL ) );
4496  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
4497  ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? i : i+1UL )
4498  :( N ) );
4499  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4500 
4501  for( size_t j=jbegin; j<jend; ++j ) {
4502  C(i,j) *= scalar;
4503  }
4504  }
4505  }
4506  }
4507  //**********************************************************************************************
4508 
4509  //**Default assignment to dense matrices (general/diagonal)*************************************
4523  template< typename MT3 // Type of the left-hand side target matrix
4524  , typename MT4 // Type of the left-hand side matrix operand
4525  , typename MT5 // Type of the right-hand side matrix operand
4526  , typename ST2 > // Type of the scalar value
4527  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4528  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4529  {
4531 
4532  const size_t M( A.rows() );
4533  const size_t N( B.columns() );
4534 
4535  for( size_t i=0UL; i<M; ++i )
4536  {
4537  const size_t jbegin( ( IsUpper<MT4>::value )
4538  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4539  :( 0UL ) );
4540  const size_t jend( ( IsLower<MT4>::value )
4541  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4542  :( N ) );
4543  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4544 
4545  if( IsUpper<MT4>::value ) {
4546  for( size_t j=0UL; j<jbegin; ++j ) {
4547  reset( C(i,j) );
4548  }
4549  }
4550  for( size_t j=jbegin; j<jend; ++j ) {
4551  C(i,j) = A(i,j) * B(j,j) * scalar;
4552  }
4553  if( IsLower<MT4>::value ) {
4554  for( size_t j=jend; j<N; ++j ) {
4555  reset( C(i,j) );
4556  }
4557  }
4558  }
4559  }
4560  //**********************************************************************************************
4561 
4562  //**Default assignment to dense matrices (diagonal/general)*************************************
4576  template< typename MT3 // Type of the left-hand side target matrix
4577  , typename MT4 // Type of the left-hand side matrix operand
4578  , typename MT5 // Type of the right-hand side matrix operand
4579  , typename ST2 > // Type of the scalar value
4580  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4581  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4582  {
4584 
4585  const size_t M( A.rows() );
4586  const size_t N( B.columns() );
4587 
4588  for( size_t i=0UL; i<M; ++i )
4589  {
4590  const size_t jbegin( ( IsUpper<MT5>::value )
4591  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4592  :( 0UL ) );
4593  const size_t jend( ( IsLower<MT5>::value )
4594  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4595  :( N ) );
4596  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4597 
4598  if( IsUpper<MT5>::value ) {
4599  for( size_t j=0UL; j<jbegin; ++j ) {
4600  reset( C(i,j) );
4601  }
4602  }
4603  for( size_t j=jbegin; j<jend; ++j ) {
4604  C(i,j) = A(i,i) * B(i,j) * scalar;
4605  }
4606  if( IsLower<MT5>::value ) {
4607  for( size_t j=jend; j<N; ++j ) {
4608  reset( C(i,j) );
4609  }
4610  }
4611  }
4612  }
4613  //**********************************************************************************************
4614 
4615  //**Default assignment to dense matrices (diagonal/diagonal)************************************
4629  template< typename MT3 // Type of the left-hand side target matrix
4630  , typename MT4 // Type of the left-hand side matrix operand
4631  , typename MT5 // Type of the right-hand side matrix operand
4632  , typename ST2 > // Type of the scalar value
4633  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
4634  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4635  {
4637 
4638  reset( C );
4639 
4640  for( size_t i=0UL; i<A.rows(); ++i ) {
4641  C(i,i) = A(i,i) * B(i,i) * scalar;
4642  }
4643  }
4644  //**********************************************************************************************
4645 
4646  //**Default assignment to dense matrices (small matrices)***************************************
4660  template< typename MT3 // Type of the left-hand side target matrix
4661  , typename MT4 // Type of the left-hand side matrix operand
4662  , typename MT5 // Type of the right-hand side matrix operand
4663  , typename ST2 > // Type of the scalar value
4664  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4665  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4666  {
4667  selectDefaultAssignKernel( C, A, B, scalar );
4668  }
4669  //**********************************************************************************************
4670 
4671  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
4686  template< typename MT3 // Type of the left-hand side target matrix
4687  , typename MT4 // Type of the left-hand side matrix operand
4688  , typename MT5 // Type of the right-hand side matrix operand
4689  , typename ST2 > // Type of the scalar value
4690  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4691  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
4692  {
4693  typedef IntrinsicTrait<ElementType> IT;
4694 
4695  const size_t M( A.rows() );
4696  const size_t N( B.columns() );
4697  const size_t K( A.columns() );
4698 
4699  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
4700 
4701  const size_t jpos( remainder ? ( N & size_t(-IT::size) ) : N );
4702  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % IT::size ) ) == jpos, "Invalid end calculation" );
4703 
4704  const IntrinsicType factor( set( scalar ) );
4705 
4706  size_t j( 0UL );
4707 
4708  for( ; (j+IT::size*7UL) < jpos; j+=IT::size*8UL ) {
4709  for( size_t i=0UL; i<M; ++i )
4710  {
4711  const size_t kbegin( ( IsUpper<MT4>::value )
4712  ?( ( IsLower<MT5>::value )
4713  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4714  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4715  :( IsLower<MT5>::value ? j : 0UL ) );
4716  const size_t kend( ( IsLower<MT4>::value )
4717  ?( ( IsUpper<MT5>::value )
4718  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
4719  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4720  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
4721 
4722  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4723 
4724  for( size_t k=kbegin; k<kend; ++k ) {
4725  const IntrinsicType a1( set( A(i,k) ) );
4726  xmm1 = xmm1 + a1 * B.load(k,j );
4727  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
4728  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
4729  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
4730  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
4731  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
4732  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
4733  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
4734  }
4735 
4736  (~C).store( i, j , xmm1 * factor );
4737  (~C).store( i, j+IT::size , xmm2 * factor );
4738  (~C).store( i, j+IT::size*2UL, xmm3 * factor );
4739  (~C).store( i, j+IT::size*3UL, xmm4 * factor );
4740  (~C).store( i, j+IT::size*4UL, xmm5 * factor );
4741  (~C).store( i, j+IT::size*5UL, xmm6 * factor );
4742  (~C).store( i, j+IT::size*6UL, xmm7 * factor );
4743  (~C).store( i, j+IT::size*7UL, xmm8 * factor );
4744  }
4745  }
4746 
4747  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
4748  {
4749  size_t i( 0UL );
4750 
4751  for( ; (i+2UL) <= M; i+=2UL )
4752  {
4753  const size_t kbegin( ( IsUpper<MT4>::value )
4754  ?( ( IsLower<MT5>::value )
4755  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4756  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4757  :( IsLower<MT5>::value ? j : 0UL ) );
4758  const size_t kend( ( IsLower<MT4>::value )
4759  ?( ( IsUpper<MT5>::value )
4760  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
4761  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4762  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
4763 
4764  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4765 
4766  for( size_t k=kbegin; k<kend; ++k ) {
4767  const IntrinsicType a1( set( A(i ,k) ) );
4768  const IntrinsicType a2( set( A(i+1UL,k) ) );
4769  const IntrinsicType b1( B.load(k,j ) );
4770  const IntrinsicType b2( B.load(k,j+IT::size ) );
4771  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
4772  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
4773  xmm1 = xmm1 + a1 * b1;
4774  xmm2 = xmm2 + a1 * b2;
4775  xmm3 = xmm3 + a1 * b3;
4776  xmm4 = xmm4 + a1 * b4;
4777  xmm5 = xmm5 + a2 * b1;
4778  xmm6 = xmm6 + a2 * b2;
4779  xmm7 = xmm7 + a2 * b3;
4780  xmm8 = xmm8 + a2 * b4;
4781  }
4782 
4783  (~C).store( i , j , xmm1 * factor );
4784  (~C).store( i , j+IT::size , xmm2 * factor );
4785  (~C).store( i , j+IT::size*2UL, xmm3 * factor );
4786  (~C).store( i , j+IT::size*3UL, xmm4 * factor );
4787  (~C).store( i+1UL, j , xmm5 * factor );
4788  (~C).store( i+1UL, j+IT::size , xmm6 * factor );
4789  (~C).store( i+1UL, j+IT::size*2UL, xmm7 * factor );
4790  (~C).store( i+1UL, j+IT::size*3UL, xmm8 * factor );
4791  }
4792 
4793  if( i < M )
4794  {
4795  const size_t kbegin( ( IsUpper<MT4>::value )
4796  ?( ( IsLower<MT5>::value )
4797  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4798  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4799  :( IsLower<MT5>::value ? j : 0UL ) );
4800  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
4801 
4802  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4803 
4804  for( size_t k=kbegin; k<kend; ++k ) {
4805  const IntrinsicType a1( set( A(i,k) ) );
4806  xmm1 = xmm1 + a1 * B.load(k,j );
4807  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
4808  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
4809  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
4810  }
4811 
4812  (~C).store( i, j , xmm1 * factor );
4813  (~C).store( i, j+IT::size , xmm2 * factor );
4814  (~C).store( i, j+IT::size*2UL, xmm3 * factor );
4815  (~C).store( i, j+IT::size*3UL, xmm4 * factor );
4816  }
4817  }
4818 
4819  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
4820  {
4821  size_t i( 0UL );
4822 
4823  for( ; (i+2UL) <= M; i+=2UL )
4824  {
4825  const size_t kbegin( ( IsUpper<MT4>::value )
4826  ?( ( IsLower<MT5>::value )
4827  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4828  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4829  :( IsLower<MT5>::value ? j : 0UL ) );
4830  const size_t kend( ( IsLower<MT4>::value )
4831  ?( ( IsUpper<MT5>::value )
4832  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
4833  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4834  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
4835 
4836  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4837 
4838  for( size_t k=kbegin; k<kend; ++k ) {
4839  const IntrinsicType a1( set( A(i ,k) ) );
4840  const IntrinsicType a2( set( A(i+1UL,k) ) );
4841  const IntrinsicType b1( B.load(k,j ) );
4842  const IntrinsicType b2( B.load(k,j+IT::size) );
4843  xmm1 = xmm1 + a1 * b1;
4844  xmm2 = xmm2 + a1 * b2;
4845  xmm3 = xmm3 + a2 * b1;
4846  xmm4 = xmm4 + a2 * b2;
4847  }
4848 
4849  (~C).store( i , j , xmm1 * factor );
4850  (~C).store( i , j+IT::size, xmm2 * factor );
4851  (~C).store( i+1UL, j , xmm3 * factor );
4852  (~C).store( i+1UL, j+IT::size, xmm4 * factor );
4853  }
4854 
4855  if( i < M )
4856  {
4857  const size_t kbegin( ( IsUpper<MT4>::value )
4858  ?( ( IsLower<MT5>::value )
4859  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4860  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4861  :( IsLower<MT5>::value ? j : 0UL ) );
4862  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
4863 
4864  IntrinsicType xmm1, xmm2;
4865 
4866  for( size_t k=kbegin; k<kend; ++k ) {
4867  const IntrinsicType a1( set( A(i,k) ) );
4868  xmm1 = xmm1 + a1 * B.load(k,j );
4869  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
4870  }
4871 
4872  (~C).store( i, j , xmm1 * factor );
4873  (~C).store( i, j+IT::size, xmm2 * factor );
4874  }
4875  }
4876 
4877  for( ; j<jpos; j+=IT::size )
4878  {
4879  size_t i( 0UL );
4880 
4881  for( ; (i+2UL) <= M; i+=2UL )
4882  {
4883  const size_t kbegin( ( IsUpper<MT4>::value )
4884  ?( ( IsLower<MT5>::value )
4885  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4886  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4887  :( IsLower<MT5>::value ? j : 0UL ) );
4888  const size_t kend( ( IsLower<MT4>::value )
4889  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4890  :( K ) );
4891 
4892  IntrinsicType xmm1, xmm2;
4893 
4894  for( size_t k=kbegin; k<kend; ++k ) {
4895  const IntrinsicType b1( B.load(k,j) );
4896  xmm1 = xmm1 + set( A(i ,k) ) * b1;
4897  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
4898  }
4899 
4900  (~C).store( i , j, xmm1 * factor );
4901  (~C).store( i+1UL, j, xmm2 * factor );
4902  }
4903 
4904  if( i < M )
4905  {
4906  const size_t kbegin( ( IsUpper<MT4>::value )
4907  ?( ( IsLower<MT5>::value )
4908  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4909  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4910  :( IsLower<MT5>::value ? j : 0UL ) );
4911 
4912  IntrinsicType xmm1;
4913 
4914  for( size_t k=kbegin; k<K; ++k ) {
4915  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
4916  }
4917 
4918  (~C).store( i, j, xmm1 * factor );
4919  }
4920  }
4921 
4922  for( ; remainder && j<N; ++j )
4923  {
4924  size_t i( 0UL );
4925 
4926  for( ; (i+2UL) <= M; i+=2UL )
4927  {
4928  const size_t kbegin( ( IsUpper<MT4>::value )
4929  ?( ( IsLower<MT5>::value )
4930  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4931  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4932  :( IsLower<MT5>::value ? j : 0UL ) );
4933  const size_t kend( ( IsLower<MT4>::value )
4934  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4935  :( K ) );
4936 
4937  ElementType value1 = ElementType();
4938  ElementType value2 = ElementType();
4939 
4940  for( size_t k=kbegin; k<kend; ++k ) {
4941  value1 += A(i ,k) * B(k,j);
4942  value2 += A(i+1UL,k) * B(k,j);
4943  }
4944 
4945  (~C)(i ,j) = value1 * scalar;
4946  (~C)(i+1UL,j) = value2 * scalar;
4947  }
4948 
4949  if( i < M )
4950  {
4951  const size_t kbegin( ( IsUpper<MT4>::value )
4952  ?( ( IsLower<MT5>::value )
4953  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4954  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4955  :( IsLower<MT5>::value ? j : 0UL ) );
4956 
4957  ElementType value = ElementType();
4958 
4959  for( size_t k=kbegin; k<K; ++k ) {
4960  value += A(i,k) * B(k,j);
4961  }
4962 
4963  (~C)(i,j) = value * scalar;
4964  }
4965  }
4966  }
4967  //**********************************************************************************************
4968 
4969  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
4984  template< typename MT3 // Type of the left-hand side target matrix
4985  , typename MT4 // Type of the left-hand side matrix operand
4986  , typename MT5 // Type of the right-hand side matrix operand
4987  , typename ST2 > // Type of the scalar value
4988  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4989  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
4990  {
4995 
4996  if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
4997  const typename MT4::OppositeType tmp( serial( A ) );
4998  assign( ~C, tmp * B * scalar );
4999  }
5000  else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
5001  const typename MT5::OppositeType tmp( serial( B ) );
5002  assign( ~C, A * tmp * scalar );
5003  }
5004  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
5005  const typename MT4::OppositeType tmp( serial( A ) );
5006  assign( ~C, tmp * B * scalar );
5007  }
5008  else {
5009  const typename MT5::OppositeType tmp( serial( B ) );
5010  assign( ~C, A * tmp * scalar );
5011  }
5012  }
5013  //**********************************************************************************************
5014 
5015  //**Default assignment to dense matrices (large matrices)***************************************
5029  template< typename MT3 // Type of the left-hand side target matrix
5030  , typename MT4 // Type of the left-hand side matrix operand
5031  , typename MT5 // Type of the right-hand side matrix operand
5032  , typename ST2 > // Type of the scalar value
5033  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5034  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5035  {
5036  selectDefaultAssignKernel( C, A, B, scalar );
5037  }
5038  //**********************************************************************************************
5039 
5040  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
5055  template< typename MT3 // Type of the left-hand side target matrix
5056  , typename MT4 // Type of the left-hand side matrix operand
5057  , typename MT5 // Type of the right-hand side matrix operand
5058  , typename ST2 > // Type of the scalar value
5059  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5060  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5061  {
5062  typedef IntrinsicTrait<ElementType> IT;
5063 
5064  const size_t M( A.rows() );
5065  const size_t N( B.columns() );
5066  const size_t K( A.columns() );
5067 
5068  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5069 
5070  const IntrinsicType factor( set( scalar ) );
5071 
5072  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
5073  {
5074  const size_t jend( min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
5075 
5076  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
5077  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % IT::size ) ) == jpos, "Invalid end calculation" );
5078 
5079  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
5080  {
5081  const size_t iend( min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
5082 
5083  for( size_t i=ii; i<iend; ++i ) {
5084  for( size_t j=jj; j<jend; ++j ) {
5085  reset( (~C)(i,j) );
5086  }
5087  }
5088 
5089  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
5090  {
5091  const size_t ktmp( min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
5092 
5093  size_t j( jj );
5094 
5095  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
5096  {
5097  const size_t j1( j+IT::size );
5098  const size_t j2( j+IT::size*2UL );
5099  const size_t j3( j+IT::size*3UL );
5100 
5101  size_t i( ii );
5102 
5103  for( ; (i+2UL) <= iend; i+=2UL )
5104  {
5105  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5106  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5107  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5108  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
5109 
5110  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5111 
5112  for( size_t k=kbegin; k<kend; ++k ) {
5113  const IntrinsicType a1( set( A(i ,k) ) );
5114  const IntrinsicType a2( set( A(i+1UL,k) ) );
5115  const IntrinsicType b1( B.load(k,j ) );
5116  const IntrinsicType b2( B.load(k,j1) );
5117  const IntrinsicType b3( B.load(k,j2) );
5118  const IntrinsicType b4( B.load(k,j3) );
5119  xmm1 = xmm1 + a1 * b1;
5120  xmm2 = xmm2 + a1 * b2;
5121  xmm3 = xmm3 + a1 * b3;
5122  xmm4 = xmm4 + a1 * b4;
5123  xmm5 = xmm5 + a2 * b1;
5124  xmm6 = xmm6 + a2 * b2;
5125  xmm7 = xmm7 + a2 * b3;
5126  xmm8 = xmm8 + a2 * b4;
5127  }
5128 
5129  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5130  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
5131  (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
5132  (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
5133  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
5134  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
5135  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
5136  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
5137  }
5138 
5139  if( i < iend )
5140  {
5141  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5142  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5143  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5144  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
5145 
5146  IntrinsicType xmm1, xmm2, xmm3, xmm4;
5147 
5148  for( size_t k=kbegin; k<kend; ++k ) {
5149  const IntrinsicType a1( set( A(i,k) ) );
5150  xmm1 = xmm1 + a1 * B.load(k,j );
5151  xmm2 = xmm2 + a1 * B.load(k,j1);
5152  xmm3 = xmm3 + a1 * B.load(k,j2);
5153  xmm4 = xmm4 + a1 * B.load(k,j3);
5154  }
5155 
5156  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5157  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
5158  (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
5159  (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
5160  }
5161  }
5162 
5163  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
5164  {
5165  const size_t j1( j+IT::size );
5166 
5167  size_t i( ii );
5168 
5169  for( ; (i+4UL) <= iend; i+=4UL )
5170  {
5171  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5172  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5173  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
5174  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
5175 
5176  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5177 
5178  for( size_t k=kbegin; k<kend; ++k ) {
5179  const IntrinsicType a1( set( A(i ,k) ) );
5180  const IntrinsicType a2( set( A(i+1UL,k) ) );
5181  const IntrinsicType a3( set( A(i+2UL,k) ) );
5182  const IntrinsicType a4( set( A(i+3UL,k) ) );
5183  const IntrinsicType b1( B.load(k,j ) );
5184  const IntrinsicType b2( B.load(k,j1) );
5185  xmm1 = xmm1 + a1 * b1;
5186  xmm2 = xmm2 + a1 * b2;
5187  xmm3 = xmm3 + a2 * b1;
5188  xmm4 = xmm4 + a2 * b2;
5189  xmm5 = xmm5 + a3 * b1;
5190  xmm6 = xmm6 + a3 * b2;
5191  xmm7 = xmm7 + a4 * b1;
5192  xmm8 = xmm8 + a4 * b2;
5193  }
5194 
5195  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5196  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
5197  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5198  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
5199  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
5200  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
5201  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
5202  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
5203  }
5204 
5205  for( ; (i+2UL) <= iend; i+=2UL )
5206  {
5207  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5208  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5209  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5210  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
5211 
5212  IntrinsicType xmm1, xmm2, xmm3, xmm4;
5213 
5214  for( size_t k=kbegin; k<kend; ++k ) {
5215  const IntrinsicType a1( set( A(i ,k) ) );
5216  const IntrinsicType a2( set( A(i+1UL,k) ) );
5217  const IntrinsicType b1( B.load(k,j ) );
5218  const IntrinsicType b2( B.load(k,j1) );
5219  xmm1 = xmm1 + a1 * b1;
5220  xmm2 = xmm2 + a1 * b2;
5221  xmm3 = xmm3 + a2 * b1;
5222  xmm4 = xmm4 + a2 * b2;
5223  }
5224 
5225  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5226  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
5227  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5228  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
5229  }
5230 
5231  if( i < iend )
5232  {
5233  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5234  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5235  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5236  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
5237 
5238  IntrinsicType xmm1, xmm2;
5239 
5240  for( size_t k=kbegin; k<kend; ++k ) {
5241  const IntrinsicType a1( set( A(i,k) ) );
5242  xmm1 = xmm1 + a1 * B.load(k,j );
5243  xmm2 = xmm2 + a1 * B.load(k,j1);
5244  }
5245 
5246  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5247  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
5248  }
5249  }
5250 
5251  for( ; j<jpos; j+=IT::size )
5252  {
5253  for( size_t i=ii; i<iend; ++i )
5254  {
5255  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5256  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5257  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5258  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
5259 
5260  IntrinsicType xmm1;
5261 
5262  for( size_t k=kbegin; k<kend; ++k ) {
5263  const IntrinsicType a1( set( A(i,k) ) );
5264  xmm1 = xmm1 + a1 * B.load(k,j);
5265  }
5266 
5267  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5268  }
5269  }
5270 
5271  for( ; remainder && j<jend; ++j )
5272  {
5273  for( size_t i=ii; i<iend; ++i )
5274  {
5275  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5276  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5277  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5278  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
5279 
5280  ElementType value = ElementType();
5281 
5282  for( size_t k=kbegin; k<kend; ++k ) {
5283  value += A(i,k) * B(k,j);
5284  }
5285 
5286  (~C)(i,j) += value * scalar;
5287  }
5288  }
5289  }
5290  }
5291  }
5292  }
5293  //**********************************************************************************************
5294 
5295  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
5309  template< typename MT3 // Type of the left-hand side target matrix
5310  , typename MT4 // Type of the left-hand side matrix operand
5311  , typename MT5 // Type of the right-hand side matrix operand
5312  , typename ST2 > // Type of the scalar value
5313  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5314  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5315  {
5316  selectSmallAssignKernel( ~C, A, B, scalar );
5317  }
5318  //**********************************************************************************************
5319 
5320  //**BLAS-based assignment to dense matrices (default)*******************************************
5334  template< typename MT3 // Type of the left-hand side target matrix
5335  , typename MT4 // Type of the left-hand side matrix operand
5336  , typename MT5 // Type of the right-hand side matrix operand
5337  , typename ST2 > // Type of the scalar value
5338  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
5339  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5340  {
5341  selectLargeAssignKernel( C, A, B, scalar );
5342  }
5343  //**********************************************************************************************
5344 
5345  //**BLAS-based assignment to dense matrices*****************************************************
5346 #if BLAZE_BLAS_MODE
5347 
5360  template< typename MT3 // Type of the left-hand side target matrix
5361  , typename MT4 // Type of the left-hand side matrix operand
5362  , typename MT5 // Type of the right-hand side matrix operand
5363  , typename ST2 > // Type of the scalar value
5364  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
5365  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5366  {
5367  typedef typename MT3::ElementType ET;
5368 
5369  if( IsTriangular<MT4>::value ) {
5370  assign( C, B );
5371  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5372  }
5373  else if( IsTriangular<MT5>::value ) {
5374  assign( C, A );
5375  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5376  }
5377  else {
5378  gemm( C, A, B, ET(scalar), ET(0) );
5379  }
5380  }
5381 #endif
5382  //**********************************************************************************************
5383 
5384  //**Assignment to sparse matrices***************************************************************
5396  template< typename MT // Type of the target sparse matrix
5397  , bool SO > // Storage order of the target sparse matrix
5398  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5399  assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5400  {
5402 
5403  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
5404 
5411 
5412  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5413  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5414 
5415  const TmpType tmp( serial( rhs ) );
5416  assign( ~lhs, tmp );
5417  }
5418  //**********************************************************************************************
5419 
5420  //**Restructuring assignment to column-major matrices*******************************************
5434  template< typename MT > // Type of the target matrix
5435  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5436  assign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
5437  {
5439 
5441 
5442  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5443  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5444 
5445  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5446  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5447 
5448  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
5449  assign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
5450  else if( IsSymmetric<MT1>::value )
5451  assign( ~lhs, trans( left ) * right * rhs.scalar_ );
5452  else
5453  assign( ~lhs, left * trans( right ) * rhs.scalar_ );
5454  }
5455  //**********************************************************************************************
5456 
5457  //**Addition assignment to dense matrices*******************************************************
5469  template< typename MT // Type of the target dense matrix
5470  , bool SO > // Storage order of the target dense matrix
5471  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5472  addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5473  {
5475 
5476  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5477  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5478 
5479  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5480  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5481 
5482  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
5483  return;
5484  }
5485 
5486  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
5487  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
5488 
5489  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5490  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
5491  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
5492  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
5493  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5494  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
5495 
5496  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5497  }
5498  //**********************************************************************************************
5499 
5500  //**Addition assignment to dense matrices (kernel selection)************************************
5511  template< typename MT3 // Type of the left-hand side target matrix
5512  , typename MT4 // Type of the left-hand side matrix operand
5513  , typename MT5 // Type of the right-hand side matrix operand
5514  , typename ST2 > // Type of the scalar value
5515  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5516  {
5517  if( ( IsDiagonal<MT5>::value ) ||
5518  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
5519  selectSmallAddAssignKernel( C, A, B, scalar );
5520  else
5521  selectBlasAddAssignKernel( C, A, B, scalar );
5522  }
5523  //**********************************************************************************************
5524 
5525  //**Default addition assignment to dense matrices (general/general)*****************************
5539  template< typename MT3 // Type of the left-hand side target matrix
5540  , typename MT4 // Type of the left-hand side matrix operand
5541  , typename MT5 // Type of the right-hand side matrix operand
5542  , typename ST2 > // Type of the scalar value
5543  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
5544  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5545  {
5546  const ResultType tmp( serial( A * B * scalar ) );
5547  addAssign( C, tmp );
5548  }
5549  //**********************************************************************************************
5550 
5551  //**Default addition assignment to dense matrices (general/diagonal)****************************
5565  template< typename MT3 // Type of the left-hand side target matrix
5566  , typename MT4 // Type of the left-hand side matrix operand
5567  , typename MT5 // Type of the right-hand side matrix operand
5568  , typename ST2 > // Type of the scalar value
5569  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
5570  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5571  {
5573 
5574  const size_t M( A.rows() );
5575  const size_t N( B.columns() );
5576 
5577  for( size_t i=0UL; i<M; ++i )
5578  {
5579  const size_t jbegin( ( IsUpper<MT4>::value )
5580  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
5581  :( 0UL ) );
5582  const size_t jend( ( IsLower<MT4>::value )
5583  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
5584  :( N ) );
5585  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5586 
5587  const size_t jnum( jend - jbegin );
5588  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
5589 
5590  for( size_t j=jbegin; j<jpos; j+=2UL ) {
5591  C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
5592  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
5593  }
5594  if( jpos < jend ) {
5595  C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
5596  }
5597  }
5598  }
5599  //**********************************************************************************************
5600 
5601  //**Default addition assignment to dense matrices (diagonal/general)****************************
5615  template< typename MT3 // Type of the left-hand side target matrix
5616  , typename MT4 // Type of the left-hand side matrix operand
5617  , typename MT5 // Type of the right-hand side matrix operand
5618  , typename ST2 > // Type of the scalar value
5619  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
5620  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5621  {
5623 
5624  const size_t M( A.rows() );
5625  const size_t N( B.columns() );
5626 
5627  for( size_t i=0UL; i<M; ++i )
5628  {
5629  const size_t jbegin( ( IsUpper<MT5>::value )
5630  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
5631  :( 0UL ) );
5632  const size_t jend( ( IsLower<MT5>::value )
5633  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
5634  :( N ) );
5635  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5636 
5637  const size_t jnum( jend - jbegin );
5638  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
5639 
5640  for( size_t j=jbegin; j<jpos; j+=2UL ) {
5641  C(i,j ) += A(i,i) * B(i,j ) * scalar;
5642  C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
5643  }
5644  if( jpos < jend ) {
5645  C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
5646  }
5647  }
5648  }
5649  //**********************************************************************************************
5650 
5651  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
5665  template< typename MT3 // Type of the left-hand side target matrix
5666  , typename MT4 // Type of the left-hand side matrix operand
5667  , typename MT5 // Type of the right-hand side matrix operand
5668  , typename ST2 > // Type of the scalar value
5669  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
5670  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5671  {
5673 
5674  for( size_t i=0UL; i<A.rows(); ++i ) {
5675  C(i,i) += A(i,i) * B(i,i) * scalar;
5676  }
5677  }
5678  //**********************************************************************************************
5679 
5680  //**Default addition assignment to dense matrices (small matrices)******************************
5694  template< typename MT3 // Type of the left-hand side target matrix
5695  , typename MT4 // Type of the left-hand side matrix operand
5696  , typename MT5 // Type of the right-hand side matrix operand
5697  , typename ST2 > // Type of the scalar value
5698  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5699  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5700  {
5701  selectDefaultAddAssignKernel( C, A, B, scalar );
5702  }
5703  //**********************************************************************************************
5704 
5705  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
5720  template< typename MT3 // Type of the left-hand side target matrix
5721  , typename MT4 // Type of the left-hand side matrix operand
5722  , typename MT5 // Type of the right-hand side matrix operand
5723  , typename ST2 > // Type of the scalar value
5724  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5725  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5726  {
5727  typedef IntrinsicTrait<ElementType> IT;
5728 
5729  const size_t M( A.rows() );
5730  const size_t N( B.columns() );
5731  const size_t K( A.columns() );
5732 
5733  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5734 
5735  const size_t jpos( remainder ? ( N & size_t(-IT::size) ) : N );
5736  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % IT::size ) ) == jpos, "Invalid end calculation" );
5737 
5738  const IntrinsicType factor( set( scalar ) );
5739 
5740  size_t j( 0UL );
5741 
5742  for( ; (j+IT::size*7UL) < jpos; j+=IT::size*8UL ) {
5743  for( size_t i=0UL; i<M; ++i )
5744  {
5745  const size_t kbegin( ( IsUpper<MT4>::value )
5746  ?( ( IsLower<MT5>::value )
5747  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5748  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5749  :( IsLower<MT5>::value ? j : 0UL ) );
5750  const size_t kend( ( IsLower<MT4>::value )
5751  ?( ( IsUpper<MT5>::value )
5752  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
5753  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
5754  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
5755 
5756  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5757 
5758  for( size_t k=kbegin; k<kend; ++k ) {
5759  const IntrinsicType a1( set( A(i,k) ) );
5760  xmm1 = xmm1 + a1 * B.load(k,j );
5761  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
5762  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
5763  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
5764  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
5765  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
5766  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
5767  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
5768  }
5769 
5770  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5771  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
5772  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
5773  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
5774  (~C).store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) + xmm5 * factor );
5775  (~C).store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) + xmm6 * factor );
5776  (~C).store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) + xmm7 * factor );
5777  (~C).store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) + xmm8 * factor );
5778  }
5779  }
5780 
5781  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
5782  {
5783  size_t i( 0UL );
5784 
5785  for( ; (i+2UL) <= M; i+=2UL )
5786  {
5787  const size_t kbegin( ( IsUpper<MT4>::value )
5788  ?( ( IsLower<MT5>::value )
5789  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5790  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5791  :( IsLower<MT5>::value ? j : 0UL ) );
5792  const size_t kend( ( IsLower<MT4>::value )
5793  ?( ( IsUpper<MT5>::value )
5794  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
5795  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5796  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
5797 
5798  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5799 
5800  for( size_t k=kbegin; k<kend; ++k ) {
5801  const IntrinsicType a1( set( A(i ,k) ) );
5802  const IntrinsicType a2( set( A(i+1UL,k) ) );
5803  const IntrinsicType b1( B.load(k,j ) );
5804  const IntrinsicType b2( B.load(k,j+IT::size ) );
5805  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
5806  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
5807  xmm1 = xmm1 + a1 * b1;
5808  xmm2 = xmm2 + a1 * b2;
5809  xmm3 = xmm3 + a1 * b3;
5810  xmm4 = xmm4 + a1 * b4;
5811  xmm5 = xmm5 + a2 * b1;
5812  xmm6 = xmm6 + a2 * b2;
5813  xmm7 = xmm7 + a2 * b3;
5814  xmm8 = xmm8 + a2 * b4;
5815  }
5816 
5817  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5818  (~C).store( i , j+IT::size , (~C).load(i ,j+IT::size ) + xmm2 * factor );
5819  (~C).store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) + xmm3 * factor );
5820  (~C).store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) + xmm4 * factor );
5821  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
5822  (~C).store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) + xmm6 * factor );
5823  (~C).store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) + xmm7 * factor );
5824  (~C).store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) + xmm8 * factor );
5825  }
5826 
5827  if( i < M )
5828  {
5829  const size_t kbegin( ( IsUpper<MT4>::value )
5830  ?( ( IsLower<MT5>::value )
5831  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5832  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5833  :( IsLower<MT5>::value ? j : 0UL ) );
5834  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
5835 
5836  IntrinsicType xmm1, xmm2, xmm3, xmm4;
5837 
5838  for( size_t k=kbegin; k<kend; ++k ) {
5839  const IntrinsicType a1( set( A(i,k) ) );
5840  xmm1 = xmm1 + a1 * B.load(k,j );
5841  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
5842  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
5843  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
5844  }
5845 
5846  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5847  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
5848  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
5849  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
5850  }
5851  }
5852 
5853  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
5854  {
5855  size_t i( 0UL );
5856 
5857  for( ; (i+2UL) <= M; i+=2UL )
5858  {
5859  const size_t kbegin( ( IsUpper<MT4>::value )
5860  ?( ( IsLower<MT5>::value )
5861  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5862  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5863  :( IsLower<MT5>::value ? j : 0UL ) );
5864  const size_t kend( ( IsLower<MT4>::value )
5865  ?( ( IsUpper<MT5>::value )
5866  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
5867  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5868  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
5869 
5870  IntrinsicType xmm1, xmm2, xmm3, xmm4;
5871 
5872  for( size_t k=kbegin; k<kend; ++k ) {
5873  const IntrinsicType a1( set( A(i ,k) ) );
5874  const IntrinsicType a2( set( A(i+1UL,k) ) );
5875  const IntrinsicType b1( B.load(k,j ) );
5876  const IntrinsicType b2( B.load(k,j+IT::size) );
5877  xmm1 = xmm1 + a1 * b1;
5878  xmm2 = xmm2 + a1 * b2;
5879  xmm3 = xmm3 + a2 * b1;
5880  xmm4 = xmm4 + a2 * b2;
5881  }
5882 
5883  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5884  (~C).store( i , j+IT::size, (~C).load(i ,j+IT::size) + xmm2 * factor );
5885  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5886  (~C).store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) + xmm4 * factor );
5887  }
5888 
5889  if( i < M )
5890  {
5891  const size_t kbegin( ( IsUpper<MT4>::value )
5892  ?( ( IsLower<MT5>::value )
5893  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5894  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5895  :( IsLower<MT5>::value ? j : 0UL ) );
5896  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
5897 
5898  IntrinsicType xmm1, xmm2;
5899 
5900  for( size_t k=kbegin; k<kend; ++k ) {
5901  const IntrinsicType a1( set( A(i,k) ) );
5902  xmm1 = xmm1 + a1 * B.load(k,j );
5903  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
5904  }
5905 
5906  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5907  (~C).store( i, j+IT::size, (~C).load(i,j+IT::size) + xmm2 * factor );
5908  }
5909  }
5910 
5911  for( ; j<jpos; j+=IT::size )
5912  {
5913  size_t i( 0UL );
5914 
5915  for( ; (i+2UL) <= M; i+=2UL )
5916  {
5917  const size_t kbegin( ( IsUpper<MT4>::value )
5918  ?( ( IsLower<MT5>::value )
5919  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5920  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5921  :( IsLower<MT5>::value ? j : 0UL ) );
5922  const size_t kend( ( IsLower<MT4>::value )
5923  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5924  :( K ) );
5925 
5926  IntrinsicType xmm1, xmm2;
5927 
5928  for( size_t k=kbegin; k<kend; ++k ) {
5929  const IntrinsicType b1( B.load(k,j) );
5930  xmm1 = xmm1 + set( A(i ,k) ) * b1;
5931  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
5932  }
5933 
5934  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5935  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
5936  }
5937 
5938  if( i < M )
5939  {
5940  const size_t kbegin( ( IsUpper<MT4>::value )
5941  ?( ( IsLower<MT5>::value )
5942  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5943  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5944  :( IsLower<MT5>::value ? j : 0UL ) );
5945 
5946  IntrinsicType xmm1;
5947 
5948  for( size_t k=kbegin; k<K; ++k ) {
5949  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
5950  }
5951 
5952  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5953  }
5954  }
5955 
5956  for( ; remainder && j<N; ++j )
5957  {
5958  size_t i( 0UL );
5959 
5960  for( ; (i+2UL) <= M; i+=2UL )
5961  {
5962  const size_t kbegin( ( IsUpper<MT4>::value )
5963  ?( ( IsLower<MT5>::value )
5964  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5965  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5966  :( IsLower<MT5>::value ? j : 0UL ) );
5967  const size_t kend( ( IsLower<MT4>::value )
5968  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5969  :( K ) );
5970 
5971  ElementType value1 = ElementType();
5972  ElementType value2 = ElementType();
5973 
5974  for( size_t k=kbegin; k<kend; ++k ) {
5975  value1 += A(i ,k) * B(k,j);
5976  value2 += A(i+1UL,k) * B(k,j);
5977  }
5978 
5979  (~C)(i ,j) += value1 * scalar;
5980  (~C)(i+1UL,j) += value2 * scalar;
5981  }
5982 
5983  if( i < M )
5984  {
5985  const size_t kbegin( ( IsUpper<MT4>::value )
5986  ?( ( IsLower<MT5>::value )
5987  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5988  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5989  :( IsLower<MT5>::value ? j : 0UL ) );
5990 
5991  ElementType value = ElementType();
5992 
5993  for( size_t k=kbegin; k<K; ++k ) {
5994  value += A(i,k) * B(k,j);
5995  }
5996 
5997  (~C)(i,j) += value * scalar;
5998  }
5999  }
6000  }
6001  //**********************************************************************************************
6002 
6003  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
6018  template< typename MT3 // Type of the left-hand side target matrix
6019  , typename MT4 // Type of the left-hand side matrix operand
6020  , typename MT5 // Type of the right-hand side matrix operand
6021  , typename ST2 > // Type of the scalar value
6022  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6023  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6024  {
6029 
6030  if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
6031  const typename MT4::OppositeType tmp( serial( A ) );
6032  addAssign( ~C, tmp * B * scalar );
6033  }
6034  else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
6035  const typename MT5::OppositeType tmp( serial( B ) );
6036  addAssign( ~C, A * tmp * scalar );
6037  }
6038  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6039  const typename MT4::OppositeType tmp( serial( A ) );
6040  addAssign( ~C, tmp * B * scalar );
6041  }
6042  else {
6043  const typename MT5::OppositeType tmp( serial( B ) );
6044  addAssign( ~C, A * tmp * scalar );
6045  }
6046  }
6047  //**********************************************************************************************
6048 
6049  //**Default addition assignment to dense matrices (large matrices)******************************
6063  template< typename MT3 // Type of the left-hand side target matrix
6064  , typename MT4 // Type of the left-hand side matrix operand
6065  , typename MT5 // Type of the right-hand side matrix operand
6066  , typename ST2 > // Type of the scalar value
6067  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6068  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6069  {
6070  selectDefaultAddAssignKernel( C, A, B, scalar );
6071  }
6072  //**********************************************************************************************
6073 
6074  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
6089  template< typename MT3 // Type of the left-hand side target matrix
6090  , typename MT4 // Type of the left-hand side matrix operand
6091  , typename MT5 // Type of the right-hand side matrix operand
6092  , typename ST2 > // Type of the scalar value
6093  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6094  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6095  {
6096  typedef IntrinsicTrait<ElementType> IT;
6097 
6098  const size_t M( A.rows() );
6099  const size_t N( B.columns() );
6100  const size_t K( A.columns() );
6101 
6102  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6103 
6104  const IntrinsicType factor( set( scalar ) );
6105 
6106  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
6107  {
6108  const size_t jend( min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
6109 
6110  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
6111  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % IT::size ) ) == jpos, "Invalid end calculation" );
6112 
6113  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
6114  {
6115  const size_t iend( min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
6116 
6117  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
6118  {
6119  const size_t ktmp( min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
6120 
6121  size_t j( jj );
6122 
6123  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
6124  {
6125  const size_t j1( j+IT::size );
6126  const size_t j2( j+IT::size*2UL );
6127  const size_t j3( j+IT::size*3UL );
6128 
6129  size_t i( ii );
6130 
6131  for( ; (i+2UL) <= iend; i+=2UL )
6132  {
6133  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6134  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6135  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
6136  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
6137 
6138  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6139 
6140  for( size_t k=kbegin; k<kend; ++k ) {
6141  const IntrinsicType a1( set( A(i ,k) ) );
6142  const IntrinsicType a2( set( A(i+1UL,k) ) );
6143  const IntrinsicType b1( B.load(k,j ) );
6144  const IntrinsicType b2( B.load(k,j1) );
6145  const IntrinsicType b3( B.load(k,j2) );
6146  const IntrinsicType b4( B.load(k,j3) );
6147  xmm1 = xmm1 + a1 * b1;
6148  xmm2 = xmm2 + a1 * b2;
6149  xmm3 = xmm3 + a1 * b3;
6150  xmm4 = xmm4 + a1 * b4;
6151  xmm5 = xmm5 + a2 * b1;
6152  xmm6 = xmm6 + a2 * b2;
6153  xmm7 = xmm7 + a2 * b3;
6154  xmm8 = xmm8 + a2 * b4;
6155  }
6156 
6157  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6158  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
6159  (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
6160  (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
6161  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
6162  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
6163  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
6164  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
6165  }
6166 
6167  if( i < iend )
6168  {
6169  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6170  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6171  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6172  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
6173 
6174  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6175 
6176  for( size_t k=kbegin; k<kend; ++k ) {
6177  const IntrinsicType a1( set( A(i,k) ) );
6178  xmm1 = xmm1 + a1 * B.load(k,j );
6179  xmm2 = xmm2 + a1 * B.load(k,j1);
6180  xmm3 = xmm3 + a1 * B.load(k,j2);
6181  xmm4 = xmm4 + a1 * B.load(k,j3);
6182  }
6183 
6184  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6185  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
6186  (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
6187  (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
6188  }
6189  }
6190 
6191  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
6192  {
6193  const size_t j1( j+IT::size );
6194 
6195  size_t i( ii );
6196 
6197  for( ; (i+4UL) <= iend; i+=4UL )
6198  {
6199  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6200  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6201  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
6202  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
6203 
6204  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6205 
6206  for( size_t k=kbegin; k<kend; ++k ) {
6207  const IntrinsicType a1( set( A(i ,k) ) );
6208  const IntrinsicType a2( set( A(i+1UL,k) ) );
6209  const IntrinsicType a3( set( A(i+2UL,k) ) );
6210  const IntrinsicType a4( set( A(i+3UL,k) ) );
6211  const IntrinsicType b1( B.load(k,j ) );
6212  const IntrinsicType b2( B.load(k,j1) );
6213  xmm1 = xmm1 + a1 * b1;
6214  xmm2 = xmm2 + a1 * b2;
6215  xmm3 = xmm3 + a2 * b1;
6216  xmm4 = xmm4 + a2 * b2;
6217  xmm5 = xmm5 + a3 * b1;
6218  xmm6 = xmm6 + a3 * b2;
6219  xmm7 = xmm7 + a4 * b1;
6220  xmm8 = xmm8 + a4 * b2;
6221  }
6222 
6223  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6224  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
6225  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
6226  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
6227  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
6228  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
6229  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
6230  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
6231  }
6232 
6233  for( ; (i+2UL) <= iend; i+=2UL )
6234  {
6235  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6236  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6237  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
6238  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
6239 
6240  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6241 
6242  for( size_t k=kbegin; k<kend; ++k ) {
6243  const IntrinsicType a1( set( A(i ,k) ) );
6244  const IntrinsicType a2( set( A(i+1UL,k) ) );
6245  const IntrinsicType b1( B.load(k,j ) );
6246  const IntrinsicType b2( B.load(k,j1) );
6247  xmm1 = xmm1 + a1 * b1;
6248  xmm2 = xmm2 + a1 * b2;
6249  xmm3 = xmm3 + a2 * b1;
6250  xmm4 = xmm4 + a2 * b2;
6251  }
6252 
6253  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6254  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
6255  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
6256  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
6257  }
6258 
6259  if( i < iend )
6260  {
6261  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6262  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6263  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6264  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
6265 
6266  IntrinsicType xmm1, xmm2;
6267 
6268  for( size_t k=kbegin; k<kend; ++k ) {
6269  const IntrinsicType a1( set( A(i,k) ) );
6270  xmm1 = xmm1 + a1 * B.load(k,j );
6271  xmm2 = xmm2 + a1 * B.load(k,j1);
6272  }
6273 
6274  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6275  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
6276  }
6277  }
6278 
6279  for( ; j<jpos; j+=IT::size )
6280  {
6281  for( size_t i=ii; i<iend; ++i )
6282  {
6283  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6284  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6285  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6286  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
6287 
6288  IntrinsicType xmm1;
6289 
6290  for( size_t k=kbegin; k<kend; ++k ) {
6291  const IntrinsicType a1( set( A(i,k) ) );
6292  xmm1 = xmm1 + a1 * B.load(k,j);
6293  }
6294 
6295  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
6296  }
6297  }
6298 
6299  for( ; remainder && j<jend; ++j )
6300  {
6301  for( size_t i=ii; i<iend; ++i )
6302  {
6303  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6304  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6305  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6306  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
6307 
6308  ElementType value = ElementType();
6309 
6310  for( size_t k=kbegin; k<kend; ++k ) {
6311  value += A(i,k) * B(k,j);
6312  }
6313 
6314  (~C)(i,j) += value * scalar;
6315  }
6316  }
6317  }
6318  }
6319  }
6320  }
6321  //**********************************************************************************************
6322 
6323  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
6337  template< typename MT3 // Type of the left-hand side target matrix
6338  , typename MT4 // Type of the left-hand side matrix operand
6339  , typename MT5 // Type of the right-hand side matrix operand
6340  , typename ST2 > // Type of the scalar value
6341  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6342  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6343  {
6344  selectSmallAddAssignKernel( ~C, A, B, scalar );
6345  }
6346  //**********************************************************************************************
6347 
6348  //**BLAS-based addition assignment to dense matrices (default)**********************************
6362  template< typename MT3 // Type of the left-hand side target matrix
6363  , typename MT4 // Type of the left-hand side matrix operand
6364  , typename MT5 // Type of the right-hand side matrix operand
6365  , typename ST2 > // Type of the scalar value
6366  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
6367  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6368  {
6369  selectLargeAddAssignKernel( C, A, B, scalar );
6370  }
6371  //**********************************************************************************************
6372 
6373  //**BLAS-based addition assignment to dense matrices********************************************
6374 #if BLAZE_BLAS_MODE
6375 
6388  template< typename MT3 // Type of the left-hand side target matrix
6389  , typename MT4 // Type of the left-hand side matrix operand
6390  , typename MT5 // Type of the right-hand side matrix operand
6391  , typename ST2 > // Type of the scalar value
6392  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
6393  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6394  {
6395  typedef typename MT3::ElementType ET;
6396 
6397  if( IsTriangular<MT4>::value ) {
6398  typename MT3::ResultType tmp( serial( B ) );
6399  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6400  addAssign( C, tmp );
6401  }
6402  else if( IsTriangular<MT5>::value ) {
6403  typename MT3::ResultType tmp( serial( A ) );
6404  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6405  addAssign( C, tmp );
6406  }
6407  else {
6408  gemm( C, A, B, ET(scalar), ET(1) );
6409  }
6410  }
6411 #endif
6412  //**********************************************************************************************
6413 
6414  //**Restructuring addition assignment to column-major matrices**********************************
6428  template< typename MT > // Type of the target matrix
6429  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
6430  addAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
6431  {
6433 
6435 
6436  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6437  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6438 
6439  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6440  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6441 
6442  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
6443  addAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
6444  else if( IsSymmetric<MT1>::value )
6445  addAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
6446  else
6447  addAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
6448  }
6449  //**********************************************************************************************
6450 
6451  //**Addition assignment to sparse matrices******************************************************
6452  // No special implementation for the addition assignment to sparse matrices.
6453  //**********************************************************************************************
6454 
6455  //**Subtraction assignment to dense matrices****************************************************
6467  template< typename MT // Type of the target dense matrix
6468  , bool SO > // Storage order of the target dense matrix
6469  friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
6470  subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6471  {
6473 
6474  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6475  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6476 
6477  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6478  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6479 
6480  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6481  return;
6482  }
6483 
6484  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6485  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6486 
6487  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6488  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6489  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6490  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6491  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6492  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6493 
6494  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6495  }
6496  //**********************************************************************************************
6497 
6498  //**Subtraction assignment to dense matrices (kernel selection)*********************************
6509  template< typename MT3 // Type of the left-hand side target matrix
6510  , typename MT4 // Type of the left-hand side matrix operand
6511  , typename MT5 // Type of the right-hand side matrix operand
6512  , typename ST2 > // Type of the scalar value
6513  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6514  {
6515  if( ( IsDiagonal<MT5>::value ) ||
6516  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6517  selectSmallSubAssignKernel( C, A, B, scalar );
6518  else
6519  selectBlasSubAssignKernel( C, A, B, scalar );
6520  }
6521  //**********************************************************************************************
6522 
6523  //**Default subtraction assignment to dense matrices (general/general)**************************
6537  template< typename MT3 // Type of the left-hand side target matrix
6538  , typename MT4 // Type of the left-hand side matrix operand
6539  , typename MT5 // Type of the right-hand side matrix operand
6540  , typename ST2 > // Type of the scalar value
6541  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6542  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6543  {
6544  const ResultType tmp( serial( A * B * scalar ) );
6545  subAssign( C, tmp );
6546  }
6547  //**********************************************************************************************
6548 
6549  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
6563  template< typename MT3 // Type of the left-hand side target matrix
6564  , typename MT4 // Type of the left-hand side matrix operand
6565  , typename MT5 // Type of the right-hand side matrix operand
6566  , typename ST2 > // Type of the scalar value
6567  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6568  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6569  {
6571 
6572  const size_t M( A.rows() );
6573  const size_t N( B.columns() );
6574 
6575  for( size_t i=0UL; i<M; ++i )
6576  {
6577  const size_t jbegin( ( IsUpper<MT4>::value )
6578  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6579  :( 0UL ) );
6580  const size_t jend( ( IsLower<MT4>::value )
6581  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6582  :( N ) );
6583  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6584 
6585  const size_t jnum( jend - jbegin );
6586  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6587 
6588  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6589  C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
6590  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6591  }
6592  if( jpos < jend ) {
6593  C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
6594  }
6595  }
6596  }
6597  //**********************************************************************************************
6598 
6599  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
6613  template< typename MT3 // Type of the left-hand side target matrix
6614  , typename MT4 // Type of the left-hand side matrix operand
6615  , typename MT5 // Type of the right-hand side matrix operand
6616  , typename ST2 > // Type of the scalar value
6617  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6618  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6619  {
6621 
6622  const size_t M( A.rows() );
6623  const size_t N( B.columns() );
6624 
6625  for( size_t i=0UL; i<M; ++i )
6626  {
6627  const size_t jbegin( ( IsUpper<MT5>::value )
6628  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
6629  :( 0UL ) );
6630  const size_t jend( ( IsLower<MT5>::value )
6631  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
6632  :( N ) );
6633  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6634 
6635  const size_t jnum( jend - jbegin );
6636  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6637 
6638  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6639  C(i,j ) -= A(i,i) * B(i,j ) * scalar;
6640  C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
6641  }
6642  if( jpos < jend ) {
6643  C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
6644  }
6645  }
6646  }
6647  //**********************************************************************************************
6648 
6649  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
6663  template< typename MT3 // Type of the left-hand side target matrix
6664  , typename MT4 // Type of the left-hand side matrix operand
6665  , typename MT5 // Type of the right-hand side matrix operand
6666  , typename ST2 > // Type of the scalar value
6667  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
6668  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6669  {
6671 
6672  for( size_t i=0UL; i<A.rows(); ++i ) {
6673  C(i,i) -= A(i,i) * B(i,i) * scalar;
6674  }
6675  }
6676  //**********************************************************************************************
6677 
6678  //**Default subtraction assignment to dense matrices (small matrices)***************************
6692  template< typename MT3 // Type of the left-hand side target matrix
6693  , typename MT4 // Type of the left-hand side matrix operand
6694  , typename MT5 // Type of the right-hand side matrix operand
6695  , typename ST2 > // Type of the scalar value
6696  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6697  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6698  {
6699  selectDefaultSubAssignKernel( C, A, B, scalar );
6700  }
6701  //**********************************************************************************************
6702 
6703  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
6718  template< typename MT3 // Type of the left-hand side target matrix
6719  , typename MT4 // Type of the left-hand side matrix operand
6720  , typename MT5 // Type of the right-hand side matrix operand
6721  , typename ST2 > // Type of the scalar value
6722  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6723  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6724  {
6725  typedef IntrinsicTrait<ElementType> IT;
6726 
6727  const size_t M( A.rows() );
6728  const size_t N( B.columns() );
6729  const size_t K( A.columns() );
6730 
6731  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6732 
6733  const size_t jpos( remainder ? ( N & size_t(-IT::size) ) : N );
6734  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % IT::size ) ) == jpos, "Invalid end calculation" );
6735 
6736  const IntrinsicType factor( set( scalar ) );
6737 
6738  size_t j( 0UL );
6739 
6740  for( ; (j+IT::size*7UL) < jpos; j+=IT::size*8UL ) {
6741  for( size_t i=0UL; i<M; ++i )
6742  {
6743  const size_t kbegin( ( IsUpper<MT4>::value )
6744  ?( ( IsLower<MT5>::value )
6745  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6746  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6747  :( IsLower<MT5>::value ? j : 0UL ) );
6748  const size_t kend( ( IsLower<MT4>::value )
6749  ?( ( IsUpper<MT5>::value )
6750  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
6751  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
6752  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
6753 
6754  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6755 
6756  for( size_t k=kbegin; k<kend; ++k ) {
6757  const IntrinsicType a1( set( A(i,k) ) );
6758  xmm1 = xmm1 + a1 * B.load(k,j );
6759  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
6760  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
6761  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
6762  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
6763  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
6764  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
6765  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
6766  }
6767 
6768  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6769  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
6770  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
6771  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
6772  (~C).store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) - xmm5 * factor );
6773  (~C).store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) - xmm6 * factor );
6774  (~C).store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) - xmm7 * factor );
6775  (~C).store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) - xmm8 * factor );
6776  }
6777  }
6778 
6779  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
6780  {
6781  size_t i( 0UL );
6782 
6783  for( ; (i+2UL) <= M; i+=2UL )
6784  {
6785  const size_t kbegin( ( IsUpper<MT4>::value )
6786  ?( ( IsLower<MT5>::value )
6787  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6788  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6789  :( IsLower<MT5>::value ? j : 0UL ) );
6790  const size_t kend( ( IsLower<MT4>::value )
6791  ?( ( IsUpper<MT5>::value )
6792  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
6793  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6794  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
6795 
6796  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6797 
6798  for( size_t k=kbegin; k<kend; ++k ) {
6799  const IntrinsicType a1( set( A(i ,k) ) );
6800  const IntrinsicType a2( set( A(i+1UL,k) ) );
6801  const IntrinsicType b1( B.load(k,j ) );
6802  const IntrinsicType b2( B.load(k,j+IT::size ) );
6803  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
6804  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
6805  xmm1 = xmm1 + a1 * b1;
6806  xmm2 = xmm2 + a1 * b2;
6807  xmm3 = xmm3 + a1 * b3;
6808  xmm4 = xmm4 + a1 * b4;
6809  xmm5 = xmm5 + a2 * b1;
6810  xmm6 = xmm6 + a2 * b2;
6811  xmm7 = xmm7 + a2 * b3;
6812  xmm8 = xmm8 + a2 * b4;
6813  }
6814 
6815  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6816  (~C).store( i , j+IT::size , (~C).load(i ,j+IT::size ) - xmm2 * factor );
6817  (~C).store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) - xmm3 * factor );
6818  (~C).store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) - xmm4 * factor );
6819  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
6820  (~C).store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) - xmm6 * factor );
6821  (~C).store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) - xmm7 * factor );
6822  (~C).store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) - xmm8 * factor );
6823  }
6824 
6825  if( i < M )
6826  {
6827  const size_t kbegin( ( IsUpper<MT4>::value )
6828  ?( ( IsLower<MT5>::value )
6829  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6830  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6831  :( IsLower<MT5>::value ? j : 0UL ) );
6832  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
6833 
6834  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6835 
6836  for( size_t k=kbegin; k<kend; ++k ) {
6837  const IntrinsicType a1( set( A(i,k) ) );
6838  xmm1 = xmm1 + a1 * B.load(k,j );
6839  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
6840  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
6841  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
6842  }
6843 
6844  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6845  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
6846  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
6847  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
6848  }
6849  }
6850 
6851  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
6852  {
6853  size_t i( 0UL );
6854 
6855  for( ; (i+2UL) <= M; i+=2UL )
6856  {
6857  const size_t kbegin( ( IsUpper<MT4>::value )
6858  ?( ( IsLower<MT5>::value )
6859  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6860  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6861  :( IsLower<MT5>::value ? j : 0UL ) );
6862  const size_t kend( ( IsLower<MT4>::value )
6863  ?( ( IsUpper<MT5>::value )
6864  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
6865  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6866  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
6867 
6868  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6869 
6870  for( size_t k=kbegin; k<kend; ++k ) {
6871  const IntrinsicType a1( set( A(i ,k) ) );
6872  const IntrinsicType a2( set( A(i+1UL,k) ) );
6873  const IntrinsicType b1( B.load(k,j ) );
6874  const IntrinsicType b2( B.load(k,j+IT::size) );
6875  xmm1 = xmm1 + a1 * b1;
6876  xmm2 = xmm2 + a1 * b2;
6877  xmm3 = xmm3 + a2 * b1;
6878  xmm4 = xmm4 + a2 * b2;
6879  }
6880 
6881  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6882  (~C).store( i , j+IT::size, (~C).load(i ,j+IT::size) - xmm2 * factor );
6883  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
6884  (~C).store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) - xmm4 * factor );
6885  }
6886 
6887  if( i < M )
6888  {
6889  const size_t kbegin( ( IsUpper<MT4>::value )
6890  ?( ( IsLower<MT5>::value )
6891  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6892  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6893  :( IsLower<MT5>::value ? j : 0UL ) );
6894  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
6895 
6896  IntrinsicType xmm1, xmm2;
6897 
6898  for( size_t k=kbegin; k<kend; ++k ) {
6899  const IntrinsicType a1( set( A(i,k) ) );
6900  xmm1 = xmm1 + a1 * B.load(k,j );
6901  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
6902  }
6903 
6904  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6905  (~C).store( i, j+IT::size, (~C).load(i,j+IT::size) - xmm2 * factor );
6906  }
6907  }
6908 
6909  for( ; j<jpos; j+=IT::size )
6910  {
6911  size_t i( 0UL );
6912 
6913  for( ; (i+2UL) <= M; i+=2UL )
6914  {
6915  const size_t kbegin( ( IsUpper<MT4>::value )
6916  ?( ( IsLower<MT5>::value )
6917  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6918  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6919  :( IsLower<MT5>::value ? j : 0UL ) );
6920  const size_t kend( ( IsLower<MT4>::value )
6921  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6922  :( K ) );
6923 
6924  IntrinsicType xmm1, xmm2;
6925 
6926  for( size_t k=kbegin; k<kend; ++k ) {
6927  const IntrinsicType b1( B.load(k,j) );
6928  xmm1 = xmm1 + set( A(i ,k) ) * b1;
6929  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
6930  }
6931 
6932  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6933  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
6934  }
6935 
6936  if( i < M )
6937  {
6938  const size_t kbegin( ( IsUpper<MT4>::value )
6939  ?( ( IsLower<MT5>::value )
6940  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6941  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6942  :( IsLower<MT5>::value ? j : 0UL ) );
6943 
6944  IntrinsicType xmm1;
6945 
6946  for( size_t k=kbegin; k<K; ++k ) {
6947  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
6948  }
6949 
6950  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
6951  }
6952  }
6953 
6954  for( ; remainder && j<N; ++j )
6955  {
6956  size_t i( 0UL );
6957 
6958  for( ; (i+2UL) <= M; i+=2UL )
6959  {
6960  const size_t kbegin( ( IsUpper<MT4>::value )
6961  ?( ( IsLower<MT5>::value )
6962  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6963  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6964  :( IsLower<MT5>::value ? j : 0UL ) );
6965  const size_t kend( ( IsLower<MT4>::value )
6966  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6967  :( K ) );
6968 
6969  ElementType value1 = ElementType();
6970  ElementType value2 = ElementType();
6971 
6972  for( size_t k=kbegin; k<kend; ++k ) {
6973  value1 += A(i ,k) * B(k,j);
6974  value2 += A(i+1UL,k) * B(k,j);
6975  }
6976 
6977  (~C)(i ,j) -= value1 * scalar;
6978  (~C)(i+1UL,j) -= value2 * scalar;
6979  }
6980 
6981  if( i < M )
6982  {
6983  const size_t kbegin( ( IsUpper<MT4>::value )
6984  ?( ( IsLower<MT5>::value )
6985  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6986  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6987  :( IsLower<MT5>::value ? j : 0UL ) );
6988 
6989  ElementType value = ElementType();
6990 
6991  for( size_t k=kbegin; k<K; ++k ) {
6992  value += A(i,k) * B(k,j);
6993  }
6994 
6995  (~C)(i,j) -= value * scalar;
6996  }
6997  }
6998  }
6999  //**********************************************************************************************
7000 
7001  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
7015  template< typename MT3 // Type of the left-hand side target matrix
7016  , typename MT4 // Type of the left-hand side matrix operand
7017  , typename MT5 // Type of the right-hand side matrix operand
7018  , typename ST2 > // Type of the scalar value
7019  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7020  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7021  {
7026 
7027  if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
7028  const typename MT4::OppositeType tmp( serial( A ) );
7029  subAssign( ~C, tmp * B * scalar );
7030  }
7031  else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
7032  const typename MT5::OppositeType tmp( serial( B ) );
7033  subAssign( ~C, A * tmp * scalar );
7034  }
7035  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
7036  const typename MT4::OppositeType tmp( serial( A ) );
7037  subAssign( ~C, tmp * B * scalar );
7038  }
7039  else {
7040  const typename MT5::OppositeType tmp( serial( B ) );
7041  subAssign( ~C, A * tmp * scalar );
7042  }
7043  }
7044  //**********************************************************************************************
7045 
7046  //**Default subtraction assignment to dense matrices (large matrices)***************************
7060  template< typename MT3 // Type of the left-hand side target matrix
7061  , typename MT4 // Type of the left-hand side matrix operand
7062  , typename MT5 // Type of the right-hand side matrix operand
7063  , typename ST2 > // Type of the scalar value
7064  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7065  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7066  {
7067  selectDefaultSubAssignKernel( C, A, B, scalar );
7068  }
7069  //**********************************************************************************************
7070 
7071  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
7086  template< typename MT3 // Type of the left-hand side target matrix
7087  , typename MT4 // Type of the left-hand side matrix operand
7088  , typename MT5 // Type of the right-hand side matrix operand
7089  , typename ST2 > // Type of the scalar value
7090  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7091  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7092  {
7093  typedef IntrinsicTrait<ElementType> IT;
7094 
7095  const size_t M( A.rows() );
7096  const size_t N( B.columns() );
7097  const size_t K( A.columns() );
7098 
7099  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
7100 
7101  const IntrinsicType factor( set( scalar ) );
7102 
7103  for( size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
7104  {
7105  const size_t jend( min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
7106 
7107  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
7108  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % IT::size ) ) == jpos, "Invalid end calculation" );
7109 
7110  for( size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
7111  {
7112  const size_t iend( min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
7113 
7114  for( size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
7115  {
7116  const size_t ktmp( min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
7117 
7118  size_t j( jj );
7119 
7120  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL )
7121  {
7122  const size_t j1( j+IT::size );
7123  const size_t j2( j+IT::size*2UL );
7124  const size_t j3( j+IT::size*3UL );
7125 
7126  size_t i( ii );
7127 
7128  for( ; (i+2UL) <= iend; i+=2UL )
7129  {
7130  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7131  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7132  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7133  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
7134 
7135  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7136 
7137  for( size_t k=kbegin; k<kend; ++k ) {
7138  const IntrinsicType a1( set( A(i ,k) ) );
7139  const IntrinsicType a2( set( A(i+1UL,k) ) );
7140  const IntrinsicType b1( B.load(k,j ) );
7141  const IntrinsicType b2( B.load(k,j1) );
7142  const IntrinsicType b3( B.load(k,j2) );
7143  const IntrinsicType b4( B.load(k,j3) );
7144  xmm1 = xmm1 + a1 * b1;
7145  xmm2 = xmm2 + a1 * b2;
7146  xmm3 = xmm3 + a1 * b3;
7147  xmm4 = xmm4 + a1 * b4;
7148  xmm5 = xmm5 + a2 * b1;
7149  xmm6 = xmm6 + a2 * b2;
7150  xmm7 = xmm7 + a2 * b3;
7151  xmm8 = xmm8 + a2 * b4;
7152  }
7153 
7154  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7155  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
7156  (~C).store( i , j2, (~C).load(i ,j2) - xmm3 * factor );
7157  (~C).store( i , j3, (~C).load(i ,j3) - xmm4 * factor );
7158  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
7159  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm6 * factor );
7160  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) - xmm7 * factor );
7161  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) - xmm8 * factor );
7162  }
7163 
7164  if( i < iend )
7165  {
7166  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7167  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7168  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7169  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
7170 
7171  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7172 
7173  for( size_t k=kbegin; k<kend; ++k ) {
7174  const IntrinsicType a1( set( A(i,k) ) );
7175  xmm1 = xmm1 + a1 * B.load(k,j );
7176  xmm2 = xmm2 + a1 * B.load(k,j1);
7177  xmm3 = xmm3 + a1 * B.load(k,j2);
7178  xmm4 = xmm4 + a1 * B.load(k,j3);
7179  }
7180 
7181  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7182  (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
7183  (~C).store( i, j2, (~C).load(i,j2) - xmm3 * factor );
7184  (~C).store( i, j3, (~C).load(i,j3) - xmm4 * factor );
7185  }
7186  }
7187 
7188  for( ; (j+IT::size) < jpos; j+=IT::size*2UL )
7189  {
7190  const size_t j1( j+IT::size );
7191 
7192  size_t i( ii );
7193 
7194  for( ; (i+4UL) <= iend; i+=4UL )
7195  {
7196  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7197  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7198  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
7199  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
7200 
7201  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7202 
7203  for( size_t k=kbegin; k<kend; ++k ) {
7204  const IntrinsicType a1( set( A(i ,k) ) );
7205  const IntrinsicType a2( set( A(i+1UL,k) ) );
7206  const IntrinsicType a3( set( A(i+2UL,k) ) );
7207  const IntrinsicType a4( set( A(i+3UL,k) ) );
7208  const IntrinsicType b1( B.load(k,j ) );
7209  const IntrinsicType b2( B.load(k,j1) );
7210  xmm1 = xmm1 + a1 * b1;
7211  xmm2 = xmm2 + a1 * b2;
7212  xmm3 = xmm3 + a2 * b1;
7213  xmm4 = xmm4 + a2 * b2;
7214  xmm5 = xmm5 + a3 * b1;
7215  xmm6 = xmm6 + a3 * b2;
7216  xmm7 = xmm7 + a4 * b1;
7217  xmm8 = xmm8 + a4 * b2;
7218  }
7219 
7220  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7221  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
7222  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
7223  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
7224  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
7225  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) - xmm6 * factor );
7226  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
7227  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) - xmm8 * factor );
7228  }
7229 
7230  for( ; (i+2UL) <= iend; i+=2UL )
7231  {
7232  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7233  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7234  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7235  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
7236 
7237  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7238 
7239  for( size_t k=kbegin; k<kend; ++k ) {
7240  const IntrinsicType a1( set( A(i ,k) ) );
7241  const IntrinsicType a2( set( A(i+1UL,k) ) );
7242  const IntrinsicType b1( B.load(k,j ) );
7243  const IntrinsicType b2( B.load(k,j1) );
7244  xmm1 = xmm1 + a1 * b1;
7245  xmm2 = xmm2 + a1 * b2;
7246  xmm3 = xmm3 + a2 * b1;
7247  xmm4 = xmm4 + a2 * b2;
7248  }
7249 
7250  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7251  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
7252  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
7253  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
7254  }
7255 
7256  if( i < iend )
7257  {
7258  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7259  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7260  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7261  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
7262 
7263  IntrinsicType xmm1, xmm2;
7264 
7265  for( size_t k=kbegin; k<kend; ++k ) {
7266  const IntrinsicType a1( set( A(i,k) ) );
7267  xmm1 = xmm1 + a1 * B.load(k,j );
7268  xmm2 = xmm2 + a1 * B.load(k,j1);
7269  }
7270 
7271  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7272  (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
7273  }
7274  }
7275 
7276  for( ; j<jpos; j+=IT::size )
7277  {
7278  for( size_t i=ii; i<iend; ++i )
7279  {
7280  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7281  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7282  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7283  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
7284 
7285  IntrinsicType xmm1;
7286 
7287  for( size_t k=kbegin; k<kend; ++k ) {
7288  const IntrinsicType a1( set( A(i,k) ) );
7289  xmm1 = xmm1 + a1 * B.load(k,j);
7290  }
7291 
7292  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
7293  }
7294  }
7295 
7296  for( ; remainder && j<jend; ++j )
7297  {
7298  for( size_t i=ii; i<iend; ++i )
7299  {
7300  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7301  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7302  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7303  ( IsUpper<MT5>::value )?( min( j+1UL, ktmp ) ):( ktmp ) ) );
7304 
7305  ElementType value = ElementType();
7306 
7307  for( size_t k=kbegin; k<kend; ++k ) {
7308  value += A(i,k) * B(k,j);
7309  }
7310 
7311  (~C)(i,j) -= value * scalar;
7312  }
7313  }
7314  }
7315  }
7316  }
7317  }
7318  //**********************************************************************************************
7319 
7320  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
7334  template< typename MT3 // Type of the left-hand side target matrix
7335  , typename MT4 // Type of the left-hand side matrix operand
7336  , typename MT5 // Type of the right-hand side matrix operand
7337  , typename ST2 > // Type of the scalar value
7338  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7339  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7340  {
7341  selectSmallSubAssignKernel( ~C, A, B, scalar );
7342  }
7343  //**********************************************************************************************
7344 
7345  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
7359  template< typename MT3 // Type of the left-hand side target matrix
7360  , typename MT4 // Type of the left-hand side matrix operand
7361  , typename MT5 // Type of the right-hand side matrix operand
7362  , typename ST2 > // Type of the scalar value
7363  static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7364  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7365  {
7366  selectLargeSubAssignKernel( C, A, B, scalar );
7367  }
7368  //**********************************************************************************************
7369 
7370  //**BLAS-based subraction assignment to dense matrices******************************************
7371 #if BLAZE_BLAS_MODE
7372 
7385  template< typename MT3 // Type of the left-hand side target matrix
7386  , typename MT4 // Type of the left-hand side matrix operand
7387  , typename MT5 // Type of the right-hand side matrix operand
7388  , typename ST2 > // Type of the scalar value
7389  static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7390  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7391  {
7392  typedef typename MT3::ElementType ET;
7393 
7394  if( IsTriangular<MT4>::value ) {
7395  typename MT3::ResultType tmp( serial( B ) );
7396  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7397  subAssign( C, tmp );
7398  }
7399  else if( IsTriangular<MT5>::value ) {
7400  typename MT3::ResultType tmp( serial( A ) );
7401  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7402  subAssign( C, tmp );
7403  }
7404  else {
7405  gemm( C, A, B, ET(-scalar), ET(1) );
7406  }
7407  }
7408 #endif
7409  //**********************************************************************************************
7410 
7411  //**Restructuring subtraction assignment to column-major matrices*******************************
7425  template< typename MT > // Type of the target matrix
7426  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7427  subAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
7428  {
7430 
7432 
7433  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7434  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7435 
7436  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7437  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7438 
7439  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7440  subAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7441  else if( IsSymmetric<MT1>::value )
7442  subAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7443  else
7444  subAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7445  }
7446  //**********************************************************************************************
7447 
7448  //**Subtraction assignment to sparse matrices***************************************************
7449  // No special implementation for the subtraction assignment to sparse matrices.
7450  //**********************************************************************************************
7451 
7452  //**Multiplication assignment to dense matrices*************************************************
7453  // No special implementation for the multiplication assignment to dense matrices.
7454  //**********************************************************************************************
7455 
7456  //**Multiplication assignment to sparse matrices************************************************
7457  // No special implementation for the multiplication assignment to sparse matrices.
7458  //**********************************************************************************************
7459 
7460  //**SMP assignment to dense matrices************************************************************
7475  template< typename MT // Type of the target dense matrix
7476  , bool SO > // Storage order of the target dense matrix
7477  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7478  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7479  {
7481 
7482  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7483  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7484 
7485  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7486  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7487 
7488  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
7489  return;
7490  }
7491  else if( left.columns() == 0UL ) {
7492  reset( ~lhs );
7493  return;
7494  }
7495 
7496  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7497  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7498 
7499  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7500  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7501  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7502  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7503  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7504  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7505 
7506  smpAssign( ~lhs, A * B * rhs.scalar_ );
7507  }
7508  //**********************************************************************************************
7509 
7510  //**SMP assignment to sparse matrices***********************************************************
7525  template< typename MT // Type of the target sparse matrix
7526  , bool SO > // Storage order of the target sparse matrix
7527  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7528  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7529  {
7531 
7532  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
7533 
7540 
7541  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7542  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7543 
7544  const TmpType tmp( rhs );
7545  smpAssign( ~lhs, tmp );
7546  }
7547  //**********************************************************************************************
7548 
7549  //**Restructuring SMP assignment to column-major matrices***************************************
7563  template< typename MT > // Type of the target matrix
7564  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7565  smpAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
7566  {
7568 
7570 
7571  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7572  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7573 
7574  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7575  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7576 
7577  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7578  smpAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7579  else if( IsSymmetric<MT1>::value )
7580  smpAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7581  else
7582  smpAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7583  }
7584  //**********************************************************************************************
7585 
7586  //**SMP addition assignment to dense matrices***************************************************
7601  template< typename MT // Type of the target dense matrix
7602  , bool SO > // Storage order of the target dense matrix
7603  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7604  smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7605  {
7607 
7608  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7609  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7610 
7611  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7612  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7613 
7614  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7615  return;
7616  }
7617 
7618  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7619  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7620 
7621  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7622  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7623  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7624  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7625  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7626  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7627 
7628  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
7629  }
7630  //**********************************************************************************************
7631 
7632  //**Restructuring SMP addition assignment to column-major matrices******************************
7646  template< typename MT > // Type of the target matrix
7647  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7648  smpAddAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
7649  {
7651 
7653 
7654  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7655  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7656 
7657  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7658  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7659 
7660  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7661  smpAddAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7662  else if( IsSymmetric<MT1>::value )
7663  smpAddAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7664  else
7665  smpAddAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7666  }
7667  //**********************************************************************************************
7668 
7669  //**SMP addition assignment to sparse matrices**************************************************
7670  // No special implementation for the SMP addition assignment to sparse matrices.
7671  //**********************************************************************************************
7672 
7673  //**SMP subtraction assignment to dense matrices************************************************
7688  template< typename MT // Type of the target dense matrix
7689  , bool SO > // Storage order of the target dense matrix
7690  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7691  smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7692  {
7694 
7695  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7696  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7697 
7698  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7699  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7700 
7701  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7702  return;
7703  }
7704 
7705  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7706  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7707 
7708  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7709  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7710  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7711  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7712  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7713  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7714 
7715  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
7716  }
7717  //**********************************************************************************************
7718 
7719  //**Restructuring SMP subtraction assignment to column-major matrices***************************
7733  template< typename MT > // Type of the target matrix
7734  friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7735  smpSubAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
7736  {
7738 
7740 
7741  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7742  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7743 
7744  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7745  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7746 
7747  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7748  smpSubAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7749  else if( IsSymmetric<MT1>::value )
7750  smpSubAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7751  else
7752  smpSubAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7753  }
7754  //**********************************************************************************************
7755 
7756  //**SMP subtraction assignment to sparse matrices***********************************************
7757  // No special implementation for the SMP subtraction assignment to sparse matrices.
7758  //**********************************************************************************************
7759 
7760  //**SMP multiplication assignment to dense matrices*********************************************
7761  // No special implementation for the SMP multiplication assignment to dense matrices.
7762  //**********************************************************************************************
7763 
7764  //**SMP multiplication assignment to sparse matrices********************************************
7765  // No special implementation for the SMP multiplication assignment to sparse matrices.
7766  //**********************************************************************************************
7767 
7768  //**Compile time checks*************************************************************************
7776  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
7777  //**********************************************************************************************
7778 };
7780 //*************************************************************************************************
7781 
7782 
7783 
7784 
7785 //=================================================================================================
7786 //
7787 // GLOBAL BINARY ARITHMETIC OPERATORS
7788 //
7789 //=================================================================================================
7790 
7791 //*************************************************************************************************
7817 template< typename T1 // Type of the left-hand side dense matrix
7818  , typename T2 > // Type of the right-hand side dense matrix
7819 inline const DMatDMatMultExpr<T1,T2>
7821 {
7823 
7824  if( (~lhs).columns() != (~rhs).rows() ) {
7825  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
7826  }
7827 
7828  return DMatDMatMultExpr<T1,T2>( ~lhs, ~rhs );
7829 }
7830 //*************************************************************************************************
7831 
7832 
7833 
7834 
7835 //=================================================================================================
7836 //
7837 // ROWS SPECIALIZATIONS
7838 //
7839 //=================================================================================================
7840 
7841 //*************************************************************************************************
7843 template< typename MT1, typename MT2 >
7844 struct Rows< DMatDMatMultExpr<MT1,MT2> > : public Rows<MT1>
7845 {};
7847 //*************************************************************************************************
7848 
7849 
7850 
7851 
7852 //=================================================================================================
7853 //
7854 // COLUMNS SPECIALIZATIONS
7855 //
7856 //=================================================================================================
7857 
7858 //*************************************************************************************************
7860 template< typename MT1, typename MT2 >
7861 struct Columns< DMatDMatMultExpr<MT1,MT2> > : public Columns<MT2>
7862 {};
7864 //*************************************************************************************************
7865 
7866 
7867 
7868 
7869 //=================================================================================================
7870 //
7871 // ISALIGNED SPECIALIZATIONS
7872 //
7873 //=================================================================================================
7874 
7875 //*************************************************************************************************
7877 template< typename MT1, typename MT2 >
7878 struct IsAligned< DMatDMatMultExpr<MT1,MT2> >
7879  : public IsTrue< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7880 {};
7882 //*************************************************************************************************
7883 
7884 
7885 
7886 
7887 //=================================================================================================
7888 //
7889 // ISLOWER SPECIALIZATIONS
7890 //
7891 //=================================================================================================
7892 
7893 //*************************************************************************************************
7895 template< typename MT1, typename MT2 >
7896 struct IsLower< DMatDMatMultExpr<MT1,MT2> >
7897  : public IsTrue< And< IsLower<MT1>, IsLower<MT2> >::value >
7898 {};
7900 //*************************************************************************************************
7901 
7902 
7903 
7904 
7905 //=================================================================================================
7906 //
7907 // ISUNILOWER SPECIALIZATIONS
7908 //
7909 //=================================================================================================
7910 
7911 //*************************************************************************************************
7913 template< typename MT1, typename MT2 >
7914 struct IsUniLower< DMatDMatMultExpr<MT1,MT2> >
7915  : public IsTrue< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
7916 {};
7918 //*************************************************************************************************
7919 
7920 
7921 
7922 
7923 //=================================================================================================
7924 //
7925 // ISSTRICTLYLOWER SPECIALIZATIONS
7926 //
7927 //=================================================================================================
7928 
7929 //*************************************************************************************************
7931 template< typename MT1, typename MT2 >
7932 struct IsStrictlyLower< DMatDMatMultExpr<MT1,MT2> >
7933  : public IsTrue< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7934  , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
7935 {};
7937 //*************************************************************************************************
7938 
7939 
7940 
7941 
7942 //=================================================================================================
7943 //
7944 // ISUPPER SPECIALIZATIONS
7945 //
7946 //=================================================================================================
7947 
7948 //*************************************************************************************************
7950 template< typename MT1, typename MT2 >
7951 struct IsUpper< DMatDMatMultExpr<MT1,MT2> >
7952  : public IsTrue< And< IsUpper<MT1>, IsUpper<MT2> >::value >
7953 {};
7955 //*************************************************************************************************
7956 
7957 
7958 
7959 
7960 //=================================================================================================
7961 //
7962 // ISUNIUPPER SPECIALIZATIONS
7963 //
7964 //=================================================================================================
7965 
7966 //*************************************************************************************************
7968 template< typename MT1, typename MT2 >
7969 struct IsUniUpper< DMatDMatMultExpr<MT1,MT2> >
7970  : public IsTrue< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
7971 {};
7973 //*************************************************************************************************
7974 
7975 
7976 
7977 
7978 //=================================================================================================
7979 //
7980 // ISSTRICTLYUPPER SPECIALIZATIONS
7981 //
7982 //=================================================================================================
7983 
7984 //*************************************************************************************************
7986 template< typename MT1, typename MT2 >
7987 struct IsStrictlyUpper< DMatDMatMultExpr<MT1,MT2> >
7988  : public IsTrue< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
7989  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
7990 {};
7992 //*************************************************************************************************
7993 
7994 
7995 
7996 
7997 //=================================================================================================
7998 //
7999 // EXPRESSION TRAIT SPECIALIZATIONS
8000 //
8001 //=================================================================================================
8002 
8003 //*************************************************************************************************
8005 template< typename MT1, typename MT2, typename VT >
8006 struct DMatDVecMultExprTrait< DMatDMatMultExpr<MT1,MT2>, VT >
8007 {
8008  public:
8009  //**********************************************************************************************
8010  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8011  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
8012  IsDenseVector<VT>::value && IsColumnVector<VT>::value
8013  , typename DMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
8014  , INVALID_TYPE >::Type Type;
8015  //**********************************************************************************************
8016 };
8018 //*************************************************************************************************
8019 
8020 
8021 //*************************************************************************************************
8023 template< typename MT1, typename MT2, typename VT >
8024 struct DMatSVecMultExprTrait< DMatDMatMultExpr<MT1,MT2>, VT >
8025 {
8026  public:
8027  //**********************************************************************************************
8028  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8029  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
8030  IsSparseVector<VT>::value && IsColumnVector<VT>::value
8031  , typename DMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
8032  , INVALID_TYPE >::Type Type;
8033  //**********************************************************************************************
8034 };
8036 //*************************************************************************************************
8037 
8038 
8039 //*************************************************************************************************
8041 template< typename VT, typename MT1, typename MT2 >
8042 struct TDVecDMatMultExprTrait< VT, DMatDMatMultExpr<MT1,MT2> >
8043 {
8044  public:
8045  //**********************************************************************************************
8046  typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
8047  IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8048  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
8049  , typename TDVecDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8050  , INVALID_TYPE >::Type Type;
8051  //**********************************************************************************************
8052 };
8054 //*************************************************************************************************
8055 
8056 
8057 //*************************************************************************************************
8059 template< typename VT, typename MT1, typename MT2 >
8060 struct TSVecDMatMultExprTrait< VT, DMatDMatMultExpr<MT1,MT2> >
8061 {
8062  public:
8063  //**********************************************************************************************
8064  typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
8065  IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8066  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
8067  , typename TDVecDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8068  , INVALID_TYPE >::Type Type;
8069  //**********************************************************************************************
8070 };
8072 //*************************************************************************************************
8073 
8074 
8075 //*************************************************************************************************
8077 template< typename MT1, typename MT2, bool AF >
8078 struct SubmatrixExprTrait< DMatDMatMultExpr<MT1,MT2>, AF >
8079 {
8080  public:
8081  //**********************************************************************************************
8082  typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
8083  , typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
8084  //**********************************************************************************************
8085 };
8087 //*************************************************************************************************
8088 
8089 
8090 //*************************************************************************************************
8092 template< typename MT1, typename MT2 >
8093 struct RowExprTrait< DMatDMatMultExpr<MT1,MT2> >
8094 {
8095  public:
8096  //**********************************************************************************************
8097  typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
8098  //**********************************************************************************************
8099 };
8101 //*************************************************************************************************
8102 
8103 
8104 //*************************************************************************************************
8106 template< typename MT1, typename MT2 >
8107 struct ColumnExprTrait< DMatDMatMultExpr<MT1,MT2> >
8108 {
8109  public:
8110  //**********************************************************************************************
8111  typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
8112  //**********************************************************************************************
8113 };
8115 //*************************************************************************************************
8116 
8117 } // namespace blaze
8118 
8119 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exceptionThis macro encapsulates the default way of...
Definition: Exception.h:187
const MT::ElementType max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1729
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:155
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > >, simd_int16_t >::Type set(T value)
Sets all values in the vector to the given 2-byte integral value.
Definition: Set.h:73
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7820
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:105
Header file for basic type definitions.
Header file for the SparseVector base class.
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:152
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:252
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:207
Header file for the IsDiagonal type trait.
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:254
size_t rows() const
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:369
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
DMatDMatMultExpr< MT1, MT2 > This
Type of this DMatDMatMultExpr instance.
Definition: DMatDMatMultExpr.h:241
Header file for the IsSame and IsStrictlySame type traits.
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:245
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:507
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2588
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:259
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:257
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:251
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Expression object for dense matrix-dense matrix multiplications.The DMatDMatMultExpr class represents...
Definition: DMatDMatMultExpr.h:144
Header file for the IsUniLower type trait.
CompressedMatrix< Type, false > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:2584
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
Header file for the IsComplexDouble type trait.
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:153
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:389
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:79
Header file for the Or class template.
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatDMatMultExpr.h:246
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:433
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exceptionThis macro encapsulates the default way of Bla...
Definition: Exception.h:331
const MT::ElementType min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1682
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
Header file for the Not class template.
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:282
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2586
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:443
Header file for the IsDenseMatrix type trait.
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:243
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:411
Header file for the serial shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:353
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:399
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:297
System settings for the BLAS mode.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:452
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:116
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:79
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:1232
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:242
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:138
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
const bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:248
Constraint on the data type.
Constraints on the storage order of matrix types.
size_t columns() const
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:379
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:244
Header file for the HasMutableDataAccess type trait.
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:260
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:122
Header file for the IsDenseVector type trait.
Header file for all intrinsic functionality.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:257
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:247
Header file for the IsRowMajorMatrix type trait.
const DMatTransExpr< MT,!SO > trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:944
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:258
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for BLAS general matrix/matrix multiplication functions (gemm)
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:150
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2583
Header file for the IsTrue value trait.
Header file for the IsComplex type trait.
Header file for the TSVecDMatMultExprTrait class template.
Header file for the complex data type.
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:154
Header file for the IsUpper type trait.
Header file for exception macros.
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:151
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:453
Header file for the IsResizable type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:423
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.