TDMatTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
52 #include <blaze/math/Exception.h>
58 #include <blaze/math/Functions.h>
59 #include <blaze/math/shims/Reset.h>
61 #include <blaze/math/SIMD.h>
101 #include <blaze/system/BLAS.h>
102 #include <blaze/system/Blocking.h>
104 #include <blaze/system/Thresholds.h>
105 #include <blaze/util/Assert.h>
106 #include <blaze/util/Complex.h>
110 #include <blaze/util/DisableIf.h>
111 #include <blaze/util/EnableIf.h>
113 #include <blaze/util/InvalidType.h>
115 #include <blaze/util/mpl/And.h>
116 #include <blaze/util/mpl/If.h>
117 #include <blaze/util/mpl/Not.h>
118 #include <blaze/util/mpl/Or.h>
119 #include <blaze/util/Types.h>
128 
129 
130 namespace blaze {
131 
132 //=================================================================================================
133 //
134 // CLASS TDMATTDMATMULTEXPR
135 //
136 //=================================================================================================
137 
138 //*************************************************************************************************
145 template< typename MT1 // Type of the left-hand side dense matrix
146  , typename MT2 > // Type of the right-hand side dense matrix
147 class TDMatTDMatMultExpr : public DenseMatrix< TDMatTDMatMultExpr<MT1,MT2>, true >
148  , private MatMatMultExpr
149  , private Computation
150 {
151  private:
152  //**Type definitions****************************************************************************
159  //**********************************************************************************************
160 
161  //**********************************************************************************************
163  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
164  //**********************************************************************************************
165 
166  //**********************************************************************************************
168  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
169  //**********************************************************************************************
170 
171  //**********************************************************************************************
173 
179  template< typename T1, typename T2, typename T3 >
180  struct CanExploitSymmetry {
181  enum : bool { value = IsRowMajorMatrix<T1>::value &&
182  ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
183  };
185  //**********************************************************************************************
186 
187  //**********************************************************************************************
189 
193  template< typename T1, typename T2, typename T3 >
194  struct IsEvaluationRequired {
195  enum : bool { value = ( evaluateLeft || evaluateRight ) &&
196  CanExploitSymmetry<T1,T2,T3>::value };
197  };
199  //**********************************************************************************************
200 
201  //**********************************************************************************************
203 
206  template< typename T1, typename T2, typename T3 >
207  struct UseBlasKernel {
209  HasMutableDataAccess<T1>::value &&
210  HasConstDataAccess<T2>::value &&
211  HasConstDataAccess<T3>::value &&
212  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
213  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
214  IsBLASCompatible< ElementType_<T1> >::value &&
215  IsBLASCompatible< ElementType_<T2> >::value &&
216  IsBLASCompatible< ElementType_<T3> >::value &&
217  IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
218  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
219  };
221  //**********************************************************************************************
222 
223  //**********************************************************************************************
225 
228  template< typename T1, typename T2, typename T3 >
229  struct UseVectorizedDefaultKernel {
230  enum : bool { value = useOptimizedKernels &&
231  !IsDiagonal<T2>::value &&
232  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
233  AreSIMDCombinable< ElementType_<T1>
234  , ElementType_<T2>
235  , ElementType_<T3> >::value &&
236  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
237  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
238  };
240  //**********************************************************************************************
241 
242  public:
243  //**Type definitions****************************************************************************
250  typedef const ElementType ReturnType;
251  typedef const ResultType CompositeType;
252 
254  typedef If_< IsExpression<MT1>, const MT1, const MT1& > LeftOperand;
255 
257  typedef If_< IsExpression<MT2>, const MT2, const MT2& > RightOperand;
258 
261 
264  //**********************************************************************************************
265 
266  //**Compilation flags***************************************************************************
268  enum : bool { simdEnabled = !IsDiagonal<MT1>::value &&
269  MT1::simdEnabled && MT2::simdEnabled &&
272 
274  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
275  !evaluateRight && MT2::smpAssignable };
276  //**********************************************************************************************
277 
278  //**SIMD properties*****************************************************************************
280  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
281  //**********************************************************************************************
282 
283  //**Constructor*********************************************************************************
289  explicit inline TDMatTDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
290  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
291  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
292  {
293  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
294  }
295  //**********************************************************************************************
296 
297  //**Access operator*****************************************************************************
304  inline ReturnType operator()( size_t i, size_t j ) const {
305  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
306  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
307 
308  if( IsDiagonal<MT1>::value ) {
309  return lhs_(i,i) * rhs_(i,j);
310  }
311  else if( IsDiagonal<MT2>::value ) {
312  return lhs_(i,j) * rhs_(j,j);
313  }
315  const size_t begin( ( IsUpper<MT1>::value )
316  ?( ( IsLower<MT2>::value )
317  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
318  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
319  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
320  :( ( IsLower<MT2>::value )
321  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
322  :( 0UL ) ) );
323  const size_t end( ( IsLower<MT1>::value )
324  ?( ( IsUpper<MT2>::value )
325  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
326  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
327  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
328  :( ( IsUpper<MT2>::value )
329  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
330  :( lhs_.columns() ) ) );
331 
332  if( begin >= end ) return ElementType();
333 
334  const size_t n( end - begin );
335 
336  return subvector( row( lhs_, i ), begin, n ) * subvector( column( rhs_, j ), begin, n );
337  }
338  else {
339  return row( lhs_, i ) * column( rhs_, j );
340  }
341  }
342  //**********************************************************************************************
343 
344  //**At function*********************************************************************************
352  inline ReturnType at( size_t i, size_t j ) const {
353  if( i >= lhs_.rows() ) {
354  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
355  }
356  if( j >= rhs_.columns() ) {
357  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
358  }
359  return (*this)(i,j);
360  }
361  //**********************************************************************************************
362 
363  //**Rows function*******************************************************************************
368  inline size_t rows() const noexcept {
369  return lhs_.rows();
370  }
371  //**********************************************************************************************
372 
373  //**Columns function****************************************************************************
378  inline size_t columns() const noexcept {
379  return rhs_.columns();
380  }
381  //**********************************************************************************************
382 
383  //**Left operand access*************************************************************************
388  inline LeftOperand leftOperand() const noexcept {
389  return lhs_;
390  }
391  //**********************************************************************************************
392 
393  //**Right operand access************************************************************************
398  inline RightOperand rightOperand() const noexcept {
399  return rhs_;
400  }
401  //**********************************************************************************************
402 
403  //**********************************************************************************************
409  template< typename T >
410  inline bool canAlias( const T* alias ) const noexcept {
411  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
412  }
413  //**********************************************************************************************
414 
415  //**********************************************************************************************
421  template< typename T >
422  inline bool isAliased( const T* alias ) const noexcept {
423  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
424  }
425  //**********************************************************************************************
426 
427  //**********************************************************************************************
432  inline bool isAligned() const noexcept {
433  return lhs_.isAligned() && rhs_.isAligned();
434  }
435  //**********************************************************************************************
436 
437  //**********************************************************************************************
442  inline bool canSMPAssign() const noexcept {
443  return ( !BLAZE_BLAS_IS_PARALLEL ||
444  ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
445  ( rows() * columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
446  }
447  //**********************************************************************************************
448 
449  private:
450  //**Member variables****************************************************************************
451  LeftOperand lhs_;
452  RightOperand rhs_;
453  //**********************************************************************************************
454 
455  //**Assignment to dense matrices****************************************************************
468  template< typename MT // Type of the target dense matrix
469  , bool SO > // Storage order of the target dense matrix
471  assign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
472  {
474 
475  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
476  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
477 
478  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
479  return;
480  }
481  else if( rhs.lhs_.columns() == 0UL ) {
482  reset( ~lhs );
483  return;
484  }
485 
486  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
487  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
488 
489  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
490  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
491  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
492  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
493  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
494  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
495 
496  TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
497  }
499  //**********************************************************************************************
500 
501  //**Assignment to dense matrices (kernel selection)*********************************************
512  template< typename MT3 // Type of the left-hand side target matrix
513  , typename MT4 // Type of the left-hand side matrix operand
514  , typename MT5 > // Type of the right-hand side matrix operand
515  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
516  {
517  if( ( IsDiagonal<MT4>::value ) ||
518  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
519  selectSmallAssignKernel( C, A, B );
520  else
521  selectBlasAssignKernel( C, A, B );
522  }
524  //**********************************************************************************************
525 
526  //**Default assignment to dense matrices (general/general)**************************************
540  template< typename MT3 // Type of the left-hand side target matrix
541  , typename MT4 // Type of the left-hand side matrix operand
542  , typename MT5 > // Type of the right-hand side matrix operand
543  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
544  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
545  {
546  const size_t M( A.rows() );
547  const size_t N( B.columns() );
548  const size_t K( A.columns() );
549 
550  for( size_t j=0UL; j<N; ++j )
551  {
552  const size_t kbegin( ( IsLower<MT5>::value )
553  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
554  :( 0UL ) );
555  const size_t kend( ( IsUpper<MT5>::value )
556  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
557  :( K ) );
558  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
559 
560  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
561  for( size_t i=0UL; i<M; ++i ) {
562  reset( (~C)(i,j) );
563  }
564  continue;
565  }
566 
567  {
568  const size_t ibegin( ( IsLower<MT4>::value )
569  ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
570  :( 0UL ) );
571  const size_t iend( ( IsUpper<MT4>::value )
572  ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
573  :( M ) );
574  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
575 
576  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
577  for( size_t i=0UL; i<ibegin; ++i ) {
578  reset( C(i,j) );
579  }
580  }
581  else if( IsStrictlyLower<MT4>::value ) {
582  reset( C(0UL,j) );
583  }
584  for( size_t i=ibegin; i<iend; ++i ) {
585  C(i,j) = A(i,kbegin) * B(kbegin,j);
586  }
587  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
588  for( size_t i=iend; i<M; ++i ) {
589  reset( C(i,j) );
590  }
591  }
592  else if( IsStrictlyUpper<MT4>::value ) {
593  reset( C(M-1UL,j) );
594  }
595  }
596 
597  for( size_t k=kbegin+1UL; k<kend; ++k )
598  {
599  const size_t ibegin( ( IsLower<MT4>::value )
600  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
601  :( 0UL ) );
602  const size_t iend( ( IsUpper<MT4>::value )
603  ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
604  :( M ) );
605  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
606 
607  for( size_t i=ibegin; i<iend; ++i ) {
608  C(i,j) += A(i,k) * B(k,j);
609  }
610  if( IsUpper<MT4>::value ) {
611  C(iend,j) = A(iend,k) * B(k,j);
612  }
613  }
614  }
615  }
617  //**********************************************************************************************
618 
619  //**Default assignment to dense matrices (general/diagonal)*************************************
633  template< typename MT3 // Type of the left-hand side target matrix
634  , typename MT4 // Type of the left-hand side matrix operand
635  , typename MT5 > // Type of the right-hand side matrix operand
636  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
637  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
638  {
640 
641  const size_t M( A.rows() );
642  const size_t N( B.columns() );
643 
644  for( size_t j=0UL; j<N; ++j )
645  {
646  const size_t ibegin( ( IsLower<MT4>::value )
647  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
648  :( 0UL ) );
649  const size_t iend( ( IsUpper<MT4>::value )
650  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
651  :( M ) );
652  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
653 
654  if( IsLower<MT4>::value ) {
655  for( size_t i=0UL; i<ibegin; ++i ) {
656  reset( C(i,j) );
657  }
658  }
659  for( size_t i=ibegin; i<iend; ++i ) {
660  C(i,j) = A(i,j) * B(j,j);
661  }
662  if( IsUpper<MT4>::value ) {
663  for( size_t i=iend; i<M; ++i ) {
664  reset( C(i,j) );
665  }
666  }
667  }
668  }
670  //**********************************************************************************************
671 
672  //**Default assignment to dense matrices (diagonal/general)*************************************
686  template< typename MT3 // Type of the left-hand side target matrix
687  , typename MT4 // Type of the left-hand side matrix operand
688  , typename MT5 > // Type of the right-hand side matrix operand
689  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
690  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
691  {
693 
694  const size_t M( A.rows() );
695  const size_t N( B.columns() );
696 
697  for( size_t j=0UL; j<N; ++j )
698  {
699  const size_t ibegin( ( IsLower<MT5>::value )
700  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
701  :( 0UL ) );
702  const size_t iend( ( IsUpper<MT5>::value )
703  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
704  :( M ) );
705  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
706 
707  if( IsLower<MT4>::value ) {
708  for( size_t i=0UL; i<ibegin; ++i ) {
709  reset( C(i,j) );
710  }
711  }
712  for( size_t i=ibegin; i<iend; ++i ) {
713  C(i,j) = A(i,i) * B(i,j);
714  }
715  if( IsUpper<MT4>::value ) {
716  for( size_t i=iend; i<M; ++i ) {
717  reset( C(i,j) );
718  }
719  }
720  }
721  }
723  //**********************************************************************************************
724 
725  //**Default assignment to dense matrices (diagonal/diagonal)************************************
739  template< typename MT3 // Type of the left-hand side target matrix
740  , typename MT4 // Type of the left-hand side matrix operand
741  , typename MT5 > // Type of the right-hand side matrix operand
742  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
743  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
744  {
746 
747  reset( C );
748 
749  for( size_t i=0UL; i<A.rows(); ++i ) {
750  C(i,i) = A(i,i) * B(i,i);
751  }
752  }
754  //**********************************************************************************************
755 
756  //**Default assignment to dense matrices (small matrices)***************************************
770  template< typename MT3 // Type of the left-hand side target matrix
771  , typename MT4 // Type of the left-hand side matrix operand
772  , typename MT5 > // Type of the right-hand side matrix operand
773  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
774  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
775  {
776  selectDefaultAssignKernel( C, A, B );
777  }
779  //**********************************************************************************************
780 
781  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
796  template< typename MT3 // Type of the left-hand side target matrix
797  , typename MT4 // Type of the left-hand side matrix operand
798  , typename MT5 > // Type of the right-hand side matrix operand
799  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
800  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
801  {
806 
807  if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
808  const OppositeType_<MT5> tmp( serial( B ) );
809  assign( ~C, A * tmp );
810  }
811  else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
812  const OppositeType_<MT4> tmp( serial( A ) );
813  assign( ~C, tmp * B );
814  }
815  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
816  const OppositeType_<MT5> tmp( serial( B ) );
817  assign( ~C, A * tmp );
818  }
819  else {
820  const OppositeType_<MT4> tmp( serial( A ) );
821  assign( ~C, tmp * B );
822  }
823  }
825  //**********************************************************************************************
826 
827  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
842  template< typename MT3 // Type of the left-hand side target matrix
843  , typename MT4 // Type of the left-hand side matrix operand
844  , typename MT5 > // Type of the right-hand side matrix operand
845  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
846  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
847  {
848  const size_t M( A.rows() );
849  const size_t N( B.columns() );
850  const size_t K( A.columns() );
851 
852  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
853 
854  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
855  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
856 
857  size_t i( 0UL );
858 
859  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
860  for( size_t j=0UL; j<N; ++j )
861  {
862  const size_t kbegin( ( IsLower<MT5>::value )
863  ?( ( IsUpper<MT4>::value )
864  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
865  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
866  :( IsUpper<MT4>::value ? i : 0UL ) );
867  const size_t kend( ( IsUpper<MT5>::value )
868  ?( ( IsLower<MT4>::value )
869  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
870  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
871  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
872 
873  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
874 
875  for( size_t k=kbegin; k<kend; ++k ) {
876  const SIMDType b1( set( B(k,j) ) );
877  xmm1 = xmm1 + A.load(i ,k) * b1;
878  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
879  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
880  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
881  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
882  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
883  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
884  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
885  }
886 
887  (~C).store( i , j, xmm1 );
888  (~C).store( i+SIMDSIZE , j, xmm2 );
889  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
890  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
891  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
892  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
893  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
894  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
895  }
896  }
897 
898  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
899  {
900  size_t j( 0UL );
901 
902  for( ; (j+2UL) <= N; j+=2UL )
903  {
904  const size_t kbegin( ( IsLower<MT5>::value )
905  ?( ( IsUpper<MT4>::value )
906  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
907  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
908  :( IsUpper<MT4>::value ? i : 0UL ) );
909  const size_t kend( ( IsUpper<MT5>::value )
910  ?( ( IsLower<MT4>::value )
911  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
912  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
913  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
914 
915  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
916 
917  for( size_t k=kbegin; k<kend; ++k ) {
918  const SIMDType a1( A.load(i ,k) );
919  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
920  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
921  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
922  const SIMDType b1( set( B(k,j ) ) );
923  const SIMDType b2( set( B(k,j+1UL) ) );
924  xmm1 = xmm1 + a1 * b1;
925  xmm2 = xmm2 + a2 * b1;
926  xmm3 = xmm3 + a3 * b1;
927  xmm4 = xmm4 + a4 * b1;
928  xmm5 = xmm5 + a1 * b2;
929  xmm6 = xmm6 + a2 * b2;
930  xmm7 = xmm7 + a3 * b2;
931  xmm8 = xmm8 + a4 * b2;
932  }
933 
934  (~C).store( i , j , xmm1 );
935  (~C).store( i+SIMDSIZE , j , xmm2 );
936  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
937  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
938  (~C).store( i , j+1UL, xmm5 );
939  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
940  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
941  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
942  }
943 
944  if( j < N )
945  {
946  const size_t kbegin( ( IsLower<MT5>::value )
947  ?( ( IsUpper<MT4>::value )
948  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
949  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
950  :( IsUpper<MT4>::value ? i : 0UL ) );
951  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
952 
953  SIMDType xmm1, xmm2, xmm3, xmm4;
954 
955  for( size_t k=kbegin; k<kend; ++k ) {
956  const SIMDType b1( set( B(k,j) ) );
957  xmm1 = xmm1 + A.load(i ,k) * b1;
958  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
959  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
960  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
961  }
962 
963  (~C).store( i , j, xmm1 );
964  (~C).store( i+SIMDSIZE , j, xmm2 );
965  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
966  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
967  }
968  }
969 
970  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
971  {
972  size_t j( 0UL );
973 
974  for( ; (j+2UL) <= N; j+=2UL )
975  {
976  const size_t kbegin( ( IsLower<MT5>::value )
977  ?( ( IsUpper<MT4>::value )
978  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
979  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
980  :( IsUpper<MT4>::value ? i : 0UL ) );
981  const size_t kend( ( IsUpper<MT5>::value )
982  ?( ( IsLower<MT4>::value )
983  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
984  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
985  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
986 
987  SIMDType xmm1, xmm2, xmm3, xmm4;
988 
989  for( size_t k=kbegin; k<kend; ++k ) {
990  const SIMDType a1( A.load(i ,k) );
991  const SIMDType a2( A.load(i+SIMDSIZE,k) );
992  const SIMDType b1( set( B(k,j ) ) );
993  const SIMDType b2( set( B(k,j+1UL) ) );
994  xmm1 = xmm1 + a1 * b1;
995  xmm2 = xmm2 + a2 * b1;
996  xmm3 = xmm3 + a1 * b2;
997  xmm4 = xmm4 + a2 * b2;
998  }
999 
1000  (~C).store( i , j , xmm1 );
1001  (~C).store( i+SIMDSIZE, j , xmm2 );
1002  (~C).store( i , j+1UL, xmm3 );
1003  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1004  }
1005 
1006  if( j < N )
1007  {
1008  const size_t kbegin( ( IsLower<MT5>::value )
1009  ?( ( IsUpper<MT4>::value )
1010  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1011  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1012  :( IsUpper<MT4>::value ? i : 0UL ) );
1013  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
1014 
1015  SIMDType xmm1, xmm2;
1016 
1017  for( size_t k=kbegin; k<kend; ++k ) {
1018  const SIMDType b1( set( B(k,j) ) );
1019  xmm1 = xmm1 + A.load(i ,k) * b1;
1020  xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
1021  }
1022 
1023  (~C).store( i , j, xmm1 );
1024  (~C).store( i+SIMDSIZE, j, xmm2 );
1025  }
1026  }
1027 
1028  for( ; i<ipos; i+=SIMDSIZE )
1029  {
1030  size_t j( 0UL );
1031 
1032  for( ; (j+2UL) <= N; j+=2UL )
1033  {
1034  const size_t kbegin( ( IsLower<MT5>::value )
1035  ?( ( IsUpper<MT4>::value )
1036  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1037  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1038  :( IsUpper<MT4>::value ? i : 0UL ) );
1039  const size_t kend( ( IsUpper<MT5>::value )
1040  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1041  :( K ) );
1042 
1043  SIMDType xmm1, xmm2;
1044 
1045  for( size_t k=kbegin; k<kend; ++k ) {
1046  const SIMDType a1( A.load(i,k) );
1047  xmm1 = xmm1 + a1 * set( B(k,j ) );
1048  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
1049  }
1050 
1051  (~C).store( i, j , xmm1 );
1052  (~C).store( i, j+1UL, xmm2 );
1053  }
1054 
1055  if( j < N )
1056  {
1057  const size_t kbegin( ( IsLower<MT5>::value )
1058  ?( ( IsUpper<MT4>::value )
1059  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1060  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1061  :( IsUpper<MT4>::value ? i : 0UL ) );
1062 
1063  SIMDType xmm1;
1064 
1065  for( size_t k=kbegin; k<K; ++k ) {
1066  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
1067  }
1068 
1069  (~C).store( i, j, xmm1 );
1070  }
1071  }
1072 
1073  for( ; remainder && i<M; ++i )
1074  {
1075  size_t j( 0UL );
1076 
1077  for( ; (j+2UL) <= N; j+=2UL )
1078  {
1079  const size_t kbegin( ( IsLower<MT5>::value )
1080  ?( ( IsUpper<MT4>::value )
1081  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1082  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1083  :( IsUpper<MT4>::value ? i : 0UL ) );
1084  const size_t kend( ( IsUpper<MT5>::value )
1085  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1086  :( K ) );
1087 
1088  ElementType value1 = ElementType();
1089  ElementType value2 = ElementType();
1090 
1091  for( size_t k=kbegin; k<kend; ++k ) {
1092  value1 += A(i,k) * B(k,j );
1093  value2 += A(i,k) * B(k,j+1UL);
1094  }
1095 
1096  (~C)(i,j ) = value1;
1097  (~C)(i,j+1UL) = value2;
1098  }
1099 
1100  if( j < N )
1101  {
1102  const size_t kbegin( ( IsLower<MT5>::value )
1103  ?( ( IsUpper<MT4>::value )
1104  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1105  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1106  :( IsUpper<MT4>::value ? i : 0UL ) );
1107 
1108  ElementType value = ElementType();
1109 
1110  for( size_t k=kbegin; k<K; ++k ) {
1111  value += A(i,k) * B(k,j);
1112  }
1113 
1114  (~C)(i,j) = value;
1115  }
1116  }
1117  }
1119  //**********************************************************************************************
1120 
1121  //**Default assignment to dense matrices (large matrices)***************************************
1135  template< typename MT3 // Type of the left-hand side target matrix
1136  , typename MT4 // Type of the left-hand side matrix operand
1137  , typename MT5 > // Type of the right-hand side matrix operand
1138  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1139  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1140  {
1141  selectDefaultAssignKernel( C, A, B );
1142  }
1144  //**********************************************************************************************
1145 
1146  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
1161  template< typename MT3 // Type of the left-hand side target matrix
1162  , typename MT4 // Type of the left-hand side matrix operand
1163  , typename MT5 > // Type of the right-hand side matrix operand
1164  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1165  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1166  {
1167  selectSmallAssignKernel( ~C, A, B );
1168  }
1170  //**********************************************************************************************
1171 
1172  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
1187  template< typename MT3 // Type of the left-hand side target matrix
1188  , typename MT4 // Type of the left-hand side matrix operand
1189  , typename MT5 > // Type of the right-hand side matrix operand
1190  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1191  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1192  {
1193  const size_t M( A.rows() );
1194  const size_t N( B.columns() );
1195  const size_t K( A.columns() );
1196 
1197  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1198 
1199  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
1200  {
1201  const size_t iend( min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
1202 
1203  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1204  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1205 
1206  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
1207  {
1208  const size_t jend( min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
1209 
1210  for( size_t j=jj; j<jend; ++j ) {
1211  for( size_t i=ii; i<iend; ++i ) {
1212  reset( (~C)(i,j) );
1213  }
1214  }
1215 
1216  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
1217  {
1218  const size_t ktmp( min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
1219 
1220  size_t i( ii );
1221 
1222  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1223  {
1224  const size_t i1( i+SIMDSIZE );
1225  const size_t i2( i+SIMDSIZE*2UL );
1226  const size_t i3( i+SIMDSIZE*3UL );
1227 
1228  size_t j( jj );
1229 
1230  for( ; (j+2UL) <= jend; j+=2UL )
1231  {
1232  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1233  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1234  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
1235  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1236 
1237  SIMDType xmm1( (~C).load(i ,j ) );
1238  SIMDType xmm2( (~C).load(i1,j ) );
1239  SIMDType xmm3( (~C).load(i2,j ) );
1240  SIMDType xmm4( (~C).load(i3,j ) );
1241  SIMDType xmm5( (~C).load(i ,j+1UL) );
1242  SIMDType xmm6( (~C).load(i1,j+1UL) );
1243  SIMDType xmm7( (~C).load(i2,j+1UL) );
1244  SIMDType xmm8( (~C).load(i3,j+1UL) );
1245 
1246  for( size_t k=kbegin; k<kend; ++k ) {
1247  const SIMDType a1( A.load(i ,k) );
1248  const SIMDType a2( A.load(i1,k) );
1249  const SIMDType a3( A.load(i2,k) );
1250  const SIMDType a4( A.load(i3,k) );
1251  const SIMDType b1( set( B(k,j ) ) );
1252  const SIMDType b2( set( B(k,j+1UL) ) );
1253  xmm1 = xmm1 + a1 * b1;
1254  xmm2 = xmm2 + a2 * b1;
1255  xmm3 = xmm3 + a3 * b1;
1256  xmm4 = xmm4 + a4 * b1;
1257  xmm5 = xmm5 + a1 * b2;
1258  xmm6 = xmm6 + a2 * b2;
1259  xmm7 = xmm7 + a3 * b2;
1260  xmm8 = xmm8 + a4 * b2;
1261  }
1262 
1263  (~C).store( i , j , xmm1 );
1264  (~C).store( i1, j , xmm2 );
1265  (~C).store( i2, j , xmm3 );
1266  (~C).store( i3, j , xmm4 );
1267  (~C).store( i , j+1UL, xmm5 );
1268  (~C).store( i1, j+1UL, xmm6 );
1269  (~C).store( i2, j+1UL, xmm7 );
1270  (~C).store( i3, j+1UL, xmm8 );
1271  }
1272 
1273  if( j < jend )
1274  {
1275  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1276  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1277  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
1278  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1279 
1280  SIMDType xmm1( (~C).load(i ,j) );
1281  SIMDType xmm2( (~C).load(i1,j) );
1282  SIMDType xmm3( (~C).load(i2,j) );
1283  SIMDType xmm4( (~C).load(i3,j) );
1284 
1285  for( size_t k=kbegin; k<kend; ++k ) {
1286  const SIMDType b1( set( B(k,j) ) );
1287  xmm1 = xmm1 + A.load(i ,k) * b1;
1288  xmm2 = xmm2 + A.load(i1,k) * b1;
1289  xmm3 = xmm3 + A.load(i2,k) * b1;
1290  xmm4 = xmm4 + A.load(i3,k) * b1;
1291  }
1292 
1293  (~C).store( i , j, xmm1 );
1294  (~C).store( i1, j, xmm2 );
1295  (~C).store( i2, j, xmm3 );
1296  (~C).store( i3, j, xmm4 );
1297  }
1298  }
1299 
1300  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1301  {
1302  const size_t i1( i+SIMDSIZE );
1303 
1304  size_t j( jj );
1305 
1306  for( ; (j+4UL) <= jend; j+=4UL )
1307  {
1308  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1309  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1310  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
1311  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
1312 
1313  SIMDType xmm1( (~C).load(i ,j ) );
1314  SIMDType xmm2( (~C).load(i1,j ) );
1315  SIMDType xmm3( (~C).load(i ,j+1UL) );
1316  SIMDType xmm4( (~C).load(i1,j+1UL) );
1317  SIMDType xmm5( (~C).load(i ,j+2UL) );
1318  SIMDType xmm6( (~C).load(i1,j+2UL) );
1319  SIMDType xmm7( (~C).load(i ,j+3UL) );
1320  SIMDType xmm8( (~C).load(i1,j+3UL) );
1321 
1322  for( size_t k=kbegin; k<kend; ++k ) {
1323  const SIMDType a1( A.load(i ,k) );
1324  const SIMDType a2( A.load(i1,k) );
1325  const SIMDType b1( set( B(k,j ) ) );
1326  const SIMDType b2( set( B(k,j+1UL) ) );
1327  const SIMDType b3( set( B(k,j+2UL) ) );
1328  const SIMDType b4( set( B(k,j+3UL) ) );
1329  xmm1 = xmm1 + a1 * b1;
1330  xmm2 = xmm2 + a2 * b1;
1331  xmm3 = xmm3 + a1 * b2;
1332  xmm4 = xmm4 + a2 * b2;
1333  xmm5 = xmm5 + a1 * b3;
1334  xmm6 = xmm6 + a2 * b3;
1335  xmm7 = xmm7 + a1 * b4;
1336  xmm8 = xmm8 + a2 * b4;
1337  }
1338 
1339  (~C).store( i , j , xmm1 );
1340  (~C).store( i1, j , xmm2 );
1341  (~C).store( i , j+1UL, xmm3 );
1342  (~C).store( i1, j+1UL, xmm4 );
1343  (~C).store( i , j+2UL, xmm5 );
1344  (~C).store( i1, j+2UL, xmm6 );
1345  (~C).store( i , j+3UL, xmm7 );
1346  (~C).store( i1, j+3UL, xmm8 );
1347  }
1348 
1349  for( ; (j+2UL) <= jend; j+=2UL )
1350  {
1351  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1352  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1353  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
1354  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1355 
1356  SIMDType xmm1( (~C).load(i ,j ) );
1357  SIMDType xmm2( (~C).load(i1,j ) );
1358  SIMDType xmm3( (~C).load(i ,j+1UL) );
1359  SIMDType xmm4( (~C).load(i1,j+1UL) );
1360 
1361  for( size_t k=kbegin; k<kend; ++k ) {
1362  const SIMDType a1( A.load(i ,k) );
1363  const SIMDType a2( A.load(i1,k) );
1364  const SIMDType b1( set( B(k,j ) ) );
1365  const SIMDType b2( set( B(k,j+1UL) ) );
1366  xmm1 = xmm1 + a1 * b1;
1367  xmm2 = xmm2 + a2 * b1;
1368  xmm3 = xmm3 + a1 * b2;
1369  xmm4 = xmm4 + a2 * b2;
1370  }
1371 
1372  (~C).store( i , j , xmm1 );
1373  (~C).store( i1, j , xmm2 );
1374  (~C).store( i , j+1UL, xmm3 );
1375  (~C).store( i1, j+1UL, xmm4 );
1376  }
1377 
1378  if( j < jend )
1379  {
1380  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1381  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1382  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
1383  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1384 
1385  SIMDType xmm1( (~C).load(i ,j) );
1386  SIMDType xmm2( (~C).load(i1,j) );
1387 
1388  for( size_t k=kbegin; k<kend; ++k ) {
1389  const SIMDType b1( set( B(k,j) ) );
1390  xmm1 = xmm1 + A.load(i ,k) * b1;
1391  xmm2 = xmm2 + A.load(i1,k) * b1;
1392  }
1393 
1394  (~C).store( i , j, xmm1 );
1395  (~C).store( i1, j, xmm2 );
1396  }
1397  }
1398 
1399  for( ; i<ipos; i+=SIMDSIZE )
1400  {
1401  for( size_t j=jj; j<jend; ++j )
1402  {
1403  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1404  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1405  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE, ktmp ) ):( ktmp ),
1406  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1407 
1408  SIMDType xmm1( (~C).load(i,j) );
1409 
1410  for( size_t k=kbegin; k<kend; ++k ) {
1411  const SIMDType b1( set( B(k,j) ) );
1412  xmm1 = xmm1 + A.load(i,k) * b1;
1413  }
1414 
1415  (~C).store( i, j, xmm1 );
1416  }
1417  }
1418 
1419  for( ; remainder && i<iend; ++i )
1420  {
1421  for( size_t j=jj; j<jend; ++j )
1422  {
1423  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1424  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1425  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
1426  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1427 
1428  ElementType value( (~C)(i,j) );
1429 
1430  for( size_t k=kbegin; k<kend; ++k ) {
1431  value += A(i,k) * B(k,j);
1432  }
1433 
1434  (~C)(i,j) = value;
1435  }
1436  }
1437  }
1438  }
1439  }
1440  }
1442  //**********************************************************************************************
1443 
1444  //**BLAS-based assignment to dense matrices (default)*******************************************
1458  template< typename MT3 // Type of the left-hand side target matrix
1459  , typename MT4 // Type of the left-hand side matrix operand
1460  , typename MT5 > // Type of the right-hand side matrix operand
1461  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
1462  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1463  {
1464  selectLargeAssignKernel( C, A, B );
1465  }
1467  //**********************************************************************************************
1468 
1469  //**BLAS-based assignment to dense matrices*****************************************************
1470 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1471 
1484  template< typename MT3 // Type of the left-hand side target matrix
1485  , typename MT4 // Type of the left-hand side matrix operand
1486  , typename MT5 > // Type of the right-hand side matrix operand
1487  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
1488  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1489  {
1490  typedef ElementType_<MT3> ET;
1491 
1492  if( IsTriangular<MT4>::value ) {
1493  assign( C, B );
1494  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1495  }
1496  else if( IsTriangular<MT5>::value ) {
1497  assign( C, A );
1498  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1499  }
1500  else {
1501  gemm( C, A, B, ET(1), ET(0) );
1502  }
1503  }
1505 #endif
1506  //**********************************************************************************************
1507 
1508  //**Assignment to sparse matrices***************************************************************
1521  template< typename MT // Type of the target sparse matrix
1522  , bool SO > // Storage order of the target sparse matrix
1523  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
1524  assign( SparseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
1525  {
1527 
1528  typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
1529 
1535  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<TmpType> );
1536 
1537  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1538  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1539 
1540  const TmpType tmp( serial( rhs ) );
1541  assign( ~lhs, tmp );
1542  }
1544  //**********************************************************************************************
1545 
1546  //**Restructuring assignment to row-major matrices**********************************************
1561  template< typename MT > // Type of the target matrix
1562  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
1563  assign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
1564  {
1566 
1568 
1569  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1570  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1571 
1572  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
1573  assign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
1574  else if( IsSymmetric<MT1>::value )
1575  assign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
1576  else
1577  assign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
1578  }
1580  //**********************************************************************************************
1581 
1582  //**Addition assignment to dense matrices*******************************************************
1595  template< typename MT // Type of the target dense matrix
1596  , bool SO > // Storage order of the target dense matrix
1597  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
1598  addAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
1599  {
1601 
1602  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1603  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1604 
1605  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1606  return;
1607  }
1608 
1609  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1610  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1611 
1612  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1613  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1614  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1615  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1616  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1617  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1618 
1619  TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1620  }
1622  //**********************************************************************************************
1623 
1624  //**Addition assignment to dense matrices (kernel selection)************************************
1635  template< typename MT3 // Type of the left-hand side target matrix
1636  , typename MT4 // Type of the left-hand side matrix operand
1637  , typename MT5 > // Type of the right-hand side matrix operand
1638  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1639  {
1640  if( ( IsDiagonal<MT4>::value ) ||
1641  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1642  selectSmallAddAssignKernel( C, A, B );
1643  else
1644  selectBlasAddAssignKernel( C, A, B );
1645  }
1647  //**********************************************************************************************
1648 
1649  //**Default addition assignment to dense matrices (general/general)*****************************
1663  template< typename MT3 // Type of the left-hand side target matrix
1664  , typename MT4 // Type of the left-hand side matrix operand
1665  , typename MT5 > // Type of the right-hand side matrix operand
1666  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
1667  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1668  {
1669  const size_t M( A.rows() );
1670  const size_t N( B.columns() );
1671  const size_t K( A.columns() );
1672 
1673  for( size_t j=0UL; j<N; ++j )
1674  {
1675  const size_t kbegin( ( IsLower<MT5>::value )
1676  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1677  :( 0UL ) );
1678  const size_t kend( ( IsUpper<MT5>::value )
1679  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1680  :( K ) );
1681  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
1682 
1683  for( size_t k=kbegin; k<kend; ++k )
1684  {
1685  const size_t ibegin( ( IsLower<MT4>::value )
1686  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
1687  :( 0UL ) );
1688  const size_t iend( ( IsUpper<MT4>::value )
1689  ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
1690  :( M ) );
1691  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1692 
1693  const size_t inum( iend - ibegin );
1694  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1695 
1696  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1697  C(i ,j) += A(i ,k) * B(k,j);
1698  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1699  }
1700  if( ipos < iend ) {
1701  C(ipos,j) += A(ipos,k) * B(k,j);
1702  }
1703  }
1704  }
1705  }
1707  //**********************************************************************************************
1708 
1709  //**Default addition assignment to dense matrices (general/diagonal)****************************
1723  template< typename MT3 // Type of the left-hand side target matrix
1724  , typename MT4 // Type of the left-hand side matrix operand
1725  , typename MT5 > // Type of the right-hand side matrix operand
1726  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
1727  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1728  {
1730 
1731  const size_t M( A.rows() );
1732  const size_t N( B.columns() );
1733 
1734  for( size_t j=0UL; j<N; ++j )
1735  {
1736  const size_t ibegin( ( IsLower<MT4>::value )
1737  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
1738  :( 0UL ) );
1739  const size_t iend( ( IsUpper<MT4>::value )
1740  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
1741  :( M ) );
1742  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1743 
1744  const size_t inum( iend - ibegin );
1745  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1746 
1747  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1748  C(i ,j) += A(i ,j) * B(j,j);
1749  C(i+1UL,j) += A(i+1UL,j) * B(j,j);
1750  }
1751  if( ipos < iend ) {
1752  C(ipos,j) += A(ipos,j) * B(j,j);
1753  }
1754  }
1755  }
1757  //**********************************************************************************************
1758 
1759  //**Default addition assignment to dense matrices (diagonal/general)****************************
1773  template< typename MT3 // Type of the left-hand side target matrix
1774  , typename MT4 // Type of the left-hand side matrix operand
1775  , typename MT5 > // Type of the right-hand side matrix operand
1776  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
1777  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1778  {
1780 
1781  const size_t M( A.rows() );
1782  const size_t N( B.columns() );
1783 
1784  for( size_t j=0UL; j<N; ++j )
1785  {
1786  const size_t ibegin( ( IsLower<MT5>::value )
1787  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1788  :( 0UL ) );
1789  const size_t iend( ( IsUpper<MT5>::value )
1790  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1791  :( M ) );
1792  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1793 
1794  const size_t inum( iend - ibegin );
1795  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1796 
1797  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1798  C(i ,j) += A(i ,i ) * B(i ,j);
1799  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
1800  }
1801  if( ipos < iend ) {
1802  C(ipos,j) += A(ipos,ipos) * B(ipos,j);
1803  }
1804  }
1805  }
1807  //**********************************************************************************************
1808 
1809  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
1823  template< typename MT3 // Type of the left-hand side target matrix
1824  , typename MT4 // Type of the left-hand side matrix operand
1825  , typename MT5 > // Type of the right-hand side matrix operand
1826  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
1827  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1828  {
1830 
1831  for( size_t i=0UL; i<A.rows(); ++i ) {
1832  C(i,i) += A(i,i) * B(i,i);
1833  }
1834  }
1836  //**********************************************************************************************
1837 
1838  //**Default addition assignment to dense matrices (small matrices)******************************
1852  template< typename MT3 // Type of the left-hand side target matrix
1853  , typename MT4 // Type of the left-hand side matrix operand
1854  , typename MT5 > // Type of the right-hand side matrix operand
1855  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1856  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1857  {
1858  selectDefaultAddAssignKernel( C, A, B );
1859  }
1861  //**********************************************************************************************
1862 
1863  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
1878  template< typename MT3 // Type of the left-hand side target matrix
1879  , typename MT4 // Type of the left-hand side matrix operand
1880  , typename MT5 > // Type of the right-hand side matrix operand
1881  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1882  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1883  {
1886  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_<MT4> );
1887  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_<MT5> );
1888 
1889  if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1890  const OppositeType_<MT5> tmp( serial( B ) );
1891  addAssign( ~C, A * tmp );
1892  }
1893  else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1894  const OppositeType_<MT4> tmp( serial( A ) );
1895  addAssign( ~C, tmp * B );
1896  }
1897  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
1898  const OppositeType_<MT5> tmp( serial( B ) );
1899  addAssign( ~C, A * tmp );
1900  }
1901  else {
1902  const OppositeType_<MT4> tmp( serial( A ) );
1903  addAssign( ~C, tmp * B );
1904  }
1905  }
1907  //**********************************************************************************************
1908 
1909  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
1924  template< typename MT3 // Type of the left-hand side target matrix
1925  , typename MT4 // Type of the left-hand side matrix operand
1926  , typename MT5 > // Type of the right-hand side matrix operand
1927  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1928  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1929  {
1930  const size_t M( A.rows() );
1931  const size_t N( B.columns() );
1932  const size_t K( A.columns() );
1933 
1934  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1935 
1936  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
1937  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1938 
1939  size_t i( 0UL );
1940 
1941  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1942  for( size_t j=0UL; j<N; ++j )
1943  {
1944  const size_t kbegin( ( IsLower<MT5>::value )
1945  ?( ( IsUpper<MT4>::value )
1946  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1947  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1948  :( IsUpper<MT4>::value ? i : 0UL ) );
1949  const size_t kend( ( IsUpper<MT5>::value )
1950  ?( ( IsLower<MT4>::value )
1951  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1952  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1953  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
1954 
1955  SIMDType xmm1( (~C).load(i ,j) );
1956  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
1957  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
1958  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
1959  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
1960  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
1961  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
1962  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
1963 
1964  for( size_t k=kbegin; k<kend; ++k ) {
1965  const SIMDType b1( set( B(k,j) ) );
1966  xmm1 = xmm1 + A.load(i ,k) * b1;
1967  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
1968  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
1969  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
1970  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
1971  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
1972  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
1973  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
1974  }
1975 
1976  (~C).store( i , j, xmm1 );
1977  (~C).store( i+SIMDSIZE , j, xmm2 );
1978  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1979  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1980  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1981  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
1982  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
1983  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
1984  }
1985  }
1986 
1987  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1988  {
1989  size_t j( 0UL );
1990 
1991  for( ; (j+2UL) <= N; j+=2UL )
1992  {
1993  const size_t kbegin( ( IsLower<MT5>::value )
1994  ?( ( IsUpper<MT4>::value )
1995  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1996  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1997  :( IsUpper<MT4>::value ? i : 0UL ) );
1998  const size_t kend( ( IsUpper<MT5>::value )
1999  ?( ( IsLower<MT4>::value )
2000  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2001  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2002  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
2003 
2004  SIMDType xmm1( (~C).load(i ,j ) );
2005  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
2006  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
2007  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
2008  SIMDType xmm5( (~C).load(i ,j+1UL) );
2009  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
2010  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2011  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
2012 
2013  for( size_t k=kbegin; k<kend; ++k ) {
2014  const SIMDType a1( A.load(i ,k) );
2015  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2016  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2017  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2018  const SIMDType b1( set( B(k,j ) ) );
2019  const SIMDType b2( set( B(k,j+1UL) ) );
2020  xmm1 = xmm1 + a1 * b1;
2021  xmm2 = xmm2 + a2 * b1;
2022  xmm3 = xmm3 + a3 * b1;
2023  xmm4 = xmm4 + a4 * b1;
2024  xmm5 = xmm5 + a1 * b2;
2025  xmm6 = xmm6 + a2 * b2;
2026  xmm7 = xmm7 + a3 * b2;
2027  xmm8 = xmm8 + a4 * b2;
2028  }
2029 
2030  (~C).store( i , j , xmm1 );
2031  (~C).store( i+SIMDSIZE , j , xmm2 );
2032  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2033  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
2034  (~C).store( i , j+1UL, xmm5 );
2035  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
2036  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
2037  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
2038  }
2039 
2040  if( j < N )
2041  {
2042  const size_t kbegin( ( IsLower<MT5>::value )
2043  ?( ( IsUpper<MT4>::value )
2044  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2045  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2046  :( IsUpper<MT4>::value ? i : 0UL ) );
2047  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
2048 
2049  SIMDType xmm1( (~C).load(i ,j) );
2050  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2051  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2052  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2053 
2054  for( size_t k=kbegin; k<kend; ++k ) {
2055  const SIMDType b1( set( B(k,j) ) );
2056  xmm1 = xmm1 + A.load(i ,k) * b1;
2057  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
2058  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
2059  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
2060  }
2061 
2062  (~C).store( i , j, xmm1 );
2063  (~C).store( i+SIMDSIZE , j, xmm2 );
2064  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2065  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2066  }
2067  }
2068 
2069  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2070  {
2071  size_t j( 0UL );
2072 
2073  for( ; (j+2UL) <= N; j+=2UL )
2074  {
2075  const size_t kbegin( ( IsLower<MT5>::value )
2076  ?( ( IsUpper<MT4>::value )
2077  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2078  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2079  :( IsUpper<MT4>::value ? i : 0UL ) );
2080  const size_t kend( ( IsUpper<MT5>::value )
2081  ?( ( IsLower<MT4>::value )
2082  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2083  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2084  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2085 
2086  SIMDType xmm1( (~C).load(i ,j ) );
2087  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2088  SIMDType xmm3( (~C).load(i ,j+1UL) );
2089  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2090 
2091  for( size_t k=kbegin; k<kend; ++k ) {
2092  const SIMDType a1( A.load(i ,k) );
2093  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2094  const SIMDType b1( set( B(k,j ) ) );
2095  const SIMDType b2( set( B(k,j+1UL) ) );
2096  xmm1 = xmm1 + a1 * b1;
2097  xmm2 = xmm2 + a2 * b1;
2098  xmm3 = xmm3 + a1 * b2;
2099  xmm4 = xmm4 + a2 * b2;
2100  }
2101 
2102  (~C).store( i , j , xmm1 );
2103  (~C).store( i+SIMDSIZE, j , xmm2 );
2104  (~C).store( i , j+1UL, xmm3 );
2105  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2106  }
2107 
2108  if( j < N )
2109  {
2110  const size_t kbegin( ( IsLower<MT5>::value )
2111  ?( ( IsUpper<MT4>::value )
2112  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2113  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2114  :( IsUpper<MT4>::value ? i : 0UL ) );
2115  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
2116 
2117  SIMDType xmm1( (~C).load(i ,j) );
2118  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
2119 
2120  for( size_t k=kbegin; k<kend; ++k ) {
2121  const SIMDType b1( set( B(k,j) ) );
2122  xmm1 = xmm1 + A.load(i ,k) * b1;
2123  xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
2124  }
2125 
2126  (~C).store( i , j, xmm1 );
2127  (~C).store( i+SIMDSIZE, j, xmm2 );
2128  }
2129  }
2130 
2131  for( ; i<ipos; i+=SIMDSIZE )
2132  {
2133  size_t j( 0UL );
2134 
2135  for( ; (j+2UL) <= N; j+=2UL )
2136  {
2137  const size_t kbegin( ( IsLower<MT5>::value )
2138  ?( ( IsUpper<MT4>::value )
2139  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2140  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2141  :( IsUpper<MT4>::value ? i : 0UL ) );
2142  const size_t kend( ( IsUpper<MT5>::value )
2143  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2144  :( K ) );
2145 
2146  SIMDType xmm1( (~C).load(i,j ) );
2147  SIMDType xmm2( (~C).load(i,j+1UL) );
2148 
2149  for( size_t k=kbegin; k<kend; ++k ) {
2150  const SIMDType a1( A.load(i,k) );
2151  xmm1 = xmm1 + a1 * set( B(k,j ) );
2152  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
2153  }
2154 
2155  (~C).store( i, j , xmm1 );
2156  (~C).store( i, j+1UL, xmm2 );
2157  }
2158 
2159  if( j < N )
2160  {
2161  const size_t kbegin( ( IsLower<MT5>::value )
2162  ?( ( IsUpper<MT4>::value )
2163  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2164  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2165  :( IsUpper<MT4>::value ? i : 0UL ) );
2166 
2167  SIMDType xmm1( (~C).load(i,j) );
2168 
2169  for( size_t k=kbegin; k<K; ++k ) {
2170  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
2171  }
2172 
2173  (~C).store( i, j, xmm1 );
2174  }
2175  }
2176 
2177  for( ; remainder && i<M; ++i )
2178  {
2179  size_t j( 0UL );
2180 
2181  for( ; (j+2UL) <= N; j+=2UL )
2182  {
2183  const size_t kbegin( ( IsLower<MT5>::value )
2184  ?( ( IsUpper<MT4>::value )
2185  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2186  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2187  :( IsUpper<MT4>::value ? i : 0UL ) );
2188  const size_t kend( ( IsUpper<MT5>::value )
2189  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2190  :( K ) );
2191 
2192  ElementType value1( (~C)(i,j ) );
2193  ElementType value2( (~C)(i,j+1UL) );
2194 
2195  for( size_t k=kbegin; k<kend; ++k ) {
2196  value1 += A(i,k) * B(k,j );
2197  value2 += A(i,k) * B(k,j+1UL);
2198  }
2199 
2200  (~C)(i,j ) = value1;
2201  (~C)(i,j+1UL) = value2;
2202  }
2203 
2204  if( j < N )
2205  {
2206  const size_t kbegin( ( IsLower<MT5>::value )
2207  ?( ( IsUpper<MT4>::value )
2208  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2209  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2210  :( IsUpper<MT4>::value ? i : 0UL ) );
2211 
2212  ElementType value( (~C)(i,j) );
2213 
2214  for( size_t k=kbegin; k<K; ++k ) {
2215  value += A(i,k) * B(k,j);
2216  }
2217 
2218  (~C)(i,j) = value;
2219  }
2220  }
2221  }
2223  //**********************************************************************************************
2224 
2225  //**Default addition assignment to dense matrices (large matrices)******************************
2239  template< typename MT3 // Type of the left-hand side target matrix
2240  , typename MT4 // Type of the left-hand side matrix operand
2241  , typename MT5 > // Type of the right-hand side matrix operand
2242  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2243  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2244  {
2245  selectDefaultAddAssignKernel( C, A, B );
2246  }
2248  //**********************************************************************************************
2249 
2250  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
2265  template< typename MT3 // Type of the left-hand side target matrix
2266  , typename MT4 // Type of the left-hand side matrix operand
2267  , typename MT5 > // Type of the right-hand side matrix operand
2268  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2269  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2270  {
2271  selectSmallAddAssignKernel( ~C, A, B );
2272  }
2274  //**********************************************************************************************
2275 
2276  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
2291  template< typename MT3 // Type of the left-hand side target matrix
2292  , typename MT4 // Type of the left-hand side matrix operand
2293  , typename MT5 > // Type of the right-hand side matrix operand
2294  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2295  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2296  {
2297  const size_t M( A.rows() );
2298  const size_t N( B.columns() );
2299  const size_t K( A.columns() );
2300 
2301  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
2302 
2303  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
2304  {
2305  const size_t iend( min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
2306 
2307  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2308  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
2309 
2310  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
2311  {
2312  const size_t jend( min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
2313 
2314  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
2315  {
2316  const size_t ktmp( min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
2317 
2318  size_t i( ii );
2319 
2320  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2321  {
2322  const size_t i1( i+SIMDSIZE );
2323  const size_t i2( i+SIMDSIZE*2UL );
2324  const size_t i3( i+SIMDSIZE*3UL );
2325 
2326  size_t j( jj );
2327 
2328  for( ; (j+2UL) <= jend; j+=2UL )
2329  {
2330  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2331  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2332  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
2333  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2334 
2335  SIMDType xmm1( (~C).load(i ,j ) );
2336  SIMDType xmm2( (~C).load(i1,j ) );
2337  SIMDType xmm3( (~C).load(i2,j ) );
2338  SIMDType xmm4( (~C).load(i3,j ) );
2339  SIMDType xmm5( (~C).load(i ,j+1UL) );
2340  SIMDType xmm6( (~C).load(i1,j+1UL) );
2341  SIMDType xmm7( (~C).load(i2,j+1UL) );
2342  SIMDType xmm8( (~C).load(i3,j+1UL) );
2343 
2344  for( size_t k=kbegin; k<kend; ++k ) {
2345  const SIMDType a1( A.load(i ,k) );
2346  const SIMDType a2( A.load(i1,k) );
2347  const SIMDType a3( A.load(i2,k) );
2348  const SIMDType a4( A.load(i3,k) );
2349  const SIMDType b1( set( B(k,j ) ) );
2350  const SIMDType b2( set( B(k,j+1UL) ) );
2351  xmm1 = xmm1 + a1 * b1;
2352  xmm2 = xmm2 + a2 * b1;
2353  xmm3 = xmm3 + a3 * b1;
2354  xmm4 = xmm4 + a4 * b1;
2355  xmm5 = xmm5 + a1 * b2;
2356  xmm6 = xmm6 + a2 * b2;
2357  xmm7 = xmm7 + a3 * b2;
2358  xmm8 = xmm8 + a4 * b2;
2359  }
2360 
2361  (~C).store( i , j , xmm1 );
2362  (~C).store( i1, j , xmm2 );
2363  (~C).store( i2, j , xmm3 );
2364  (~C).store( i3, j , xmm4 );
2365  (~C).store( i , j+1UL, xmm5 );
2366  (~C).store( i1, j+1UL, xmm6 );
2367  (~C).store( i2, j+1UL, xmm7 );
2368  (~C).store( i3, j+1UL, xmm8 );
2369  }
2370 
2371  if( j < jend )
2372  {
2373  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2374  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2375  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
2376  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2377 
2378  SIMDType xmm1( (~C).load(i ,j) );
2379  SIMDType xmm2( (~C).load(i1,j) );
2380  SIMDType xmm3( (~C).load(i2,j) );
2381  SIMDType xmm4( (~C).load(i3,j) );
2382 
2383  for( size_t k=kbegin; k<kend; ++k ) {
2384  const SIMDType b1( set( B(k,j) ) );
2385  xmm1 = xmm1 + A.load(i ,k) * b1;
2386  xmm2 = xmm2 + A.load(i1,k) * b1;
2387  xmm3 = xmm3 + A.load(i2,k) * b1;
2388  xmm4 = xmm4 + A.load(i3,k) * b1;
2389  }
2390 
2391  (~C).store( i , j, xmm1 );
2392  (~C).store( i1, j, xmm2 );
2393  (~C).store( i2, j, xmm3 );
2394  (~C).store( i3, j, xmm4 );
2395  }
2396  }
2397 
2398  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2399  {
2400  const size_t i1( i+SIMDSIZE );
2401 
2402  size_t j( jj );
2403 
2404  for( ; (j+4UL) <= jend; j+=4UL )
2405  {
2406  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2407  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2408  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
2409  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
2410 
2411  SIMDType xmm1( (~C).load(i ,j ) );
2412  SIMDType xmm2( (~C).load(i1,j ) );
2413  SIMDType xmm3( (~C).load(i ,j+1UL) );
2414  SIMDType xmm4( (~C).load(i1,j+1UL) );
2415  SIMDType xmm5( (~C).load(i ,j+2UL) );
2416  SIMDType xmm6( (~C).load(i1,j+2UL) );
2417  SIMDType xmm7( (~C).load(i ,j+3UL) );
2418  SIMDType xmm8( (~C).load(i1,j+3UL) );
2419 
2420  for( size_t k=kbegin; k<kend; ++k ) {
2421  const SIMDType a1( A.load(i ,k) );
2422  const SIMDType a2( A.load(i1,k) );
2423  const SIMDType b1( set( B(k,j ) ) );
2424  const SIMDType b2( set( B(k,j+1UL) ) );
2425  const SIMDType b3( set( B(k,j+2UL) ) );
2426  const SIMDType b4( set( B(k,j+3UL) ) );
2427  xmm1 = xmm1 + a1 * b1;
2428  xmm2 = xmm2 + a2 * b1;
2429  xmm3 = xmm3 + a1 * b2;
2430  xmm4 = xmm4 + a2 * b2;
2431  xmm5 = xmm5 + a1 * b3;
2432  xmm6 = xmm6 + a2 * b3;
2433  xmm7 = xmm7 + a1 * b4;
2434  xmm8 = xmm8 + a2 * b4;
2435  }
2436 
2437  (~C).store( i , j , xmm1 );
2438  (~C).store( i1, j , xmm2 );
2439  (~C).store( i , j+1UL, xmm3 );
2440  (~C).store( i1, j+1UL, xmm4 );
2441  (~C).store( i , j+2UL, xmm5 );
2442  (~C).store( i1, j+2UL, xmm6 );
2443  (~C).store( i , j+3UL, xmm7 );
2444  (~C).store( i1, j+3UL, xmm8 );
2445  }
2446 
2447  for( ; (j+2UL) <= jend; j+=2UL )
2448  {
2449  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2450  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2451  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
2452  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2453 
2454  SIMDType xmm1( (~C).load(i ,j ) );
2455  SIMDType xmm2( (~C).load(i1,j ) );
2456  SIMDType xmm3( (~C).load(i ,j+1UL) );
2457  SIMDType xmm4( (~C).load(i1,j+1UL) );
2458 
2459  for( size_t k=kbegin; k<kend; ++k ) {
2460  const SIMDType a1( A.load(i ,k) );
2461  const SIMDType a2( A.load(i1,k) );
2462  const SIMDType b1( set( B(k,j ) ) );
2463  const SIMDType b2( set( B(k,j+1UL) ) );
2464  xmm1 = xmm1 + a1 * b1;
2465  xmm2 = xmm2 + a2 * b1;
2466  xmm3 = xmm3 + a1 * b2;
2467  xmm4 = xmm4 + a2 * b2;
2468  }
2469 
2470  (~C).store( i , j , xmm1 );
2471  (~C).store( i1, j , xmm2 );
2472  (~C).store( i , j+1UL, xmm3 );
2473  (~C).store( i1, j+1UL, xmm4 );
2474  }
2475 
2476  if( j < jend )
2477  {
2478  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2479  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2480  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
2481  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2482 
2483  SIMDType xmm1( (~C).load(i ,j) );
2484  SIMDType xmm2( (~C).load(i1,j) );
2485 
2486  for( size_t k=kbegin; k<kend; ++k ) {
2487  const SIMDType b1( set( B(k,j) ) );
2488  xmm1 = xmm1 + A.load(i ,k) * b1;
2489  xmm2 = xmm2 + A.load(i1,k) * b1;
2490  }
2491 
2492  (~C).store( i , j, xmm1 );
2493  (~C).store( i1, j, xmm2 );
2494  }
2495  }
2496 
2497  for( ; i<ipos; i+=SIMDSIZE )
2498  {
2499  for( size_t j=jj; j<jend; ++j )
2500  {
2501  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2502  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2503  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE, ktmp ) ):( ktmp ),
2504  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2505 
2506  SIMDType xmm1( (~C).load(i,j) );
2507 
2508  for( size_t k=kbegin; k<kend; ++k ) {
2509  const SIMDType b1( set( B(k,j) ) );
2510  xmm1 = xmm1 + A.load(i,k) * b1;
2511  }
2512 
2513  (~C).store( i, j, xmm1 );
2514  }
2515  }
2516 
2517  for( ; remainder && i<iend; ++i )
2518  {
2519  for( size_t j=jj; j<jend; ++j )
2520  {
2521  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2522  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2523  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
2524  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2525 
2526  ElementType value( (~C)(i,j) );
2527 
2528  for( size_t k=kbegin; k<kend; ++k ) {
2529  value += A(i,k) * B(k,j);
2530  }
2531 
2532  (~C)(i,j) = value;
2533  }
2534  }
2535  }
2536  }
2537  }
2538  }
2540  //**********************************************************************************************
2541 
2542  //**BLAS-based addition assignment to dense matrices (default)**********************************
2556  template< typename MT3 // Type of the left-hand side target matrix
2557  , typename MT4 // Type of the left-hand side matrix operand
2558  , typename MT5 > // Type of the right-hand side matrix operand
2559  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
2560  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2561  {
2562  selectLargeAddAssignKernel( C, A, B );
2563  }
2565  //**********************************************************************************************
2566 
2567  //**BLAS-based addition assignment to dense matrices********************************************
2568 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2569 
2582  template< typename MT3 // Type of the left-hand side target matrix
2583  , typename MT4 // Type of the left-hand side matrix operand
2584  , typename MT5 > // Type of the right-hand side matrix operand
2585  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
2586  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2587  {
2588  typedef ElementType_<MT3> ET;
2589 
2590  if( IsTriangular<MT4>::value ) {
2591  ResultType_<MT3> tmp( serial( B ) );
2592  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2593  addAssign( C, tmp );
2594  }
2595  else if( IsTriangular<MT5>::value ) {
2596  ResultType_<MT3> tmp( serial( A ) );
2597  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2598  addAssign( C, tmp );
2599  }
2600  else {
2601  gemm( C, A, B, ET(1), ET(1) );
2602  }
2603  }
2605 #endif
2606  //**********************************************************************************************
2607 
2608  //**Restructuring addition assignment to row-major matrices*************************************
2623  template< typename MT > // Type of the target matrix
2624  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
2625  addAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
2626  {
2628 
2630 
2631  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2632  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2633 
2634  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
2635  addAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
2636  else if( IsSymmetric<MT1>::value )
2637  addAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
2638  else
2639  addAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
2640  }
2642  //**********************************************************************************************
2643 
2644  //**Addition assignment to sparse matrices******************************************************
2645  // No special implementation for the addition assignment to sparse matrices.
2646  //**********************************************************************************************
2647 
2648  //**Subtraction assignment to dense matrices****************************************************
2661  template< typename MT // Type of the target dense matrix
2662  , bool SO > // Storage order of the target dense matrix
2663  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
2664  subAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
2665  {
2667 
2668  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2669  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2670 
2671  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2672  return;
2673  }
2674 
2675  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2676  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2677 
2678  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2679  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2680  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2681  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2682  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2683  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2684 
2685  TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2686  }
2688  //**********************************************************************************************
2689 
2690  //**Subtraction assignment to dense matrices (kernel selection)*********************************
2701  template< typename MT3 // Type of the left-hand side target matrix
2702  , typename MT4 // Type of the left-hand side matrix operand
2703  , typename MT5 > // Type of the right-hand side matrix operand
2704  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2705  {
2706  if( ( IsDiagonal<MT4>::value ) ||
2707  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
2708  selectSmallSubAssignKernel( C, A, B );
2709  else
2710  selectBlasSubAssignKernel( C, A, B );
2711  }
2713  //**********************************************************************************************
2714 
2715  //**Default subtraction assignment to dense matrices (general/general)**************************
2729  template< typename MT3 // Type of the left-hand side target matrix
2730  , typename MT4 // Type of the left-hand side matrix operand
2731  , typename MT5 > // Type of the right-hand side matrix operand
2732  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2733  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2734  {
2735  const size_t M( A.rows() );
2736  const size_t N( B.columns() );
2737  const size_t K( A.columns() );
2738 
2739  for( size_t j=0UL; j<N; ++j )
2740  {
2741  const size_t kbegin( ( IsLower<MT5>::value )
2742  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2743  :( 0UL ) );
2744  const size_t kend( ( IsUpper<MT5>::value )
2745  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2746  :( K ) );
2747  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2748 
2749  for( size_t k=kbegin; k<kend; ++k )
2750  {
2751  const size_t ibegin( ( IsLower<MT4>::value )
2752  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
2753  :( 0UL ) );
2754  const size_t iend( ( IsUpper<MT4>::value )
2755  ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
2756  :( M ) );
2757  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2758 
2759  const size_t inum( iend - ibegin );
2760  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2761 
2762  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2763  C(i ,j) -= A(i ,k) * B(k,j);
2764  C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
2765  }
2766  if( ipos < iend ) {
2767  C(ipos,j) -= A(ipos,k) * B(k,j);
2768  }
2769  }
2770  }
2771  }
2773  //**********************************************************************************************
2774 
2775  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
2789  template< typename MT3 // Type of the left-hand side target matrix
2790  , typename MT4 // Type of the left-hand side matrix operand
2791  , typename MT5 > // Type of the right-hand side matrix operand
2792  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2793  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2794  {
2796 
2797  const size_t M( A.rows() );
2798  const size_t N( B.columns() );
2799 
2800  for( size_t j=0UL; j<N; ++j )
2801  {
2802  const size_t ibegin( ( IsLower<MT4>::value )
2803  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2804  :( 0UL ) );
2805  const size_t iend( ( IsUpper<MT4>::value )
2806  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2807  :( M ) );
2808  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2809 
2810  const size_t inum( iend - ibegin );
2811  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2812 
2813  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2814  C(i ,j) -= A(i ,j) * B(j,j);
2815  C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
2816  }
2817  if( ipos < iend ) {
2818  C(ipos,j) -= A(ipos,j) * B(j,j);
2819  }
2820  }
2821  }
2823  //**********************************************************************************************
2824 
2825  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
2839  template< typename MT3 // Type of the left-hand side target matrix
2840  , typename MT4 // Type of the left-hand side matrix operand
2841  , typename MT5 > // Type of the right-hand side matrix operand
2842  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2843  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2844  {
2846 
2847  const size_t M( A.rows() );
2848  const size_t N( B.columns() );
2849 
2850  for( size_t j=0UL; j<N; ++j )
2851  {
2852  const size_t ibegin( ( IsLower<MT5>::value )
2853  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2854  :( 0UL ) );
2855  const size_t iend( ( IsUpper<MT5>::value )
2856  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2857  :( M ) );
2858  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2859 
2860  const size_t inum( iend - ibegin );
2861  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2862 
2863  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2864  C(i ,j) -= A(i ,i ) * B(i ,j);
2865  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
2866  }
2867  if( ipos < iend ) {
2868  C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
2869  }
2870  }
2871  }
2873  //**********************************************************************************************
2874 
2875  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
2889  template< typename MT3 // Type of the left-hand side target matrix
2890  , typename MT4 // Type of the left-hand side matrix operand
2891  , typename MT5 > // Type of the right-hand side matrix operand
2892  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2893  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2894  {
2896 
2897  for( size_t i=0UL; i<A.rows(); ++i ) {
2898  C(i,i) -= A(i,i) * B(i,i);
2899  }
2900  }
2902  //**********************************************************************************************
2903 
2904  //**Default subtraction assignment to dense matrices (small matrices)***************************
2918  template< typename MT3 // Type of the left-hand side target matrix
2919  , typename MT4 // Type of the left-hand side matrix operand
2920  , typename MT5 > // Type of the right-hand side matrix operand
2921  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2922  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2923  {
2924  selectDefaultSubAssignKernel( C, A, B );
2925  }
2927  //**********************************************************************************************
2928 
2929  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
2944  template< typename MT3 // Type of the left-hand side target matrix
2945  , typename MT4 // Type of the left-hand side matrix operand
2946  , typename MT5 > // Type of the right-hand side matrix operand
2947  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2948  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2949  {
2952  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_<MT4> );
2953  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_<MT5> );
2954 
2955  if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2956  const OppositeType_<MT5> tmp( serial( B ) );
2957  subAssign( ~C, A * tmp );
2958  }
2959  else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2960  const OppositeType_<MT4> tmp( serial( A ) );
2961  subAssign( ~C, tmp * B );
2962  }
2963  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2964  const OppositeType_<MT5> tmp( serial( B ) );
2965  subAssign( ~C, A * tmp );
2966  }
2967  else {
2968  const OppositeType_<MT4> tmp( serial( A ) );
2969  subAssign( ~C, tmp * B );
2970  }
2971  }
2973  //**********************************************************************************************
2974 
2975  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
2990  template< typename MT3 // Type of the left-hand side target matrix
2991  , typename MT4 // Type of the left-hand side matrix operand
2992  , typename MT5 > // Type of the right-hand side matrix operand
2993  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2994  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2995  {
2996  const size_t M( A.rows() );
2997  const size_t N( B.columns() );
2998  const size_t K( A.columns() );
2999 
3000  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3001 
3002  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
3003  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3004 
3005  size_t i( 0UL );
3006 
3007  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3008  for( size_t j=0UL; j<N; ++j )
3009  {
3010  const size_t kbegin( ( IsLower<MT5>::value )
3011  ?( ( IsUpper<MT4>::value )
3012  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3013  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3014  :( IsUpper<MT4>::value ? i : 0UL ) );
3015  const size_t kend( ( IsUpper<MT5>::value )
3016  ?( ( IsLower<MT4>::value )
3017  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3018  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3019  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
3020 
3021  SIMDType xmm1( (~C).load(i ,j) );
3022  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3023  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3024  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3025  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3026  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3027  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3028  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3029 
3030  for( size_t k=kbegin; k<kend; ++k ) {
3031  const SIMDType b1( set( B(k,j) ) );
3032  xmm1 = xmm1 - A.load(i ,k) * b1;
3033  xmm2 = xmm2 - A.load(i+SIMDSIZE ,k) * b1;
3034  xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,k) * b1;
3035  xmm4 = xmm4 - A.load(i+SIMDSIZE*3UL,k) * b1;
3036  xmm5 = xmm5 - A.load(i+SIMDSIZE*4UL,k) * b1;
3037  xmm6 = xmm6 - A.load(i+SIMDSIZE*5UL,k) * b1;
3038  xmm7 = xmm7 - A.load(i+SIMDSIZE*6UL,k) * b1;
3039  xmm8 = xmm8 - A.load(i+SIMDSIZE*7UL,k) * b1;
3040  }
3041 
3042  (~C).store( i , j, xmm1 );
3043  (~C).store( i+SIMDSIZE , j, xmm2 );
3044  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3045  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3046  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3047  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3048  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3049  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3050  }
3051  }
3052 
3053  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3054  {
3055  size_t j( 0UL );
3056 
3057  for( ; (j+2UL) <= N; j+=2UL )
3058  {
3059  const size_t kbegin( ( IsLower<MT5>::value )
3060  ?( ( IsUpper<MT4>::value )
3061  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3062  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3063  :( IsUpper<MT4>::value ? i : 0UL ) );
3064  const size_t kend( ( IsUpper<MT5>::value )
3065  ?( ( IsLower<MT4>::value )
3066  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3067  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3068  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
3069 
3070  SIMDType xmm1( (~C).load(i ,j ) );
3071  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3072  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3073  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3074  SIMDType xmm5( (~C).load(i ,j+1UL) );
3075  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3076  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3077  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3078 
3079  for( size_t k=kbegin; k<kend; ++k ) {
3080  const SIMDType a1( A.load(i ,k) );
3081  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3082  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3083  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3084  const SIMDType b1( set( B(k,j ) ) );
3085  const SIMDType b2( set( B(k,j+1UL) ) );
3086  xmm1 = xmm1 - a1 * b1;
3087  xmm2 = xmm2 - a2 * b1;
3088  xmm3 = xmm3 - a3 * b1;
3089  xmm4 = xmm4 - a4 * b1;
3090  xmm5 = xmm5 - a1 * b2;
3091  xmm6 = xmm6 - a2 * b2;
3092  xmm7 = xmm7 - a3 * b2;
3093  xmm8 = xmm8 - a4 * b2;
3094  }
3095 
3096  (~C).store( i , j , xmm1 );
3097  (~C).store( i+SIMDSIZE , j , xmm2 );
3098  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3099  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3100  (~C).store( i , j+1UL, xmm5 );
3101  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3102  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3103  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3104  }
3105 
3106  if( j < N )
3107  {
3108  const size_t kbegin( ( IsLower<MT5>::value )
3109  ?( ( IsUpper<MT4>::value )
3110  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3111  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3112  :( IsUpper<MT4>::value ? i : 0UL ) );
3113  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
3114 
3115  SIMDType xmm1( (~C).load(i ,j) );
3116  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3117  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3118  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3119 
3120  for( size_t k=kbegin; k<kend; ++k ) {
3121  const SIMDType b1( set( B(k,j) ) );
3122  xmm1 = xmm1 - A.load(i ,k) * b1;
3123  xmm2 = xmm2 - A.load(i+SIMDSIZE ,k) * b1;
3124  xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,k) * b1;
3125  xmm4 = xmm4 - A.load(i+SIMDSIZE*3UL,k) * b1;
3126  }
3127 
3128  (~C).store( i , j, xmm1 );
3129  (~C).store( i+SIMDSIZE , j, xmm2 );
3130  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3131  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3132  }
3133  }
3134 
3135  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3136  {
3137  size_t j( 0UL );
3138 
3139  for( ; (j+2UL) <= N; j+=2UL )
3140  {
3141  const size_t kbegin( ( IsLower<MT5>::value )
3142  ?( ( IsUpper<MT4>::value )
3143  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3144  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3145  :( IsUpper<MT4>::value ? i : 0UL ) );
3146  const size_t kend( ( IsUpper<MT5>::value )
3147  ?( ( IsLower<MT4>::value )
3148  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3149  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3150  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
3151 
3152  SIMDType xmm1( (~C).load(i ,j ) );
3153  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3154  SIMDType xmm3( (~C).load(i ,j+1UL) );
3155  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3156 
3157  for( size_t k=kbegin; k<kend; ++k ) {
3158  const SIMDType a1( A.load(i ,k) );
3159  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3160  const SIMDType b1( set( B(k,j ) ) );
3161  const SIMDType b2( set( B(k,j+1UL) ) );
3162  xmm1 = xmm1 - a1 * b1;
3163  xmm2 = xmm2 - a2 * b1;
3164  xmm3 = xmm3 - a1 * b2;
3165  xmm4 = xmm4 - a2 * b2;
3166  }
3167 
3168  (~C).store( i , j , xmm1 );
3169  (~C).store( i+SIMDSIZE, j , xmm2 );
3170  (~C).store( i , j+1UL, xmm3 );
3171  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3172  }
3173 
3174  if( j < N )
3175  {
3176  const size_t kbegin( ( IsLower<MT5>::value )
3177  ?( ( IsUpper<MT4>::value )
3178  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3179  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3180  :( IsUpper<MT4>::value ? i : 0UL ) );
3181  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
3182 
3183  SIMDType xmm1( (~C).load(i ,j) );
3184  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
3185 
3186  for( size_t k=kbegin; k<kend; ++k ) {
3187  const SIMDType b1( set( B(k,j) ) );
3188  xmm1 = xmm1 - A.load(i ,k) * b1;
3189  xmm2 = xmm2 - A.load(i+SIMDSIZE,k) * b1;
3190  }
3191 
3192  (~C).store( i , j, xmm1 );
3193  (~C).store( i+SIMDSIZE, j, xmm2 );
3194  }
3195  }
3196 
3197  for( ; i<ipos; i+=SIMDSIZE )
3198  {
3199  size_t j( 0UL );
3200 
3201  for( ; (j+2UL) <= N; j+=2UL )
3202  {
3203  const size_t kbegin( ( IsLower<MT5>::value )
3204  ?( ( IsUpper<MT4>::value )
3205  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3206  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3207  :( IsUpper<MT4>::value ? i : 0UL ) );
3208  const size_t kend( ( IsUpper<MT5>::value )
3209  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3210  :( K ) );
3211 
3212  SIMDType xmm1( (~C).load(i,j ) );
3213  SIMDType xmm2( (~C).load(i,j+1UL) );
3214 
3215  for( size_t k=kbegin; k<kend; ++k ) {
3216  const SIMDType a1( A.load(i,k) );
3217  xmm1 = xmm1 - a1 * set( B(k,j ) );
3218  xmm2 = xmm2 - a1 * set( B(k,j+1UL) );
3219  }
3220 
3221  (~C).store( i, j , xmm1 );
3222  (~C).store( i, j+1UL, xmm2 );
3223  }
3224 
3225  if( j < N )
3226  {
3227  const size_t kbegin( ( IsLower<MT5>::value )
3228  ?( ( IsUpper<MT4>::value )
3229  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3230  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3231  :( IsUpper<MT4>::value ? i : 0UL ) );
3232 
3233  SIMDType xmm1( (~C).load(i,j) );
3234 
3235  for( size_t k=kbegin; k<K; ++k ) {
3236  xmm1 = xmm1 - A.load(i,k) * set( B(k,j) );
3237  }
3238 
3239  (~C).store( i, j, xmm1 );
3240  }
3241  }
3242 
3243  for( ; remainder && i<M; ++i )
3244  {
3245  size_t j( 0UL );
3246 
3247  for( ; (j+2UL) <= N; j+=2UL )
3248  {
3249  const size_t kbegin( ( IsLower<MT5>::value )
3250  ?( ( IsUpper<MT4>::value )
3251  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3252  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3253  :( IsUpper<MT4>::value ? i : 0UL ) );
3254  const size_t kend( ( IsUpper<MT5>::value )
3255  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3256  :( K ) );
3257 
3258  ElementType value1( (~C)(i,j ) );
3259  ElementType value2( (~C)(i,j+1UL) );
3260 
3261  for( size_t k=kbegin; k<kend; ++k ) {
3262  value1 -= A(i,k) * B(k,j );
3263  value2 -= A(i,k) * B(k,j+1UL);
3264  }
3265 
3266  (~C)(i,j ) = value1;
3267  (~C)(i,j+1UL) = value2;
3268  }
3269 
3270  if( j < N )
3271  {
3272  const size_t kbegin( ( IsLower<MT5>::value )
3273  ?( ( IsUpper<MT4>::value )
3274  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3275  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3276  :( IsUpper<MT4>::value ? i : 0UL ) );
3277 
3278  ElementType value( (~C)(i,j) );
3279 
3280  for( size_t k=kbegin; k<K; ++k ) {
3281  value -= A(i,k) * B(k,j);
3282  }
3283 
3284  (~C)(i,j) = value;
3285  }
3286  }
3287  }
3289  //**********************************************************************************************
3290 
3291  //**Default subtraction assignment to dense matrices (large matrices)***************************
3305  template< typename MT3 // Type of the left-hand side target matrix
3306  , typename MT4 // Type of the left-hand side matrix operand
3307  , typename MT5 > // Type of the right-hand side matrix operand
3308  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3309  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3310  {
3311  selectDefaultSubAssignKernel( C, A, B );
3312  }
3314  //**********************************************************************************************
3315 
3316  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
3331  template< typename MT3 // Type of the left-hand side target matrix
3332  , typename MT4 // Type of the left-hand side matrix operand
3333  , typename MT5 > // Type of the right-hand side matrix operand
3334  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3335  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3336  {
3337  selectSmallSubAssignKernel( ~C, A, B );
3338  }
3340  //**********************************************************************************************
3341 
3342  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
3357  template< typename MT3 // Type of the left-hand side target matrix
3358  , typename MT4 // Type of the left-hand side matrix operand
3359  , typename MT5 > // Type of the right-hand side matrix operand
3360  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3361  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3362  {
3363  const size_t M( A.rows() );
3364  const size_t N( B.columns() );
3365  const size_t K( A.columns() );
3366 
3367  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3368 
3369  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
3370  {
3371  const size_t iend( min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
3372 
3373  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3374  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3375 
3376  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
3377  {
3378  const size_t jend( min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
3379 
3380  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
3381  {
3382  const size_t ktmp( min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
3383 
3384  size_t i( ii );
3385 
3386  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3387  {
3388  const size_t i1( i+SIMDSIZE );
3389  const size_t i2( i+SIMDSIZE*2UL );
3390  const size_t i3( i+SIMDSIZE*3UL );
3391 
3392  size_t j( jj );
3393 
3394  for( ; (j+2UL) <= jend; j+=2UL )
3395  {
3396  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3397  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3398  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
3399  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3400 
3401  SIMDType xmm1( (~C).load(i ,j ) );
3402  SIMDType xmm2( (~C).load(i1,j ) );
3403  SIMDType xmm3( (~C).load(i2,j ) );
3404  SIMDType xmm4( (~C).load(i3,j ) );
3405  SIMDType xmm5( (~C).load(i ,j+1UL) );
3406  SIMDType xmm6( (~C).load(i1,j+1UL) );
3407  SIMDType xmm7( (~C).load(i2,j+1UL) );
3408  SIMDType xmm8( (~C).load(i3,j+1UL) );
3409 
3410  for( size_t k=kbegin; k<kend; ++k ) {
3411  const SIMDType a1( A.load(i ,k) );
3412  const SIMDType a2( A.load(i1,k) );
3413  const SIMDType a3( A.load(i2,k) );
3414  const SIMDType a4( A.load(i3,k) );
3415  const SIMDType b1( set( B(k,j ) ) );
3416  const SIMDType b2( set( B(k,j+1UL) ) );
3417  xmm1 = xmm1 - a1 * b1;
3418  xmm2 = xmm2 - a2 * b1;
3419  xmm3 = xmm3 - a3 * b1;
3420  xmm4 = xmm4 - a4 * b1;
3421  xmm5 = xmm5 - a1 * b2;
3422  xmm6 = xmm6 - a2 * b2;
3423  xmm7 = xmm7 - a3 * b2;
3424  xmm8 = xmm8 - a4 * b2;
3425  }
3426 
3427  (~C).store( i , j , xmm1 );
3428  (~C).store( i1, j , xmm2 );
3429  (~C).store( i2, j , xmm3 );
3430  (~C).store( i3, j , xmm4 );
3431  (~C).store( i , j+1UL, xmm5 );
3432  (~C).store( i1, j+1UL, xmm6 );
3433  (~C).store( i2, j+1UL, xmm7 );
3434  (~C).store( i3, j+1UL, xmm8 );
3435  }
3436 
3437  if( j < jend )
3438  {
3439  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3440  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3441  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
3442  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3443 
3444  SIMDType xmm1( (~C).load(i ,j) );
3445  SIMDType xmm2( (~C).load(i1,j) );
3446  SIMDType xmm3( (~C).load(i2,j) );
3447  SIMDType xmm4( (~C).load(i3,j) );
3448 
3449  for( size_t k=kbegin; k<kend; ++k ) {
3450  const SIMDType b1( set( B(k,j) ) );
3451  xmm1 = xmm1 - A.load(i ,k) * b1;
3452  xmm2 = xmm2 - A.load(i1,k) * b1;
3453  xmm3 = xmm3 - A.load(i2,k) * b1;
3454  xmm4 = xmm4 - A.load(i3,k) * b1;
3455  }
3456 
3457  (~C).store( i , j, xmm1 );
3458  (~C).store( i1, j, xmm2 );
3459  (~C).store( i2, j, xmm3 );
3460  (~C).store( i3, j, xmm4 );
3461  }
3462  }
3463 
3464  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3465  {
3466  const size_t i1( i+SIMDSIZE );
3467 
3468  size_t j( jj );
3469 
3470  for( ; (j+4UL) <= jend; j+=4UL )
3471  {
3472  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3473  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3474  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
3475  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
3476 
3477  SIMDType xmm1( (~C).load(i ,j ) );
3478  SIMDType xmm2( (~C).load(i1,j ) );
3479  SIMDType xmm3( (~C).load(i ,j+1UL) );
3480  SIMDType xmm4( (~C).load(i1,j+1UL) );
3481  SIMDType xmm5( (~C).load(i ,j+2UL) );
3482  SIMDType xmm6( (~C).load(i1,j+2UL) );
3483  SIMDType xmm7( (~C).load(i ,j+3UL) );
3484  SIMDType xmm8( (~C).load(i1,j+3UL) );
3485 
3486  for( size_t k=kbegin; k<kend; ++k ) {
3487  const SIMDType a1( A.load(i ,k) );
3488  const SIMDType a2( A.load(i1,k) );
3489  const SIMDType b1( set( B(k,j ) ) );
3490  const SIMDType b2( set( B(k,j+1UL) ) );
3491  const SIMDType b3( set( B(k,j+2UL) ) );
3492  const SIMDType b4( set( B(k,j+3UL) ) );
3493  xmm1 = xmm1 - a1 * b1;
3494  xmm2 = xmm2 - a2 * b1;
3495  xmm3 = xmm3 - a1 * b2;
3496  xmm4 = xmm4 - a2 * b2;
3497  xmm5 = xmm5 - a1 * b3;
3498  xmm6 = xmm6 - a2 * b3;
3499  xmm7 = xmm7 - a1 * b4;
3500  xmm8 = xmm8 - a2 * b4;
3501  }
3502 
3503  (~C).store( i , j , xmm1 );
3504  (~C).store( i1, j , xmm2 );
3505  (~C).store( i , j+1UL, xmm3 );
3506  (~C).store( i1, j+1UL, xmm4 );
3507  (~C).store( i , j+2UL, xmm5 );
3508  (~C).store( i1, j+2UL, xmm6 );
3509  (~C).store( i , j+3UL, xmm7 );
3510  (~C).store( i1, j+3UL, xmm8 );
3511  }
3512 
3513  for( ; (j+2UL) <= jend; j+=2UL )
3514  {
3515  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3516  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3517  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
3518  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3519 
3520  SIMDType xmm1( (~C).load(i ,j ) );
3521  SIMDType xmm2( (~C).load(i1,j ) );
3522  SIMDType xmm3( (~C).load(i ,j+1UL) );
3523  SIMDType xmm4( (~C).load(i1,j+1UL) );
3524 
3525  for( size_t k=kbegin; k<kend; ++k ) {
3526  const SIMDType a1( A.load(i ,k) );
3527  const SIMDType a2( A.load(i1,k) );
3528  const SIMDType b1( set( B(k,j ) ) );
3529  const SIMDType b2( set( B(k,j+1UL) ) );
3530  xmm1 = xmm1 - a1 * b1;
3531  xmm2 = xmm2 - a2 * b1;
3532  xmm3 = xmm3 - a1 * b2;
3533  xmm4 = xmm4 - a2 * b2;
3534  }
3535 
3536  (~C).store( i , j , xmm1 );
3537  (~C).store( i1, j , xmm2 );
3538  (~C).store( i , j+1UL, xmm3 );
3539  (~C).store( i1, j+1UL, xmm4 );
3540  }
3541 
3542  if( j < jend )
3543  {
3544  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3545  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3546  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
3547  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3548 
3549  SIMDType xmm1( (~C).load(i ,j) );
3550  SIMDType xmm2( (~C).load(i1,j) );
3551 
3552  for( size_t k=kbegin; k<kend; ++k ) {
3553  const SIMDType b1( set( B(k,j) ) );
3554  xmm1 = xmm1 - A.load(i ,k) * b1;
3555  xmm2 = xmm2 - A.load(i1,k) * b1;
3556  }
3557 
3558  (~C).store( i , j, xmm1 );
3559  (~C).store( i1, j, xmm2 );
3560  }
3561  }
3562 
3563  for( ; i<ipos; i+=SIMDSIZE )
3564  {
3565  for( size_t j=jj; j<jend; ++j )
3566  {
3567  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3568  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3569  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE, ktmp ) ):( ktmp ),
3570  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3571 
3572  SIMDType xmm1( (~C).load(i,j) );
3573 
3574  for( size_t k=kbegin; k<kend; ++k ) {
3575  const SIMDType b1( set( B(k,j) ) );
3576  xmm1 = xmm1 - A.load(i,k) * b1;
3577  }
3578 
3579  (~C).store( i, j, xmm1 );
3580  }
3581  }
3582 
3583  for( ; remainder && i<iend; ++i )
3584  {
3585  for( size_t j=jj; j<jend; ++j )
3586  {
3587  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3588  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3589  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
3590  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3591 
3592  ElementType value( (~C)(i,j) );
3593 
3594  for( size_t k=kbegin; k<kend; ++k ) {
3595  value -= A(i,k) * B(k,j);
3596  }
3597 
3598  (~C)(i,j) = value;
3599  }
3600  }
3601  }
3602  }
3603  }
3604  }
3606  //**********************************************************************************************
3607 
3608  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
3622  template< typename MT3 // Type of the left-hand side target matrix
3623  , typename MT4 // Type of the left-hand side matrix operand
3624  , typename MT5 > // Type of the right-hand side matrix operand
3625  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
3626  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3627  {
3628  selectLargeSubAssignKernel( C, A, B );
3629  }
3631  //**********************************************************************************************
3632 
3633  //**BLAS-based subraction assignment to dense matrices******************************************
3634 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3635 
3648  template< typename MT3 // Type of the left-hand side target matrix
3649  , typename MT4 // Type of the left-hand side matrix operand
3650  , typename MT5 > // Type of the right-hand side matrix operand
3651  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
3652  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3653  {
3654  typedef ElementType_<MT3> ET;
3655 
3656  if( IsTriangular<MT4>::value ) {
3657  ResultType_<MT3> tmp( serial( B ) );
3658  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3659  subAssign( C, tmp );
3660  }
3661  else if( IsTriangular<MT5>::value ) {
3662  ResultType_<MT3> tmp( serial( A ) );
3663  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3664  subAssign( C, tmp );
3665  }
3666  else {
3667  gemm( C, A, B, ET(-1), ET(1) );
3668  }
3669  }
3671 #endif
3672  //**********************************************************************************************
3673 
3674  //**Restructuring subtraction assignment to row-major matrices**********************************
3690  template< typename MT > // Type of the target matrix
3691  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3692  subAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
3693  {
3695 
3697 
3698  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3699  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3700 
3701  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3702  subAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3703  else if( IsSymmetric<MT1>::value )
3704  subAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
3705  else
3706  subAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
3707  }
3709  //**********************************************************************************************
3710 
3711  //**Subtraction assignment to sparse matrices***************************************************
3712  // No special implementation for the subtraction assignment to sparse matrices.
3713  //**********************************************************************************************
3714 
3715  //**Multiplication assignment to dense matrices*************************************************
3716  // No special implementation for the multiplication assignment to dense matrices.
3717  //**********************************************************************************************
3718 
3719  //**Multiplication assignment to sparse matrices************************************************
3720  // No special implementation for the multiplication assignment to sparse matrices.
3721  //**********************************************************************************************
3722 
3723  //**SMP assignment to dense matrices************************************************************
3739  template< typename MT // Type of the target dense matrix
3740  , bool SO > // Storage order of the target dense matrix
3741  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3742  smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3743  {
3745 
3746  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3747  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3748 
3749  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
3750  return;
3751  }
3752  else if( rhs.lhs_.columns() == 0UL ) {
3753  reset( ~lhs );
3754  return;
3755  }
3756 
3757  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3758  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3759 
3760  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3761  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3762  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3763  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3764  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3765  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3766 
3767  smpAssign( ~lhs, A * B );
3768  }
3770  //**********************************************************************************************
3771 
3772  //**SMP assignment to sparse matrices***********************************************************
3788  template< typename MT // Type of the target sparse matrix
3789  , bool SO > // Storage order of the target sparse matrix
3790  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3791  smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3792  {
3794 
3795  typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
3796 
3802  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<TmpType> );
3803 
3804  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3805  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3806 
3807  const TmpType tmp( rhs );
3808  smpAssign( ~lhs, tmp );
3809  }
3811  //**********************************************************************************************
3812 
3813  //**Restructuring SMP assignment to row-major matrices******************************************
3828  template< typename MT > // Type of the target matrix
3829  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3830  smpAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
3831  {
3833 
3835 
3836  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3837  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3838 
3839  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3840  smpAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3841  else if( IsSymmetric<MT1>::value )
3842  smpAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
3843  else
3844  smpAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
3845  }
3847  //**********************************************************************************************
3848 
3849  //**SMP addition assignment to dense matrices***************************************************
3865  template< typename MT // Type of the target dense matrix
3866  , bool SO > // Storage order of the target dense matrix
3867  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3868  smpAddAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3869  {
3871 
3872  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3873  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3874 
3875  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3876  return;
3877  }
3878 
3879  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3880  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3881 
3882  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3883  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3884  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3885  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3886  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3887  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3888 
3889  smpAddAssign( ~lhs, A * B );
3890  }
3892  //**********************************************************************************************
3893 
3894  //**Restructuring SMP addition assignment to row-major matrices*********************************
3910  template< typename MT > // Type of the target matrix
3911  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3912  smpAddAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
3913  {
3915 
3917 
3918  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3919  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3920 
3921  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3922  smpAddAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
3923  else if( IsSymmetric<MT1>::value )
3924  smpAddAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
3925  else
3926  smpAddAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
3927  }
3929  //**********************************************************************************************
3930 
3931  //**SMP addition assignment to sparse matrices**************************************************
3932  // No special implementation for the SMP addition assignment to sparse matrices.
3933  //**********************************************************************************************
3934 
3935  //**SMP subtraction assignment to dense matrices************************************************
3951  template< typename MT // Type of the target dense matrix
3952  , bool SO > // Storage order of the target dense matrix
3953  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3954  smpSubAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3955  {
3957 
3958  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3959  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3960 
3961  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3962  return;
3963  }
3964 
3965  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
3966  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
3967 
3968  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3969  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3970  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3971  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3972  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3973  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3974 
3975  smpSubAssign( ~lhs, A * B );
3976  }
3978  //**********************************************************************************************
3979 
3980  //**Restructuring SMP subtraction assignment to row-major matrices******************************
3996  template< typename MT > // Type of the target matrix
3997  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3998  smpSubAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
3999  {
4001 
4003 
4004  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4005  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4006 
4007  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
4008  smpSubAssign( ~lhs, trans( rhs.lhs_ ) * trans( rhs.rhs_ ) );
4009  else if( IsSymmetric<MT1>::value )
4010  smpSubAssign( ~lhs, trans( rhs.lhs_ ) * rhs.rhs_ );
4011  else
4012  smpSubAssign( ~lhs, rhs.lhs_ * trans( rhs.rhs_ ) );
4013  }
4015  //**********************************************************************************************
4016 
4017  //**SMP subtraction assignment to sparse matrices***********************************************
4018  // No special implementation for the SMP subtraction assignment to sparse matrices.
4019  //**********************************************************************************************
4020 
4021  //**SMP multiplication assignment to dense matrices*********************************************
4022  // No special implementation for the SMP multiplication assignment to dense matrices.
4023  //**********************************************************************************************
4024 
4025  //**SMP multiplication assignment to sparse matrices********************************************
4026  // No special implementation for the SMP multiplication assignment to sparse matrices.
4027  //**********************************************************************************************
4028 
4029  //**Compile time checks*************************************************************************
4037  //**********************************************************************************************
4038 };
4039 //*************************************************************************************************
4040 
4041 
4042 
4043 
4044 //=================================================================================================
4045 //
4046 // DMATSCALARMULTEXPR SPECIALIZATION
4047 //
4048 //=================================================================================================
4049 
4050 //*************************************************************************************************
4058 template< typename MT1 // Type of the left-hand side dense matrix
4059  , typename MT2 // Type of the right-hand side dense matrix
4060  , typename ST > // Type of the right-hand side scalar value
4061 class DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2>, ST, true >
4062  : public DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2>, ST, true >, true >
4063  , private MatScalarMultExpr
4064  , private Computation
4065 {
4066  private:
4067  //**Type definitions****************************************************************************
4068  typedef TDMatTDMatMultExpr<MT1,MT2> MMM;
4069  typedef ResultType_<MMM> RES;
4070  typedef ResultType_<MT1> RT1;
4071  typedef ResultType_<MT2> RT2;
4072  typedef ElementType_<RT1> ET1;
4073  typedef ElementType_<RT2> ET2;
4074  typedef CompositeType_<MT1> CT1;
4075  typedef CompositeType_<MT2> CT2;
4076  //**********************************************************************************************
4077 
4078  //**********************************************************************************************
4080  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4081  //**********************************************************************************************
4082 
4083  //**********************************************************************************************
4085  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4086  //**********************************************************************************************
4087 
4088  //**********************************************************************************************
4090 
4095  template< typename T1, typename T2, typename T3 >
4096  struct CanExploitSymmetry {
4097  enum : bool { value = IsRowMajorMatrix<T1>::value &&
4098  ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
4099  };
4100  //**********************************************************************************************
4101 
4102  //**********************************************************************************************
4104 
4107  template< typename T1, typename T2, typename T3 >
4108  struct IsEvaluationRequired {
4109  enum : bool { value = ( evaluateLeft || evaluateRight ) &&
4110  !CanExploitSymmetry<T1,T2,T3>::value };
4111  };
4112  //**********************************************************************************************
4113 
4114  //**********************************************************************************************
4116 
4118  template< typename T1, typename T2, typename T3, typename T4 >
4119  struct UseBlasKernel {
4121  HasMutableDataAccess<T1>::value &&
4122  HasConstDataAccess<T2>::value &&
4123  HasConstDataAccess<T3>::value &&
4124  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4125  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4126  IsBLASCompatible< ElementType_<T1> >::value &&
4127  IsBLASCompatible< ElementType_<T2> >::value &&
4128  IsBLASCompatible< ElementType_<T3> >::value &&
4129  IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
4130  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
4131  !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
4132  };
4133  //**********************************************************************************************
4134 
4135  //**********************************************************************************************
4137 
4139  template< typename T1, typename T2, typename T3, typename T4 >
4140  struct UseVectorizedDefaultKernel {
4141  enum : bool { value = useOptimizedKernels &&
4142  !IsDiagonal<T2>::value &&
4143  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4144  AreSIMDCombinable< ElementType_<T1>
4145  , ElementType_<T2>
4146  , ElementType_<T3>
4147  , T4 >::value &&
4148  HasSIMDAdd< ElementType_<T2>, ElementType_<T2> >::value &&
4149  HasSIMDMult< ElementType_<T3>, ElementType_<T3> >::value };
4150  };
4151  //**********************************************************************************************
4152 
4153  public:
4154  //**Type definitions****************************************************************************
4155  typedef DMatScalarMultExpr<MMM,ST,true> This;
4156  typedef MultTrait_<RES,ST> ResultType;
4157  typedef OppositeType_<ResultType> OppositeType;
4158  typedef TransposeType_<ResultType> TransposeType;
4159  typedef ElementType_<ResultType> ElementType;
4160  typedef SIMDTrait_<ElementType> SIMDType;
4161  typedef const ElementType ReturnType;
4162  typedef const ResultType CompositeType;
4163 
4165  typedef const TDMatTDMatMultExpr<MT1,MT2> LeftOperand;
4166 
4168  typedef ST RightOperand;
4169 
4171  typedef IfTrue_< evaluateLeft, const RT1, CT1 > LT;
4172 
4174  typedef IfTrue_< evaluateRight, const RT2, CT2 > RT;
4175  //**********************************************************************************************
4176 
4177  //**Compilation flags***************************************************************************
4179  enum : bool { simdEnabled = !IsDiagonal<MT1>::value &&
4180  MT1::simdEnabled && MT2::simdEnabled &&
4181  AreSIMDCombinable<ET1,ET2,ST>::value &&
4182  HasSIMDAdd<ET1,ET2>::value &&
4183  HasSIMDMult<ET1,ET2>::value };
4184 
4186  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4187  !evaluateRight && MT2::smpAssignable };
4188  //**********************************************************************************************
4189 
4190  //**SIMD properties*****************************************************************************
4192  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
4193  //**********************************************************************************************
4194 
4195  //**Constructor*********************************************************************************
4201  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4202  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4203  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4204  {}
4205  //**********************************************************************************************
4206 
4207  //**Access operator*****************************************************************************
4214  inline ReturnType operator()( size_t i, size_t j ) const {
4215  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4216  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4217  return matrix_(i,j) * scalar_;
4218  }
4219  //**********************************************************************************************
4220 
4221  //**At function*********************************************************************************
4229  inline ReturnType at( size_t i, size_t j ) const {
4230  if( i >= matrix_.rows() ) {
4231  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4232  }
4233  if( j >= matrix_.columns() ) {
4234  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4235  }
4236  return (*this)(i,j);
4237  }
4238  //**********************************************************************************************
4239 
4240  //**Rows function*******************************************************************************
4245  inline size_t rows() const {
4246  return matrix_.rows();
4247  }
4248  //**********************************************************************************************
4249 
4250  //**Columns function****************************************************************************
4255  inline size_t columns() const {
4256  return matrix_.columns();
4257  }
4258  //**********************************************************************************************
4259 
4260  //**Left operand access*************************************************************************
4265  inline LeftOperand leftOperand() const {
4266  return matrix_;
4267  }
4268  //**********************************************************************************************
4269 
4270  //**Right operand access************************************************************************
4275  inline RightOperand rightOperand() const {
4276  return scalar_;
4277  }
4278  //**********************************************************************************************
4279 
4280  //**********************************************************************************************
4286  template< typename T >
4287  inline bool canAlias( const T* alias ) const {
4288  return matrix_.canAlias( alias );
4289  }
4290  //**********************************************************************************************
4291 
4292  //**********************************************************************************************
4298  template< typename T >
4299  inline bool isAliased( const T* alias ) const {
4300  return matrix_.isAliased( alias );
4301  }
4302  //**********************************************************************************************
4303 
4304  //**********************************************************************************************
4309  inline bool isAligned() const {
4310  return matrix_.isAligned();
4311  }
4312  //**********************************************************************************************
4313 
4314  //**********************************************************************************************
4319  inline bool canSMPAssign() const noexcept {
4320  return ( !BLAZE_BLAS_IS_PARALLEL ||
4321  ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
4322  ( rows() * columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
4323  }
4324  //**********************************************************************************************
4325 
4326  private:
4327  //**Member variables****************************************************************************
4328  LeftOperand matrix_;
4329  RightOperand scalar_;
4330  //**********************************************************************************************
4331 
4332  //**Assignment to dense matrices****************************************************************
4344  template< typename MT // Type of the target dense matrix
4345  , bool SO > // Storage order of the target dense matrix
4346  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
4347  assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4348  {
4350 
4351  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4352  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4353 
4354  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
4355  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
4356 
4357  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4358  return;
4359  }
4360  else if( left.columns() == 0UL ) {
4361  reset( ~lhs );
4362  return;
4363  }
4364 
4365  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4366  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4367 
4368  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4369  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4370  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4371  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4372  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4373  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4374 
4375  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4376  }
4377  //**********************************************************************************************
4378 
4379  //**Assignment to dense matrices (kernel selection)*********************************************
4390  template< typename MT3 // Type of the left-hand side target matrix
4391  , typename MT4 // Type of the left-hand side matrix operand
4392  , typename MT5 // Type of the right-hand side matrix operand
4393  , typename ST2 > // Type of the scalar value
4394  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4395  {
4396  if( ( IsDiagonal<MT4>::value ) ||
4397  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
4398  selectSmallAssignKernel( C, A, B, scalar );
4399  else
4400  selectBlasAssignKernel( C, A, B, scalar );
4401  }
4402  //**********************************************************************************************
4403 
4404  //**Default assignment to dense matrices (general/general)**************************************
4418  template< typename MT3 // Type of the left-hand side target matrix
4419  , typename MT4 // Type of the left-hand side matrix operand
4420  , typename MT5 // Type of the right-hand side matrix operand
4421  , typename ST2 > // Type of the scalar value
4422  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4423  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4424  {
4425  const size_t M( A.rows() );
4426  const size_t N( B.columns() );
4427  const size_t K( A.columns() );
4428 
4429  for( size_t j=0UL; j<N; ++j )
4430  {
4431  const size_t kbegin( ( IsLower<MT5>::value )
4432  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4433  :( 0UL ) );
4434  const size_t kend( ( IsUpper<MT5>::value )
4435  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4436  :( K ) );
4437  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4438 
4439  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
4440  for( size_t i=0UL; i<M; ++i ) {
4441  reset( (~C)(i,j) );
4442  }
4443  continue;
4444  }
4445 
4446  {
4447  const size_t ibegin( ( IsLower<MT4>::value )
4448  ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
4449  :( 0UL ) );
4450  const size_t iend( ( IsUpper<MT4>::value )
4451  ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
4452  :( M ) );
4453  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4454 
4455  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
4456  for( size_t i=0UL; i<ibegin; ++i ) {
4457  reset( C(i,j) );
4458  }
4459  }
4460  else if( IsStrictlyLower<MT4>::value ) {
4461  reset( C(0UL,j) );
4462  }
4463  for( size_t i=ibegin; i<iend; ++i ) {
4464  C(i,j) = A(i,kbegin) * B(kbegin,j);
4465  }
4466  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
4467  for( size_t i=iend; i<M; ++i ) {
4468  reset( C(i,j) );
4469  }
4470  }
4471  else if( IsStrictlyUpper<MT4>::value ) {
4472  reset( C(M-1UL,j) );
4473  }
4474  }
4475 
4476  for( size_t k=kbegin+1UL; k<kend; ++k )
4477  {
4478  const size_t ibegin( ( IsLower<MT4>::value )
4479  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
4480  :( 0UL ) );
4481  const size_t iend( ( IsUpper<MT4>::value )
4482  ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
4483  :( M ) );
4484  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4485 
4486  for( size_t i=ibegin; i<iend; ++i ) {
4487  C(i,j) += A(i,k) * B(k,j);
4488  }
4489  if( IsUpper<MT4>::value ) {
4490  C(iend,j) = A(iend,k) * B(k,j);
4491  }
4492  }
4493 
4494  {
4495  const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
4496  ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? j+1UL : j )
4497  :( 0UL ) );
4498  const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4499  ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? j : j+1UL )
4500  :( M ) );
4501  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4502 
4503  for( size_t i=ibegin; i<iend; ++i ) {
4504  C(i,j) *= scalar;
4505  }
4506  }
4507  }
4508  }
4509  //**********************************************************************************************
4510 
4511  //**Default assignment to dense matrices (general/diagonal)*************************************
4525  template< typename MT3 // Type of the left-hand side target matrix
4526  , typename MT4 // Type of the left-hand side matrix operand
4527  , typename MT5 // Type of the right-hand side matrix operand
4528  , typename ST2 > // Type of the scalar value
4529  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4530  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4531  {
4533 
4534  const size_t M( A.rows() );
4535  const size_t N( B.columns() );
4536 
4537  for( size_t j=0UL; j<N; ++j )
4538  {
4539  const size_t ibegin( ( IsLower<MT4>::value )
4540  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4541  :( 0UL ) );
4542  const size_t iend( ( IsUpper<MT4>::value )
4543  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4544  :( M ) );
4545  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4546 
4547  if( IsLower<MT4>::value ) {
4548  for( size_t i=0UL; i<ibegin; ++i ) {
4549  reset( C(i,j) );
4550  }
4551  }
4552  for( size_t i=ibegin; i<iend; ++i ) {
4553  C(i,j) = A(i,j) * B(j,j) * scalar;
4554  }
4555  if( IsUpper<MT4>::value ) {
4556  for( size_t i=iend; i<M; ++i ) {
4557  reset( C(i,j) );
4558  }
4559  }
4560  }
4561  }
4562  //**********************************************************************************************
4563 
4564  //**Default assignment to dense matrices (diagonal/general)*************************************
4578  template< typename MT3 // Type of the left-hand side target matrix
4579  , typename MT4 // Type of the left-hand side matrix operand
4580  , typename MT5 // Type of the right-hand side matrix operand
4581  , typename ST2 > // Type of the scalar value
4582  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4583  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4584  {
4586 
4587  const size_t M( A.rows() );
4588  const size_t N( B.columns() );
4589 
4590  for( size_t j=0UL; j<N; ++j )
4591  {
4592  const size_t ibegin( ( IsLower<MT5>::value )
4593  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4594  :( 0UL ) );
4595  const size_t iend( ( IsUpper<MT5>::value )
4596  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4597  :( M ) );
4598  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4599 
4600  if( IsLower<MT4>::value ) {
4601  for( size_t i=0UL; i<ibegin; ++i ) {
4602  reset( C(i,j) );
4603  }
4604  }
4605  for( size_t i=ibegin; i<iend; ++i ) {
4606  C(i,j) = A(i,i) * B(i,j) * scalar;
4607  }
4608  if( IsUpper<MT4>::value ) {
4609  for( size_t i=iend; i<M; ++i ) {
4610  reset( C(i,j) );
4611  }
4612  }
4613  }
4614  }
4615  //**********************************************************************************************
4616 
4617  //**Default assignment to dense matrices (diagonal/diagonal)************************************
4631  template< typename MT3 // Type of the left-hand side target matrix
4632  , typename MT4 // Type of the left-hand side matrix operand
4633  , typename MT5 // Type of the right-hand side matrix operand
4634  , typename ST2 > // Type of the scalar value
4635  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
4636  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4637  {
4639 
4640  reset( C );
4641 
4642  for( size_t i=0UL; i<A.rows(); ++i ) {
4643  C(i,i) = A(i,i) * B(i,i) * scalar;
4644  }
4645  }
4646  //**********************************************************************************************
4647 
4648  //**Default assignment to dense matrices (small matrices)***************************************
4662  template< typename MT3 // Type of the left-hand side target matrix
4663  , typename MT4 // Type of the left-hand side matrix operand
4664  , typename MT5 // Type of the right-hand side matrix operand
4665  , typename ST2 > // Type of the scalar value
4666  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4667  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4668  {
4669  selectDefaultAssignKernel( C, A, B, scalar );
4670  }
4671  //**********************************************************************************************
4672 
4673  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
4688  template< typename MT3 // Type of the left-hand side target matrix
4689  , typename MT4 // Type of the left-hand side matrix operand
4690  , typename MT5 // Type of the right-hand side matrix operand
4691  , typename ST2 > // Type of the scalar value
4692  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4693  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
4694  {
4697  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_<MT4> );
4698  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_<MT5> );
4699 
4700  if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
4701  const OppositeType_<MT5> tmp( serial( B ) );
4702  assign( ~C, A * tmp * scalar );
4703  }
4704  else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
4705  const OppositeType_<MT4> tmp( serial( A ) );
4706  assign( ~C, tmp * B * scalar );
4707  }
4708  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
4709  const OppositeType_<MT5> tmp( serial( B ) );
4710  assign( ~C, A * tmp * scalar );
4711  }
4712  else {
4713  const OppositeType_<MT4> tmp( serial( A ) );
4714  assign( ~C, tmp * B * scalar );
4715  }
4716  }
4717  //**********************************************************************************************
4718 
4719  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
4734  template< typename MT3 // Type of the left-hand side target matrix
4735  , typename MT4 // Type of the left-hand side matrix operand
4736  , typename MT5 // Type of the right-hand side matrix operand
4737  , typename ST2 > // Type of the scalar value
4738  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4739  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
4740  {
4741  const size_t M( A.rows() );
4742  const size_t N( B.columns() );
4743  const size_t K( A.columns() );
4744 
4745  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
4746 
4747  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
4748  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
4749 
4750  const SIMDType factor( set( scalar ) );
4751 
4752  size_t i( 0UL );
4753 
4754  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
4755  for( size_t j=0UL; j<N; ++j )
4756  {
4757  const size_t kbegin( ( IsLower<MT5>::value )
4758  ?( ( IsUpper<MT4>::value )
4759  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4760  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4761  :( IsUpper<MT4>::value ? i : 0UL ) );
4762  const size_t kend( ( IsUpper<MT5>::value )
4763  ?( ( IsLower<MT4>::value )
4764  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4765  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
4766  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
4767 
4768  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4769 
4770  for( size_t k=kbegin; k<kend; ++k ) {
4771  const SIMDType b1( set( B(k,j) ) );
4772  xmm1 = xmm1 + A.load(i ,k) * b1;
4773  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
4774  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
4775  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
4776  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
4777  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
4778  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
4779  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
4780  }
4781 
4782  (~C).store( i , j, xmm1 * factor );
4783  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
4784  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
4785  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
4786  (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
4787  (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
4788  (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
4789  (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
4790  }
4791  }
4792 
4793  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4794  {
4795  size_t j( 0UL );
4796 
4797  for( ; (j+2UL) <= N; j+=2UL )
4798  {
4799  const size_t kbegin( ( IsLower<MT5>::value )
4800  ?( ( IsUpper<MT4>::value )
4801  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4802  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4803  :( IsUpper<MT4>::value ? i : 0UL ) );
4804  const size_t kend( ( IsUpper<MT5>::value )
4805  ?( ( IsLower<MT4>::value )
4806  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4807  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4808  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
4809 
4810  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4811 
4812  for( size_t k=kbegin; k<kend; ++k ) {
4813  const SIMDType a1( A.load(i ,k) );
4814  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4815  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4816  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4817  const SIMDType b1( set( B(k,j ) ) );
4818  const SIMDType b2( set( B(k,j+1UL) ) );
4819  xmm1 = xmm1 + a1 * b1;
4820  xmm2 = xmm2 + a2 * b1;
4821  xmm3 = xmm3 + a3 * b1;
4822  xmm4 = xmm4 + a4 * b1;
4823  xmm5 = xmm5 + a1 * b2;
4824  xmm6 = xmm6 + a2 * b2;
4825  xmm7 = xmm7 + a3 * b2;
4826  xmm8 = xmm8 + a4 * b2;
4827  }
4828 
4829  (~C).store( i , j , xmm1 * factor );
4830  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
4831  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
4832  (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
4833  (~C).store( i , j+1UL, xmm5 * factor );
4834  (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
4835  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
4836  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
4837  }
4838 
4839  if( j < N )
4840  {
4841  const size_t kbegin( ( IsLower<MT5>::value )
4842  ?( ( IsUpper<MT4>::value )
4843  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4844  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4845  :( IsUpper<MT4>::value ? i : 0UL ) );
4846  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
4847 
4848  SIMDType xmm1, xmm2, xmm3, xmm4;
4849 
4850  for( size_t k=kbegin; k<kend; ++k ) {
4851  const SIMDType b1( set( B(k,j) ) );
4852  xmm1 = xmm1 + A.load(i ,k) * b1;
4853  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
4854  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
4855  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
4856  }
4857 
4858  (~C).store( i , j, xmm1 * factor );
4859  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
4860  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
4861  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
4862  }
4863  }
4864 
4865  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4866  {
4867  size_t j( 0UL );
4868 
4869  for( ; (j+2UL) <= N; j+=2UL )
4870  {
4871  const size_t kbegin( ( IsLower<MT5>::value )
4872  ?( ( IsUpper<MT4>::value )
4873  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4874  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4875  :( IsUpper<MT4>::value ? i : 0UL ) );
4876  const size_t kend( ( IsUpper<MT5>::value )
4877  ?( ( IsLower<MT4>::value )
4878  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4879  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4880  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
4881 
4882  SIMDType xmm1, xmm2, xmm3, xmm4;
4883 
4884  for( size_t k=kbegin; k<kend; ++k ) {
4885  const SIMDType a1( A.load(i ,k) );
4886  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4887  const SIMDType b1( set( B(k,j ) ) );
4888  const SIMDType b2( set( B(k,j+1UL) ) );
4889  xmm1 = xmm1 + a1 * b1;
4890  xmm2 = xmm2 + a2 * b1;
4891  xmm3 = xmm3 + a1 * b2;
4892  xmm4 = xmm4 + a2 * b2;
4893  }
4894 
4895  (~C).store( i , j , xmm1 * factor );
4896  (~C).store( i+SIMDSIZE, j , xmm2 * factor );
4897  (~C).store( i , j+1UL, xmm3 * factor );
4898  (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
4899  }
4900 
4901  if( j < N )
4902  {
4903  const size_t kbegin( ( IsLower<MT5>::value )
4904  ?( ( IsUpper<MT4>::value )
4905  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4906  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4907  :( IsUpper<MT4>::value ? i : 0UL ) );
4908  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
4909 
4910  SIMDType xmm1, xmm2;
4911 
4912  for( size_t k=kbegin; k<kend; ++k ) {
4913  const SIMDType b1( set( B(k,j) ) );
4914  xmm1 = xmm1 + A.load(i ,k) * b1;
4915  xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
4916  }
4917 
4918  (~C).store( i , j, xmm1 * factor );
4919  (~C).store( i+SIMDSIZE, j, xmm2 * factor );
4920  }
4921  }
4922 
4923  for( ; i<ipos; i+=SIMDSIZE )
4924  {
4925  size_t j( 0UL );
4926 
4927  for( ; (j+2UL) <= N; j+=2UL )
4928  {
4929  const size_t kbegin( ( IsLower<MT5>::value )
4930  ?( ( IsUpper<MT4>::value )
4931  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4932  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4933  :( IsUpper<MT4>::value ? i : 0UL ) );
4934  const size_t kend( ( IsUpper<MT5>::value )
4935  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4936  :( K ) );
4937 
4938  SIMDType xmm1, xmm2;
4939 
4940  for( size_t k=kbegin; k<kend; ++k ) {
4941  const SIMDType a1( A.load(i,k) );
4942  xmm1 = xmm1 + a1 * set( B(k,j ) );
4943  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
4944  }
4945 
4946  (~C).store( i, j , xmm1 * factor );
4947  (~C).store( i, j+1UL, xmm2 * factor );
4948  }
4949 
4950  if( j < N )
4951  {
4952  const size_t kbegin( ( IsLower<MT5>::value )
4953  ?( ( IsUpper<MT4>::value )
4954  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4955  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4956  :( IsUpper<MT4>::value ? i : 0UL ) );
4957 
4958  SIMDType xmm1;
4959 
4960  for( size_t k=kbegin; k<K; ++k ) {
4961  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
4962  }
4963 
4964  (~C).store( i, j, xmm1 * factor );
4965  }
4966  }
4967 
4968  for( ; remainder && i<M; ++i )
4969  {
4970  size_t j( 0UL );
4971 
4972  for( ; (j+2UL) <= N; j+=2UL )
4973  {
4974  const size_t kbegin( ( IsLower<MT5>::value )
4975  ?( ( IsUpper<MT4>::value )
4976  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4977  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4978  :( IsUpper<MT4>::value ? i : 0UL ) );
4979  const size_t kend( ( IsUpper<MT5>::value )
4980  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4981  :( K ) );
4982 
4983  ElementType value1 = ElementType();
4984  ElementType value2 = ElementType();
4985 
4986  for( size_t k=kbegin; k<kend; ++k ) {
4987  value1 += A(i,k) * B(k,j );
4988  value2 += A(i,k) * B(k,j+1UL);
4989  }
4990 
4991  (~C)(i,j ) = value1 * scalar;
4992  (~C)(i,j+1UL) = value2 * scalar;
4993  }
4994 
4995  if( j < N )
4996  {
4997  const size_t kbegin( ( IsLower<MT5>::value )
4998  ?( ( IsUpper<MT4>::value )
4999  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5000  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5001  :( IsUpper<MT4>::value ? i : 0UL ) );
5002 
5003  ElementType value = ElementType();
5004 
5005  for( size_t k=kbegin; k<K; ++k ) {
5006  value += A(i,k) * B(k,j);
5007  }
5008 
5009  (~C)(i,j) = value * scalar;
5010  }
5011  }
5012  }
5013  //**********************************************************************************************
5014 
5015  //**Default assignment to dense matrices (large matrices)***************************************
5029  template< typename MT3 // Type of the left-hand side target matrix
5030  , typename MT4 // Type of the left-hand side matrix operand
5031  , typename MT5 // Type of the right-hand side matrix operand
5032  , typename ST2 > // Type of the scalar value
5033  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5034  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5035  {
5036  selectDefaultAssignKernel( C, A, B, scalar );
5037  }
5038  //**********************************************************************************************
5039 
5040  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
5055  template< typename MT3 // Type of the left-hand side target matrix
5056  , typename MT4 // Type of the left-hand side matrix operand
5057  , typename MT5 // Type of the right-hand side matrix operand
5058  , typename ST2 > // Type of the scalar value
5059  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5060  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5061  {
5062  selectSmallAssignKernel( ~C, A, B, scalar );
5063  }
5064  //**********************************************************************************************
5065 
5066  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
5081  template< typename MT3 // Type of the left-hand side target matrix
5082  , typename MT4 // Type of the left-hand side matrix operand
5083  , typename MT5 // Type of the right-hand side matrix operand
5084  , typename ST2 > // Type of the scalar value
5085  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5086  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5087  {
5088  const size_t M( A.rows() );
5089  const size_t N( B.columns() );
5090  const size_t K( A.columns() );
5091 
5092  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
5093 
5094  const SIMDType factor( set( scalar ) );
5095 
5096  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
5097  {
5098  const size_t iend( min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
5099 
5100  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
5101  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
5102 
5103  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
5104  {
5105  const size_t jend( min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
5106 
5107  for( size_t j=jj; j<jend; ++j ) {
5108  for( size_t i=ii; i<iend; ++i ) {
5109  reset( (~C)(i,j) );
5110  }
5111  }
5112 
5113  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
5114  {
5115  const size_t ktmp( min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
5116 
5117  size_t i( ii );
5118 
5119  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5120  {
5121  const size_t i1( i+SIMDSIZE );
5122  const size_t i2( i+SIMDSIZE*2UL );
5123  const size_t i3( i+SIMDSIZE*3UL );
5124 
5125  size_t j( jj );
5126 
5127  for( ; (j+2UL) <= jend; j+=2UL )
5128  {
5129  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5130  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5131  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
5132  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5133 
5134  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5135 
5136  for( size_t k=kbegin; k<kend; ++k ) {
5137  const SIMDType a1( A.load(i ,k) );
5138  const SIMDType a2( A.load(i1,k) );
5139  const SIMDType a3( A.load(i2,k) );
5140  const SIMDType a4( A.load(i3,k) );
5141  const SIMDType b1( set( B(k,j ) ) );
5142  const SIMDType b2( set( B(k,j+1UL) ) );
5143  xmm1 = xmm1 + a1 * b1;
5144  xmm2 = xmm2 + a2 * b1;
5145  xmm3 = xmm3 + a3 * b1;
5146  xmm4 = xmm4 + a4 * b1;
5147  xmm5 = xmm5 + a1 * b2;
5148  xmm6 = xmm6 + a2 * b2;
5149  xmm7 = xmm7 + a3 * b2;
5150  xmm8 = xmm8 + a4 * b2;
5151  }
5152 
5153  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5154  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
5155  (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
5156  (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
5157  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
5158  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
5159  (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
5160  (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
5161  }
5162 
5163  if( j < jend )
5164  {
5165  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5166  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5167  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
5168  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5169 
5170  SIMDType xmm1, xmm2, xmm3, xmm4;
5171 
5172  for( size_t k=kbegin; k<kend; ++k ) {
5173  const SIMDType b1( set( B(k,j) ) );
5174  xmm1 = xmm1 + A.load(i ,k) * b1;
5175  xmm2 = xmm2 + A.load(i1,k) * b1;
5176  xmm3 = xmm3 + A.load(i2,k) * b1;
5177  xmm4 = xmm4 + A.load(i3,k) * b1;
5178  }
5179 
5180  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5181  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
5182  (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
5183  (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
5184  }
5185  }
5186 
5187  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5188  {
5189  const size_t i1( i+SIMDSIZE );
5190 
5191  size_t j( jj );
5192 
5193  for( ; (j+4UL) <= jend; j+=4UL )
5194  {
5195  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5196  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5197  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
5198  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
5199 
5200  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5201 
5202  for( size_t k=kbegin; k<kend; ++k ) {
5203  const SIMDType a1( A.load(i ,k) );
5204  const SIMDType a2( A.load(i1,k) );
5205  const SIMDType b1( set( B(k,j ) ) );
5206  const SIMDType b2( set( B(k,j+1UL) ) );
5207  const SIMDType b3( set( B(k,j+2UL) ) );
5208  const SIMDType b4( set( B(k,j+3UL) ) );
5209  xmm1 = xmm1 + a1 * b1;
5210  xmm2 = xmm2 + a2 * b1;
5211  xmm3 = xmm3 + a1 * b2;
5212  xmm4 = xmm4 + a2 * b2;
5213  xmm5 = xmm5 + a1 * b3;
5214  xmm6 = xmm6 + a2 * b3;
5215  xmm7 = xmm7 + a1 * b4;
5216  xmm8 = xmm8 + a2 * b4;
5217  }
5218 
5219  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5220  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
5221  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5222  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
5223  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
5224  (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
5225  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
5226  (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
5227  }
5228 
5229  for( ; (j+2UL) <= jend; j+=2UL )
5230  {
5231  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5232  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5233  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
5234  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5235 
5236  SIMDType xmm1, xmm2, xmm3, xmm4;
5237 
5238  for( size_t k=kbegin; k<kend; ++k ) {
5239  const SIMDType a1( A.load(i ,k) );
5240  const SIMDType a2( A.load(i1,k) );
5241  const SIMDType b1( set( B(k,j ) ) );
5242  const SIMDType b2( set( B(k,j+1UL) ) );
5243  xmm1 = xmm1 + a1 * b1;
5244  xmm2 = xmm2 + a2 * b1;
5245  xmm3 = xmm3 + a1 * b2;
5246  xmm4 = xmm4 + a2 * b2;
5247  }
5248 
5249  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5250  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
5251  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5252  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
5253  }
5254 
5255  if( j < jend )
5256  {
5257  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5258  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5259  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
5260  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5261 
5262  SIMDType xmm1, xmm2;
5263 
5264  for( size_t k=kbegin; k<kend; ++k ) {
5265  const SIMDType b1( set( B(k,j) ) );
5266  xmm1 = xmm1 + A.load(i ,k) * b1;
5267  xmm2 = xmm2 + A.load(i1,k) * b1;
5268  }
5269 
5270  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5271  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
5272  }
5273  }
5274 
5275  for( ; i<ipos; i+=SIMDSIZE )
5276  {
5277  for( size_t j=jj; j<jend; ++j )
5278  {
5279  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5280  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5281  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE, ktmp ) ):( ktmp ),
5282  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5283 
5284  SIMDType xmm1;
5285 
5286  for( size_t k=kbegin; k<kend; ++k ) {
5287  const SIMDType b1( set( B(k,j) ) );
5288  xmm1 = xmm1 + A.load(i,k) * b1;
5289  }
5290 
5291  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5292  }
5293  }
5294 
5295  for( ; remainder && i<iend; ++i )
5296  {
5297  for( size_t j=jj; j<jend; ++j )
5298  {
5299  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5300  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5301  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
5302  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5303 
5304  ElementType value = ElementType();
5305 
5306  for( size_t k=kbegin; k<kend; ++k ) {
5307  value += A(i,k) * B(k,j);
5308  }
5309 
5310  (~C)(i,j) += value * scalar;
5311  }
5312  }
5313  }
5314  }
5315  }
5316  }
5317  //**********************************************************************************************
5318 
5319  //**BLAS-based assignment to dense matrices (default)*******************************************
5333  template< typename MT3 // Type of the left-hand side target matrix
5334  , typename MT4 // Type of the left-hand side matrix operand
5335  , typename MT5 // Type of the right-hand side matrix operand
5336  , typename ST2 > // Type of the scalar value
5337  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
5338  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5339  {
5340  selectLargeAssignKernel( C, A, B, scalar );
5341  }
5342  //**********************************************************************************************
5343 
5344  //**BLAS-based assignment to dense matrices*****************************************************
5345 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
5346 
5359  template< typename MT3 // Type of the left-hand side target matrix
5360  , typename MT4 // Type of the left-hand side matrix operand
5361  , typename MT5 // Type of the right-hand side matrix operand
5362  , typename ST2 > // Type of the scalar value
5363  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
5364  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5365  {
5366  typedef ElementType_<MT3> ET;
5367 
5368  if( IsTriangular<MT4>::value ) {
5369  assign( C, B );
5370  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5371  }
5372  else if( IsTriangular<MT5>::value ) {
5373  assign( C, A );
5374  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5375  }
5376  else {
5377  gemm( C, A, B, ET(scalar), ET(0) );
5378  }
5379  }
5380 #endif
5381  //**********************************************************************************************
5382 
5383  //**Assignment to sparse matrices***************************************************************
5395  template< typename MT // Type of the target sparse matrix
5396  , bool SO > // Storage order of the target sparse matrix
5397  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
5398  assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5399  {
5401 
5402  typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
5403 
5409  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<TmpType> );
5410 
5411  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5412  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5413 
5414  const TmpType tmp( serial( rhs ) );
5415  assign( ~lhs, tmp );
5416  }
5417  //**********************************************************************************************
5418 
5419  //**Restructuring assignment to row-major matrices**********************************************
5433  template< typename MT > // Type of the target matrix
5434  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
5435  assign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
5436  {
5438 
5440 
5441  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5442  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5443 
5444  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
5445  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
5446 
5447  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
5448  assign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
5449  else if( IsSymmetric<MT1>::value )
5450  assign( ~lhs, trans( left ) * right * rhs.scalar_ );
5451  else
5452  assign( ~lhs, left * trans( right ) * rhs.scalar_ );
5453  }
5454  //**********************************************************************************************
5455 
5456  //**Addition assignment to dense matrices*******************************************************
5468  template< typename MT // Type of the target dense matrix
5469  , bool SO > // Storage order of the target dense matrix
5470  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
5471  addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5472  {
5474 
5475  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5476  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5477 
5478  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
5479  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
5480 
5481  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
5482  return;
5483  }
5484 
5485  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
5486  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
5487 
5488  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5489  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
5490  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
5491  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
5492  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5493  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
5494 
5495  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5496  }
5497  //**********************************************************************************************
5498 
5499  //**Addition assignment to dense matrices (kernel selection)************************************
5510  template< typename MT3 // Type of the left-hand side target matrix
5511  , typename MT4 // Type of the left-hand side matrix operand
5512  , typename MT5 // Type of the right-hand side matrix operand
5513  , typename ST2 > // Type of the scalar value
5514  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5515  {
5516  if( ( IsDiagonal<MT4>::value ) ||
5517  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
5518  selectSmallAddAssignKernel( C, A, B, scalar );
5519  else
5520  selectBlasAddAssignKernel( C, A, B, scalar );
5521  }
5522  //**********************************************************************************************
5523 
5524  //**Default addition assignment to dense matrices (general/general)*****************************
5538  template< typename MT3 // Type of the left-hand side target matrix
5539  , typename MT4 // Type of the left-hand side matrix operand
5540  , typename MT5 // Type of the right-hand side matrix operand
5541  , typename ST2 > // Type of the scalar value
5542  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
5543  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5544  {
5545  const ResultType tmp( serial( A * B * scalar ) );
5546  addAssign( C, tmp );
5547  }
5548  //**********************************************************************************************
5549 
5550  //**Default addition assignment to dense matrices (general/diagonal)****************************
5564  template< typename MT3 // Type of the left-hand side target matrix
5565  , typename MT4 // Type of the left-hand side matrix operand
5566  , typename MT5 // Type of the right-hand side matrix operand
5567  , typename ST2 > // Type of the scalar value
5568  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
5569  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5570  {
5572 
5573  const size_t M( A.rows() );
5574  const size_t N( B.columns() );
5575 
5576  for( size_t j=0UL; j<N; ++j )
5577  {
5578  const size_t ibegin( ( IsLower<MT4>::value )
5579  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
5580  :( 0UL ) );
5581  const size_t iend( ( IsUpper<MT4>::value )
5582  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
5583  :( M ) );
5584  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5585 
5586  const size_t inum( iend - ibegin );
5587  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
5588 
5589  for( size_t i=ibegin; i<ipos; i+=2UL ) {
5590  C(i ,j) += A(i ,j) * B(j,j) * scalar;
5591  C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
5592  }
5593  if( ipos < iend ) {
5594  C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
5595  }
5596  }
5597  }
5598  //**********************************************************************************************
5599 
5600  //**Default addition assignment to dense matrices (diagonal/general)****************************
5614  template< typename MT3 // Type of the left-hand side target matrix
5615  , typename MT4 // Type of the left-hand side matrix operand
5616  , typename MT5 // Type of the right-hand side matrix operand
5617  , typename ST2 > // Type of the scalar value
5618  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
5619  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5620  {
5622 
5623  const size_t M( A.rows() );
5624  const size_t N( B.columns() );
5625 
5626  for( size_t j=0UL; j<N; ++j )
5627  {
5628  const size_t ibegin( ( IsLower<MT5>::value )
5629  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5630  :( 0UL ) );
5631  const size_t iend( ( IsUpper<MT5>::value )
5632  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5633  :( M ) );
5634  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5635 
5636  const size_t inum( iend - ibegin );
5637  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
5638 
5639  for( size_t i=ibegin; i<ipos; i+=2UL ) {
5640  C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
5641  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
5642  }
5643  if( ipos < iend ) {
5644  C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
5645  }
5646  }
5647  }
5648  //**********************************************************************************************
5649 
5650  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
5664  template< typename MT3 // Type of the left-hand side target matrix
5665  , typename MT4 // Type of the left-hand side matrix operand
5666  , typename MT5 // Type of the right-hand side matrix operand
5667  , typename ST2 > // Type of the scalar value
5668  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
5669  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5670  {
5672 
5673  for( size_t i=0UL; i<A.rows(); ++i ) {
5674  C(i,i) += A(i,i) * B(i,i) * scalar;
5675  }
5676  }
5677  //**********************************************************************************************
5678 
5679  //**Default addition assignment to dense matrices (small matrices)******************************
5693  template< typename MT3 // Type of the left-hand side target matrix
5694  , typename MT4 // Type of the left-hand side matrix operand
5695  , typename MT5 // Type of the right-hand side matrix operand
5696  , typename ST2 > // Type of the scalar value
5697  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5698  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5699  {
5700  selectDefaultAddAssignKernel( C, A, B, scalar );
5701  }
5702  //**********************************************************************************************
5703 
5704  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
5719  template< typename MT3 // Type of the left-hand side target matrix
5720  , typename MT4 // Type of the left-hand side matrix operand
5721  , typename MT5 // Type of the right-hand side matrix operand
5722  , typename ST2 > // Type of the scalar value
5723  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5724  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5725  {
5728  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_<MT4> );
5729  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_<MT5> );
5730 
5731  if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
5732  const OppositeType_<MT5> tmp( serial( B ) );
5733  addAssign( ~C, A * tmp * scalar );
5734  }
5735  else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
5736  const OppositeType_<MT4> tmp( serial( A ) );
5737  addAssign( ~C, tmp * B * scalar );
5738  }
5739  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5740  const OppositeType_<MT5> tmp( serial( B ) );
5741  addAssign( ~C, A * tmp * scalar );
5742  }
5743  else {
5744  const OppositeType_<MT4> tmp( serial( A ) );
5745  addAssign( ~C, tmp * B * scalar );
5746  }
5747  }
5748  //**********************************************************************************************
5749 
5750  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
5765  template< typename MT3 // Type of the left-hand side target matrix
5766  , typename MT4 // Type of the left-hand side matrix operand
5767  , typename MT5 // Type of the right-hand side matrix operand
5768  , typename ST2 > // Type of the scalar value
5769  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5770  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5771  {
5772  const size_t M( A.rows() );
5773  const size_t N( B.columns() );
5774  const size_t K( A.columns() );
5775 
5776  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
5777 
5778  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
5779  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
5780 
5781  const SIMDType factor( set( scalar ) );
5782 
5783  size_t i( 0UL );
5784 
5785  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5786  for( size_t j=0UL; j<N; ++j )
5787  {
5788  const size_t kbegin( ( IsLower<MT5>::value )
5789  ?( ( IsUpper<MT4>::value )
5790  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5791  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5792  :( IsUpper<MT4>::value ? i : 0UL ) );
5793  const size_t kend( ( IsUpper<MT5>::value )
5794  ?( ( IsLower<MT4>::value )
5795  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
5796  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
5797  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
5798 
5799  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5800 
5801  for( size_t k=kbegin; k<kend; ++k ) {
5802  const SIMDType b1( set( B(k,j) ) );
5803  xmm1 = xmm1 + A.load(i ,k) * b1;
5804  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
5805  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
5806  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
5807  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
5808  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
5809  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
5810  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
5811  }
5812 
5813  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5814  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
5815  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
5816  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
5817  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
5818  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
5819  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
5820  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
5821  }
5822  }
5823 
5824  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5825  {
5826  size_t j( 0UL );
5827 
5828  for( ; (j+2UL) <= N; j+=2UL )
5829  {
5830  const size_t kbegin( ( IsLower<MT5>::value )
5831  ?( ( IsUpper<MT4>::value )
5832  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5833  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5834  :( IsUpper<MT4>::value ? i : 0UL ) );
5835  const size_t kend( ( IsUpper<MT5>::value )
5836  ?( ( IsLower<MT4>::value )
5837  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5838  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5839  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
5840 
5841  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5842 
5843  for( size_t k=kbegin; k<kend; ++k ) {
5844  const SIMDType a1( A.load(i ,k) );
5845  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5846  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5847  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5848  const SIMDType b1( set( B(k,j ) ) );
5849  const SIMDType b2( set( B(k,j+1UL) ) );
5850  xmm1 = xmm1 + a1 * b1;
5851  xmm2 = xmm2 + a2 * b1;
5852  xmm3 = xmm3 + a3 * b1;
5853  xmm4 = xmm4 + a4 * b1;
5854  xmm5 = xmm5 + a1 * b2;
5855  xmm6 = xmm6 + a2 * b2;
5856  xmm7 = xmm7 + a3 * b2;
5857  xmm8 = xmm8 + a4 * b2;
5858  }
5859 
5860  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5861  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
5862  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
5863  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
5864  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
5865  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
5866  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
5867  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
5868  }
5869 
5870  if( j < N )
5871  {
5872  const size_t kbegin( ( IsLower<MT5>::value )
5873  ?( ( IsUpper<MT4>::value )
5874  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5875  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5876  :( IsUpper<MT4>::value ? i : 0UL ) );
5877  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
5878 
5879  SIMDType xmm1, xmm2, xmm3, xmm4;
5880 
5881  for( size_t k=kbegin; k<kend; ++k ) {
5882  const SIMDType b1( set( B(k,j) ) );
5883  xmm1 = xmm1 + A.load(i ,k) * b1;
5884  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
5885  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
5886  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
5887  }
5888 
5889  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5890  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
5891  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
5892  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
5893  }
5894  }
5895 
5896  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5897  {
5898  size_t j( 0UL );
5899 
5900  for( ; (j+2UL) <= N; j+=2UL )
5901  {
5902  const size_t kbegin( ( IsLower<MT5>::value )
5903  ?( ( IsUpper<MT4>::value )
5904  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5905  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5906  :( IsUpper<MT4>::value ? i : 0UL ) );
5907  const size_t kend( ( IsUpper<MT5>::value )
5908  ?( ( IsLower<MT4>::value )
5909  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5910  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5911  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
5912 
5913  SIMDType xmm1, xmm2, xmm3, xmm4;
5914 
5915  for( size_t k=kbegin; k<kend; ++k ) {
5916  const SIMDType a1( A.load(i ,k) );
5917  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5918  const SIMDType b1( set( B(k,j ) ) );
5919  const SIMDType b2( set( B(k,j+1UL) ) );
5920  xmm1 = xmm1 + a1 * b1;
5921  xmm2 = xmm2 + a2 * b1;
5922  xmm3 = xmm3 + a1 * b2;
5923  xmm4 = xmm4 + a2 * b2;
5924  }
5925 
5926  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5927  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
5928  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5929  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
5930  }
5931 
5932  if( j < N )
5933  {
5934  const size_t kbegin( ( IsLower<MT5>::value )
5935  ?( ( IsUpper<MT4>::value )
5936  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5937  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5938  :( IsUpper<MT4>::value ? i : 0UL ) );
5939  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
5940 
5941  SIMDType xmm1, xmm2;
5942 
5943  for( size_t k=kbegin; k<kend; ++k ) {
5944  const SIMDType b1( set( B(k,j) ) );
5945  xmm1 = xmm1 + A.load(i ,k) * b1;
5946  xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
5947  }
5948 
5949  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5950  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + xmm2 * factor );
5951  }
5952  }
5953 
5954  for( ; i<ipos; i+=SIMDSIZE )
5955  {
5956  size_t j( 0UL );
5957 
5958  for( ; (j+2UL) <= N; j+=2UL )
5959  {
5960  const size_t kbegin( ( IsLower<MT5>::value )
5961  ?( ( IsUpper<MT4>::value )
5962  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5963  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5964  :( IsUpper<MT4>::value ? i : 0UL ) );
5965  const size_t kend( ( IsUpper<MT5>::value )
5966  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
5967  :( K ) );
5968 
5969  SIMDType xmm1, xmm2;
5970 
5971  for( size_t k=kbegin; k<kend; ++k ) {
5972  const SIMDType a1( A.load(i,k) );
5973  xmm1 = xmm1 + a1 * set( B(k,j ) );
5974  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
5975  }
5976 
5977  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5978  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
5979  }
5980 
5981  if( j < N )
5982  {
5983  const size_t kbegin( ( IsLower<MT5>::value )
5984  ?( ( IsUpper<MT4>::value )
5985  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5986  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5987  :( IsUpper<MT4>::value ? i : 0UL ) );
5988 
5989  SIMDType xmm1;
5990 
5991  for( size_t k=kbegin; k<K; ++k ) {
5992  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
5993  }
5994 
5995  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5996  }
5997  }
5998 
5999  for( ; remainder && i<M; ++i )
6000  {
6001  size_t j( 0UL );
6002 
6003  for( ; (j+2UL) <= N; j+=2UL )
6004  {
6005  const size_t kbegin( ( IsLower<MT5>::value )
6006  ?( ( IsUpper<MT4>::value )
6007  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6008  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6009  :( IsUpper<MT4>::value ? i : 0UL ) );
6010  const size_t kend( ( IsUpper<MT5>::value )
6011  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6012  :( K ) );
6013 
6014  ElementType value1 = ElementType();
6015  ElementType value2 = ElementType();
6016 
6017  for( size_t k=kbegin; k<kend; ++k ) {
6018  value1 += A(i,k) * B(k,j );
6019  value2 += A(i,k) * B(k,j+1UL);
6020  }
6021 
6022  (~C)(i,j ) += value1 * scalar;
6023  (~C)(i,j+1UL) += value2 * scalar;
6024  }
6025 
6026  if( j < N )
6027  {
6028  const size_t kbegin( ( IsLower<MT5>::value )
6029  ?( ( IsUpper<MT4>::value )
6030  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6031  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6032  :( IsUpper<MT4>::value ? i : 0UL ) );
6033 
6034  ElementType value = ElementType();
6035 
6036  for( size_t k=kbegin; k<K; ++k ) {
6037  value += A(i,k) * B(k,j);
6038  }
6039 
6040  (~C)(i,j) += value * scalar;
6041  }
6042  }
6043  }
6044  //**********************************************************************************************
6045 
6046  //**Default addition assignment to dense matrices (large matrices)******************************
6060  template< typename MT3 // Type of the left-hand side target matrix
6061  , typename MT4 // Type of the left-hand side matrix operand
6062  , typename MT5 // Type of the right-hand side matrix operand
6063  , typename ST2 > // Type of the scalar value
6064  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6065  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6066  {
6067  selectDefaultAddAssignKernel( C, A, B, scalar );
6068  }
6069  //**********************************************************************************************
6070 
6071  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
6086  template< typename MT3 // Type of the left-hand side target matrix
6087  , typename MT4 // Type of the left-hand side matrix operand
6088  , typename MT5 // Type of the right-hand side matrix operand
6089  , typename ST2 > // Type of the scalar value
6090  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6091  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6092  {
6093  selectSmallAddAssignKernel( ~C, A, B, scalar );
6094  }
6095  //**********************************************************************************************
6096 
6097  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
6112  template< typename MT3 // Type of the left-hand side target matrix
6113  , typename MT4 // Type of the left-hand side matrix operand
6114  , typename MT5 // Type of the right-hand side matrix operand
6115  , typename ST2 > // Type of the scalar value
6116  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6117  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6118  {
6119  const size_t M( A.rows() );
6120  const size_t N( B.columns() );
6121  const size_t K( A.columns() );
6122 
6123  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6124 
6125  const SIMDType factor( set( scalar ) );
6126 
6127  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
6128  {
6129  const size_t iend( min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
6130 
6131  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
6132  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
6133 
6134  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
6135  {
6136  const size_t jend( min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
6137 
6138  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
6139  {
6140  const size_t ktmp( min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
6141 
6142  size_t i( ii );
6143 
6144  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6145  {
6146  const size_t i1( i+SIMDSIZE );
6147  const size_t i2( i+SIMDSIZE*2UL );
6148  const size_t i3( i+SIMDSIZE*3UL );
6149 
6150  size_t j( jj );
6151 
6152  for( ; (j+2UL) <= jend; j+=2UL )
6153  {
6154  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6155  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6156  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
6157  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
6158 
6159  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6160 
6161  for( size_t k=kbegin; k<kend; ++k ) {
6162  const SIMDType a1( A.load(i ,k) );
6163  const SIMDType a2( A.load(i1,k) );
6164  const SIMDType a3( A.load(i2,k) );
6165  const SIMDType a4( A.load(i3,k) );
6166  const SIMDType b1( set( B(k,j ) ) );
6167  const SIMDType b2( set( B(k,j+1UL) ) );
6168  xmm1 = xmm1 + a1 * b1;
6169  xmm2 = xmm2 + a2 * b1;
6170  xmm3 = xmm3 + a3 * b1;
6171  xmm4 = xmm4 + a4 * b1;
6172  xmm5 = xmm5 + a1 * b2;
6173  xmm6 = xmm6 + a2 * b2;
6174  xmm7 = xmm7 + a3 * b2;
6175  xmm8 = xmm8 + a4 * b2;
6176  }
6177 
6178  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6179  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
6180  (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
6181  (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
6182  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
6183  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
6184  (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
6185  (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
6186  }
6187 
6188  if( j < jend )
6189  {
6190  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6191  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6192  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
6193  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6194 
6195  SIMDType xmm1, xmm2, xmm3, xmm4;
6196 
6197  for( size_t k=kbegin; k<kend; ++k ) {
6198  const SIMDType b1( set( B(k,j) ) );
6199  xmm1 = xmm1 + A.load(i ,k) * b1;
6200  xmm2 = xmm2 + A.load(i1,k) * b1;
6201  xmm3 = xmm3 + A.load(i2,k) * b1;
6202  xmm4 = xmm4 + A.load(i3,k) * b1;
6203  }
6204 
6205  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6206  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
6207  (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
6208  (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
6209  }
6210  }
6211 
6212  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6213  {
6214  const size_t i1( i+SIMDSIZE );
6215 
6216  size_t j( jj );
6217 
6218  for( ; (j+4UL) <= jend; j+=4UL )
6219  {
6220  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6221  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6222  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
6223  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
6224 
6225  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6226 
6227  for( size_t k=kbegin; k<kend; ++k ) {
6228  const SIMDType a1( A.load(i ,k) );
6229  const SIMDType a2( A.load(i1,k) );
6230  const SIMDType b1( set( B(k,j ) ) );
6231  const SIMDType b2( set( B(k,j+1UL) ) );
6232  const SIMDType b3( set( B(k,j+2UL) ) );
6233  const SIMDType b4( set( B(k,j+3UL) ) );
6234  xmm1 = xmm1 + a1 * b1;
6235  xmm2 = xmm2 + a2 * b1;
6236  xmm3 = xmm3 + a1 * b2;
6237  xmm4 = xmm4 + a2 * b2;
6238  xmm5 = xmm5 + a1 * b3;
6239  xmm6 = xmm6 + a2 * b3;
6240  xmm7 = xmm7 + a1 * b4;
6241  xmm8 = xmm8 + a2 * b4;
6242  }
6243 
6244  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6245  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
6246  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6247  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
6248  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
6249  (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
6250  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
6251  (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
6252  }
6253 
6254  for( ; (j+2UL) <= jend; j+=2UL )
6255  {
6256  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6257  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6258  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
6259  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
6260 
6261  SIMDType xmm1, xmm2, xmm3, xmm4;
6262 
6263  for( size_t k=kbegin; k<kend; ++k ) {
6264  const SIMDType a1( A.load(i ,k) );
6265  const SIMDType a2( A.load(i1,k) );
6266  const SIMDType b1( set( B(k,j ) ) );
6267  const SIMDType b2( set( B(k,j+1UL) ) );
6268  xmm1 = xmm1 + a1 * b1;
6269  xmm2 = xmm2 + a2 * b1;
6270  xmm3 = xmm3 + a1 * b2;
6271  xmm4 = xmm4 + a2 * b2;
6272  }
6273 
6274  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6275  (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
6276  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6277  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
6278  }
6279 
6280  if( j < jend )
6281  {
6282  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6283  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6284  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
6285  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6286 
6287  SIMDType xmm1, xmm2;
6288 
6289  for( size_t k=kbegin; k<kend; ++k ) {
6290  const SIMDType b1( set( B(k,j) ) );
6291  xmm1 = xmm1 + A.load(i ,k) * b1;
6292  xmm2 = xmm2 + A.load(i1,k) * b1;
6293  }
6294 
6295  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6296  (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
6297  }
6298  }
6299 
6300  for( ; i<ipos; i+=SIMDSIZE )
6301  {
6302  for( size_t j=jj; j<jend; ++j )
6303  {
6304  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6305  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6306  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE, ktmp ) ):( ktmp ),
6307  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6308 
6309  SIMDType xmm1;
6310 
6311  for( size_t k=kbegin; k<kend; ++k ) {
6312  const SIMDType b1( set( B(k,j) ) );
6313  xmm1 = xmm1 + A.load(i,k) * b1;
6314  }
6315 
6316  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
6317  }
6318  }
6319 
6320  for( ; remainder && i<iend; ++i )
6321  {
6322  for( size_t j=jj; j<jend; ++j )
6323  {
6324  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
6325  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
6326  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
6327  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6328 
6329  ElementType value = ElementType();
6330 
6331  for( size_t k=kbegin; k<kend; ++k ) {
6332  value += A(i,k) * B(k,j);
6333  }
6334 
6335  (~C)(i,j) += value * scalar;
6336  }
6337  }
6338  }
6339  }
6340  }
6341  }
6342  //**********************************************************************************************
6343 
6344  //**BLAS-based addition assignment to dense matrices (default)**********************************
6359  template< typename MT3 // Type of the left-hand side target matrix
6360  , typename MT4 // Type of the left-hand side matrix operand
6361  , typename MT5 // Type of the right-hand side matrix operand
6362  , typename ST2 > // Type of the scalar value
6363  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
6364  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6365  {
6366  selectLargeAddAssignKernel( C, A, B, scalar );
6367  }
6368  //**********************************************************************************************
6369 
6370  //**BLAS-based addition assignment to dense matrices********************************************
6371 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6372 
6385  template< typename MT3 // Type of the left-hand side target matrix
6386  , typename MT4 // Type of the left-hand side matrix operand
6387  , typename MT5 // Type of the right-hand side matrix operand
6388  , typename ST2 > // Type of the scalar value
6389  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
6390  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6391  {
6392  typedef ElementType_<MT3> ET;
6393 
6394  if( IsTriangular<MT4>::value ) {
6395  ResultType_<MT3> tmp( serial( B ) );
6396  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6397  addAssign( C, tmp );
6398  }
6399  else if( IsTriangular<MT5>::value ) {
6400  ResultType_<MT3> tmp( serial( A ) );
6401  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6402  addAssign( C, tmp );
6403  }
6404  else {
6405  gemm( C, A, B, ET(scalar), ET(1) );
6406  }
6407  }
6408 #endif
6409  //**********************************************************************************************
6410 
6411  //**Restructuring addition assignment to row-major matrices*************************************
6426  template< typename MT > // Type of the target matrix
6427  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
6428  addAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
6429  {
6431 
6433 
6434  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6435  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6436 
6437  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6438  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6439 
6440  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
6441  addAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
6442  else if( IsSymmetric<MT1>::value )
6443  addAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
6444  else
6445  addAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
6446  }
6447  //**********************************************************************************************
6448 
6449  //**Addition assignment to sparse matrices******************************************************
6450  // No special implementation for the addition assignment to sparse matrices.
6451  //**********************************************************************************************
6452 
6453  //**Subtraction assignment to dense matrices****************************************************
6465  template< typename MT // Type of the target dense matrix
6466  , bool SO > // Storage order of the target dense matrix
6467  friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
6468  subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6469  {
6471 
6472  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6473  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6474 
6475  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6476  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6477 
6478  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6479  return;
6480  }
6481 
6482  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6483  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6484 
6485  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6486  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6487  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6488  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6489  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6490  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6491 
6492  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6493  }
6494  //**********************************************************************************************
6495 
6496  //**Subtraction assignment to dense matrices (kernel selection)*********************************
6507  template< typename MT3 // Type of the left-hand side target matrix
6508  , typename MT4 // Type of the left-hand side matrix operand
6509  , typename MT5 // Type of the right-hand side matrix operand
6510  , typename ST2 > // Type of the scalar value
6511  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6512  {
6513  if( ( IsDiagonal<MT4>::value ) ||
6514  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6515  selectSmallSubAssignKernel( C, A, B, scalar );
6516  else
6517  selectBlasSubAssignKernel( C, A, B, scalar );
6518  }
6519  //**********************************************************************************************
6520 
6521  //**Default subtraction assignment to dense matrices (general/general)**************************
6535  template< typename MT3 // Type of the left-hand side target matrix
6536  , typename MT4 // Type of the left-hand side matrix operand
6537  , typename MT5 // Type of the right-hand side matrix operand
6538  , typename ST2 > // Type of the scalar value
6539  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6540  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6541  {
6542  const ResultType tmp( serial( A * B * scalar ) );
6543  subAssign( C, tmp );
6544  }
6545  //**********************************************************************************************
6546 
6547  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
6561  template< typename MT3 // Type of the left-hand side target matrix
6562  , typename MT4 // Type of the left-hand side matrix operand
6563  , typename MT5 // Type of the right-hand side matrix operand
6564  , typename ST2 > // Type of the scalar value
6565  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6566  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6567  {
6569 
6570  const size_t M( A.rows() );
6571  const size_t N( B.columns() );
6572 
6573  for( size_t j=0UL; j<N; ++j )
6574  {
6575  const size_t ibegin( ( IsLower<MT4>::value )
6576  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6577  :( 0UL ) );
6578  const size_t iend( ( IsUpper<MT4>::value )
6579  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6580  :( M ) );
6581  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6582 
6583  const size_t inum( iend - ibegin );
6584  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6585 
6586  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6587  C(i ,j) -= A(i ,j) * B(j,j) * scalar;
6588  C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
6589  }
6590  if( ipos < iend ) {
6591  C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
6592  }
6593  }
6594  }
6595  //**********************************************************************************************
6596 
6597  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
6611  template< typename MT3 // Type of the left-hand side target matrix
6612  , typename MT4 // Type of the left-hand side matrix operand
6613  , typename MT5 // Type of the right-hand side matrix operand
6614  , typename ST2 > // Type of the scalar value
6615  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6616  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6617  {
6619 
6620  const size_t M( A.rows() );
6621  const size_t N( B.columns() );
6622 
6623  for( size_t j=0UL; j<N; ++j )
6624  {
6625  const size_t ibegin( ( IsLower<MT5>::value )
6626  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6627  :( 0UL ) );
6628  const size_t iend( ( IsUpper<MT5>::value )
6629  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6630  :( M ) );
6631  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6632 
6633  const size_t inum( iend - ibegin );
6634  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6635 
6636  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6637  C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
6638  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6639  }
6640  if( ipos < iend ) {
6641  C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
6642  }
6643  }
6644  }
6645  //**********************************************************************************************
6646 
6647  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
6661  template< typename MT3 // Type of the left-hand side target matrix
6662  , typename MT4 // Type of the left-hand side matrix operand
6663  , typename MT5 // Type of the right-hand side matrix operand
6664  , typename ST2 > // Type of the scalar value
6665  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
6666  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6667  {
6669 
6670  for( size_t i=0UL; i<A.rows(); ++i ) {
6671  C(i,i) -= A(i,i) * B(i,i) * scalar;
6672  }
6673  }
6674  //**********************************************************************************************
6675 
6676  //**Default subtraction assignment to dense matrices (small matrices)***************************
6690  template< typename MT3 // Type of the left-hand side target matrix
6691  , typename MT4 // Type of the left-hand side matrix operand
6692  , typename MT5 // Type of the right-hand side matrix operand
6693  , typename ST2 > // Type of the scalar value
6694  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6695  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6696  {
6697  selectDefaultSubAssignKernel( C, A, B, scalar );
6698  }
6699  //**********************************************************************************************
6700 
6701  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
6716  template< typename MT3 // Type of the left-hand side target matrix
6717  , typename MT4 // Type of the left-hand side matrix operand
6718  , typename MT5 // Type of the right-hand side matrix operand
6719  , typename ST2 > // Type of the scalar value
6720  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6721  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6722  {
6725  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_<MT4> );
6726  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_<MT5> );
6727 
6728  if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
6729  const OppositeType_<MT5> tmp( serial( B ) );
6730  subAssign( ~C, A * tmp * scalar );
6731  }
6732  else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
6733  const OppositeType_<MT4> tmp( serial( A ) );
6734  subAssign( ~C, tmp * B * scalar );
6735  }
6736  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6737  const OppositeType_<MT5> tmp( serial( B ) );
6738  subAssign( ~C, A * tmp * scalar );
6739  }
6740  else {
6741  const OppositeType_<MT4> tmp( serial( A ) );
6742  subAssign( ~C, tmp * B * scalar );
6743  }
6744  }
6745  //**********************************************************************************************
6746 
6747  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
6762  template< typename MT3 // Type of the left-hand side target matrix
6763  , typename MT4 // Type of the left-hand side matrix operand
6764  , typename MT5 // Type of the right-hand side matrix operand
6765  , typename ST2 > // Type of the scalar value
6766  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6767  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6768  {
6769  const size_t M( A.rows() );
6770  const size_t N( B.columns() );
6771  const size_t K( A.columns() );
6772 
6773  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6774 
6775  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
6776  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
6777 
6778  const SIMDType factor( set( scalar ) );
6779 
6780  size_t i( 0UL );
6781 
6782  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6783  for( size_t j=0UL; j<N; ++j )
6784  {
6785  const size_t kbegin( ( IsLower<MT5>::value )
6786  ?( ( IsUpper<MT4>::value )
6787  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6788  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6789  :( IsUpper<MT4>::value ? i : 0UL ) );
6790  const size_t kend( ( IsUpper<MT5>::value )
6791  ?( ( IsLower<MT4>::value )
6792  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
6793  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
6794  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
6795 
6796  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6797 
6798  for( size_t k=kbegin; k<kend; ++k ) {
6799  const SIMDType b1( set( B(k,j) ) );
6800  xmm1 = xmm1 + A.load(i ,k) * b1;
6801  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
6802  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
6803  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
6804  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
6805  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
6806  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
6807  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
6808  }
6809 
6810  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6811  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
6812  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
6813  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
6814  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
6815  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
6816  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
6817  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
6818  }
6819  }
6820 
6821  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6822  {
6823  size_t j( 0UL );
6824 
6825  for( ; (j+2UL) <= N; j+=2UL )
6826  {
6827  const size_t kbegin( ( IsLower<MT5>::value )
6828  ?( ( IsUpper<MT4>::value )
6829  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6830  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6831  :( IsUpper<MT4>::value ? i : 0UL ) );
6832  const size_t kend( ( IsUpper<MT5>::value )
6833  ?( ( IsLower<MT4>::value )
6834  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6835  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6836  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
6837 
6838  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6839 
6840  for( size_t k=kbegin; k<kend; ++k ) {
6841  const SIMDType a1( A.load(i ,k) );
6842  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6843  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6844  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6845  const SIMDType b1( set( B(k,j ) ) );
6846  const SIMDType b2( set( B(k,j+1UL) ) );
6847  xmm1 = xmm1 + a1 * b1;
6848  xmm2 = xmm2 + a2 * b1;
6849  xmm3 = xmm3 + a3 * b1;
6850  xmm4 = xmm4 + a4 * b1;
6851  xmm5 = xmm5 + a1 * b2;
6852  xmm6 = xmm6 + a2 * b2;
6853  xmm7 = xmm7 + a3 * b2;
6854  xmm8 = xmm8 + a4 * b2;
6855  }
6856 
6857  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6858  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
6859  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
6860  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
6861  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
6862  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
6863  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
6864  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
6865  }
6866 
6867  if( j < N )
6868  {
6869  const size_t kbegin( ( IsLower<MT5>::value )
6870  ?( ( IsUpper<MT4>::value )
6871  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6872  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6873  :( IsUpper<MT4>::value ? i : 0UL ) );
6874  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
6875 
6876  SIMDType xmm1, xmm2, xmm3, xmm4;
6877 
6878  for( size_t k=kbegin; k<kend; ++k ) {
6879  const SIMDType b1( set( B(k,j) ) );
6880  xmm1 = xmm1 + A.load(i ,k) * b1;
6881  xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
6882  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
6883  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
6884  }
6885 
6886  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6887  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
6888  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
6889  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
6890  }
6891  }
6892 
6893  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6894  {
6895  size_t j( 0UL );
6896 
6897  for( ; (j+2UL) <= N; j+=2UL )
6898  {
6899  const size_t kbegin( ( IsLower<MT5>::value )
6900  ?( ( IsUpper<MT4>::value )
6901  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6902  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6903  :( IsUpper<MT4>::value ? i : 0UL ) );
6904  const size_t kend( ( IsUpper<MT5>::value )
6905  ?( ( IsLower<MT4>::value )
6906  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6907  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6908  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6909 
6910  SIMDType xmm1, xmm2, xmm3, xmm4;
6911 
6912  for( size_t k=kbegin; k<kend; ++k ) {
6913  const SIMDType a1( A.load(i ,k) );
6914  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6915  const SIMDType b1( set( B(k,j ) ) );
6916  const SIMDType b2( set( B(k,j+1UL) ) );
6917  xmm1 = xmm1 + a1 * b1;
6918  xmm2 = xmm2 + a2 * b1;
6919  xmm3 = xmm3 + a1 * b2;
6920  xmm4 = xmm4 + a2 * b2;
6921  }
6922 
6923  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6924  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
6925  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
6926  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
6927  }
6928 
6929  if( j < N )
6930  {
6931  const size_t kbegin( ( IsLower<MT5>::value )
6932  ?( ( IsUpper<MT4>::value )
6933  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6934  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6935  :( IsUpper<MT4>::value ? i : 0UL ) );
6936  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
6937 
6938  SIMDType xmm1, xmm2;
6939 
6940  for( size_t k=kbegin; k<kend; ++k ) {
6941  const SIMDType b1( set( B(k,j) ) );
6942  xmm1 = xmm1 + A.load(i ,k) * b1;
6943  xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
6944  }
6945 
6946  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6947  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - xmm2 * factor );
6948  }
6949  }
6950 
6951  for( ; i<ipos; i+=SIMDSIZE )
6952  {
6953  size_t j( 0UL );
6954 
6955  for( ; (j+2UL) <= N; j+=2UL )
6956  {
6957  const size_t kbegin( ( IsLower<MT5>::value )
6958  ?( ( IsUpper<MT4>::value )
6959  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6960  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6961  :( IsUpper<MT4>::value ? i : 0UL ) );
6962  const size_t kend( ( IsUpper<MT5>::value )
6963  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6964  :( K ) );
6965 
6966  SIMDType xmm1, xmm2;
6967 
6968  for( size_t k=kbegin; k<kend; ++k ) {
6969  const SIMDType a1( A.load(i,k) );
6970  xmm1 = xmm1 + a1 * set( B(k,j ) );
6971  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
6972  }
6973 
6974  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6975  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
6976  }
6977 
6978  if( j < N )
6979  {
6980  const size_t kbegin( ( IsLower<MT5>::value )
6981  ?( ( IsUpper<MT4>::value )
6982  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6983  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6984  :( IsUpper<MT4>::value ? i : 0UL ) );
6985 
6986  SIMDType xmm1;
6987 
6988  for( size_t k=kbegin; k<K; ++k ) {
6989  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
6990  }
6991 
6992  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
6993  }
6994  }
6995 
6996  for( ; remainder && i<M; ++i )
6997  {
6998  size_t j( 0UL );
6999 
7000  for( ; (j+2UL) <= N; j+=2UL )
7001  {
7002  const size_t kbegin( ( IsLower<MT5>::value )
7003  ?( ( IsUpper<MT4>::value )
7004  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7005  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7006  :( IsUpper<MT4>::value ? i : 0UL ) );
7007  const size_t kend( ( IsUpper<MT5>::value )
7008  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7009  :( K ) );
7010 
7011  ElementType value1 = ElementType();
7012  ElementType value2 = ElementType();
7013 
7014  for( size_t k=kbegin; k<kend; ++k ) {
7015  value1 += A(i,k) * B(k,j );
7016  value2 += A(i,k) * B(k,j+1UL);
7017  }
7018 
7019  (~C)(i,j ) -= value1 * scalar;
7020  (~C)(i,j+1UL) -= value2 * scalar;
7021  }
7022 
7023  if( j < N )
7024  {
7025  const size_t kbegin( ( IsLower<MT5>::value )
7026  ?( ( IsUpper<MT4>::value )
7027  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7028  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7029  :( IsUpper<MT4>::value ? i : 0UL ) );
7030 
7031  ElementType value = ElementType();
7032 
7033  for( size_t k=kbegin; k<K; ++k ) {
7034  value += A(i,k) * B(k,j);
7035  }
7036 
7037  (~C)(i,j) -= value * scalar;
7038  }
7039  }
7040  }
7041  //**********************************************************************************************
7042 
7043  //**Default subtraction assignment to dense matrices (large matrices)***************************
7057  template< typename MT3 // Type of the left-hand side target matrix
7058  , typename MT4 // Type of the left-hand side matrix operand
7059  , typename MT5 // Type of the right-hand side matrix operand
7060  , typename ST2 > // Type of the scalar value
7061  static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7062  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7063  {
7064  selectDefaultSubAssignKernel( C, A, B, scalar );
7065  }
7066  //**********************************************************************************************
7067 
7068  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
7083  template< typename MT3 // Type of the left-hand side target matrix
7084  , typename MT4 // Type of the left-hand side matrix operand
7085  , typename MT5 // Type of the right-hand side matrix operand
7086  , typename ST2 > // Type of the scalar value
7087  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7088  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7089  {
7090  selectSmallSubAssignKernel( ~C, A, B, scalar );
7091  }
7092  //**********************************************************************************************
7093 
7094  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
7109  template< typename MT3 // Type of the left-hand side target matrix
7110  , typename MT4 // Type of the left-hand side matrix operand
7111  , typename MT5 // Type of the right-hand side matrix operand
7112  , typename ST2 > // Type of the scalar value
7113  static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7114  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7115  {
7116  const size_t M( A.rows() );
7117  const size_t N( B.columns() );
7118  const size_t K( A.columns() );
7119 
7120  const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
7121 
7122  const SIMDType factor( set( scalar ) );
7123 
7124  for( size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
7125  {
7126  const size_t iend( min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
7127 
7128  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
7129  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
7130 
7131  for( size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
7132  {
7133  const size_t jend( min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
7134 
7135  for( size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
7136  {
7137  const size_t ktmp( min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
7138 
7139  size_t i( ii );
7140 
7141  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7142  {
7143  const size_t i1( i+SIMDSIZE );
7144  const size_t i2( i+SIMDSIZE*2UL );
7145  const size_t i3( i+SIMDSIZE*3UL );
7146 
7147  size_t j( jj );
7148 
7149  for( ; (j+2UL) <= jend; j+=2UL )
7150  {
7151  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7152  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7153  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
7154  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7155 
7156  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7157 
7158  for( size_t k=kbegin; k<kend; ++k ) {
7159  const SIMDType a1( A.load(i ,k) );
7160  const SIMDType a2( A.load(i1,k) );
7161  const SIMDType a3( A.load(i2,k) );
7162  const SIMDType a4( A.load(i3,k) );
7163  const SIMDType b1( set( B(k,j ) ) );
7164  const SIMDType b2( set( B(k,j+1UL) ) );
7165  xmm1 = xmm1 + a1 * b1;
7166  xmm2 = xmm2 + a2 * b1;
7167  xmm3 = xmm3 + a3 * b1;
7168  xmm4 = xmm4 + a4 * b1;
7169  xmm5 = xmm5 + a1 * b2;
7170  xmm6 = xmm6 + a2 * b2;
7171  xmm7 = xmm7 + a3 * b2;
7172  xmm8 = xmm8 + a4 * b2;
7173  }
7174 
7175  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7176  (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
7177  (~C).store( i2, j , (~C).load(i2,j ) - xmm3 * factor );
7178  (~C).store( i3, j , (~C).load(i3,j ) - xmm4 * factor );
7179  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
7180  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm6 * factor );
7181  (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) - xmm7 * factor );
7182  (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) - xmm8 * factor );
7183  }
7184 
7185  if( j < jend )
7186  {
7187  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7188  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7189  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
7190  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7191 
7192  SIMDType xmm1, xmm2, xmm3, xmm4;
7193 
7194  for( size_t k=kbegin; k<kend; ++k ) {
7195  const SIMDType b1( set( B(k,j) ) );
7196  xmm1 = xmm1 + A.load(i ,k) * b1;
7197  xmm2 = xmm2 + A.load(i1,k) * b1;
7198  xmm3 = xmm3 + A.load(i2,k) * b1;
7199  xmm4 = xmm4 + A.load(i3,k) * b1;
7200  }
7201 
7202  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7203  (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
7204  (~C).store( i2, j, (~C).load(i2,j) - xmm3 * factor );
7205  (~C).store( i3, j, (~C).load(i3,j) - xmm4 * factor );
7206  }
7207  }
7208 
7209  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7210  {
7211  const size_t i1( i+SIMDSIZE );
7212 
7213  size_t j( jj );
7214 
7215  for( ; (j+4UL) <= jend; j+=4UL )
7216  {
7217  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7218  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7219  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
7220  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
7221 
7222  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7223 
7224  for( size_t k=kbegin; k<kend; ++k ) {
7225  const SIMDType a1( A.load(i ,k) );
7226  const SIMDType a2( A.load(i1,k) );
7227  const SIMDType b1( set( B(k,j ) ) );
7228  const SIMDType b2( set( B(k,j+1UL) ) );
7229  const SIMDType b3( set( B(k,j+2UL) ) );
7230  const SIMDType b4( set( B(k,j+3UL) ) );
7231  xmm1 = xmm1 + a1 * b1;
7232  xmm2 = xmm2 + a2 * b1;
7233  xmm3 = xmm3 + a1 * b2;
7234  xmm4 = xmm4 + a2 * b2;
7235  xmm5 = xmm5 + a1 * b3;
7236  xmm6 = xmm6 + a2 * b3;
7237  xmm7 = xmm7 + a1 * b4;
7238  xmm8 = xmm8 + a2 * b4;
7239  }
7240 
7241  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7242  (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
7243  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7244  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
7245  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
7246  (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) - xmm6 * factor );
7247  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
7248  (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) - xmm8 * factor );
7249  }
7250 
7251  for( ; (j+2UL) <= jend; j+=2UL )
7252  {
7253  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7254  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7255  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
7256  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7257 
7258  SIMDType xmm1, xmm2, xmm3, xmm4;
7259 
7260  for( size_t k=kbegin; k<kend; ++k ) {
7261  const SIMDType a1( A.load(i ,k) );
7262  const SIMDType a2( A.load(i1,k) );
7263  const SIMDType b1( set( B(k,j ) ) );
7264  const SIMDType b2( set( B(k,j+1UL) ) );
7265  xmm1 = xmm1 + a1 * b1;
7266  xmm2 = xmm2 + a2 * b1;
7267  xmm3 = xmm3 + a1 * b2;
7268  xmm4 = xmm4 + a2 * b2;
7269  }
7270 
7271  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7272  (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
7273  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7274  (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
7275  }
7276 
7277  if( j < jend )
7278  {
7279  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7280  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7281  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
7282  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7283 
7284  SIMDType xmm1, xmm2;
7285 
7286  for( size_t k=kbegin; k<kend; ++k ) {
7287  const SIMDType b1( set( B(k,j) ) );
7288  xmm1 = xmm1 + A.load(i ,k) * b1;
7289  xmm2 = xmm2 + A.load(i1,k) * b1;
7290  }
7291 
7292  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7293  (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
7294  }
7295  }
7296 
7297  for( ; i<ipos; i+=SIMDSIZE )
7298  {
7299  for( size_t j=jj; j<jend; ++j )
7300  {
7301  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7302  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7303  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+SIMDSIZE, ktmp ) ):( ktmp ),
7304  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7305 
7306  SIMDType xmm1;
7307 
7308  for( size_t k=kbegin; k<kend; ++k ) {
7309  const SIMDType b1( set( B(k,j) ) );
7310  xmm1 = xmm1 + A.load(i,k) * b1;
7311  }
7312 
7313  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
7314  }
7315  }
7316 
7317  for( ; remainder && i<iend; ++i )
7318  {
7319  for( size_t j=jj; j<jend; ++j )
7320  {
7321  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7322  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7323  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+1UL, ktmp ) ):( ktmp ),
7324  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7325 
7326  ElementType value = ElementType();
7327 
7328  for( size_t k=kbegin; k<kend; ++k ) {
7329  value += A(i,k) * B(k,j);
7330  }
7331 
7332  (~C)(i,j) -= value * scalar;
7333  }
7334  }
7335  }
7336  }
7337  }
7338  }
7339  //**********************************************************************************************
7340 
7341  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
7356  template< typename MT3 // Type of the left-hand side target matrix
7357  , typename MT4 // Type of the left-hand side matrix operand
7358  , typename MT5 // Type of the right-hand side matrix operand
7359  , typename ST2 > // Type of the scalar value
7360  static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7361  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7362  {
7363  selectLargeSubAssignKernel( C, A, B, scalar );
7364  }
7365  //**********************************************************************************************
7366 
7367  //**BLAS-based subraction assignment to dense matrices******************************************
7368 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7369 
7382  template< typename MT3 // Type of the left-hand side target matrix
7383  , typename MT4 // Type of the left-hand side matrix operand
7384  , typename MT5 // Type of the right-hand side matrix operand
7385  , typename ST2 > // Type of the scalar value
7386  static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7387  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7388  {
7389  typedef ElementType_<MT3> ET;
7390 
7391  if( IsTriangular<MT4>::value ) {
7392  ResultType_<MT3> tmp( serial( B ) );
7393  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7394  subAssign( C, tmp );
7395  }
7396  else if( IsTriangular<MT5>::value ) {
7397  ResultType_<MT3> tmp( serial( A ) );
7398  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7399  subAssign( C, tmp );
7400  }
7401  else {
7402  gemm( C, A, B, ET(-scalar), ET(1) );
7403  }
7404  }
7405 #endif
7406  //**********************************************************************************************
7407 
7408  //**Restructuring subtraction assignment to row-major matrices**********************************
7422  template< typename MT > // Type of the target matrix
7423  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7424  subAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7425  {
7427 
7429 
7430  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7431  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7432 
7433  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7434  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7435 
7436  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7437  subAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7438  else if( IsSymmetric<MT1>::value )
7439  subAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7440  else
7441  subAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7442  }
7443  //**********************************************************************************************
7444 
7445  //**Subtraction assignment to sparse matrices***************************************************
7446  // No special implementation for the subtraction assignment to sparse matrices.
7447  //**********************************************************************************************
7448 
7449  //**Multiplication assignment to dense matrices*************************************************
7450  // No special implementation for the multiplication assignment to dense matrices.
7451  //**********************************************************************************************
7452 
7453  //**Multiplication assignment to sparse matrices************************************************
7454  // No special implementation for the multiplication assignment to sparse matrices.
7455  //**********************************************************************************************
7456 
7457  //**SMP assignment to dense matrices************************************************************
7472  template< typename MT // Type of the target dense matrix
7473  , bool SO > // Storage order of the target dense matrix
7474  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7475  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7476  {
7478 
7479  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7480  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7481 
7482  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7483  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7484 
7485  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
7486  return;
7487  }
7488  else if( left.columns() == 0UL ) {
7489  reset( ~lhs );
7490  return;
7491  }
7492 
7493  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7494  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7495 
7496  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7497  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7498  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7499  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7500  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7501  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7502 
7503  smpAssign( ~lhs, A * B * rhs.scalar_ );
7504  }
7505  //**********************************************************************************************
7506 
7507  //**SMP assignment to sparse matrices***********************************************************
7522  template< typename MT // Type of the target sparse matrix
7523  , bool SO > // Storage order of the target sparse matrix
7524  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7525  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7526  {
7528 
7529  typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
7530 
7536  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<TmpType> );
7537 
7538  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7539  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7540 
7541  const TmpType tmp( rhs );
7542  smpAssign( ~lhs, tmp );
7543  }
7544  //**********************************************************************************************
7545 
7546  //**Restructuring SMP assignment to row-major matrices******************************************
7560  template< typename MT > // Type of the target matrix
7561  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7562  smpAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7563  {
7565 
7567 
7568  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7569  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7570 
7571  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7572  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7573 
7574  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7575  smpAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7576  else if( IsSymmetric<MT1>::value )
7577  smpAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7578  else
7579  smpAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7580  }
7581  //**********************************************************************************************
7582 
7583  //**SMP addition assignment to dense matrices***************************************************
7598  template< typename MT // Type of the target dense matrix
7599  , bool SO > // Storage order of the target dense matrix
7600  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7601  smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7602  {
7604 
7605  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7606  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7607 
7608  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7609  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7610 
7611  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7612  return;
7613  }
7614 
7615  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7616  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7617 
7618  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7619  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7620  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7621  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7622  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7623  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7624 
7625  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
7626  }
7627  //**********************************************************************************************
7628 
7629  //**Restructuring SMP addition assignment to row-major matrices*********************************
7644  template< typename MT > // Type of the target matrix
7645  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7646  smpAddAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7647  {
7649 
7651 
7652  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7653  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7654 
7655  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7656  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7657 
7658  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7659  smpAddAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7660  else if( IsSymmetric<MT1>::value )
7661  smpAddAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7662  else
7663  smpAddAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7664  }
7665  //**********************************************************************************************
7666 
7667  //**SMP addition assignment to sparse matrices**************************************************
7668  // No special implementation for the SMP addition assignment to sparse matrices.
7669  //**********************************************************************************************
7670 
7671  //**SMP subtraction assignment to dense matrices************************************************
7686  template< typename MT // Type of the target dense matrix
7687  , bool SO > // Storage order of the target dense matrix
7688  friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7689  smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7690  {
7692 
7693  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7694  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7695 
7696  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7697  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7698 
7699  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7700  return;
7701  }
7702 
7703  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7704  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7705 
7706  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7707  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7708  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7709  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7710  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7711  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7712 
7713  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
7714  }
7715  //**********************************************************************************************
7716 
7717  //**Restructuring SMP subtraction assignment to row-major matrices******************************
7732  template< typename MT > // Type of the target matrix
7733  friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7734  smpSubAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7735  {
7737 
7739 
7740  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7741  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7742 
7743  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7744  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7745 
7746  if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7747  smpSubAssign( ~lhs, trans( left ) * trans( right ) * rhs.scalar_ );
7748  else if( IsSymmetric<MT1>::value )
7749  smpSubAssign( ~lhs, trans( left ) * right * rhs.scalar_ );
7750  else
7751  smpSubAssign( ~lhs, left * trans( right ) * rhs.scalar_ );
7752  }
7753  //**********************************************************************************************
7754 
7755  //**SMP subtraction assignment to sparse matrices***********************************************
7756  // No special implementation for the SMP subtraction assignment to sparse matrices.
7757  //**********************************************************************************************
7758 
7759  //**SMP multiplication assignment to dense matrices*********************************************
7760  // No special implementation for the SMP multiplication assignment to dense matrices.
7761  //**********************************************************************************************
7762 
7763  //**SMP multiplication assignment to sparse matrices********************************************
7764  // No special implementation for the SMP multiplication assignment to sparse matrices.
7765  //**********************************************************************************************
7766 
7767  //**Compile time checks*************************************************************************
7775  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
7776  //**********************************************************************************************
7777 };
7779 //*************************************************************************************************
7780 
7781 
7782 
7783 
7784 //=================================================================================================
7785 //
7786 // GLOBAL BINARY ARITHMETIC OPERATORS
7787 //
7788 //=================================================================================================
7789 
7790 //*************************************************************************************************
7816 template< typename T1 // Type of the left-hand side dense matrix
7817  , typename T2 > // Type of the right-hand side dense matrix
7818 inline const TDMatTDMatMultExpr<T1,T2>
7820 {
7822 
7823  if( (~lhs).columns() != (~rhs).rows() ) {
7824  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
7825  }
7826 
7827  return TDMatTDMatMultExpr<T1,T2>( ~lhs, ~rhs );
7828 }
7829 //*************************************************************************************************
7830 
7831 
7832 
7833 
7834 //=================================================================================================
7835 //
7836 // ROWS SPECIALIZATIONS
7837 //
7838 //=================================================================================================
7839 
7840 //*************************************************************************************************
7842 template< typename MT1, typename MT2 >
7843 struct Rows< TDMatTDMatMultExpr<MT1,MT2> > : public Rows<MT1>
7844 {};
7846 //*************************************************************************************************
7847 
7848 
7849 
7850 
7851 //=================================================================================================
7852 //
7853 // COLUMNS SPECIALIZATIONS
7854 //
7855 //=================================================================================================
7856 
7857 //*************************************************************************************************
7859 template< typename MT1, typename MT2 >
7860 struct Columns< TDMatTDMatMultExpr<MT1,MT2> > : public Columns<MT2>
7861 {};
7863 //*************************************************************************************************
7864 
7865 
7866 
7867 
7868 //=================================================================================================
7869 //
7870 // ISALIGNED SPECIALIZATIONS
7871 //
7872 //=================================================================================================
7873 
7874 //*************************************************************************************************
7876 template< typename MT1, typename MT2 >
7877 struct IsAligned< TDMatTDMatMultExpr<MT1,MT2> >
7878  : public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7879 {};
7881 //*************************************************************************************************
7882 
7883 
7884 
7885 
7886 //=================================================================================================
7887 //
7888 // ISLOWER SPECIALIZATIONS
7889 //
7890 //=================================================================================================
7891 
7892 //*************************************************************************************************
7894 template< typename MT1, typename MT2 >
7895 struct IsLower< TDMatTDMatMultExpr<MT1,MT2> >
7896  : public BoolConstant< And< IsLower<MT1>, IsLower<MT2> >::value >
7897 {};
7899 //*************************************************************************************************
7900 
7901 
7902 
7903 
7904 //=================================================================================================
7905 //
7906 // ISUNILOWER SPECIALIZATIONS
7907 //
7908 //=================================================================================================
7909 
7910 //*************************************************************************************************
7912 template< typename MT1, typename MT2 >
7913 struct IsUniLower< TDMatTDMatMultExpr<MT1,MT2> >
7914  : public BoolConstant< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
7915 {};
7917 //*************************************************************************************************
7918 
7919 
7920 
7921 
7922 //=================================================================================================
7923 //
7924 // ISSTRICTLYLOWER SPECIALIZATIONS
7925 //
7926 //=================================================================================================
7927 
7928 //*************************************************************************************************
7930 template< typename MT1, typename MT2 >
7931 struct IsStrictlyLower< TDMatTDMatMultExpr<MT1,MT2> >
7932  : public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7933  , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
7934 {};
7936 //*************************************************************************************************
7937 
7938 
7939 
7940 
7941 //=================================================================================================
7942 //
7943 // ISUPPER SPECIALIZATIONS
7944 //
7945 //=================================================================================================
7946 
7947 //*************************************************************************************************
7949 template< typename MT1, typename MT2 >
7950 struct IsUpper< TDMatTDMatMultExpr<MT1,MT2> >
7951  : public BoolConstant< And< IsUpper<MT1>, IsUpper<MT2> >::value >
7952 {};
7954 //*************************************************************************************************
7955 
7956 
7957 
7958 
7959 //=================================================================================================
7960 //
7961 // ISUNIUPPER SPECIALIZATIONS
7962 //
7963 //=================================================================================================
7964 
7965 //*************************************************************************************************
7967 template< typename MT1, typename MT2 >
7968 struct IsUniUpper< TDMatTDMatMultExpr<MT1,MT2> >
7969  : public BoolConstant< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
7970 {};
7972 //*************************************************************************************************
7973 
7974 
7975 
7976 
7977 //=================================================================================================
7978 //
7979 // ISSTRICTLYUPPER SPECIALIZATIONS
7980 //
7981 //=================================================================================================
7982 
7983 //*************************************************************************************************
7985 template< typename MT1, typename MT2 >
7986 struct IsStrictlyUpper< TDMatTDMatMultExpr<MT1,MT2> >
7987  : public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
7988  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
7989 {};
7991 //*************************************************************************************************
7992 
7993 
7994 
7995 
7996 //=================================================================================================
7997 //
7998 // EXPRESSION TRAIT SPECIALIZATIONS
7999 //
8000 //=================================================================================================
8001 
8002 //*************************************************************************************************
8004 template< typename MT1, typename MT2, typename VT >
8005 struct TDMatDVecMultExprTrait< TDMatTDMatMultExpr<MT1,MT2>, VT >
8006 {
8007  public:
8008  //**********************************************************************************************
8009  using Type = If_< And< IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
8010  , IsDenseMatrix<MT2>, IsColumnMajorMatrix<MT2>
8011  , IsDenseVector<VT>, IsColumnVector<VT> >
8012  , TDMatDVecMultExprTrait_< MT1, TDMatDVecMultExprTrait_<MT2,VT> >
8013  , INVALID_TYPE >;
8014  //**********************************************************************************************
8015 };
8017 //*************************************************************************************************
8018 
8019 
8020 //*************************************************************************************************
8022 template< typename MT1, typename MT2, typename VT >
8023 struct TDMatSVecMultExprTrait< TDMatTDMatMultExpr<MT1,MT2>, VT >
8024 {
8025  public:
8026  //**********************************************************************************************
8027  using Type = If_< And< IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
8028  , IsDenseMatrix<MT2>, IsColumnMajorMatrix<MT2>
8029  , IsSparseVector<VT>, IsColumnVector<VT> >
8030  , TDMatDVecMultExprTrait_< MT1, TDMatSVecMultExprTrait_<MT2,VT> >
8031  , INVALID_TYPE >;
8032  //**********************************************************************************************
8033 };
8035 //*************************************************************************************************
8036 
8037 
8038 //*************************************************************************************************
8040 template< typename VT, typename MT1, typename MT2 >
8041 struct TDVecTDMatMultExprTrait< VT, TDMatTDMatMultExpr<MT1,MT2> >
8042 {
8043  public:
8044  //**********************************************************************************************
8045  using Type = If_< And< IsDenseVector<VT>, IsRowVector<VT>
8046  , IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
8047  , IsDenseMatrix<MT2>, IsColumnMajorMatrix<MT2> >
8048  , TDVecTDMatMultExprTrait_< TDVecTDMatMultExprTrait_<VT,MT1>, MT2 >
8049  , INVALID_TYPE >;
8050  //**********************************************************************************************
8051 };
8053 //*************************************************************************************************
8054 
8055 
8056 //*************************************************************************************************
8058 template< typename VT, typename MT1, typename MT2 >
8059 struct TSVecTDMatMultExprTrait< VT, TDMatTDMatMultExpr<MT1,MT2> >
8060 {
8061  public:
8062  //**********************************************************************************************
8063  using Type = If_< And< IsSparseVector<VT>, IsRowVector<VT>
8064  , IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
8065  , IsDenseMatrix<MT2>, IsColumnMajorMatrix<MT2> >
8066  , TDVecTDMatMultExprTrait_< TSVecTDMatMultExprTrait_<VT,MT1>, MT2 >
8067  , INVALID_TYPE >;
8068  //**********************************************************************************************
8069 };
8071 //*************************************************************************************************
8072 
8073 
8074 //*************************************************************************************************
8076 template< typename MT1, typename MT2, bool AF >
8077 struct SubmatrixExprTrait< TDMatTDMatMultExpr<MT1,MT2>, AF >
8078 {
8079  public:
8080  //**********************************************************************************************
8081  using Type = MultExprTrait_< SubmatrixExprTrait_<const MT1,AF>
8082  , SubmatrixExprTrait_<const MT2,AF> >;
8083  //**********************************************************************************************
8084 };
8086 //*************************************************************************************************
8087 
8088 
8089 //*************************************************************************************************
8091 template< typename MT1, typename MT2 >
8092 struct RowExprTrait< TDMatTDMatMultExpr<MT1,MT2> >
8093 {
8094  public:
8095  //**********************************************************************************************
8096  using Type = MultExprTrait_< RowExprTrait_<const MT1>, MT2 >;
8097  //**********************************************************************************************
8098 };
8100 //*************************************************************************************************
8101 
8102 
8103 //*************************************************************************************************
8105 template< typename MT1, typename MT2 >
8106 struct ColumnExprTrait< TDMatTDMatMultExpr<MT1,MT2> >
8107 {
8108  public:
8109  //**********************************************************************************************
8110  using Type = MultExprTrait_< MT1, ColumnExprTrait_<const MT2> >;
8111  //**********************************************************************************************
8112 };
8114 //*************************************************************************************************
8115 
8116 } // namespace blaze
8117 
8118 #endif
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:153
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:304
Header file for auxiliary alias declarations.
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:245
Header file for the Rows type trait.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:251
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:398
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:158
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:432
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:368
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:352
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:249
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:410
Header file for the IsRowVector type trait.
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:223
Header file for the And class template.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:451
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1669
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:156
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Expression object for transpose dense matrix-transpose dense matrix multiplications.The TDMatTDMatMultExpr class represents the compile time expression for multiplications between two column-major dense matrices.
Definition: Forward.h:134
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:378
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1716
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:126
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
TDMatTDMatMultExpr< MT1, MT2 > This
Type of this TDMatTDMatMultExpr instance.
Definition: TDMatTDMatMultExpr.h:244
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:263
Header file for the IsComplexDouble type trait.
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Header file for the DisableIf class template.
If_< IsExpression< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:257
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:157
Header file for the TSVecTDMatMultExprTrait class template.
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
Header file for the TDMatSVecMultExprTrait class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
Header file for the Not class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:250
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:254
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:155
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:109
Header file for the IsNumeric type trait.
BLAZE_ALWAYS_INLINE const EnableIf_< And< IsIntegral< T >, HasSize< T, 1UL > >, If_< IsSigned< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:76
Header file for the HasConstDataAccess type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:126
System settings for the BLAS mode.
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:246
Header file for the IsSparseVector type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Utility type for generic codes.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:388
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:442
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:154
Constraint on the data type.
If_< IsExpression< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:254
Constraints on the storage order of matrix types.
Header file for the HasMutableDataAccess type trait.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:247
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:243
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Header file for the IsDenseVector type trait.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:248
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:452
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:289
Header file for the AreSIMDCombinable type trait.
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
const DMatTransExpr< MT,!SO > trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:950
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Header file for the TDMatDVecMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:422
Header file for the IntegralConstant class template.
Header file for the IsComplex type trait.
Header file for the complex data type.
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:260
Header file for the IsColumnVector type trait.
Constraint on the data type.
Header file for the IsResizable type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.