TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <stdexcept>
44 #include <blaze/math/blas/Level3.h>
53 #include <blaze/math/Functions.h>
54 #include <blaze/math/Intrinsics.h>
55 #include <blaze/math/shims/Reset.h>
92 #include <blaze/system/BLAS.h>
94 #include <blaze/util/Assert.h>
95 #include <blaze/util/Complex.h>
99 #include <blaze/util/DisableIf.h>
100 #include <blaze/util/EnableIf.h>
101 #include <blaze/util/InvalidType.h>
103 #include <blaze/util/mpl/And.h>
104 #include <blaze/util/mpl/Not.h>
105 #include <blaze/util/mpl/Or.h>
106 #include <blaze/util/SelectType.h>
107 #include <blaze/util/Types.h>
114 
115 
116 namespace blaze {
117 
118 //=================================================================================================
119 //
120 // CLASS TDMATDMATMULTEXPR
121 //
122 //=================================================================================================
123 
124 //*************************************************************************************************
131 template< typename MT1 // Type of the left-hand side dense matrix
132  , typename MT2 > // Type of the right-hand side dense matrix
133 class TDMatDMatMultExpr : public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
134  , private MatMatMultExpr
135  , private Computation
136 {
137  private:
138  //**Type definitions****************************************************************************
139  typedef typename MT1::ResultType RT1;
140  typedef typename MT2::ResultType RT2;
141  typedef typename RT1::ElementType ET1;
142  typedef typename RT2::ElementType ET2;
143  typedef typename MT1::CompositeType CT1;
144  typedef typename MT2::CompositeType CT2;
145  //**********************************************************************************************
146 
147  //**********************************************************************************************
150  //**********************************************************************************************
151 
152  //**********************************************************************************************
154  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
155  //**********************************************************************************************
156 
157  //**********************************************************************************************
159 
163  template< typename T1, typename T2, typename T3 >
164  struct IsEvaluationRequired {
165  enum { value = ( evaluateLeft || evaluateRight ) };
166  };
168  //**********************************************************************************************
169 
170  //**********************************************************************************************
172 
175  template< typename T1, typename T2, typename T3 >
176  struct UseSinglePrecisionKernel {
177  enum { value = BLAZE_BLAS_MODE &&
178  HasMutableDataAccess<T1>::value &&
179  HasConstDataAccess<T2>::value &&
180  HasConstDataAccess<T3>::value &&
181  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
182  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
183  IsFloat<typename T1::ElementType>::value &&
184  IsFloat<typename T2::ElementType>::value &&
185  IsFloat<typename T3::ElementType>::value };
186  };
188  //**********************************************************************************************
189 
190  //**********************************************************************************************
192 
195  template< typename T1, typename T2, typename T3 >
196  struct UseDoublePrecisionKernel {
197  enum { value = BLAZE_BLAS_MODE &&
198  HasMutableDataAccess<T1>::value &&
199  HasConstDataAccess<T2>::value &&
200  HasConstDataAccess<T3>::value &&
201  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
202  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
203  IsDouble<typename T1::ElementType>::value &&
204  IsDouble<typename T2::ElementType>::value &&
205  IsDouble<typename T3::ElementType>::value };
206  };
208  //**********************************************************************************************
209 
210  //**********************************************************************************************
212 
216  template< typename T1, typename T2, typename T3 >
217  struct UseSinglePrecisionComplexKernel {
218  typedef complex<float> Type;
219  enum { value = BLAZE_BLAS_MODE &&
220  HasMutableDataAccess<T1>::value &&
221  HasConstDataAccess<T2>::value &&
222  HasConstDataAccess<T3>::value &&
223  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
224  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
225  IsSame<typename T1::ElementType,Type>::value &&
226  IsSame<typename T2::ElementType,Type>::value &&
227  IsSame<typename T3::ElementType,Type>::value };
228  };
230  //**********************************************************************************************
231 
232  //**********************************************************************************************
234 
238  template< typename T1, typename T2, typename T3 >
239  struct UseDoublePrecisionComplexKernel {
240  typedef complex<double> Type;
241  enum { value = BLAZE_BLAS_MODE &&
242  HasMutableDataAccess<T1>::value &&
243  HasConstDataAccess<T2>::value &&
244  HasConstDataAccess<T3>::value &&
245  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
246  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
247  IsSame<typename T1::ElementType,Type>::value &&
248  IsSame<typename T2::ElementType,Type>::value &&
249  IsSame<typename T3::ElementType,Type>::value };
250  };
252  //**********************************************************************************************
253 
254  //**********************************************************************************************
256 
259  template< typename T1, typename T2, typename T3 >
260  struct UseDefaultKernel {
261  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
262  !UseDoublePrecisionKernel<T1,T2,T3>::value &&
263  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
264  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
265  };
267  //**********************************************************************************************
268 
269  //**********************************************************************************************
271 
274  template< typename T1, typename T2, typename T3 >
275  struct UseVectorizedDefaultKernel {
276  enum { value = !( IsDiagonal<T2>::value && IsDiagonal<T3>::value ) &&
277  !( IsDiagonal<T2>::value && IsColumnMajorMatrix<T1>::value ) &&
278  !( IsDiagonal<T3>::value && IsRowMajorMatrix<T1>::value ) &&
279  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
280  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
281  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
282  IntrinsicTrait<typename T1::ElementType>::addition &&
283  IntrinsicTrait<typename T1::ElementType>::subtraction &&
284  IntrinsicTrait<typename T1::ElementType>::multiplication };
285  };
287  //**********************************************************************************************
288 
289  public:
290  //**Type definitions****************************************************************************
297  typedef const ElementType ReturnType;
298  typedef const ResultType CompositeType;
299 
301  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
302 
304  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
305 
308 
311  //**********************************************************************************************
312 
313  //**Compilation flags***************************************************************************
315  enum { vectorizable = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
316  MT1::vectorizable && MT2::vectorizable &&
320 
322  enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
323  !evaluateRight && MT2::smpAssignable };
324  //**********************************************************************************************
325 
326  //**Constructor*********************************************************************************
332  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs )
333  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
334  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
335  {
336  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
337  }
338  //**********************************************************************************************
339 
340  //**Access operator*****************************************************************************
347  inline ReturnType operator()( size_t i, size_t j ) const {
348  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
349  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
350 
351  const size_t kbegin( ( IsUpper<MT1>::value )
352  ?( ( IsLower<MT2>::value )
353  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
354  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
355  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
356  :( ( IsLower<MT2>::value )
357  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
358  :( 0UL ) ) );
359  const size_t kend( ( IsLower<MT1>::value )
360  ?( ( IsUpper<MT2>::value )
361  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
362  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
363  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
364  :( ( IsUpper<MT2>::value )
365  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
366  :( lhs_.columns() ) ) );
367 
368  if( lhs_.columns() == 0UL ||
369  ( ( IsTriangular<MT1>::value || IsTriangular<MT2>::value ) && kbegin >= kend ) )
370  return ElementType();
371 
373  return lhs_(i,i) * rhs_(i,j);
374 
376  return lhs_(i,j) * rhs_(j,j);
377 
378  const size_t knum( kend - kbegin );
379  const size_t kpos( kbegin + ( ( knum - 1UL ) & size_t(-2) ) + 1UL );
380 
381  ElementType tmp( lhs_(i,kbegin) * rhs_(kbegin,j) );
382 
383  for( size_t k=kbegin+1UL; k<kpos; k+=2UL ) {
384  tmp += lhs_(i,k ) * rhs_(k ,j);
385  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
386  }
387  if( kpos < kend ) {
388  tmp += lhs_(i,kpos) * rhs_(kpos,j);
389  }
390 
391  return tmp;
392  }
393  //**********************************************************************************************
394 
395  //**Rows function*******************************************************************************
400  inline size_t rows() const {
401  return lhs_.rows();
402  }
403  //**********************************************************************************************
404 
405  //**Columns function****************************************************************************
410  inline size_t columns() const {
411  return rhs_.columns();
412  }
413  //**********************************************************************************************
414 
415  //**Left operand access*************************************************************************
420  inline LeftOperand leftOperand() const {
421  return lhs_;
422  }
423  //**********************************************************************************************
424 
425  //**Right operand access************************************************************************
430  inline RightOperand rightOperand() const {
431  return rhs_;
432  }
433  //**********************************************************************************************
434 
435  //**********************************************************************************************
441  template< typename T >
442  inline bool canAlias( const T* alias ) const {
443  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
444  }
445  //**********************************************************************************************
446 
447  //**********************************************************************************************
453  template< typename T >
454  inline bool isAliased( const T* alias ) const {
455  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
456  }
457  //**********************************************************************************************
458 
459  //**********************************************************************************************
464  inline bool isAligned() const {
465  return lhs_.isAligned() && rhs_.isAligned();
466  }
467  //**********************************************************************************************
468 
469  //**********************************************************************************************
474  inline bool canSMPAssign() const {
475  return ( !BLAZE_BLAS_IS_PARALLEL ||
476  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
478  }
479  //**********************************************************************************************
480 
481  private:
482  //**Member variables****************************************************************************
483  LeftOperand lhs_;
484  RightOperand rhs_;
485  //**********************************************************************************************
486 
487  //**Assignment to dense matrices****************************************************************
500  template< typename MT // Type of the target dense matrix
501  , bool SO > // Storage order of the target dense matrix
502  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
503  {
505 
506  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
507  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
508 
509  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
510  return;
511  }
512  else if( rhs.lhs_.columns() == 0UL ) {
513  reset( ~lhs );
514  return;
515  }
516 
517  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
518  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
519 
520  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
521  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
522  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
523  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
524  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
525  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
526 
527  TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
528  }
530  //**********************************************************************************************
531 
532  //**Assignment to dense matrices (kernel selection)*********************************************
543  template< typename MT3 // Type of the left-hand side target matrix
544  , typename MT4 // Type of the left-hand side matrix operand
545  , typename MT5 > // Type of the right-hand side matrix operand
546  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
547  {
549  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
550  selectSmallAssignKernel( C, A, B );
551  else
552  selectBlasAssignKernel( C, A, B );
553  }
555  //**********************************************************************************************
556 
557  //**Default assignment to row-major dense matrices (general/general)****************************
571  template< typename MT3 // Type of the left-hand side target matrix
572  , typename MT4 // Type of the left-hand side matrix operand
573  , typename MT5 > // Type of the right-hand side matrix operand
574  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
575  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
576  {
577  const size_t M( A.rows() );
578  const size_t N( B.columns() );
579  const size_t K( A.columns() );
580 
581  for( size_t i=0UL; i<M; ++i )
582  {
583  const size_t kbegin( ( IsUpper<MT4>::value )
584  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
585  :( 0UL ) );
586  const size_t kend( ( IsLower<MT4>::value )
587  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
588  :( K ) );
589  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
590 
591  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
592  for( size_t j=0UL; j<N; ++j ) {
593  reset( (~C)(i,j) );
594  }
595  continue;
596  }
597 
598  {
599  const size_t jbegin( ( IsUpper<MT5>::value )
600  ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
601  :( 0UL ) );
602  const size_t jend( ( IsLower<MT5>::value )
603  ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
604  :( N ) );
605  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
606 
607  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
608  for( size_t j=0UL; j<jbegin; ++j ) {
609  reset( (~C)(i,j) );
610  }
611  }
612  else if( IsStrictlyUpper<MT5>::value ) {
613  reset( (~C)(i,0UL) );
614  }
615  for( size_t j=jbegin; j<jend; ++j ) {
616  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
617  }
618  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
619  for( size_t j=jend; j<N; ++j ) {
620  reset( (~C)(i,j) );
621  }
622  }
623  else if( IsStrictlyLower<MT5>::value ) {
624  reset( (~C)(i,N-1UL) );
625  }
626  }
627 
628  for( size_t k=kbegin+1UL; k<kend; ++k )
629  {
630  const size_t jbegin( ( IsUpper<MT5>::value )
631  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
632  :( 0UL ) );
633  const size_t jend( ( IsLower<MT5>::value )
634  ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
635  :( N ) );
636  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
637 
638  for( size_t j=jbegin; j<jend; ++j ) {
639  (~C)(i,j) += A(i,k) * B(k,j);
640  }
641  if( IsLower<MT5>::value ) {
642  (~C)(i,jend) = A(i,k) * B(k,jend);
643  }
644  }
645  }
646  }
648  //**********************************************************************************************
649 
650  //**Default assignment to column-major dense matrices (general/general)*************************
664  template< typename MT3 // Type of the left-hand side target matrix
665  , typename MT4 // Type of the left-hand side matrix operand
666  , typename MT5 > // Type of the right-hand side matrix operand
667  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
668  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
669  {
670  const size_t M( A.rows() );
671  const size_t N( B.columns() );
672  const size_t K( A.columns() );
673 
674  for( size_t j=0UL; j<N; ++j )
675  {
676  const size_t kbegin( ( IsLower<MT5>::value )
677  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
678  :( 0UL ) );
679  const size_t kend( ( IsUpper<MT5>::value )
680  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
681  :( K ) );
682  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
683 
684  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
685  for( size_t i=0UL; i<M; ++i ) {
686  reset( (~C)(i,j) );
687  }
688  continue;
689  }
690 
691  {
692  const size_t ibegin( ( IsLower<MT4>::value )
693  ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
694  :( 0UL ) );
695  const size_t iend( ( IsUpper<MT4>::value )
696  ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
697  :( M ) );
698  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
699 
700  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
701  for( size_t i=0UL; i<ibegin; ++i ) {
702  reset( (~C)(i,j) );
703  }
704  }
705  else if( IsStrictlyLower<MT4>::value ) {
706  reset( (~C)(0UL,j) );
707  }
708  for( size_t i=ibegin; i<iend; ++i ) {
709  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
710  }
711  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
712  for( size_t i=iend; i<M; ++i ) {
713  reset( (~C)(i,j) );
714  }
715  }
716  else if( IsStrictlyUpper<MT4>::value ) {
717  reset( (~C)(M-1UL,j) );
718  }
719  }
720 
721  for( size_t k=kbegin+1UL; k<kend; ++k )
722  {
723  const size_t ibegin( ( IsLower<MT4>::value )
724  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
725  :( 0UL ) );
726  const size_t iend( ( IsUpper<MT4>::value )
727  ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
728  :( M ) );
729  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
730 
731  for( size_t i=ibegin; i<iend; ++i ) {
732  (~C)(i,j) += A(i,k) * B(k,j);
733  }
734  if( IsUpper<MT4>::value ) {
735  (~C)(iend,j) = A(iend,k) * B(k,j);
736  }
737  }
738  }
739  }
741  //**********************************************************************************************
742 
743  //**Default assignment to row-major dense matrices (general/diagonal)***************************
757  template< typename MT3 // Type of the left-hand side target matrix
758  , typename MT4 // Type of the left-hand side matrix operand
759  , typename MT5 > // Type of the right-hand side matrix operand
760  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
761  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
762  {
763  const size_t M( A.rows() );
764  const size_t N( B.columns() );
765 
766  const size_t block( 16UL );
767 
768  for( size_t ii=0UL; ii<M; ii+=block ) {
769  const size_t iend( min( M, ii+block ) );
770  for( size_t jj=0UL; jj<N; jj+=block ) {
771  const size_t jend( min( N, jj+block ) );
772  for( size_t i=ii; i<iend; ++i )
773  {
774  const size_t jbegin( ( IsUpper<MT4>::value )
775  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
776  :( jj ) );
777  const size_t jpos( ( IsLower<MT4>::value )
778  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
779  :( jend ) );
780 
781  if( IsUpper<MT4>::value ) {
782  for( size_t j=jj; j<jbegin; ++j ) {
783  reset( (~C)(i,j) );
784  }
785  }
786  for( size_t j=jbegin; j<jpos; ++j ) {
787  (~C)(i,j) = A(i,j) * B(j,j);
788  }
789  if( IsLower<MT4>::value ) {
790  for( size_t j=jpos; j<jend; ++j ) {
791  reset( (~C)(i,j) );
792  }
793  }
794  }
795  }
796  }
797  }
799  //**********************************************************************************************
800 
801  //**Default assignment to column-major dense matrices (general/diagonal)************************
815  template< typename MT3 // Type of the left-hand side target matrix
816  , typename MT4 // Type of the left-hand side matrix operand
817  , typename MT5 > // Type of the right-hand side matrix operand
818  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
819  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
820  {
821  const size_t M( A.rows() );
822  const size_t N( B.columns() );
823 
824  for( size_t j=0UL; j<N; ++j )
825  {
826  const size_t ibegin( ( IsLower<MT4>::value )
827  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
828  :( 0UL ) );
829  const size_t iend( ( IsUpper<MT4>::value )
830  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
831  :( M ) );
832  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
833 
834  if( IsLower<MT4>::value ) {
835  for( size_t i=0UL; i<ibegin; ++i ) {
836  reset( (~C)(i,j) );
837  }
838  }
839  for( size_t i=ibegin; i<iend; ++i ) {
840  (~C)(i,j) = A(i,j) * B(j,j);
841  }
842  if( IsUpper<MT4>::value ) {
843  for( size_t i=iend; i<M; ++i ) {
844  reset( (~C)(i,j) );
845  }
846  }
847  }
848  }
850  //**********************************************************************************************
851 
852  //**Default assignment to row-major dense matrices (diagonal/general)***************************
866  template< typename MT3 // Type of the left-hand side target matrix
867  , typename MT4 // Type of the left-hand side matrix operand
868  , typename MT5 > // Type of the right-hand side matrix operand
869  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
870  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
871  {
872  const size_t M( A.rows() );
873  const size_t N( B.columns() );
874 
875  for( size_t i=0UL; i<M; ++i )
876  {
877  const size_t jbegin( ( IsUpper<MT5>::value )
878  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
879  :( 0UL ) );
880  const size_t jend( ( IsLower<MT5>::value )
881  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
882  :( N ) );
883  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
884 
885  if( IsUpper<MT5>::value ) {
886  for( size_t j=0UL; j<jbegin; ++j ) {
887  reset( (~C)(i,j) );
888  }
889  }
890  for( size_t j=jbegin; j<jend; ++j ) {
891  (~C)(i,j) = A(i,i) * B(i,j);
892  }
893  if( IsLower<MT5>::value ) {
894  for( size_t j=jend; j<N; ++j ) {
895  reset( (~C)(i,j) );
896  }
897  }
898  }
899  }
901  //**********************************************************************************************
902 
903  //**Default assignment to column-major dense matrices (diagonal/general)************************
917  template< typename MT3 // Type of the left-hand side target matrix
918  , typename MT4 // Type of the left-hand side matrix operand
919  , typename MT5 > // Type of the right-hand side matrix operand
920  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
921  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
922  {
923  const size_t M( A.rows() );
924  const size_t N( B.columns() );
925 
926  const size_t block( 16UL );
927 
928  for( size_t jj=0UL; jj<N; jj+=block ) {
929  const size_t jend( min( N, jj+block ) );
930  for( size_t ii=0UL; ii<M; ii+=block ) {
931  const size_t iend( min( M, ii+block ) );
932  for( size_t j=jj; j<jend; ++j )
933  {
934  const size_t ibegin( ( IsLower<MT5>::value )
935  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
936  :( ii ) );
937  const size_t ipos( ( IsUpper<MT5>::value )
938  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
939  :( iend ) );
940 
941  if( IsLower<MT5>::value ) {
942  for( size_t i=ii; i<ibegin; ++i ) {
943  reset( (~C)(i,j) );
944  }
945  }
946  for( size_t i=ibegin; i<ipos; ++i ) {
947  (~C)(i,j) = A(i,i) * B(i,j);
948  }
949  if( IsUpper<MT5>::value ) {
950  for( size_t i=ipos; i<iend; ++i ) {
951  reset( (~C)(i,j) );
952  }
953  }
954  }
955  }
956  }
957  }
959  //**********************************************************************************************
960 
961  //**Default assignment to dense matrices (diagonal/diagonal)************************************
975  template< typename MT3 // Type of the left-hand side target matrix
976  , typename MT4 // Type of the left-hand side matrix operand
977  , typename MT5 > // Type of the right-hand side matrix operand
978  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
979  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
980  {
981  reset( C );
982 
983  for( size_t i=0UL; i<A.rows(); ++i ) {
984  C(i,i) = A(i,i) * B(i,i);
985  }
986  }
988  //**********************************************************************************************
989 
990  //**Default assignment to dense matrices (small matrices)***************************************
1004  template< typename MT3 // Type of the left-hand side target matrix
1005  , typename MT4 // Type of the left-hand side matrix operand
1006  , typename MT5 > // Type of the right-hand side matrix operand
1007  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1008  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1009  {
1010  selectDefaultAssignKernel( ~C, A, B );
1011  }
1013  //**********************************************************************************************
1014 
1015  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
1030  template< typename MT3 // Type of the left-hand side target matrix
1031  , typename MT4 // Type of the left-hand side matrix operand
1032  , typename MT5 > // Type of the right-hand side matrix operand
1033  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1034  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1035  {
1036  typedef IntrinsicTrait<ElementType> IT;
1037 
1038  const size_t M( A.rows() );
1039  const size_t N( B.columns() );
1040  const size_t K( A.columns() );
1041 
1042  size_t j( 0UL );
1043 
1044  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1045  for( size_t i=0UL; i<M; ++i )
1046  {
1047  const size_t kbegin( ( IsUpper<MT4>::value )
1048  ?( ( IsLower<MT5>::value )
1049  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1050  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1051  :( IsLower<MT5>::value ? j : 0UL ) );
1052  const size_t kend( ( IsLower<MT4>::value )
1053  ?( ( IsUpper<MT5>::value )
1054  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
1055  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1056  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
1057 
1058  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1059 
1060  for( size_t k=kbegin; k<kend; ++k ) {
1061  const IntrinsicType a1( set( A(i,k) ) );
1062  xmm1 = xmm1 + a1 * B.load(k,j );
1063  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1064  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1065  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1066  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
1067  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
1068  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
1069  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
1070  }
1071 
1072  (~C).store( i, j , xmm1 );
1073  (~C).store( i, j+IT::size , xmm2 );
1074  (~C).store( i, j+IT::size*2UL, xmm3 );
1075  (~C).store( i, j+IT::size*3UL, xmm4 );
1076  (~C).store( i, j+IT::size*4UL, xmm5 );
1077  (~C).store( i, j+IT::size*5UL, xmm6 );
1078  (~C).store( i, j+IT::size*6UL, xmm7 );
1079  (~C).store( i, j+IT::size*7UL, xmm8 );
1080  }
1081  }
1082 
1083  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL )
1084  {
1085  size_t i( 0UL );
1086 
1087  for( ; (i+2UL) <= M; i+=2UL )
1088  {
1089  const size_t kbegin( ( IsUpper<MT4>::value )
1090  ?( ( IsLower<MT5>::value )
1091  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1092  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1093  :( IsLower<MT5>::value ? j : 0UL ) );
1094  const size_t kend( ( IsLower<MT4>::value )
1095  ?( ( IsUpper<MT5>::value )
1096  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
1097  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1098  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
1099 
1100  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1101 
1102  for( size_t k=kbegin; k<kend; ++k ) {
1103  const IntrinsicType a1( set( A(i ,k) ) );
1104  const IntrinsicType a2( set( A(i+1UL,k) ) );
1105  const IntrinsicType b1( B.load(k,j ) );
1106  const IntrinsicType b2( B.load(k,j+IT::size ) );
1107  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
1108  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
1109  xmm1 = xmm1 + a1 * b1;
1110  xmm2 = xmm2 + a1 * b2;
1111  xmm3 = xmm3 + a1 * b3;
1112  xmm4 = xmm4 + a1 * b4;
1113  xmm5 = xmm5 + a2 * b1;
1114  xmm6 = xmm6 + a2 * b2;
1115  xmm7 = xmm7 + a2 * b3;
1116  xmm8 = xmm8 + a2 * b4;
1117  }
1118 
1119  (~C).store( i , j , xmm1 );
1120  (~C).store( i , j+IT::size , xmm2 );
1121  (~C).store( i , j+IT::size*2UL, xmm3 );
1122  (~C).store( i , j+IT::size*3UL, xmm4 );
1123  (~C).store( i+1UL, j , xmm5 );
1124  (~C).store( i+1UL, j+IT::size , xmm6 );
1125  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
1126  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
1127  }
1128 
1129  if( i < M )
1130  {
1131  const size_t kbegin( ( IsUpper<MT4>::value )
1132  ?( ( IsLower<MT5>::value )
1133  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1134  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1135  :( IsLower<MT5>::value ? j : 0UL ) );
1136  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
1137 
1138  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1139 
1140  for( size_t k=kbegin; k<kend; ++k ) {
1141  const IntrinsicType a1( set( A(i,k) ) );
1142  xmm1 = xmm1 + a1 * B.load(k,j );
1143  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1144  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1145  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1146  }
1147 
1148  (~C).store( i, j , xmm1 );
1149  (~C).store( i, j+IT::size , xmm2 );
1150  (~C).store( i, j+IT::size*2UL, xmm3 );
1151  (~C).store( i, j+IT::size*3UL, xmm4 );
1152  }
1153  }
1154 
1155  for( ; (j+IT::size) < N; j+=IT::size*2UL )
1156  {
1157  size_t i( 0UL );
1158 
1159  for( ; (i+2UL) <= M; i+=2UL )
1160  {
1161  const size_t kbegin( ( IsUpper<MT4>::value )
1162  ?( ( IsLower<MT5>::value )
1163  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1164  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1165  :( IsLower<MT5>::value ? j : 0UL ) );
1166  const size_t kend( ( IsLower<MT4>::value )
1167  ?( ( IsUpper<MT5>::value )
1168  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
1169  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1170  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
1171 
1172  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1173 
1174  for( size_t k=kbegin; k<kend; ++k ) {
1175  const IntrinsicType a1( set( A(i ,k) ) );
1176  const IntrinsicType a2( set( A(i+1UL,k) ) );
1177  const IntrinsicType b1( B.load(k,j ) );
1178  const IntrinsicType b2( B.load(k,j+IT::size) );
1179  xmm1 = xmm1 + a1 * b1;
1180  xmm2 = xmm2 + a1 * b2;
1181  xmm3 = xmm3 + a2 * b1;
1182  xmm4 = xmm4 + a2 * b2;
1183  }
1184 
1185  (~C).store( i , j , xmm1 );
1186  (~C).store( i , j+IT::size, xmm2 );
1187  (~C).store( i+1UL, j , xmm3 );
1188  (~C).store( i+1UL, j+IT::size, xmm4 );
1189  }
1190 
1191  if( i < M )
1192  {
1193  const size_t kbegin( ( IsUpper<MT4>::value )
1194  ?( ( IsLower<MT5>::value )
1195  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1196  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1197  :( IsLower<MT5>::value ? j : 0UL ) );
1198  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
1199 
1200  IntrinsicType xmm1, xmm2;
1201 
1202  for( size_t k=kbegin; k<kend; ++k ) {
1203  const IntrinsicType a1( set( A(i,k) ) );
1204  xmm1 = xmm1 + a1 * B.load(k,j );
1205  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
1206  }
1207 
1208  (~C).store( i, j , xmm1 );
1209  (~C).store( i, j+IT::size, xmm2 );
1210  }
1211  }
1212 
1213  if( j < N )
1214  {
1215  size_t i( 0UL );
1216 
1217  for( ; (i+2UL) <= M; i+=2UL )
1218  {
1219  const size_t kbegin( ( IsUpper<MT4>::value )
1220  ?( ( IsLower<MT5>::value )
1221  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1222  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1223  :( IsLower<MT5>::value ? j : 0UL ) );
1224  const size_t kend( ( IsLower<MT4>::value )
1225  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1226  :( K ) );
1227 
1228  IntrinsicType xmm1, xmm2;
1229 
1230  for( size_t k=kbegin; k<kend; ++k ) {
1231  const IntrinsicType b1( B.load(k,j) );
1232  xmm1 = xmm1 + set( A(i ,k) ) * b1;
1233  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
1234  }
1235 
1236  (~C).store( i , j, xmm1 );
1237  (~C).store( i+1UL, j, xmm2 );
1238  }
1239 
1240  if( i < M )
1241  {
1242  const size_t kbegin( ( IsUpper<MT4>::value )
1243  ?( ( IsLower<MT5>::value )
1244  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1245  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1246  :( IsLower<MT5>::value ? j : 0UL ) );
1247 
1248  IntrinsicType xmm1;
1249 
1250  for( size_t k=kbegin; k<K; ++k ) {
1251  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
1252  }
1253 
1254  (~C).store( i, j, xmm1 );
1255  }
1256  }
1257  }
1259  //**********************************************************************************************
1260 
1261  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1276  template< typename MT3 // Type of the left-hand side target matrix
1277  , typename MT4 // Type of the left-hand side matrix operand
1278  , typename MT5 > // Type of the right-hand side matrix operand
1279  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1280  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1281  {
1282  typedef IntrinsicTrait<ElementType> IT;
1283 
1284  const size_t M( A.rows() );
1285  const size_t N( B.columns() );
1286  const size_t K( A.columns() );
1287 
1288  size_t i( 0UL );
1289 
1290  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1291  for( size_t j=0UL; j<N; ++j )
1292  {
1293  const size_t kbegin( ( IsLower<MT5>::value )
1294  ?( ( IsUpper<MT4>::value )
1295  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1296  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1297  :( IsUpper<MT4>::value ? i : 0UL ) );
1298  const size_t kend( ( IsUpper<MT5>::value )
1299  ?( ( IsLower<MT4>::value )
1300  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1301  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1302  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
1303 
1304  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1305 
1306  for( size_t k=kbegin; k<kend; ++k ) {
1307  const IntrinsicType b1( set( B(k,j) ) );
1308  xmm1 = xmm1 + A.load(i ,k) * b1;
1309  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1310  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1311  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1312  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
1313  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
1314  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
1315  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
1316  }
1317 
1318  (~C).store( i , j, xmm1 );
1319  (~C).store( i+IT::size , j, xmm2 );
1320  (~C).store( i+IT::size*2UL, j, xmm3 );
1321  (~C).store( i+IT::size*3UL, j, xmm4 );
1322  (~C).store( i+IT::size*4UL, j, xmm5 );
1323  (~C).store( i+IT::size*5UL, j, xmm6 );
1324  (~C).store( i+IT::size*6UL, j, xmm7 );
1325  (~C).store( i+IT::size*7UL, j, xmm8 );
1326  }
1327  }
1328 
1329  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL )
1330  {
1331  size_t j( 0UL );
1332 
1333  for( ; (j+2UL) <= N; j+=2UL )
1334  {
1335  const size_t kbegin( ( IsLower<MT5>::value )
1336  ?( ( IsUpper<MT4>::value )
1337  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1338  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1339  :( IsUpper<MT4>::value ? i : 0UL ) );
1340  const size_t kend( ( IsUpper<MT5>::value )
1341  ?( ( IsLower<MT4>::value )
1342  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1343  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1344  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
1345 
1346  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1347 
1348  for( size_t k=kbegin; k<kend; ++k ) {
1349  const IntrinsicType a1( A.load(i ,k) );
1350  const IntrinsicType a2( A.load(i+IT::size ,k) );
1351  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
1352  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
1353  const IntrinsicType b1( set( B(k,j ) ) );
1354  const IntrinsicType b2( set( B(k,j+1UL) ) );
1355  xmm1 = xmm1 + a1 * b1;
1356  xmm2 = xmm2 + a2 * b1;
1357  xmm3 = xmm3 + a3 * b1;
1358  xmm4 = xmm4 + a4 * b1;
1359  xmm5 = xmm5 + a1 * b2;
1360  xmm6 = xmm6 + a2 * b2;
1361  xmm7 = xmm7 + a3 * b2;
1362  xmm8 = xmm8 + a4 * b2;
1363  }
1364 
1365  (~C).store( i , j , xmm1 );
1366  (~C).store( i+IT::size , j , xmm2 );
1367  (~C).store( i+IT::size*2UL, j , xmm3 );
1368  (~C).store( i+IT::size*3UL, j , xmm4 );
1369  (~C).store( i , j+1UL, xmm5 );
1370  (~C).store( i+IT::size , j+1UL, xmm6 );
1371  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
1372  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
1373  }
1374 
1375  if( j < N )
1376  {
1377  const size_t kbegin( ( IsLower<MT5>::value )
1378  ?( ( IsUpper<MT4>::value )
1379  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1380  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1381  :( IsUpper<MT4>::value ? i : 0UL ) );
1382  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
1383 
1384  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1385 
1386  for( size_t k=kbegin; k<kend; ++k ) {
1387  const IntrinsicType b1( set( B(k,j) ) );
1388  xmm1 = xmm1 + A.load(i ,k) * b1;
1389  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1390  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1391  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1392  }
1393 
1394  (~C).store( i , j, xmm1 );
1395  (~C).store( i+IT::size , j, xmm2 );
1396  (~C).store( i+IT::size*2UL, j, xmm3 );
1397  (~C).store( i+IT::size*3UL, j, xmm4 );
1398  }
1399  }
1400 
1401  for( ; (i+IT::size) < M; i+=IT::size*2UL )
1402  {
1403  size_t j( 0UL );
1404 
1405  for( ; (j+2UL) <= N; j+=2UL )
1406  {
1407  const size_t kbegin( ( IsLower<MT5>::value )
1408  ?( ( IsUpper<MT4>::value )
1409  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1410  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1411  :( IsUpper<MT4>::value ? i : 0UL ) );
1412  const size_t kend( ( IsUpper<MT5>::value )
1413  ?( ( IsLower<MT4>::value )
1414  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1415  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1416  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
1417 
1418  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1419 
1420  for( size_t k=kbegin; k<kend; ++k ) {
1421  const IntrinsicType a1( A.load(i ,k) );
1422  const IntrinsicType a2( A.load(i+IT::size,k) );
1423  const IntrinsicType b1( set( B(k,j ) ) );
1424  const IntrinsicType b2( set( B(k,j+1UL) ) );
1425  xmm1 = xmm1 + a1 * b1;
1426  xmm2 = xmm2 + a2 * b1;
1427  xmm3 = xmm3 + a1 * b2;
1428  xmm4 = xmm4 + a2 * b2;
1429  }
1430 
1431  (~C).store( i , j , xmm1 );
1432  (~C).store( i+IT::size, j , xmm2 );
1433  (~C).store( i , j+1UL, xmm3 );
1434  (~C).store( i+IT::size, j+1UL, xmm4 );
1435  }
1436 
1437  if( j < N )
1438  {
1439  const size_t kbegin( ( IsLower<MT5>::value )
1440  ?( ( IsUpper<MT4>::value )
1441  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1442  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1443  :( IsUpper<MT4>::value ? i : 0UL ) );
1444  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
1445 
1446  IntrinsicType xmm1, xmm2;
1447 
1448  for( size_t k=kbegin; k<kend; ++k ) {
1449  const IntrinsicType b1( set( B(k,j) ) );
1450  xmm1 = xmm1 + A.load(i ,k) * b1;
1451  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
1452  }
1453 
1454  (~C).store( i , j, xmm1 );
1455  (~C).store( i+IT::size, j, xmm2 );
1456  }
1457  }
1458 
1459  if( i < M )
1460  {
1461  size_t j( 0UL );
1462 
1463  for( ; (j+2UL) <= N; j+=2UL )
1464  {
1465  const size_t kbegin( ( IsLower<MT5>::value )
1466  ?( ( IsUpper<MT4>::value )
1467  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1468  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1469  :( IsUpper<MT4>::value ? i : 0UL ) );
1470  const size_t kend( ( IsUpper<MT5>::value )
1471  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1472  :( K ) );
1473 
1474  IntrinsicType xmm1, xmm2;
1475 
1476  for( size_t k=kbegin; k<kend; ++k ) {
1477  const IntrinsicType a1( A.load(i,k) );
1478  xmm1 = xmm1 + a1 * set( B(k,j ) );
1479  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
1480  }
1481 
1482  (~C).store( i, j , xmm1 );
1483  (~C).store( i, j+1UL, xmm2 );
1484  }
1485 
1486  if( j < N )
1487  {
1488  const size_t kbegin( ( IsLower<MT5>::value )
1489  ?( ( IsUpper<MT4>::value )
1490  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1491  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1492  :( IsUpper<MT4>::value ? i : 0UL ) );
1493 
1494  IntrinsicType xmm1;
1495 
1496  for( size_t k=kbegin; k<K; ++k ) {
1497  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
1498  }
1499 
1500  (~C).store( i, j, xmm1 );
1501  }
1502  }
1503  }
1505  //**********************************************************************************************
1506 
1507  //**Default assignment to dense matrices (large matrices)***************************************
1521  template< typename MT3 // Type of the left-hand side target matrix
1522  , typename MT4 // Type of the left-hand side matrix operand
1523  , typename MT5 > // Type of the right-hand side matrix operand
1524  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1525  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1526  {
1527  selectDefaultAssignKernel( C, A, B );
1528  }
1530  //**********************************************************************************************
1531 
1532  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
1547  template< typename MT3 // Type of the left-hand side target matrix
1548  , typename MT4 // Type of the left-hand side matrix operand
1549  , typename MT5 > // Type of the right-hand side matrix operand
1550  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1551  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1552  {
1553  typedef IntrinsicTrait<ElementType> IT;
1554 
1555  const size_t M( A.rows() );
1556  const size_t N( B.columns() );
1557  const size_t K( A.columns() );
1558 
1559  const size_t iblock( 64UL );
1560  const size_t jblock( 128UL );
1561  const size_t kblock( 128UL );
1562 
1563  for( size_t jj=0UL; jj<N; jj+=jblock )
1564  {
1565  const size_t jend( min( jj+jblock, N ) );
1566 
1567  for( size_t ii=0UL; ii<M; ii+=iblock )
1568  {
1569  const size_t iend( min( ii+iblock, M ) );
1570 
1571  for( size_t i=ii; i<iend; ++i ) {
1572  for( size_t j=jj; j<jend; ++j ) {
1573  reset( (~C)(i,j) );
1574  }
1575  }
1576 
1577  for( size_t kk=0UL; kk<K; kk+=kblock )
1578  {
1579  const size_t ktmp( min( kk+kblock, K ) );
1580 
1581  size_t j( jj );
1582 
1583  for( ; (j+IT::size*3UL) < jend; j+=IT::size*4UL )
1584  {
1585  const size_t j1( j+IT::size );
1586  const size_t j2( j+IT::size*2UL );
1587  const size_t j3( j+IT::size*3UL );
1588 
1589  size_t i( ii );
1590 
1591  for( ; (i+2UL) <= iend; i+=2UL )
1592  {
1593  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1594  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1595  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1596  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
1597 
1598  IntrinsicType xmm1( (~C).load(i ,j ) );
1599  IntrinsicType xmm2( (~C).load(i ,j1) );
1600  IntrinsicType xmm3( (~C).load(i ,j2) );
1601  IntrinsicType xmm4( (~C).load(i ,j3) );
1602  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1603  IntrinsicType xmm6( (~C).load(i+1UL,j1) );
1604  IntrinsicType xmm7( (~C).load(i+1UL,j2) );
1605  IntrinsicType xmm8( (~C).load(i+1UL,j3) );
1606 
1607  for( size_t k=kbegin; k<kend; ++k ) {
1608  const IntrinsicType a1( set( A(i ,k) ) );
1609  const IntrinsicType a2( set( A(i+1UL,k) ) );
1610  const IntrinsicType b1( B.load(k,j ) );
1611  const IntrinsicType b2( B.load(k,j1) );
1612  const IntrinsicType b3( B.load(k,j2) );
1613  const IntrinsicType b4( B.load(k,j3) );
1614  xmm1 = xmm1 + a1 * b1;
1615  xmm2 = xmm2 + a1 * b2;
1616  xmm3 = xmm3 + a1 * b3;
1617  xmm4 = xmm4 + a1 * b4;
1618  xmm5 = xmm5 + a2 * b1;
1619  xmm6 = xmm6 + a2 * b2;
1620  xmm7 = xmm7 + a2 * b3;
1621  xmm8 = xmm8 + a2 * b4;
1622  }
1623 
1624  (~C).store( i , j , xmm1 );
1625  (~C).store( i , j1, xmm2 );
1626  (~C).store( i , j2, xmm3 );
1627  (~C).store( i , j3, xmm4 );
1628  (~C).store( i+1UL, j , xmm5 );
1629  (~C).store( i+1UL, j1, xmm6 );
1630  (~C).store( i+1UL, j2, xmm7 );
1631  (~C).store( i+1UL, j3, xmm8 );
1632  }
1633 
1634  if( i < iend )
1635  {
1636  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1637  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1638  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1639  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
1640 
1641  IntrinsicType xmm1( (~C).load(i,j ) );
1642  IntrinsicType xmm2( (~C).load(i,j1) );
1643  IntrinsicType xmm3( (~C).load(i,j2) );
1644  IntrinsicType xmm4( (~C).load(i,j3) );
1645 
1646  for( size_t k=kbegin; k<kend; ++k ) {
1647  const IntrinsicType a1( set( A(i,k) ) );
1648  xmm1 = xmm1 + a1 * B.load(k,j );
1649  xmm2 = xmm2 + a1 * B.load(k,j1);
1650  xmm3 = xmm3 + a1 * B.load(k,j2);
1651  xmm4 = xmm4 + a1 * B.load(k,j3);
1652  }
1653 
1654  (~C).store( i, j , xmm1 );
1655  (~C).store( i, j1, xmm2 );
1656  (~C).store( i, j2, xmm3 );
1657  (~C).store( i, j3, xmm4 );
1658  }
1659  }
1660 
1661  for( ; (j+IT::size) < jend; j+=IT::size*2UL )
1662  {
1663  const size_t j1( j+IT::size );
1664 
1665  size_t i( ii );
1666 
1667  for( ; (i+4UL) <= iend; i+=4UL )
1668  {
1669  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1670  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1671  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
1672  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
1673 
1674  IntrinsicType xmm1( (~C).load(i ,j ) );
1675  IntrinsicType xmm2( (~C).load(i ,j1) );
1676  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1677  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
1678  IntrinsicType xmm5( (~C).load(i+2UL,j ) );
1679  IntrinsicType xmm6( (~C).load(i+2UL,j1) );
1680  IntrinsicType xmm7( (~C).load(i+3UL,j ) );
1681  IntrinsicType xmm8( (~C).load(i+3UL,j1) );
1682 
1683  for( size_t k=kbegin; k<kend; ++k ) {
1684  const IntrinsicType a1( set( A(i ,k) ) );
1685  const IntrinsicType a2( set( A(i+1UL,k) ) );
1686  const IntrinsicType a3( set( A(i+2UL,k) ) );
1687  const IntrinsicType a4( set( A(i+3UL,k) ) );
1688  const IntrinsicType b1( B.load(k,j ) );
1689  const IntrinsicType b2( B.load(k,j1) );
1690  xmm1 = xmm1 + a1 * b1;
1691  xmm2 = xmm2 + a1 * b2;
1692  xmm3 = xmm3 + a2 * b1;
1693  xmm4 = xmm4 + a2 * b2;
1694  xmm5 = xmm5 + a3 * b1;
1695  xmm6 = xmm6 + a3 * b2;
1696  xmm7 = xmm7 + a4 * b1;
1697  xmm8 = xmm8 + a4 * b2;
1698  }
1699 
1700  (~C).store( i , j , xmm1 );
1701  (~C).store( i , j1, xmm2 );
1702  (~C).store( i+1UL, j , xmm3 );
1703  (~C).store( i+1UL, j1, xmm4 );
1704  (~C).store( i+2UL, j , xmm5 );
1705  (~C).store( i+2UL, j1, xmm6 );
1706  (~C).store( i+3UL, j , xmm7 );
1707  (~C).store( i+3UL, j1, xmm8 );
1708  }
1709 
1710  for( ; (i+2UL) <= iend; i+=2UL )
1711  {
1712  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1713  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1714  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1715  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
1716 
1717  IntrinsicType xmm1( (~C).load(i ,j ) );
1718  IntrinsicType xmm2( (~C).load(i ,j1) );
1719  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1720  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
1721 
1722  for( size_t k=kbegin; k<kend; ++k ) {
1723  const IntrinsicType a1( set( A(i ,k) ) );
1724  const IntrinsicType a2( set( A(i+1UL,k) ) );
1725  const IntrinsicType b1( B.load(k,j ) );
1726  const IntrinsicType b2( B.load(k,j1) );
1727  xmm1 = xmm1 + a1 * b1;
1728  xmm2 = xmm2 + a1 * b2;
1729  xmm3 = xmm3 + a2 * b1;
1730  xmm4 = xmm4 + a2 * b2;
1731  }
1732 
1733  (~C).store( i , j , xmm1 );
1734  (~C).store( i , j1, xmm2 );
1735  (~C).store( i+1UL, j , xmm3 );
1736  (~C).store( i+1UL, j1, xmm4 );
1737  }
1738 
1739  if( i < iend )
1740  {
1741  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1742  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1743  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1744  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
1745 
1746  IntrinsicType xmm1( (~C).load(i,j ) );
1747  IntrinsicType xmm2( (~C).load(i,j1) );
1748 
1749  for( size_t k=kbegin; k<kend; ++k ) {
1750  const IntrinsicType a1( set( A(i,k) ) );
1751  xmm1 = xmm1 + a1 * B.load(k,j );
1752  xmm2 = xmm2 + a1 * B.load(k,j1);
1753  }
1754 
1755  (~C).store( i, j , xmm1 );
1756  (~C).store( i, j1, xmm2 );
1757  }
1758  }
1759 
1760  if( j < jend )
1761  {
1762  for( size_t i=ii; i<iend; ++i )
1763  {
1764  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1765  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1766  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1767  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
1768 
1769  IntrinsicType xmm1( (~C).load(i,j) );
1770 
1771  for( size_t k=kbegin; k<kend; ++k ) {
1772  const IntrinsicType a1( set( A(i,k) ) );
1773  xmm1 = xmm1 + a1 * B.load(k,j);
1774  }
1775 
1776  (~C).store( i, j, xmm1 );
1777  }
1778  }
1779  }
1780  }
1781  }
1782  }
1784  //**********************************************************************************************
1785 
1786  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
1801  template< typename MT3 // Type of the left-hand side target matrix
1802  , typename MT4 // Type of the left-hand side matrix operand
1803  , typename MT5 > // Type of the right-hand side matrix operand
1804  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1805  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1806  {
1807  typedef IntrinsicTrait<ElementType> IT;
1808 
1809  const size_t M( A.rows() );
1810  const size_t N( B.columns() );
1811  const size_t K( A.columns() );
1812 
1813  const size_t iblock( 128UL );
1814  const size_t jblock( 64UL );
1815  const size_t kblock( 128UL );
1816 
1817  for( size_t ii=0UL; ii<M; ii+=iblock )
1818  {
1819  const size_t iend( min( ii+iblock, M ) );
1820 
1821  for( size_t jj=0UL; jj<N; jj+=jblock )
1822  {
1823  const size_t jend( min( jj+jblock, N ) );
1824 
1825  for( size_t j=jj; j<jend; ++j ) {
1826  for( size_t i=ii; i<iend; ++i ) {
1827  reset( (~C)(i,j) );
1828  }
1829  }
1830 
1831  for( size_t kk=0UL; kk<K; kk+=kblock )
1832  {
1833  const size_t ktmp( min( kk+kblock, K ) );
1834 
1835  size_t i( ii );
1836 
1837  for( ; (i+IT::size*3UL) < iend; i+=IT::size*4UL )
1838  {
1839  const size_t i1( i+IT::size );
1840  const size_t i2( i+IT::size*2UL );
1841  const size_t i3( i+IT::size*3UL );
1842 
1843  size_t j( jj );
1844 
1845  for( ; (j+2UL) <= jend; j+=2UL )
1846  {
1847  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1848  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1849  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
1850  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1851 
1852  IntrinsicType xmm1( (~C).load(i ,j ) );
1853  IntrinsicType xmm2( (~C).load(i1,j ) );
1854  IntrinsicType xmm3( (~C).load(i2,j ) );
1855  IntrinsicType xmm4( (~C).load(i3,j ) );
1856  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
1857  IntrinsicType xmm6( (~C).load(i1,j+1UL) );
1858  IntrinsicType xmm7( (~C).load(i2,j+1UL) );
1859  IntrinsicType xmm8( (~C).load(i3,j+1UL) );
1860 
1861  for( size_t k=kbegin; k<kend; ++k ) {
1862  const IntrinsicType a1( A.load(i ,k) );
1863  const IntrinsicType a2( A.load(i1,k) );
1864  const IntrinsicType a3( A.load(i2,k) );
1865  const IntrinsicType a4( A.load(i3,k) );
1866  const IntrinsicType b1( set( B(k,j ) ) );
1867  const IntrinsicType b2( set( B(k,j+1UL) ) );
1868  xmm1 = xmm1 + a1 * b1;
1869  xmm2 = xmm2 + a2 * b1;
1870  xmm3 = xmm3 + a3 * b1;
1871  xmm4 = xmm4 + a4 * b1;
1872  xmm5 = xmm5 + a1 * b2;
1873  xmm6 = xmm6 + a2 * b2;
1874  xmm7 = xmm7 + a3 * b2;
1875  xmm8 = xmm8 + a4 * b2;
1876  }
1877 
1878  (~C).store( i , j , xmm1 );
1879  (~C).store( i1, j , xmm2 );
1880  (~C).store( i2, j , xmm3 );
1881  (~C).store( i3, j , xmm4 );
1882  (~C).store( i , j+1UL, xmm5 );
1883  (~C).store( i1, j+1UL, xmm6 );
1884  (~C).store( i2, j+1UL, xmm7 );
1885  (~C).store( i3, j+1UL, xmm8 );
1886  }
1887 
1888  if( j < jend )
1889  {
1890  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1891  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1892  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
1893  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1894 
1895  IntrinsicType xmm1( (~C).load(i ,j) );
1896  IntrinsicType xmm2( (~C).load(i1,j) );
1897  IntrinsicType xmm3( (~C).load(i2,j) );
1898  IntrinsicType xmm4( (~C).load(i3,j) );
1899 
1900  for( size_t k=kbegin; k<kend; ++k ) {
1901  const IntrinsicType b1( set( B(k,j) ) );
1902  xmm1 = xmm1 + A.load(i ,k) * b1;
1903  xmm2 = xmm2 + A.load(i1,k) * b1;
1904  xmm3 = xmm3 + A.load(i2,k) * b1;
1905  xmm4 = xmm4 + A.load(i3,k) * b1;
1906  }
1907 
1908  (~C).store( i , j, xmm1 );
1909  (~C).store( i1, j, xmm2 );
1910  (~C).store( i2, j, xmm3 );
1911  (~C).store( i3, j, xmm4 );
1912  }
1913  }
1914 
1915  for( ; (i+IT::size) < iend; i+=IT::size*2UL )
1916  {
1917  const size_t i1( i+IT::size );
1918 
1919  size_t j( jj );
1920 
1921  for( ; (j+4UL) <= jend; j+=4UL )
1922  {
1923  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1924  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1925  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
1926  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
1927 
1928  IntrinsicType xmm1( (~C).load(i ,j ) );
1929  IntrinsicType xmm2( (~C).load(i1,j ) );
1930  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
1931  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
1932  IntrinsicType xmm5( (~C).load(i ,j+2UL) );
1933  IntrinsicType xmm6( (~C).load(i1,j+2UL) );
1934  IntrinsicType xmm7( (~C).load(i ,j+3UL) );
1935  IntrinsicType xmm8( (~C).load(i1,j+3UL) );
1936 
1937  for( size_t k=kbegin; k<kend; ++k ) {
1938  const IntrinsicType a1( A.load(i ,k) );
1939  const IntrinsicType a2( A.load(i1,k) );
1940  const IntrinsicType b1( set( B(k,j ) ) );
1941  const IntrinsicType b2( set( B(k,j+1UL) ) );
1942  const IntrinsicType b3( set( B(k,j+2UL) ) );
1943  const IntrinsicType b4( set( B(k,j+3UL) ) );
1944  xmm1 = xmm1 + a1 * b1;
1945  xmm2 = xmm2 + a2 * b1;
1946  xmm3 = xmm3 + a1 * b2;
1947  xmm4 = xmm4 + a2 * b2;
1948  xmm5 = xmm5 + a1 * b3;
1949  xmm6 = xmm6 + a2 * b3;
1950  xmm7 = xmm7 + a1 * b4;
1951  xmm8 = xmm8 + a2 * b4;
1952  }
1953 
1954  (~C).store( i , j , xmm1 );
1955  (~C).store( i1, j , xmm2 );
1956  (~C).store( i , j+1UL, xmm3 );
1957  (~C).store( i1, j+1UL, xmm4 );
1958  (~C).store( i , j+2UL, xmm5 );
1959  (~C).store( i1, j+2UL, xmm6 );
1960  (~C).store( i , j+3UL, xmm7 );
1961  (~C).store( i1, j+3UL, xmm8 );
1962  }
1963 
1964  for( ; (j+2UL) <= jend; j+=2UL )
1965  {
1966  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1967  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1968  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
1969  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1970 
1971  IntrinsicType xmm1( (~C).load(i ,j ) );
1972  IntrinsicType xmm2( (~C).load(i1,j ) );
1973  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
1974  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
1975 
1976  for( size_t k=kbegin; k<kend; ++k ) {
1977  const IntrinsicType a1( A.load(i ,k) );
1978  const IntrinsicType a2( A.load(i1,k) );
1979  const IntrinsicType b1( set( B(k,j ) ) );
1980  const IntrinsicType b2( set( B(k,j+1UL) ) );
1981  xmm1 = xmm1 + a1 * b1;
1982  xmm2 = xmm2 + a2 * b1;
1983  xmm3 = xmm3 + a1 * b2;
1984  xmm4 = xmm4 + a2 * b2;
1985  }
1986 
1987  (~C).store( i , j , xmm1 );
1988  (~C).store( i1, j , xmm2 );
1989  (~C).store( i , j+1UL, xmm3 );
1990  (~C).store( i1, j+1UL, xmm4 );
1991  }
1992 
1993  if( j < jend )
1994  {
1995  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
1996  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
1997  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
1998  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1999 
2000  IntrinsicType xmm1( (~C).load(i ,j) );
2001  IntrinsicType xmm2( (~C).load(i1,j) );
2002 
2003  for( size_t k=kbegin; k<kend; ++k ) {
2004  const IntrinsicType b1( set( B(k,j) ) );
2005  xmm1 = xmm1 + A.load(i ,k) * b1;
2006  xmm2 = xmm2 + A.load(i1,k) * b1;
2007  }
2008 
2009  (~C).store( i , j, xmm1 );
2010  (~C).store( i1, j, xmm2 );
2011  }
2012  }
2013 
2014  if( i < iend )
2015  {
2016  for( size_t j=jj; j<jend; ++j )
2017  {
2018  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
2019  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
2020  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
2021  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2022 
2023  IntrinsicType xmm1( (~C).load(i,j) );
2024 
2025  for( size_t k=kbegin; k<kend; ++k ) {
2026  const IntrinsicType b1( set( B(k,j) ) );
2027  xmm1 = xmm1 + A.load(i,k) * b1;
2028  }
2029 
2030  (~C).store( i, j, xmm1 );
2031  }
2032  }
2033  }
2034  }
2035  }
2036  }
2038  //**********************************************************************************************
2039 
2040  //**BLAS-based assignment to dense matrices (default)*******************************************
2054  template< typename MT3 // Type of the left-hand side target matrix
2055  , typename MT4 // Type of the left-hand side matrix operand
2056  , typename MT5 > // Type of the right-hand side matrix operand
2057  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2058  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2059  {
2060  selectLargeAssignKernel( C, A, B );
2061  }
2063  //**********************************************************************************************
2064 
2065  //**BLAS-based assignment to dense matrices (single precision)**********************************
2066 #if BLAZE_BLAS_MODE
2067 
2080  template< typename MT3 // Type of the left-hand side target matrix
2081  , typename MT4 // Type of the left-hand side matrix operand
2082  , typename MT5 > // Type of the right-hand side matrix operand
2083  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2084  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2085  {
2086  if( IsTriangular<MT4>::value ) {
2087  assign( C, B );
2088  strmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0F );
2089  }
2090  else if( IsTriangular<MT5>::value ) {
2091  assign( C, A );
2092  strmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0F );
2093  }
2094  else {
2095  sgemm( C, A, B, 1.0F, 0.0F );
2096  }
2097  }
2099 #endif
2100  //**********************************************************************************************
2101 
2102  //**BLAS-based assignment to dense matrices (double precision)**********************************
2103 #if BLAZE_BLAS_MODE
2104 
2117  template< typename MT3 // Type of the left-hand side target matrix
2118  , typename MT4 // Type of the left-hand side matrix operand
2119  , typename MT5 > // Type of the right-hand side matrix operand
2120  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2121  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2122  {
2123  if( IsTriangular<MT4>::value ) {
2124  assign( C, B );
2125  dtrmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0 );
2126  }
2127  else if( IsTriangular<MT5>::value ) {
2128  assign( C, A );
2129  dtrmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0 );
2130  }
2131  else {
2132  dgemm( C, A, B, 1.0, 0.0 );
2133  }
2134  }
2136 #endif
2137  //**********************************************************************************************
2138 
2139  //**BLAS-based assignment to dense matrices (single precision complex)**************************
2140 #if BLAZE_BLAS_MODE
2141 
2154  template< typename MT3 // Type of the left-hand side target matrix
2155  , typename MT4 // Type of the left-hand side matrix operand
2156  , typename MT5 > // Type of the right-hand side matrix operand
2157  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2158  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2159  {
2160  if( IsTriangular<MT4>::value ) {
2161  assign( C, B );
2162  ctrmm( C, A, CblasLeft,
2163  ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
2164  complex<float>( 1.0F, 0.0F ) );
2165  }
2166  else if( IsTriangular<MT5>::value ) {
2167  assign( C, A );
2168  ctrmm( C, B, CblasRight,
2169  ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
2170  complex<float>( 1.0F, 0.0F ) );
2171  }
2172  else {
2173  cgemm( C, A, B, complex<float>( 1.0F, 0.0F ), complex<float>( 0.0F, 0.0F ) );
2174  }
2175  }
2177 #endif
2178  //**********************************************************************************************
2179 
2180  //**BLAS-based assignment to dense matrices (double precision complex)**************************
2181 #if BLAZE_BLAS_MODE
2182 
2195  template< typename MT3 // Type of the left-hand side target matrix
2196  , typename MT4 // Type of the left-hand side matrix operand
2197  , typename MT5 > // Type of the right-hand side matrix operand
2198  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2199  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2200  {
2201  if( IsTriangular<MT4>::value ) {
2202  assign( C, B );
2203  ztrmm( C, A, CblasLeft,
2204  ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
2205  complex<double>( 1.0, 0.0 ) );
2206  }
2207  else if( IsTriangular<MT5>::value ) {
2208  assign( C, A );
2209  ztrmm( C, B, CblasRight,
2210  ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
2211  complex<double>( 1.0, 0.0 ) );
2212  }
2213  else {
2214  zgemm( C, A, B, complex<double>( 1.0, 0.0 ), complex<double>( 0.0, 0.0 ) );
2215  }
2216  }
2218 #endif
2219  //**********************************************************************************************
2220 
2221  //**Assignment to sparse matrices***************************************************************
2234  template< typename MT // Type of the target sparse matrix
2235  , bool SO > // Storage order of the target sparse matrix
2236  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2237  {
2239 
2240  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
2241 
2248 
2249  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2250  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2251 
2252  const TmpType tmp( serial( rhs ) );
2253  assign( ~lhs, tmp );
2254  }
2256  //**********************************************************************************************
2257 
2258  //**Addition assignment to dense matrices*******************************************************
2271  template< typename MT // Type of the target dense matrix
2272  , bool SO > // Storage order of the target dense matrix
2273  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2274  {
2276 
2277  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2278  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2279 
2280  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2281  return;
2282  }
2283 
2284  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2285  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2286 
2287  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2288  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2289  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2290  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2291  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2292  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2293 
2294  TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2295  }
2297  //**********************************************************************************************
2298 
2299  //**Addition assignment to dense matrices (kernel selection)************************************
2310  template< typename MT3 // Type of the left-hand side target matrix
2311  , typename MT4 // Type of the left-hand side matrix operand
2312  , typename MT5 > // Type of the right-hand side matrix operand
2313  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2314  {
2315  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
2316  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2317  selectSmallAddAssignKernel( C, A, B );
2318  else
2319  selectBlasAddAssignKernel( C, A, B );
2320  }
2322  //**********************************************************************************************
2323 
2324  //**Default addition assignment to row-major dense matrices (general/general)*******************
2338  template< typename MT3 // Type of the left-hand side target matrix
2339  , typename MT4 // Type of the left-hand side matrix operand
2340  , typename MT5 > // Type of the right-hand side matrix operand
2341  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2342  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2343  {
2344  const size_t M( A.rows() );
2345  const size_t N( B.columns() );
2346  const size_t K( A.columns() );
2347 
2348  for( size_t i=0UL; i<M; ++i )
2349  {
2350  const size_t kbegin( ( IsUpper<MT4>::value )
2351  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2352  :( 0UL ) );
2353  const size_t kend( ( IsLower<MT4>::value )
2354  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2355  :( K ) );
2356  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2357 
2358  for( size_t k=kbegin; k<kend; ++k )
2359  {
2360  const size_t jbegin( ( IsUpper<MT5>::value )
2361  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
2362  :( 0UL ) );
2363  const size_t jend( ( IsLower<MT5>::value )
2364  ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
2365  :( N ) );
2366  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2367 
2368  const size_t jnum( jend - jbegin );
2369  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2370 
2371  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2372  (~C)(i,j ) += A(i,k) * B(k,j );
2373  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2374  }
2375  if( jpos < jend ) {
2376  (~C)(i,jpos) += A(i,k) * B(k,jpos);
2377  }
2378  }
2379  }
2380  }
2382  //**********************************************************************************************
2383 
2384  //**Default addition assignment to column-major dense matrices (general/general)****************
2398  template< typename MT3 // Type of the left-hand side target matrix
2399  , typename MT4 // Type of the left-hand side matrix operand
2400  , typename MT5 > // Type of the right-hand side matrix operand
2401  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2402  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2403  {
2404  const size_t M( A.rows() );
2405  const size_t N( B.columns() );
2406  const size_t K( A.columns() );
2407 
2408  for( size_t j=0UL; j<N; ++j )
2409  {
2410  const size_t kbegin( ( IsLower<MT5>::value )
2411  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2412  :( 0UL ) );
2413  const size_t kend( ( IsUpper<MT5>::value )
2414  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2415  :( K ) );
2416  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2417 
2418  for( size_t k=kbegin; k<kend; ++k )
2419  {
2420  const size_t ibegin( ( IsLower<MT4>::value )
2421  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
2422  :( 0UL ) );
2423  const size_t iend( ( IsUpper<MT4>::value )
2424  ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
2425  :( M ) );
2426  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2427 
2428  const size_t inum( iend - ibegin );
2429  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2430 
2431  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2432  (~C)(i ,j) += A(i ,k) * B(k,j);
2433  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2434  }
2435  if( ipos < iend ) {
2436  (~C)(ipos,j) += A(ipos,k) * B(k,j);
2437  }
2438  }
2439  }
2440  }
2442  //**********************************************************************************************
2443 
2444  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
2458  template< typename MT3 // Type of the left-hand side target matrix
2459  , typename MT4 // Type of the left-hand side matrix operand
2460  , typename MT5 > // Type of the right-hand side matrix operand
2461  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2462  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2463  {
2464  const size_t M( A.rows() );
2465  const size_t N( B.columns() );
2466 
2467  const size_t block( 16UL );
2468 
2469  for( size_t ii=0UL; ii<M; ii+=block ) {
2470  const size_t iend( min( M, ii+block ) );
2471  for( size_t jj=0UL; jj<N; jj+=block ) {
2472  const size_t jend( min( N, jj+block ) );
2473  for( size_t i=ii; i<iend; ++i )
2474  {
2475  const size_t jbegin( ( IsUpper<MT4>::value )
2476  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
2477  :( jj ) );
2478  const size_t jpos( ( IsLower<MT4>::value )
2479  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
2480  :( jend ) );
2481 
2482  for( size_t j=jbegin; j<jpos; ++j ) {
2483  (~C)(i,j) += A(i,j) * B(j,j);
2484  }
2485  }
2486  }
2487  }
2488  }
2490  //**********************************************************************************************
2491 
2492  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
2506  template< typename MT3 // Type of the left-hand side target matrix
2507  , typename MT4 // Type of the left-hand side matrix operand
2508  , typename MT5 > // Type of the right-hand side matrix operand
2509  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2510  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2511  {
2512  const size_t M( A.rows() );
2513  const size_t N( B.columns() );
2514 
2515  for( size_t j=0UL; j<N; ++j )
2516  {
2517  const size_t ibegin( ( IsLower<MT4>::value )
2518  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2519  :( 0UL ) );
2520  const size_t iend( ( IsUpper<MT4>::value )
2521  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2522  :( M ) );
2523  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2524 
2525  const size_t inum( iend - ibegin );
2526  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2527 
2528  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2529  (~C)(i ,j) += A(i ,j) * B(j,j);
2530  (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j);
2531  }
2532  if( ipos < iend ) {
2533  (~C)(ipos,j) += A(ipos,j) * B(j,j);
2534  }
2535  }
2536  }
2538  //**********************************************************************************************
2539 
2540  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
2554  template< typename MT3 // Type of the left-hand side target matrix
2555  , typename MT4 // Type of the left-hand side matrix operand
2556  , typename MT5 > // Type of the right-hand side matrix operand
2557  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2558  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2559  {
2560  const size_t M( A.rows() );
2561  const size_t N( B.columns() );
2562 
2563  for( size_t i=0UL; i<M; ++i )
2564  {
2565  const size_t jbegin( ( IsUpper<MT5>::value )
2566  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2567  :( 0UL ) );
2568  const size_t jend( ( IsLower<MT5>::value )
2569  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2570  :( N ) );
2571  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2572 
2573  const size_t jnum( jend - jbegin );
2574  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2575 
2576  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2577  (~C)(i,j ) += A(i,i) * B(i,j );
2578  (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL);
2579  }
2580  if( jpos < jend ) {
2581  (~C)(i,jpos) += A(i,i) * B(i,jpos);
2582  }
2583  }
2584  }
2586  //**********************************************************************************************
2587 
2588  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
2602  template< typename MT3 // Type of the left-hand side target matrix
2603  , typename MT4 // Type of the left-hand side matrix operand
2604  , typename MT5 > // Type of the right-hand side matrix operand
2605  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2606  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2607  {
2608  const size_t M( A.rows() );
2609  const size_t N( B.columns() );
2610 
2611  const size_t block( 16UL );
2612 
2613  for( size_t jj=0UL; jj<N; jj+=block ) {
2614  const size_t jend( min( N, jj+block ) );
2615  for( size_t ii=0UL; ii<M; ii+=block ) {
2616  const size_t iend( min( M, ii+block ) );
2617  for( size_t j=jj; j<jend; ++j )
2618  {
2619  const size_t ibegin( ( IsLower<MT5>::value )
2620  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
2621  :( ii ) );
2622  const size_t ipos( ( IsUpper<MT5>::value )
2623  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
2624  :( iend ) );
2625 
2626  for( size_t i=ibegin; i<ipos; ++i ) {
2627  (~C)(i,j) += A(i,i) * B(i,j);
2628  }
2629  }
2630  }
2631  }
2632  }
2634  //**********************************************************************************************
2635 
2636  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2650  template< typename MT3 // Type of the left-hand side target matrix
2651  , typename MT4 // Type of the left-hand side matrix operand
2652  , typename MT5 > // Type of the right-hand side matrix operand
2653  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
2654  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2655  {
2656  for( size_t i=0UL; i<A.rows(); ++i ) {
2657  C(i,i) += A(i,i) * B(i,i);
2658  }
2659  }
2661  //**********************************************************************************************
2662 
2663  //**Default addition assignment to dense matrices (small matrices)******************************
2677  template< typename MT3 // Type of the left-hand side target matrix
2678  , typename MT4 // Type of the left-hand side matrix operand
2679  , typename MT5 > // Type of the right-hand side matrix operand
2680  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2681  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2682  {
2683  selectDefaultAddAssignKernel( C, A, B );
2684  }
2686  //**********************************************************************************************
2687 
2688  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2703  template< typename MT3 // Type of the left-hand side target matrix
2704  , typename MT4 // Type of the left-hand side matrix operand
2705  , typename MT5 > // Type of the right-hand side matrix operand
2706  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2707  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2708  {
2709  typedef IntrinsicTrait<ElementType> IT;
2710 
2711  const size_t M( A.rows() );
2712  const size_t N( B.columns() );
2713  const size_t K( A.columns() );
2714 
2715  size_t j( 0UL );
2716 
2717  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2718  for( size_t i=0UL; i<M; ++i )
2719  {
2720  const size_t kbegin( ( IsUpper<MT4>::value )
2721  ?( ( IsLower<MT5>::value )
2722  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2723  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2724  :( IsLower<MT5>::value ? j : 0UL ) );
2725  const size_t kend( ( IsLower<MT4>::value )
2726  ?( ( IsUpper<MT5>::value )
2727  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
2728  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2729  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
2730 
2731  IntrinsicType xmm1( (~C).load(i,j ) );
2732  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
2733  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
2734  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
2735  IntrinsicType xmm5( (~C).load(i,j+IT::size*4UL) );
2736  IntrinsicType xmm6( (~C).load(i,j+IT::size*5UL) );
2737  IntrinsicType xmm7( (~C).load(i,j+IT::size*6UL) );
2738  IntrinsicType xmm8( (~C).load(i,j+IT::size*7UL) );
2739 
2740  for( size_t k=kbegin; k<kend; ++k ) {
2741  const IntrinsicType a1( set( A(i,k) ) );
2742  xmm1 = xmm1 + a1 * B.load(k,j );
2743  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2744  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2745  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2746  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
2747  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
2748  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
2749  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
2750  }
2751 
2752  (~C).store( i, j , xmm1 );
2753  (~C).store( i, j+IT::size , xmm2 );
2754  (~C).store( i, j+IT::size*2UL, xmm3 );
2755  (~C).store( i, j+IT::size*3UL, xmm4 );
2756  (~C).store( i, j+IT::size*4UL, xmm5 );
2757  (~C).store( i, j+IT::size*5UL, xmm6 );
2758  (~C).store( i, j+IT::size*6UL, xmm7 );
2759  (~C).store( i, j+IT::size*7UL, xmm8 );
2760  }
2761  }
2762 
2763  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL )
2764  {
2765  size_t i( 0UL );
2766 
2767  for( ; (i+2UL) <= M; i+=2UL )
2768  {
2769  const size_t kbegin( ( IsUpper<MT4>::value )
2770  ?( ( IsLower<MT5>::value )
2771  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2772  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2773  :( IsLower<MT5>::value ? j : 0UL ) );
2774  const size_t kend( ( IsLower<MT4>::value )
2775  ?( ( IsUpper<MT5>::value )
2776  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
2777  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2778  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
2779 
2780  IntrinsicType xmm1( (~C).load(i ,j ) );
2781  IntrinsicType xmm2( (~C).load(i ,j+IT::size ) );
2782  IntrinsicType xmm3( (~C).load(i ,j+IT::size*2UL) );
2783  IntrinsicType xmm4( (~C).load(i ,j+IT::size*3UL) );
2784  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
2785  IntrinsicType xmm6( (~C).load(i+1UL,j+IT::size ) );
2786  IntrinsicType xmm7( (~C).load(i+1UL,j+IT::size*2UL) );
2787  IntrinsicType xmm8( (~C).load(i+1UL,j+IT::size*3UL) );
2788 
2789  for( size_t k=kbegin; k<kend; ++k ) {
2790  const IntrinsicType a1( set( A(i ,k) ) );
2791  const IntrinsicType a2( set( A(i+1UL,k) ) );
2792  const IntrinsicType b1( B.load(k,j ) );
2793  const IntrinsicType b2( B.load(k,j+IT::size ) );
2794  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
2795  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
2796  xmm1 = xmm1 + a1 * b1;
2797  xmm2 = xmm2 + a1 * b2;
2798  xmm3 = xmm3 + a1 * b3;
2799  xmm4 = xmm4 + a1 * b4;
2800  xmm5 = xmm5 + a2 * b1;
2801  xmm6 = xmm6 + a2 * b2;
2802  xmm7 = xmm7 + a2 * b3;
2803  xmm8 = xmm8 + a2 * b4;
2804  }
2805 
2806  (~C).store( i , j , xmm1 );
2807  (~C).store( i , j+IT::size , xmm2 );
2808  (~C).store( i , j+IT::size*2UL, xmm3 );
2809  (~C).store( i , j+IT::size*3UL, xmm4 );
2810  (~C).store( i+1UL, j , xmm5 );
2811  (~C).store( i+1UL, j+IT::size , xmm6 );
2812  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
2813  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
2814  }
2815 
2816  if( i < M )
2817  {
2818  const size_t kbegin( ( IsUpper<MT4>::value )
2819  ?( ( IsLower<MT5>::value )
2820  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2821  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2822  :( IsLower<MT5>::value ? j : 0UL ) );
2823  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
2824 
2825  IntrinsicType xmm1( (~C).load(i,j ) );
2826  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
2827  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
2828  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
2829 
2830  for( size_t k=kbegin; k<kend; ++k ) {
2831  const IntrinsicType a1( set( A(i,k) ) );
2832  xmm1 = xmm1 + a1 * B.load(k,j );
2833  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2834  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2835  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2836  }
2837 
2838  (~C).store( i, j , xmm1 );
2839  (~C).store( i, j+IT::size , xmm2 );
2840  (~C).store( i, j+IT::size*2UL, xmm3 );
2841  (~C).store( i, j+IT::size*3UL, xmm4 );
2842  }
2843  }
2844 
2845  for( ; (j+IT::size) < N; j+=IT::size*2UL )
2846  {
2847  size_t i( 0UL );
2848 
2849  for( ; (i+2UL) <= M; i+=2UL )
2850  {
2851  const size_t kbegin( ( IsUpper<MT4>::value )
2852  ?( ( IsLower<MT5>::value )
2853  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2854  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2855  :( IsLower<MT5>::value ? j : 0UL ) );
2856  const size_t kend( ( IsLower<MT4>::value )
2857  ?( ( IsUpper<MT5>::value )
2858  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
2859  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2860  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
2861 
2862  IntrinsicType xmm1( (~C).load(i ,j ) );
2863  IntrinsicType xmm2( (~C).load(i ,j+IT::size) );
2864  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
2865  IntrinsicType xmm4( (~C).load(i+1UL,j+IT::size) );
2866 
2867  for( size_t k=kbegin; k<kend; ++k ) {
2868  const IntrinsicType a1( set( A(i ,k) ) );
2869  const IntrinsicType a2( set( A(i+1UL,k) ) );
2870  const IntrinsicType b1( B.load(k,j ) );
2871  const IntrinsicType b2( B.load(k,j+IT::size) );
2872  xmm1 = xmm1 + a1 * b1;
2873  xmm2 = xmm2 + a1 * b2;
2874  xmm3 = xmm3 + a2 * b1;
2875  xmm4 = xmm4 + a2 * b2;
2876  }
2877 
2878  (~C).store( i , j , xmm1 );
2879  (~C).store( i , j+IT::size, xmm2 );
2880  (~C).store( i+1UL, j , xmm3 );
2881  (~C).store( i+1UL, j+IT::size, xmm4 );
2882  }
2883 
2884  if( i < M )
2885  {
2886  const size_t kbegin( ( IsUpper<MT4>::value )
2887  ?( ( IsLower<MT5>::value )
2888  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2889  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2890  :( IsLower<MT5>::value ? j : 0UL ) );
2891  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
2892 
2893  IntrinsicType xmm1( (~C).load(i,j ) );
2894  IntrinsicType xmm2( (~C).load(i,j+IT::size) );
2895 
2896  for( size_t k=kbegin; k<kend; ++k ) {
2897  const IntrinsicType a1( set( A(i,k) ) );
2898  xmm1 = xmm1 + a1 * B.load(k,j );
2899  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
2900  }
2901 
2902  (~C).store( i, j , xmm1 );
2903  (~C).store( i, j+IT::size, xmm2 );
2904  }
2905  }
2906 
2907  if( j < N )
2908  {
2909  size_t i( 0UL );
2910 
2911  for( ; (i+2UL) <= M; i+=2UL )
2912  {
2913  const size_t kbegin( ( IsUpper<MT4>::value )
2914  ?( ( IsLower<MT5>::value )
2915  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2916  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2917  :( IsLower<MT5>::value ? j : 0UL ) );
2918  const size_t kend( ( IsLower<MT4>::value )
2919  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2920  :( K ) );
2921 
2922  IntrinsicType xmm1( (~C).load(i ,j) );
2923  IntrinsicType xmm2( (~C).load(i+1UL,j) );
2924 
2925  for( size_t k=kbegin; k<kend; ++k ) {
2926  const IntrinsicType b1( B.load(k,j) );
2927  xmm1 = xmm1 + set( A(i ,k) ) * b1;
2928  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
2929  }
2930 
2931  (~C).store( i , j, xmm1 );
2932  (~C).store( i+1UL, j, xmm2 );
2933  }
2934 
2935  if( i < M )
2936  {
2937  const size_t kbegin( ( IsUpper<MT4>::value )
2938  ?( ( IsLower<MT5>::value )
2939  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2940  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2941  :( IsLower<MT5>::value ? j : 0UL ) );
2942 
2943  IntrinsicType xmm1( (~C).load(i,j) );
2944 
2945  for( size_t k=kbegin; k<K; ++k ) {
2946  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
2947  }
2948 
2949  (~C).store( i, j, xmm1 );
2950  }
2951  }
2952  }
2954  //**********************************************************************************************
2955 
2956  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2971  template< typename MT3 // Type of the left-hand side target matrix
2972  , typename MT4 // Type of the left-hand side matrix operand
2973  , typename MT5 > // Type of the right-hand side matrix operand
2974  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2975  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2976  {
2977  typedef IntrinsicTrait<ElementType> IT;
2978 
2979  const size_t M( A.rows() );
2980  const size_t N( B.columns() );
2981  const size_t K( A.columns() );
2982 
2983  size_t i( 0UL );
2984 
2985  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2986  for( size_t j=0UL; j<N; ++j )
2987  {
2988  const size_t kbegin( ( IsLower<MT5>::value )
2989  ?( ( IsUpper<MT4>::value )
2990  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2991  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2992  :( IsUpper<MT4>::value ? i : 0UL ) );
2993  const size_t kend( ( IsUpper<MT5>::value )
2994  ?( ( IsLower<MT4>::value )
2995  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
2996  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
2997  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
2998 
2999  IntrinsicType xmm1( (~C).load(i ,j) );
3000  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
3001  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
3002  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
3003  IntrinsicType xmm5( (~C).load(i+IT::size*4UL,j) );
3004  IntrinsicType xmm6( (~C).load(i+IT::size*5UL,j) );
3005  IntrinsicType xmm7( (~C).load(i+IT::size*6UL,j) );
3006  IntrinsicType xmm8( (~C).load(i+IT::size*7UL,j) );
3007 
3008  for( size_t k=kbegin; k<kend; ++k ) {
3009  const IntrinsicType b1( set( B(k,j) ) );
3010  xmm1 = xmm1 + A.load(i ,k) * b1;
3011  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3012  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3013  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3014  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
3015  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
3016  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
3017  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
3018  }
3019 
3020  (~C).store( i , j, xmm1 );
3021  (~C).store( i+IT::size , j, xmm2 );
3022  (~C).store( i+IT::size*2UL, j, xmm3 );
3023  (~C).store( i+IT::size*3UL, j, xmm4 );
3024  (~C).store( i+IT::size*4UL, j, xmm5 );
3025  (~C).store( i+IT::size*5UL, j, xmm6 );
3026  (~C).store( i+IT::size*6UL, j, xmm7 );
3027  (~C).store( i+IT::size*7UL, j, xmm8 );
3028  }
3029  }
3030 
3031  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL )
3032  {
3033  size_t j( 0UL );
3034 
3035  for( ; (j+2UL) <= N; j+=2UL )
3036  {
3037  const size_t kbegin( ( IsLower<MT5>::value )
3038  ?( ( IsUpper<MT4>::value )
3039  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3040  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3041  :( IsUpper<MT4>::value ? i : 0UL ) );
3042  const size_t kend( ( IsUpper<MT5>::value )
3043  ?( ( IsLower<MT4>::value )
3044  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3045  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3046  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
3047 
3048  IntrinsicType xmm1( (~C).load(i ,j ) );
3049  IntrinsicType xmm2( (~C).load(i+IT::size ,j ) );
3050  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j ) );
3051  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j ) );
3052  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
3053  IntrinsicType xmm6( (~C).load(i+IT::size ,j+1UL) );
3054  IntrinsicType xmm7( (~C).load(i+IT::size*2UL,j+1UL) );
3055  IntrinsicType xmm8( (~C).load(i+IT::size*3UL,j+1UL) );
3056 
3057  for( size_t k=kbegin; k<kend; ++k ) {
3058  const IntrinsicType a1( A.load(i ,k) );
3059  const IntrinsicType a2( A.load(i+IT::size ,k) );
3060  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
3061  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
3062  const IntrinsicType b1( set( B(k,j ) ) );
3063  const IntrinsicType b2( set( B(k,j+1UL) ) );
3064  xmm1 = xmm1 + a1 * b1;
3065  xmm2 = xmm2 + a2 * b1;
3066  xmm3 = xmm3 + a3 * b1;
3067  xmm4 = xmm4 + a4 * b1;
3068  xmm5 = xmm5 + a1 * b2;
3069  xmm6 = xmm6 + a2 * b2;
3070  xmm7 = xmm7 + a3 * b2;
3071  xmm8 = xmm8 + a4 * b2;
3072  }
3073 
3074  (~C).store( i , j , xmm1 );
3075  (~C).store( i+IT::size , j , xmm2 );
3076  (~C).store( i+IT::size*2UL, j , xmm3 );
3077  (~C).store( i+IT::size*3UL, j , xmm4 );
3078  (~C).store( i , j+1UL, xmm5 );
3079  (~C).store( i+IT::size , j+1UL, xmm6 );
3080  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
3081  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
3082  }
3083 
3084  if( j < N )
3085  {
3086  const size_t kbegin( ( IsLower<MT5>::value )
3087  ?( ( IsUpper<MT4>::value )
3088  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3089  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3090  :( IsUpper<MT4>::value ? i : 0UL ) );
3091  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
3092 
3093  IntrinsicType xmm1( (~C).load(i ,j) );
3094  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
3095  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
3096  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
3097 
3098  for( size_t k=kbegin; k<kend; ++k ) {
3099  const IntrinsicType b1( set( B(k,j) ) );
3100  xmm1 = xmm1 + A.load(i ,k) * b1;
3101  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3102  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3103  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3104  }
3105 
3106  (~C).store( i , j, xmm1 );
3107  (~C).store( i+IT::size , j, xmm2 );
3108  (~C).store( i+IT::size*2UL, j, xmm3 );
3109  (~C).store( i+IT::size*3UL, j, xmm4 );
3110  }
3111  }
3112 
3113  for( ; (i+IT::size) < M; i+=IT::size*2UL )
3114  {
3115  size_t j( 0UL );
3116 
3117  for( ; (j+2UL) <= N; j+=2UL )
3118  {
3119  const size_t kbegin( ( IsLower<MT5>::value )
3120  ?( ( IsUpper<MT4>::value )
3121  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3122  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3123  :( IsUpper<MT4>::value ? i : 0UL ) );
3124  const size_t kend( ( IsUpper<MT5>::value )
3125  ?( ( IsLower<MT4>::value )
3126  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3127  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3128  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
3129 
3130  IntrinsicType xmm1( (~C).load(i ,j ) );
3131  IntrinsicType xmm2( (~C).load(i+IT::size,j ) );
3132  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3133  IntrinsicType xmm4( (~C).load(i+IT::size,j+1UL) );
3134 
3135  for( size_t k=kbegin; k<kend; ++k ) {
3136  const IntrinsicType a1( A.load(i ,k) );
3137  const IntrinsicType a2( A.load(i+IT::size,k) );
3138  const IntrinsicType b1( set( B(k,j ) ) );
3139  const IntrinsicType b2( set( B(k,j+1UL) ) );
3140  xmm1 = xmm1 + a1 * b1;
3141  xmm2 = xmm2 + a2 * b1;
3142  xmm3 = xmm3 + a1 * b2;
3143  xmm4 = xmm4 + a2 * b2;
3144  }
3145 
3146  (~C).store( i , j , xmm1 );
3147  (~C).store( i+IT::size, j , xmm2 );
3148  (~C).store( i , j+1UL, xmm3 );
3149  (~C).store( i+IT::size, j+1UL, xmm4 );
3150  }
3151 
3152  if( j < N )
3153  {
3154  const size_t kbegin( ( IsLower<MT5>::value )
3155  ?( ( IsUpper<MT4>::value )
3156  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3157  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3158  :( IsUpper<MT4>::value ? i : 0UL ) );
3159  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
3160 
3161  IntrinsicType xmm1( (~C).load(i ,j) );
3162  IntrinsicType xmm2( (~C).load(i+IT::size,j) );
3163 
3164  for( size_t k=kbegin; k<kend; ++k ) {
3165  const IntrinsicType b1( set( B(k,j) ) );
3166  xmm1 = xmm1 + A.load(i ,k) * b1;
3167  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
3168  }
3169 
3170  (~C).store( i , j, xmm1 );
3171  (~C).store( i+IT::size, j, xmm2 );
3172  }
3173  }
3174 
3175  if( i < M )
3176  {
3177  size_t j( 0UL );
3178 
3179  for( ; (j+2UL) <= N; j+=2UL )
3180  {
3181  const size_t kbegin( ( IsLower<MT5>::value )
3182  ?( ( IsUpper<MT4>::value )
3183  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3184  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3185  :( IsUpper<MT4>::value ? i : 0UL ) );
3186  const size_t kend( ( IsUpper<MT5>::value )
3187  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3188  :( K ) );
3189 
3190  IntrinsicType xmm1( (~C).load(i,j ) );
3191  IntrinsicType xmm2( (~C).load(i,j+1UL) );
3192 
3193  for( size_t k=kbegin; k<kend; ++k ) {
3194  const IntrinsicType a1( A.load(i,k) );
3195  xmm1 = xmm1 + a1 * set( B(k,j ) );
3196  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
3197  }
3198 
3199  (~C).store( i, j , xmm1 );
3200  (~C).store( i, j+1UL, xmm2 );
3201  }
3202 
3203  if( j < N )
3204  {
3205  const size_t kbegin( ( IsLower<MT5>::value )
3206  ?( ( IsUpper<MT4>::value )
3207  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3208  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3209  :( IsUpper<MT4>::value ? i : 0UL ) );
3210 
3211  IntrinsicType xmm1( (~C).load(i,j) );
3212 
3213  for( size_t k=kbegin; k<K; ++k ) {
3214  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
3215  }
3216 
3217  (~C).store( i, j, xmm1 );
3218  }
3219  }
3220  }
3222  //**********************************************************************************************
3223 
3224  //**Default addition assignment to dense matrices (large matrices)******************************
3238  template< typename MT3 // Type of the left-hand side target matrix
3239  , typename MT4 // Type of the left-hand side matrix operand
3240  , typename MT5 > // Type of the right-hand side matrix operand
3241  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3242  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3243  {
3244  selectDefaultAddAssignKernel( C, A, B );
3245  }
3247  //**********************************************************************************************
3248 
3249  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
3264  template< typename MT3 // Type of the left-hand side target matrix
3265  , typename MT4 // Type of the left-hand side matrix operand
3266  , typename MT5 > // Type of the right-hand side matrix operand
3267  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3268  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3269  {
3270  typedef IntrinsicTrait<ElementType> IT;
3271 
3272  const size_t M( A.rows() );
3273  const size_t N( B.columns() );
3274  const size_t K( A.columns() );
3275 
3276  const size_t iblock( 64UL );
3277  const size_t jblock( 128UL );
3278  const size_t kblock( 128UL );
3279 
3280  for( size_t jj=0UL; jj<N; jj+=jblock )
3281  {
3282  const size_t jend( min( jj+jblock, N ) );
3283 
3284  for( size_t ii=0UL; ii<M; ii+=iblock )
3285  {
3286  const size_t iend( min( ii+iblock, M ) );
3287 
3288  for( size_t kk=0UL; kk<K; kk+=kblock )
3289  {
3290  const size_t ktmp( min( kk+kblock, K ) );
3291 
3292  size_t j( jj );
3293 
3294  for( ; (j+IT::size*3UL) < jend; j+=IT::size*4UL )
3295  {
3296  const size_t j1( j+IT::size );
3297  const size_t j2( j+IT::size*2UL );
3298  const size_t j3( j+IT::size*3UL );
3299 
3300  size_t i( ii );
3301 
3302  for( ; (i+2UL) <= iend; i+=2UL )
3303  {
3304  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3305  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3306  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3307  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
3308 
3309  IntrinsicType xmm1( (~C).load(i ,j ) );
3310  IntrinsicType xmm2( (~C).load(i ,j1) );
3311  IntrinsicType xmm3( (~C).load(i ,j2) );
3312  IntrinsicType xmm4( (~C).load(i ,j3) );
3313  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
3314  IntrinsicType xmm6( (~C).load(i+1UL,j1) );
3315  IntrinsicType xmm7( (~C).load(i+1UL,j2) );
3316  IntrinsicType xmm8( (~C).load(i+1UL,j3) );
3317 
3318  for( size_t k=kbegin; k<kend; ++k ) {
3319  const IntrinsicType a1( set( A(i ,k) ) );
3320  const IntrinsicType a2( set( A(i+1UL,k) ) );
3321  const IntrinsicType b1( B.load(k,j ) );
3322  const IntrinsicType b2( B.load(k,j1) );
3323  const IntrinsicType b3( B.load(k,j2) );
3324  const IntrinsicType b4( B.load(k,j3) );
3325  xmm1 = xmm1 + a1 * b1;
3326  xmm2 = xmm2 + a1 * b2;
3327  xmm3 = xmm3 + a1 * b3;
3328  xmm4 = xmm4 + a1 * b4;
3329  xmm5 = xmm5 + a2 * b1;
3330  xmm6 = xmm6 + a2 * b2;
3331  xmm7 = xmm7 + a2 * b3;
3332  xmm8 = xmm8 + a2 * b4;
3333  }
3334 
3335  (~C).store( i , j , xmm1 );
3336  (~C).store( i , j1, xmm2 );
3337  (~C).store( i , j2, xmm3 );
3338  (~C).store( i , j3, xmm4 );
3339  (~C).store( i+1UL, j , xmm5 );
3340  (~C).store( i+1UL, j1, xmm6 );
3341  (~C).store( i+1UL, j2, xmm7 );
3342  (~C).store( i+1UL, j3, xmm8 );
3343  }
3344 
3345  if( i < iend )
3346  {
3347  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3348  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3349  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3350  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
3351 
3352  IntrinsicType xmm1( (~C).load(i,j ) );
3353  IntrinsicType xmm2( (~C).load(i,j1) );
3354  IntrinsicType xmm3( (~C).load(i,j2) );
3355  IntrinsicType xmm4( (~C).load(i,j3) );
3356 
3357  for( size_t k=kbegin; k<kend; ++k ) {
3358  const IntrinsicType a1( set( A(i,k) ) );
3359  xmm1 = xmm1 + a1 * B.load(k,j );
3360  xmm2 = xmm2 + a1 * B.load(k,j1);
3361  xmm3 = xmm3 + a1 * B.load(k,j2);
3362  xmm4 = xmm4 + a1 * B.load(k,j3);
3363  }
3364 
3365  (~C).store( i, j , xmm1 );
3366  (~C).store( i, j1, xmm2 );
3367  (~C).store( i, j2, xmm3 );
3368  (~C).store( i, j3, xmm4 );
3369  }
3370  }
3371 
3372  for( ; (j+IT::size) < jend; j+=IT::size*2UL )
3373  {
3374  const size_t j1( j+IT::size );
3375 
3376  size_t i( ii );
3377 
3378  for( ; (i+4UL) <= iend; i+=4UL )
3379  {
3380  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3381  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3382  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
3383  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
3384 
3385  IntrinsicType xmm1( (~C).load(i ,j ) );
3386  IntrinsicType xmm2( (~C).load(i ,j1) );
3387  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
3388  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
3389  IntrinsicType xmm5( (~C).load(i+2UL,j ) );
3390  IntrinsicType xmm6( (~C).load(i+2UL,j1) );
3391  IntrinsicType xmm7( (~C).load(i+3UL,j ) );
3392  IntrinsicType xmm8( (~C).load(i+3UL,j1) );
3393 
3394  for( size_t k=kbegin; k<kend; ++k ) {
3395  const IntrinsicType a1( set( A(i ,k) ) );
3396  const IntrinsicType a2( set( A(i+1UL,k) ) );
3397  const IntrinsicType a3( set( A(i+2UL,k) ) );
3398  const IntrinsicType a4( set( A(i+3UL,k) ) );
3399  const IntrinsicType b1( B.load(k,j ) );
3400  const IntrinsicType b2( B.load(k,j1) );
3401  xmm1 = xmm1 + a1 * b1;
3402  xmm2 = xmm2 + a1 * b2;
3403  xmm3 = xmm3 + a2 * b1;
3404  xmm4 = xmm4 + a2 * b2;
3405  xmm5 = xmm5 + a3 * b1;
3406  xmm6 = xmm6 + a3 * b2;
3407  xmm7 = xmm7 + a4 * b1;
3408  xmm8 = xmm8 + a4 * b2;
3409  }
3410 
3411  (~C).store( i , j , xmm1 );
3412  (~C).store( i , j1, xmm2 );
3413  (~C).store( i+1UL, j , xmm3 );
3414  (~C).store( i+1UL, j1, xmm4 );
3415  (~C).store( i+2UL, j , xmm5 );
3416  (~C).store( i+2UL, j1, xmm6 );
3417  (~C).store( i+3UL, j , xmm7 );
3418  (~C).store( i+3UL, j1, xmm8 );
3419  }
3420 
3421  for( ; (i+2UL) <= iend; i+=2UL )
3422  {
3423  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3424  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3425  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3426  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
3427 
3428  IntrinsicType xmm1( (~C).load(i ,j ) );
3429  IntrinsicType xmm2( (~C).load(i ,j1) );
3430  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
3431  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
3432 
3433  for( size_t k=kbegin; k<kend; ++k ) {
3434  const IntrinsicType a1( set( A(i ,k) ) );
3435  const IntrinsicType a2( set( A(i+1UL,k) ) );
3436  const IntrinsicType b1( B.load(k,j ) );
3437  const IntrinsicType b2( B.load(k,j1) );
3438  xmm1 = xmm1 + a1 * b1;
3439  xmm2 = xmm2 + a1 * b2;
3440  xmm3 = xmm3 + a2 * b1;
3441  xmm4 = xmm4 + a2 * b2;
3442  }
3443 
3444  (~C).store( i , j , xmm1 );
3445  (~C).store( i , j1, xmm2 );
3446  (~C).store( i+1UL, j , xmm3 );
3447  (~C).store( i+1UL, j1, xmm4 );
3448  }
3449 
3450  if( i < iend )
3451  {
3452  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3453  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3454  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3455  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
3456 
3457  IntrinsicType xmm1( (~C).load(i,j ) );
3458  IntrinsicType xmm2( (~C).load(i,j1) );
3459 
3460  for( size_t k=kbegin; k<kend; ++k ) {
3461  const IntrinsicType a1( set( A(i,k) ) );
3462  xmm1 = xmm1 + a1 * B.load(k,j );
3463  xmm2 = xmm2 + a1 * B.load(k,j1);
3464  }
3465 
3466  (~C).store( i, j , xmm1 );
3467  (~C).store( i, j1, xmm2 );
3468  }
3469  }
3470 
3471  if( j < jend )
3472  {
3473  for( size_t i=ii; i<iend; ++i )
3474  {
3475  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3476  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3477  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3478  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
3479 
3480  IntrinsicType xmm1( (~C).load(i,j) );
3481 
3482  for( size_t k=kbegin; k<kend; ++k ) {
3483  const IntrinsicType a1( set( A(i,k) ) );
3484  xmm1 = xmm1 + a1 * B.load(k,j);
3485  }
3486 
3487  (~C).store( i, j, xmm1 );
3488  }
3489  }
3490  }
3491  }
3492  }
3493  }
3495  //**********************************************************************************************
3496 
3497  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
3512  template< typename MT3 // Type of the left-hand side target matrix
3513  , typename MT4 // Type of the left-hand side matrix operand
3514  , typename MT5 > // Type of the right-hand side matrix operand
3515  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3516  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3517  {
3518  typedef IntrinsicTrait<ElementType> IT;
3519 
3520  const size_t M( A.rows() );
3521  const size_t N( B.columns() );
3522  const size_t K( A.columns() );
3523 
3524  const size_t iblock( 128UL );
3525  const size_t jblock( 64UL );
3526  const size_t kblock( 128UL );
3527 
3528  for( size_t ii=0UL; ii<M; ii+=iblock )
3529  {
3530  const size_t iend( min( ii+iblock, M ) );
3531 
3532  for( size_t jj=0UL; jj<N; jj+=jblock )
3533  {
3534  const size_t jend( min( jj+jblock, N ) );
3535 
3536  for( size_t kk=0UL; kk<K; kk+=kblock )
3537  {
3538  const size_t ktmp( min( kk+kblock, K ) );
3539 
3540  size_t i( ii );
3541 
3542  for( ; (i+IT::size*3UL) < iend; i+=IT::size*4UL )
3543  {
3544  const size_t i1( i+IT::size );
3545  const size_t i2( i+IT::size*2UL );
3546  const size_t i3( i+IT::size*3UL );
3547 
3548  size_t j( jj );
3549 
3550  for( ; (j+2UL) <= jend; j+=2UL )
3551  {
3552  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3553  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3554  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
3555  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3556 
3557  IntrinsicType xmm1( (~C).load(i ,j ) );
3558  IntrinsicType xmm2( (~C).load(i1,j ) );
3559  IntrinsicType xmm3( (~C).load(i2,j ) );
3560  IntrinsicType xmm4( (~C).load(i3,j ) );
3561  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
3562  IntrinsicType xmm6( (~C).load(i1,j+1UL) );
3563  IntrinsicType xmm7( (~C).load(i2,j+1UL) );
3564  IntrinsicType xmm8( (~C).load(i3,j+1UL) );
3565 
3566  for( size_t k=kbegin; k<kend; ++k ) {
3567  const IntrinsicType a1( A.load(i ,k) );
3568  const IntrinsicType a2( A.load(i1,k) );
3569  const IntrinsicType a3( A.load(i2,k) );
3570  const IntrinsicType a4( A.load(i3,k) );
3571  const IntrinsicType b1( set( B(k,j ) ) );
3572  const IntrinsicType b2( set( B(k,j+1UL) ) );
3573  xmm1 = xmm1 + a1 * b1;
3574  xmm2 = xmm2 + a2 * b1;
3575  xmm3 = xmm3 + a3 * b1;
3576  xmm4 = xmm4 + a4 * b1;
3577  xmm5 = xmm5 + a1 * b2;
3578  xmm6 = xmm6 + a2 * b2;
3579  xmm7 = xmm7 + a3 * b2;
3580  xmm8 = xmm8 + a4 * b2;
3581  }
3582 
3583  (~C).store( i , j , xmm1 );
3584  (~C).store( i1, j , xmm2 );
3585  (~C).store( i2, j , xmm3 );
3586  (~C).store( i3, j , xmm4 );
3587  (~C).store( i , j+1UL, xmm5 );
3588  (~C).store( i1, j+1UL, xmm6 );
3589  (~C).store( i2, j+1UL, xmm7 );
3590  (~C).store( i3, j+1UL, xmm8 );
3591  }
3592 
3593  if( j < jend )
3594  {
3595  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3596  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3597  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
3598  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3599 
3600  IntrinsicType xmm1( (~C).load(i ,j) );
3601  IntrinsicType xmm2( (~C).load(i1,j) );
3602  IntrinsicType xmm3( (~C).load(i2,j) );
3603  IntrinsicType xmm4( (~C).load(i3,j) );
3604 
3605  for( size_t k=kbegin; k<kend; ++k ) {
3606  const IntrinsicType b1( set( B(k,j) ) );
3607  xmm1 = xmm1 + A.load(i ,k) * b1;
3608  xmm2 = xmm2 + A.load(i1,k) * b1;
3609  xmm3 = xmm3 + A.load(i2,k) * b1;
3610  xmm4 = xmm4 + A.load(i3,k) * b1;
3611  }
3612 
3613  (~C).store( i , j, xmm1 );
3614  (~C).store( i1, j, xmm2 );
3615  (~C).store( i2, j, xmm3 );
3616  (~C).store( i3, j, xmm4 );
3617  }
3618  }
3619 
3620  for( ; (i+IT::size) < iend; i+=IT::size*2UL )
3621  {
3622  const size_t i1( i+IT::size );
3623 
3624  size_t j( jj );
3625 
3626  for( ; (j+4UL) <= jend; j+=4UL )
3627  {
3628  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3629  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3630  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
3631  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
3632 
3633  IntrinsicType xmm1( (~C).load(i ,j ) );
3634  IntrinsicType xmm2( (~C).load(i1,j ) );
3635  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3636  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
3637  IntrinsicType xmm5( (~C).load(i ,j+2UL) );
3638  IntrinsicType xmm6( (~C).load(i1,j+2UL) );
3639  IntrinsicType xmm7( (~C).load(i ,j+3UL) );
3640  IntrinsicType xmm8( (~C).load(i1,j+3UL) );
3641 
3642  for( size_t k=kbegin; k<kend; ++k ) {
3643  const IntrinsicType a1( A.load(i ,k) );
3644  const IntrinsicType a2( A.load(i1,k) );
3645  const IntrinsicType b1( set( B(k,j ) ) );
3646  const IntrinsicType b2( set( B(k,j+1UL) ) );
3647  const IntrinsicType b3( set( B(k,j+2UL) ) );
3648  const IntrinsicType b4( set( B(k,j+3UL) ) );
3649  xmm1 = xmm1 + a1 * b1;
3650  xmm2 = xmm2 + a2 * b1;
3651  xmm3 = xmm3 + a1 * b2;
3652  xmm4 = xmm4 + a2 * b2;
3653  xmm5 = xmm5 + a1 * b3;
3654  xmm6 = xmm6 + a2 * b3;
3655  xmm7 = xmm7 + a1 * b4;
3656  xmm8 = xmm8 + a2 * b4;
3657  }
3658 
3659  (~C).store( i , j , xmm1 );
3660  (~C).store( i1, j , xmm2 );
3661  (~C).store( i , j+1UL, xmm3 );
3662  (~C).store( i1, j+1UL, xmm4 );
3663  (~C).store( i , j+2UL, xmm5 );
3664  (~C).store( i1, j+2UL, xmm6 );
3665  (~C).store( i , j+3UL, xmm7 );
3666  (~C).store( i1, j+3UL, xmm8 );
3667  }
3668 
3669  for( ; (j+2UL) <= jend; j+=2UL )
3670  {
3671  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3672  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3673  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
3674  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3675 
3676  IntrinsicType xmm1( (~C).load(i ,j ) );
3677  IntrinsicType xmm2( (~C).load(i1,j ) );
3678  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3679  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
3680 
3681  for( size_t k=kbegin; k<kend; ++k ) {
3682  const IntrinsicType a1( A.load(i ,k) );
3683  const IntrinsicType a2( A.load(i1,k) );
3684  const IntrinsicType b1( set( B(k,j ) ) );
3685  const IntrinsicType b2( set( B(k,j+1UL) ) );
3686  xmm1 = xmm1 + a1 * b1;
3687  xmm2 = xmm2 + a2 * b1;
3688  xmm3 = xmm3 + a1 * b2;
3689  xmm4 = xmm4 + a2 * b2;
3690  }
3691 
3692  (~C).store( i , j , xmm1 );
3693  (~C).store( i1, j , xmm2 );
3694  (~C).store( i , j+1UL, xmm3 );
3695  (~C).store( i1, j+1UL, xmm4 );
3696  }
3697 
3698  if( j < jend )
3699  {
3700  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3701  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3702  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
3703  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3704 
3705  IntrinsicType xmm1( (~C).load(i ,j) );
3706  IntrinsicType xmm2( (~C).load(i1,j) );
3707 
3708  for( size_t k=kbegin; k<kend; ++k ) {
3709  const IntrinsicType b1( set( B(k,j) ) );
3710  xmm1 = xmm1 + A.load(i ,k) * b1;
3711  xmm2 = xmm2 + A.load(i1,k) * b1;
3712  }
3713 
3714  (~C).store( i , j, xmm1 );
3715  (~C).store( i1, j, xmm2 );
3716  }
3717  }
3718 
3719  if( i < iend )
3720  {
3721  for( size_t j=jj; j<jend; ++j )
3722  {
3723  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
3724  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
3725  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
3726  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3727 
3728  IntrinsicType xmm1( (~C).load(i,j) );
3729 
3730  for( size_t k=kbegin; k<kend; ++k ) {
3731  const IntrinsicType b1( set( B(k,j) ) );
3732  xmm1 = xmm1 + A.load(i,k) * b1;
3733  }
3734 
3735  (~C).store( i, j, xmm1 );
3736  }
3737  }
3738  }
3739  }
3740  }
3741  }
3743  //**********************************************************************************************
3744 
3745  //**BLAS-based addition assignment to dense matrices (default)**********************************
3759  template< typename MT3 // Type of the left-hand side target matrix
3760  , typename MT4 // Type of the left-hand side matrix operand
3761  , typename MT5 > // Type of the right-hand side matrix operand
3762  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
3763  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3764  {
3765  selectLargeAddAssignKernel( C, A, B );
3766  }
3768  //**********************************************************************************************
3769 
3770  //**BLAS-based addition assignment to dense matrices (single precision)*************************
3771 #if BLAZE_BLAS_MODE
3772 
3785  template< typename MT3 // Type of the left-hand side target matrix
3786  , typename MT4 // Type of the left-hand side matrix operand
3787  , typename MT5 > // Type of the right-hand side matrix operand
3788  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
3789  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3790  {
3791  if( IsTriangular<MT4>::value ) {
3792  typename MT3::ResultType tmp( B );
3793  strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0F );
3794  addAssign( C, tmp );
3795  }
3796  else if( IsTriangular<MT5>::value ) {
3797  typename MT3::ResultType tmp( A );
3798  strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0F );
3799  addAssign( C, tmp );
3800  }
3801  else {
3802  sgemm( C, A, B, 1.0F, 1.0F );
3803  }
3804  }
3806 #endif
3807  //**********************************************************************************************
3808 
3809  //**BLAS-based addition assignment to dense matrices (double precision)*************************
3810 #if BLAZE_BLAS_MODE
3811 
3824  template< typename MT3 // Type of the left-hand side target matrix
3825  , typename MT4 // Type of the left-hand side matrix operand
3826  , typename MT5 > // Type of the right-hand side matrix operand
3827  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
3828  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3829  {
3830  if( IsTriangular<MT4>::value ) {
3831  typename MT3::ResultType tmp( B );
3832  dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0 );
3833  addAssign( C, tmp );
3834  }
3835  else if( IsTriangular<MT5>::value ) {
3836  typename MT3::ResultType tmp( A );
3837  dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0 );
3838  addAssign( C, tmp );
3839  }
3840  else {
3841  dgemm( C, A, B, 1.0, 1.0 );
3842  }
3843  }
3845 #endif
3846  //**********************************************************************************************
3847 
3848  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
3849 #if BLAZE_BLAS_MODE
3850 
3863  template< typename MT3 // Type of the left-hand side target matrix
3864  , typename MT4 // Type of the left-hand side matrix operand
3865  , typename MT5 > // Type of the right-hand side matrix operand
3866  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3867  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3868  {
3869  if( IsTriangular<MT4>::value ) {
3870  typename MT3::ResultType tmp( B );
3871  ctrmm( tmp, A, CblasLeft,
3872  ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
3873  complex<float>( 1.0F, 0.0F ) );
3874  addAssign( C, tmp );
3875  }
3876  else if( IsTriangular<MT5>::value ) {
3877  typename MT3::ResultType tmp( A );
3878  ctrmm( tmp, B, CblasRight,
3879  ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
3880  complex<float>( 1.0F, 0.0F ) );
3881  addAssign( C, tmp );
3882  }
3883  else {
3884  cgemm( C, A, B, complex<float>( 1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
3885  }
3886  }
3888 #endif
3889  //**********************************************************************************************
3890 
3891  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
3892 #if BLAZE_BLAS_MODE
3893 
3906  template< typename MT3 // Type of the left-hand side target matrix
3907  , typename MT4 // Type of the left-hand side matrix operand
3908  , typename MT5 > // Type of the right-hand side matrix operand
3909  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3910  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3911  {
3912  if( IsTriangular<MT4>::value ) {
3913  typename MT3::ResultType tmp( B );
3914  ztrmm( tmp, A, CblasLeft,
3915  ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
3916  complex<double>( 1.0, 0.0 ) );
3917  addAssign( C, tmp );
3918  }
3919  else if( IsTriangular<MT5>::value ) {
3920  typename MT3::ResultType tmp( A );
3921  ztrmm( tmp, B, CblasRight,
3922  ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
3923  complex<double>( 1.0, 0.0 ) );
3924  addAssign( C, tmp );
3925  }
3926  else {
3927  zgemm( C, A, B, complex<double>( 1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
3928  }
3929  }
3931 #endif
3932  //**********************************************************************************************
3933 
3934  //**Addition assignment to sparse matrices******************************************************
3935  // No special implementation for the addition assignment to sparse matrices.
3936  //**********************************************************************************************
3937 
3938  //**Subtraction assignment to dense matrices****************************************************
3951  template< typename MT // Type of the target dense matrix
3952  , bool SO > // Storage order of the target dense matrix
3953  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
3954  {
3956 
3957  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3958  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3959 
3960  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3961  return;
3962  }
3963 
3964  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3965  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3966 
3967  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3968  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3969  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3970  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3971  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3972  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3973 
3974  TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3975  }
3977  //**********************************************************************************************
3978 
3979  //**Subtraction assignment to dense matrices (kernel selection)*********************************
3990  template< typename MT3 // Type of the left-hand side target matrix
3991  , typename MT4 // Type of the left-hand side matrix operand
3992  , typename MT5 > // Type of the right-hand side matrix operand
3993  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3994  {
3995  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
3996  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
3997  selectSmallSubAssignKernel( C, A, B );
3998  else
3999  selectBlasSubAssignKernel( C, A, B );
4000  }
4002  //**********************************************************************************************
4003 
4004  //**Default subtraction assignment to row-major dense matrices (general/general)****************
4018  template< typename MT3 // Type of the left-hand side target matrix
4019  , typename MT4 // Type of the left-hand side matrix operand
4020  , typename MT5 > // Type of the right-hand side matrix operand
4021  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4022  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4023  {
4024  const size_t M( A.rows() );
4025  const size_t N( B.columns() );
4026  const size_t K( A.columns() );
4027 
4028  for( size_t i=0UL; i<M; ++i )
4029  {
4030  const size_t kbegin( ( IsUpper<MT4>::value )
4031  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4032  :( 0UL ) );
4033  const size_t kend( ( IsLower<MT4>::value )
4034  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4035  :( K ) );
4036  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4037 
4038  for( size_t k=kbegin; k<kend; ++k )
4039  {
4040  const size_t jbegin( ( IsUpper<MT5>::value )
4041  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
4042  :( 0UL ) );
4043  const size_t jend( ( IsLower<MT5>::value )
4044  ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
4045  :( N ) );
4046  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4047 
4048  const size_t jnum( jend - jbegin );
4049  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4050 
4051  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4052  (~C)(i,j ) -= A(i,k) * B(k,j );
4053  (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4054  }
4055  if( jpos < jend ) {
4056  (~C)(i,jpos) -= A(i,k) * B(k,jpos);
4057  }
4058  }
4059  }
4060  }
4062  //**********************************************************************************************
4063 
4064  //**Default subtraction assignment to column-major dense matrices (general/general)*************
4078  template< typename MT3 // Type of the left-hand side target matrix
4079  , typename MT4 // Type of the left-hand side matrix operand
4080  , typename MT5 > // Type of the right-hand side matrix operand
4081  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4082  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4083  {
4084  const size_t M( A.rows() );
4085  const size_t N( B.columns() );
4086  const size_t K( A.columns() );
4087 
4088  for( size_t j=0UL; j<N; ++j )
4089  {
4090  const size_t kbegin( ( IsLower<MT5>::value )
4091  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4092  :( 0UL ) );
4093  const size_t kend( ( IsUpper<MT5>::value )
4094  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4095  :( K ) );
4096  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4097 
4098  for( size_t k=kbegin; k<kend; ++k )
4099  {
4100  const size_t ibegin( ( IsLower<MT4>::value )
4101  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
4102  :( 0UL ) );
4103  const size_t iend( ( IsUpper<MT4>::value )
4104  ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
4105  :( M ) );
4106  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4107 
4108  const size_t inum( iend - ibegin );
4109  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4110 
4111  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4112  (~C)(i ,j) -= A(i ,k) * B(k,j);
4113  (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4114  }
4115  if( ipos < iend ) {
4116  (~C)(ipos,j) -= A(ipos,k) * B(k,j);
4117  }
4118  }
4119  }
4120  }
4122  //**********************************************************************************************
4123 
4124  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
4138  template< typename MT3 // Type of the left-hand side target matrix
4139  , typename MT4 // Type of the left-hand side matrix operand
4140  , typename MT5 > // Type of the right-hand side matrix operand
4141  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4142  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4143  {
4144  const size_t M( A.rows() );
4145  const size_t N( B.columns() );
4146 
4147  const size_t block( 16UL );
4148 
4149  for( size_t ii=0UL; ii<M; ii+=block ) {
4150  const size_t iend( min( M, ii+block ) );
4151  for( size_t jj=0UL; jj<N; jj+=block ) {
4152  const size_t jend( min( N, jj+block ) );
4153  for( size_t i=ii; i<iend; ++i )
4154  {
4155  const size_t jbegin( ( IsUpper<MT4>::value )
4156  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
4157  :( jj ) );
4158  const size_t jpos( ( IsLower<MT4>::value )
4159  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
4160  :( jend ) );
4161 
4162  for( size_t j=jbegin; j<jpos; ++j ) {
4163  (~C)(i,j) -= A(i,j) * B(j,j);
4164  }
4165  }
4166  }
4167  }
4168  }
4170  //**********************************************************************************************
4171 
4172  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
4186  template< typename MT3 // Type of the left-hand side target matrix
4187  , typename MT4 // Type of the left-hand side matrix operand
4188  , typename MT5 > // Type of the right-hand side matrix operand
4189  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4190  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4191  {
4192  const size_t M( A.rows() );
4193  const size_t N( B.columns() );
4194 
4195  for( size_t j=0UL; j<N; ++j )
4196  {
4197  const size_t ibegin( ( IsLower<MT4>::value )
4198  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4199  :( 0UL ) );
4200  const size_t iend( ( IsUpper<MT4>::value )
4201  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4202  :( M ) );
4203  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4204 
4205  const size_t inum( iend - ibegin );
4206  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4207 
4208  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4209  (~C)(i ,j) -= A(i ,j) * B(j,j);
4210  (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4211  }
4212  if( ipos < iend ) {
4213  (~C)(ipos,j) -= A(ipos,j) * B(j,j);
4214  }
4215  }
4216  }
4218  //**********************************************************************************************
4219 
4220  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
4234  template< typename MT3 // Type of the left-hand side target matrix
4235  , typename MT4 // Type of the left-hand side matrix operand
4236  , typename MT5 > // Type of the right-hand side matrix operand
4237  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4238  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4239  {
4240  const size_t M( A.rows() );
4241  const size_t N( B.columns() );
4242 
4243  for( size_t i=0UL; i<M; ++i )
4244  {
4245  const size_t jbegin( ( IsUpper<MT5>::value )
4246  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4247  :( 0UL ) );
4248  const size_t jend( ( IsLower<MT5>::value )
4249  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4250  :( N ) );
4251  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4252 
4253  const size_t jnum( jend - jbegin );
4254  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4255 
4256  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4257  (~C)(i,j ) -= A(i,i) * B(i,j );
4258  (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4259  }
4260  if( jpos < jend ) {
4261  (~C)(i,jpos) -= A(i,i) * B(i,jpos);
4262  }
4263  }
4264  }
4266  //**********************************************************************************************
4267 
4268  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
4282  template< typename MT3 // Type of the left-hand side target matrix
4283  , typename MT4 // Type of the left-hand side matrix operand
4284  , typename MT5 > // Type of the right-hand side matrix operand
4285  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4286  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4287  {
4288  const size_t M( A.rows() );
4289  const size_t N( B.columns() );
4290 
4291  const size_t block( 16UL );
4292 
4293  for( size_t jj=0UL; jj<N; jj+=block ) {
4294  const size_t jend( min( N, jj+block ) );
4295  for( size_t ii=0UL; ii<M; ii+=block ) {
4296  const size_t iend( min( M, ii+block ) );
4297  for( size_t j=jj; j<jend; ++j )
4298  {
4299  const size_t ibegin( ( IsLower<MT5>::value )
4300  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
4301  :( ii ) );
4302  const size_t ipos( ( IsUpper<MT5>::value )
4303  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
4304  :( iend ) );
4305 
4306  for( size_t i=ibegin; i<ipos; ++i ) {
4307  (~C)(i,j) -= A(i,i) * B(i,j);
4308  }
4309  }
4310  }
4311  }
4312  }
4314  //**********************************************************************************************
4315 
4316  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
4330  template< typename MT3 // Type of the left-hand side target matrix
4331  , typename MT4 // Type of the left-hand side matrix operand
4332  , typename MT5 > // Type of the right-hand side matrix operand
4333  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
4334  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4335  {
4336  for( size_t i=0UL; i<A.rows(); ++i ) {
4337  C(i,i) -= A(i,i) * B(i,i);
4338  }
4339  }
4341  //**********************************************************************************************
4342 
4343  //**Default subtraction assignment to dense matrices (small matrices)***************************
4357  template< typename MT3 // Type of the left-hand side target matrix
4358  , typename MT4 // Type of the left-hand side matrix operand
4359  , typename MT5 > // Type of the right-hand side matrix operand
4360  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4361  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4362  {
4363  selectDefaultSubAssignKernel( C, A, B );
4364  }
4366  //**********************************************************************************************
4367 
4368  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
4383  template< typename MT3 // Type of the left-hand side target matrix
4384  , typename MT4 // Type of the left-hand side matrix operand
4385  , typename MT5 > // Type of the right-hand side matrix operand
4386  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4387  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4388  {
4389  typedef IntrinsicTrait<ElementType> IT;
4390 
4391  const size_t M( A.rows() );
4392  const size_t N( B.columns() );
4393  const size_t K( A.columns() );
4394 
4395  size_t j( 0UL );
4396 
4397  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
4398  for( size_t i=0UL; i<M; ++i )
4399  {
4400  const size_t kbegin( ( IsUpper<MT4>::value )
4401  ?( ( IsLower<MT5>::value )
4402  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4403  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4404  :( IsLower<MT5>::value ? j : 0UL ) );
4405  const size_t kend( ( IsLower<MT4>::value )
4406  ?( ( IsUpper<MT5>::value )
4407  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
4408  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4409  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
4410 
4411  IntrinsicType xmm1( (~C).load(i,j ) );
4412  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
4413  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
4414  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
4415  IntrinsicType xmm5( (~C).load(i,j+IT::size*4UL) );
4416  IntrinsicType xmm6( (~C).load(i,j+IT::size*5UL) );
4417  IntrinsicType xmm7( (~C).load(i,j+IT::size*6UL) );
4418  IntrinsicType xmm8( (~C).load(i,j+IT::size*7UL) );
4419 
4420  for( size_t k=kbegin; k<kend; ++k ) {
4421  const IntrinsicType a1( set( A(i,k) ) );
4422  xmm1 = xmm1 - a1 * B.load(k,j );
4423  xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
4424  xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
4425  xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
4426  xmm5 = xmm5 - a1 * B.load(k,j+IT::size*4UL);
4427  xmm6 = xmm6 - a1 * B.load(k,j+IT::size*5UL);
4428  xmm7 = xmm7 - a1 * B.load(k,j+IT::size*6UL);
4429  xmm8 = xmm8 - a1 * B.load(k,j+IT::size*7UL);
4430  }
4431 
4432  (~C).store( i, j , xmm1 );
4433  (~C).store( i, j+IT::size , xmm2 );
4434  (~C).store( i, j+IT::size*2UL, xmm3 );
4435  (~C).store( i, j+IT::size*3UL, xmm4 );
4436  (~C).store( i, j+IT::size*4UL, xmm5 );
4437  (~C).store( i, j+IT::size*5UL, xmm6 );
4438  (~C).store( i, j+IT::size*6UL, xmm7 );
4439  (~C).store( i, j+IT::size*7UL, xmm8 );
4440  }
4441  }
4442 
4443  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL )
4444  {
4445  size_t i( 0UL );
4446 
4447  for( ; (i+2UL) <= M; i+=2UL )
4448  {
4449  const size_t kbegin( ( IsUpper<MT4>::value )
4450  ?( ( IsLower<MT5>::value )
4451  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4452  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4453  :( IsLower<MT5>::value ? j : 0UL ) );
4454  const size_t kend( ( IsLower<MT4>::value )
4455  ?( ( IsUpper<MT5>::value )
4456  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
4457  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4458  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
4459 
4460  IntrinsicType xmm1( (~C).load(i ,j ) );
4461  IntrinsicType xmm2( (~C).load(i ,j+IT::size ) );
4462  IntrinsicType xmm3( (~C).load(i ,j+IT::size*2UL) );
4463  IntrinsicType xmm4( (~C).load(i ,j+IT::size*3UL) );
4464  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
4465  IntrinsicType xmm6( (~C).load(i+1UL,j+IT::size ) );
4466  IntrinsicType xmm7( (~C).load(i+1UL,j+IT::size*2UL) );
4467  IntrinsicType xmm8( (~C).load(i+1UL,j+IT::size*3UL) );
4468 
4469  for( size_t k=kbegin; k<kend; ++k ) {
4470  const IntrinsicType a1( set( A(i ,k) ) );
4471  const IntrinsicType a2( set( A(i+1UL,k) ) );
4472  const IntrinsicType b1( B.load(k,j ) );
4473  const IntrinsicType b2( B.load(k,j+IT::size ) );
4474  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
4475  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
4476  xmm1 = xmm1 - a1 * b1;
4477  xmm2 = xmm2 - a1 * b2;
4478  xmm3 = xmm3 - a1 * b3;
4479  xmm4 = xmm4 - a1 * b4;
4480  xmm5 = xmm5 - a2 * b1;
4481  xmm6 = xmm6 - a2 * b2;
4482  xmm7 = xmm7 - a2 * b3;
4483  xmm8 = xmm8 - a2 * b4;
4484  }
4485 
4486  (~C).store( i , j , xmm1 );
4487  (~C).store( i , j+IT::size , xmm2 );
4488  (~C).store( i , j+IT::size*2UL, xmm3 );
4489  (~C).store( i , j+IT::size*3UL, xmm4 );
4490  (~C).store( i+1UL, j , xmm5 );
4491  (~C).store( i+1UL, j+IT::size , xmm6 );
4492  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
4493  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
4494  }
4495 
4496  if( i < M )
4497  {
4498  const size_t kbegin( ( IsUpper<MT4>::value )
4499  ?( ( IsLower<MT5>::value )
4500  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4501  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4502  :( IsLower<MT5>::value ? j : 0UL ) );
4503  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
4504 
4505  IntrinsicType xmm1( (~C).load(i,j ) );
4506  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
4507  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
4508  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
4509 
4510  for( size_t k=kbegin; k<kend; ++k ) {
4511  const IntrinsicType a1( set( A(i,k) ) );
4512  xmm1 = xmm1 - a1 * B.load(k,j );
4513  xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
4514  xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
4515  xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
4516  }
4517 
4518  (~C).store( i, j , xmm1 );
4519  (~C).store( i, j+IT::size , xmm2 );
4520  (~C).store( i, j+IT::size*2UL, xmm3 );
4521  (~C).store( i, j+IT::size*3UL, xmm4 );
4522  }
4523  }
4524 
4525  for( ; (j+IT::size) < N; j+=IT::size*2UL )
4526  {
4527  size_t i( 0UL );
4528 
4529  for( ; (i+2UL) <= M; i+=2UL )
4530  {
4531  const size_t kbegin( ( IsUpper<MT4>::value )
4532  ?( ( IsLower<MT5>::value )
4533  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4534  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4535  :( IsLower<MT5>::value ? j : 0UL ) );
4536  const size_t kend( ( IsLower<MT4>::value )
4537  ?( ( IsUpper<MT5>::value )
4538  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
4539  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4540  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
4541 
4542  IntrinsicType xmm1( (~C).load(i ,j ) );
4543  IntrinsicType xmm2( (~C).load(i ,j+IT::size) );
4544  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
4545  IntrinsicType xmm4( (~C).load(i+1UL,j+IT::size) );
4546 
4547  for( size_t k=kbegin; k<kend; ++k ) {
4548  const IntrinsicType a1( set( A(i ,k) ) );
4549  const IntrinsicType a2( set( A(i+1UL,k) ) );
4550  const IntrinsicType b1( B.load(k,j ) );
4551  const IntrinsicType b2( B.load(k,j+IT::size) );
4552  xmm1 = xmm1 - a1 * b1;
4553  xmm2 = xmm2 - a1 * b2;
4554  xmm3 = xmm3 - a2 * b1;
4555  xmm4 = xmm4 - a2 * b2;
4556  }
4557 
4558  (~C).store( i , j , xmm1 );
4559  (~C).store( i , j+IT::size, xmm2 );
4560  (~C).store( i+1UL, j , xmm3 );
4561  (~C).store( i+1UL, j+IT::size, xmm4 );
4562  }
4563 
4564  if( i < M )
4565  {
4566  const size_t kbegin( ( IsUpper<MT4>::value )
4567  ?( ( IsLower<MT5>::value )
4568  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4569  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4570  :( IsLower<MT5>::value ? j : 0UL ) );
4571  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
4572 
4573  IntrinsicType xmm1( (~C).load(i,j ) );
4574  IntrinsicType xmm2( (~C).load(i,j+IT::size) );
4575 
4576  for( size_t k=kbegin; k<kend; ++k ) {
4577  const IntrinsicType a1( set( A(i,k) ) );
4578  xmm1 = xmm1 - a1 * B.load(k,j );
4579  xmm2 = xmm2 - a1 * B.load(k,j+IT::size);
4580  }
4581 
4582  (~C).store( i, j , xmm1 );
4583  (~C).store( i, j+IT::size, xmm2 );
4584  }
4585  }
4586 
4587  if( j < N )
4588  {
4589  size_t i( 0UL );
4590 
4591  for( ; (i+2UL) <= M; i+=2UL )
4592  {
4593  const size_t kbegin( ( IsUpper<MT4>::value )
4594  ?( ( IsLower<MT5>::value )
4595  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4596  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4597  :( IsLower<MT5>::value ? j : 0UL ) );
4598  const size_t kend( ( IsLower<MT4>::value )
4599  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4600  :( K ) );
4601 
4602  IntrinsicType xmm1( (~C).load(i ,j) );
4603  IntrinsicType xmm2( (~C).load(i+1UL,j) );
4604 
4605  for( size_t k=kbegin; k<kend; ++k ) {
4606  const IntrinsicType b1( B.load(k,j) );
4607  xmm1 = xmm1 - set( A(i ,k) ) * b1;
4608  xmm2 = xmm2 - set( A(i+1UL,k) ) * b1;
4609  }
4610 
4611  (~C).store( i , j, xmm1 );
4612  (~C).store( i+1UL, j, xmm2 );
4613  }
4614 
4615  if( i < M )
4616  {
4617  const size_t kbegin( ( IsUpper<MT4>::value )
4618  ?( ( IsLower<MT5>::value )
4619  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4620  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4621  :( IsLower<MT5>::value ? j : 0UL ) );
4622 
4623  IntrinsicType xmm1( (~C).load(i,j) );
4624 
4625  for( size_t k=kbegin; k<K; ++k ) {
4626  xmm1 = xmm1 - set( A(i,k) ) * B.load(k,j);
4627  }
4628 
4629  (~C).store( i, j, xmm1 );
4630  }
4631  }
4632  }
4634  //**********************************************************************************************
4635 
4636  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
4651  template< typename MT3 // Type of the left-hand side target matrix
4652  , typename MT4 // Type of the left-hand side matrix operand
4653  , typename MT5 > // Type of the right-hand side matrix operand
4654  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4655  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4656  {
4657  typedef IntrinsicTrait<ElementType> IT;
4658 
4659  const size_t M( A.rows() );
4660  const size_t N( B.columns() );
4661  const size_t K( A.columns() );
4662 
4663  size_t i( 0UL );
4664 
4665  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
4666  for( size_t j=0UL; j<N; ++j )
4667  {
4668  const size_t kbegin( ( IsLower<MT5>::value )
4669  ?( ( IsUpper<MT4>::value )
4670  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4671  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4672  :( IsUpper<MT4>::value ? i : 0UL ) );
4673  const size_t kend( ( IsUpper<MT5>::value )
4674  ?( ( IsLower<MT4>::value )
4675  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4676  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
4677  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
4678 
4679  IntrinsicType xmm1( (~C).load(i ,j) );
4680  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
4681  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
4682  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
4683  IntrinsicType xmm5( (~C).load(i+IT::size*4UL,j) );
4684  IntrinsicType xmm6( (~C).load(i+IT::size*5UL,j) );
4685  IntrinsicType xmm7( (~C).load(i+IT::size*6UL,j) );
4686  IntrinsicType xmm8( (~C).load(i+IT::size*7UL,j) );
4687 
4688  for( size_t k=kbegin; k<kend; ++k ) {
4689  const IntrinsicType b1( set( B(k,j) ) );
4690  xmm1 = xmm1 - A.load(i ,k) * b1;
4691  xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
4692  xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
4693  xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
4694  xmm5 = xmm5 - A.load(i+IT::size*4UL,k) * b1;
4695  xmm6 = xmm6 - A.load(i+IT::size*5UL,k) * b1;
4696  xmm7 = xmm7 - A.load(i+IT::size*6UL,k) * b1;
4697  xmm8 = xmm8 - A.load(i+IT::size*7UL,k) * b1;
4698  }
4699 
4700  (~C).store( i , j, xmm1 );
4701  (~C).store( i+IT::size , j, xmm2 );
4702  (~C).store( i+IT::size*2UL, j, xmm3 );
4703  (~C).store( i+IT::size*3UL, j, xmm4 );
4704  (~C).store( i+IT::size*4UL, j, xmm5 );
4705  (~C).store( i+IT::size*5UL, j, xmm6 );
4706  (~C).store( i+IT::size*6UL, j, xmm7 );
4707  (~C).store( i+IT::size*7UL, j, xmm8 );
4708  }
4709  }
4710 
4711  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL )
4712  {
4713  size_t j( 0UL );
4714 
4715  for( ; (j+2UL) <= N; j+=2UL )
4716  {
4717  const size_t kbegin( ( IsLower<MT5>::value )
4718  ?( ( IsUpper<MT4>::value )
4719  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4720  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4721  :( IsUpper<MT4>::value ? i : 0UL ) );
4722  const size_t kend( ( IsUpper<MT5>::value )
4723  ?( ( IsLower<MT4>::value )
4724  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4725  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4726  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
4727 
4728  IntrinsicType xmm1( (~C).load(i ,j ) );
4729  IntrinsicType xmm2( (~C).load(i+IT::size ,j ) );
4730  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j ) );
4731  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j ) );
4732  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
4733  IntrinsicType xmm6( (~C).load(i+IT::size ,j+1UL) );
4734  IntrinsicType xmm7( (~C).load(i+IT::size*2UL,j+1UL) );
4735  IntrinsicType xmm8( (~C).load(i+IT::size*3UL,j+1UL) );
4736 
4737  for( size_t k=kbegin; k<kend; ++k ) {
4738  const IntrinsicType a1( A.load(i ,k) );
4739  const IntrinsicType a2( A.load(i+IT::size ,k) );
4740  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
4741  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
4742  const IntrinsicType b1( set( B(k,j ) ) );
4743  const IntrinsicType b2( set( B(k,j+1UL) ) );
4744  xmm1 = xmm1 - a1 * b1;
4745  xmm2 = xmm2 - a2 * b1;
4746  xmm3 = xmm3 - a3 * b1;
4747  xmm4 = xmm4 - a4 * b1;
4748  xmm5 = xmm5 - a1 * b2;
4749  xmm6 = xmm6 - a2 * b2;
4750  xmm7 = xmm7 - a3 * b2;
4751  xmm8 = xmm8 - a4 * b2;
4752  }
4753 
4754  (~C).store( i , j , xmm1 );
4755  (~C).store( i+IT::size , j , xmm2 );
4756  (~C).store( i+IT::size*2UL, j , xmm3 );
4757  (~C).store( i+IT::size*3UL, j , xmm4 );
4758  (~C).store( i , j+1UL, xmm5 );
4759  (~C).store( i+IT::size , j+1UL, xmm6 );
4760  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
4761  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
4762  }
4763 
4764  if( j < N )
4765  {
4766  const size_t kbegin( ( IsLower<MT5>::value )
4767  ?( ( IsUpper<MT4>::value )
4768  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4769  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4770  :( IsUpper<MT4>::value ? i : 0UL ) );
4771  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
4772 
4773  IntrinsicType xmm1( (~C).load(i ,j) );
4774  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
4775  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
4776  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
4777 
4778  for( size_t k=kbegin; k<kend; ++k ) {
4779  const IntrinsicType b1( set( B(k,j) ) );
4780  xmm1 = xmm1 - A.load(i ,k) * b1;
4781  xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
4782  xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
4783  xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
4784  }
4785 
4786  (~C).store( i , j, xmm1 );
4787  (~C).store( i+IT::size , j, xmm2 );
4788  (~C).store( i+IT::size*2UL, j, xmm3 );
4789  (~C).store( i+IT::size*3UL, j, xmm4 );
4790  }
4791  }
4792 
4793  for( ; (i+IT::size) < M; i+=IT::size*2UL )
4794  {
4795  size_t j( 0UL );
4796 
4797  for( ; (j+2UL) <= N; j+=2UL )
4798  {
4799  const size_t kbegin( ( IsLower<MT5>::value )
4800  ?( ( IsUpper<MT4>::value )
4801  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4802  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4803  :( IsUpper<MT4>::value ? i : 0UL ) );
4804  const size_t kend( ( IsUpper<MT5>::value )
4805  ?( ( IsLower<MT4>::value )
4806  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4807  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4808  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
4809 
4810  IntrinsicType xmm1( (~C).load(i ,j ) );
4811  IntrinsicType xmm2( (~C).load(i+IT::size,j ) );
4812  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
4813  IntrinsicType xmm4( (~C).load(i+IT::size,j+1UL) );
4814 
4815  for( size_t k=kbegin; k<kend; ++k ) {
4816  const IntrinsicType a1( A.load(i ,k) );
4817  const IntrinsicType a2( A.load(i+IT::size,k) );
4818  const IntrinsicType b1( set( B(k,j ) ) );
4819  const IntrinsicType b2( set( B(k,j+1UL) ) );
4820  xmm1 = xmm1 - a1 * b1;
4821  xmm2 = xmm2 - a2 * b1;
4822  xmm3 = xmm3 - a1 * b2;
4823  xmm4 = xmm4 - a2 * b2;
4824  }
4825 
4826  (~C).store( i , j , xmm1 );
4827  (~C).store( i+IT::size, j , xmm2 );
4828  (~C).store( i , j+1UL, xmm3 );
4829  (~C).store( i+IT::size, j+1UL, xmm4 );
4830  }
4831 
4832  if( j < N )
4833  {
4834  const size_t kbegin( ( IsLower<MT5>::value )
4835  ?( ( IsUpper<MT4>::value )
4836  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4837  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4838  :( IsUpper<MT4>::value ? i : 0UL ) );
4839  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
4840 
4841  IntrinsicType xmm1( (~C).load(i ,j) );
4842  IntrinsicType xmm2( (~C).load(i+IT::size,j) );
4843 
4844  for( size_t k=kbegin; k<kend; ++k ) {
4845  const IntrinsicType b1( set( B(k,j) ) );
4846  xmm1 = xmm1 - A.load(i ,k) * b1;
4847  xmm2 = xmm2 - A.load(i+IT::size,k) * b1;
4848  }
4849 
4850  (~C).store( i , j, xmm1 );
4851  (~C).store( i+IT::size, j, xmm2 );
4852  }
4853  }
4854 
4855  if( i < M )
4856  {
4857  size_t j( 0UL );
4858 
4859  for( ; (j+2UL) <= N; j+=2UL )
4860  {
4861  const size_t kbegin( ( IsLower<MT5>::value )
4862  ?( ( IsUpper<MT4>::value )
4863  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4864  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4865  :( IsUpper<MT4>::value ? i : 0UL ) );
4866  const size_t kend( ( IsUpper<MT5>::value )
4867  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4868  :( K ) );
4869 
4870  IntrinsicType xmm1( (~C).load(i,j ) );
4871  IntrinsicType xmm2( (~C).load(i,j+1UL) );
4872 
4873  for( size_t k=kbegin; k<kend; ++k ) {
4874  const IntrinsicType a1( A.load(i,k) );
4875  xmm1 = xmm1 - a1 * set( B(k,j ) );
4876  xmm2 = xmm2 - a1 * set( B(k,j+1UL) );
4877  }
4878 
4879  (~C).store( i, j , xmm1 );
4880  (~C).store( i, j+1UL, xmm2 );
4881  }
4882 
4883  if( j < N )
4884  {
4885  const size_t kbegin( ( IsLower<MT5>::value )
4886  ?( ( IsUpper<MT4>::value )
4887  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4888  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4889  :( IsUpper<MT4>::value ? i : 0UL ) );
4890 
4891  IntrinsicType xmm1( (~C).load(i,j) );
4892 
4893  for( size_t k=kbegin; k<K; ++k ) {
4894  xmm1 = xmm1 - A.load(i,k) * set( B(k,j) );
4895  }
4896 
4897  (~C).store( i, j, xmm1 );
4898  }
4899  }
4900  }
4902  //**********************************************************************************************
4903 
4904  //**Default subtraction assignment to dense matrices (large matrices)***************************
4918  template< typename MT3 // Type of the left-hand side target matrix
4919  , typename MT4 // Type of the left-hand side matrix operand
4920  , typename MT5 > // Type of the right-hand side matrix operand
4921  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4922  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4923  {
4924  selectDefaultSubAssignKernel( C, A, B );
4925  }
4927  //**********************************************************************************************
4928 
4929  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
4944  template< typename MT3 // Type of the left-hand side target matrix
4945  , typename MT4 // Type of the left-hand side matrix operand
4946  , typename MT5 > // Type of the right-hand side matrix operand
4947  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4948  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4949  {
4950  typedef IntrinsicTrait<ElementType> IT;
4951 
4952  const size_t M( A.rows() );
4953  const size_t N( B.columns() );
4954  const size_t K( A.columns() );
4955 
4956  const size_t iblock( 64UL );
4957  const size_t jblock( 128UL );
4958  const size_t kblock( 128UL );
4959 
4960  for( size_t jj=0UL; jj<N; jj+=jblock )
4961  {
4962  const size_t jend( min( jj+jblock, N ) );
4963 
4964  for( size_t ii=0UL; ii<M; ii+=iblock )
4965  {
4966  const size_t iend( min( ii+iblock, M ) );
4967 
4968  for( size_t kk=0UL; kk<K; kk+=kblock )
4969  {
4970  const size_t ktmp( min( kk+kblock, K ) );
4971 
4972  size_t j( jj );
4973 
4974  for( ; (j+IT::size*3UL) < jend; j+=IT::size*4UL )
4975  {
4976  const size_t j1( j+IT::size );
4977  const size_t j2( j+IT::size*2UL );
4978  const size_t j3( j+IT::size*3UL );
4979 
4980  size_t i( ii );
4981 
4982  for( ; (i+2UL) <= iend; i+=2UL )
4983  {
4984  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
4985  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
4986  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
4987  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
4988 
4989  IntrinsicType xmm1( (~C).load(i ,j ) );
4990  IntrinsicType xmm2( (~C).load(i ,j1) );
4991  IntrinsicType xmm3( (~C).load(i ,j2) );
4992  IntrinsicType xmm4( (~C).load(i ,j3) );
4993  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
4994  IntrinsicType xmm6( (~C).load(i+1UL,j1) );
4995  IntrinsicType xmm7( (~C).load(i+1UL,j2) );
4996  IntrinsicType xmm8( (~C).load(i+1UL,j3) );
4997 
4998  for( size_t k=kbegin; k<kend; ++k ) {
4999  const IntrinsicType a1( set( A(i ,k) ) );
5000  const IntrinsicType a2( set( A(i+1UL,k) ) );
5001  const IntrinsicType b1( B.load(k,j ) );
5002  const IntrinsicType b2( B.load(k,j1) );
5003  const IntrinsicType b3( B.load(k,j2) );
5004  const IntrinsicType b4( B.load(k,j3) );
5005  xmm1 = xmm1 - a1 * b1;
5006  xmm2 = xmm2 - a1 * b2;
5007  xmm3 = xmm3 - a1 * b3;
5008  xmm4 = xmm4 - a1 * b4;
5009  xmm5 = xmm5 - a2 * b1;
5010  xmm6 = xmm6 - a2 * b2;
5011  xmm7 = xmm7 - a2 * b3;
5012  xmm8 = xmm8 - a2 * b4;
5013  }
5014 
5015  (~C).store( i , j , xmm1 );
5016  (~C).store( i , j1, xmm2 );
5017  (~C).store( i , j2, xmm3 );
5018  (~C).store( i , j3, xmm4 );
5019  (~C).store( i+1UL, j , xmm5 );
5020  (~C).store( i+1UL, j1, xmm6 );
5021  (~C).store( i+1UL, j2, xmm7 );
5022  (~C).store( i+1UL, j3, xmm8 );
5023  }
5024 
5025  if( i < iend )
5026  {
5027  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5028  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5029  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5030  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
5031 
5032  IntrinsicType xmm1( (~C).load(i,j ) );
5033  IntrinsicType xmm2( (~C).load(i,j1) );
5034  IntrinsicType xmm3( (~C).load(i,j2) );
5035  IntrinsicType xmm4( (~C).load(i,j3) );
5036 
5037  for( size_t k=kbegin; k<kend; ++k ) {
5038  const IntrinsicType a1( set( A(i,k) ) );
5039  xmm1 = xmm1 - a1 * B.load(k,j );
5040  xmm2 = xmm2 - a1 * B.load(k,j1);
5041  xmm3 = xmm3 - a1 * B.load(k,j2);
5042  xmm4 = xmm4 - a1 * B.load(k,j3);
5043  }
5044 
5045  (~C).store( i, j , xmm1 );
5046  (~C).store( i, j1, xmm2 );
5047  (~C).store( i, j2, xmm3 );
5048  (~C).store( i, j3, xmm4 );
5049  }
5050  }
5051 
5052  for( ; (j+IT::size) < jend; j+=IT::size*2UL )
5053  {
5054  const size_t j1( j+IT::size );
5055 
5056  size_t i( ii );
5057 
5058  for( ; (i+4UL) <= iend; i+=4UL )
5059  {
5060  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5061  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5062  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
5063  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
5064 
5065  IntrinsicType xmm1( (~C).load(i ,j ) );
5066  IntrinsicType xmm2( (~C).load(i ,j1) );
5067  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
5068  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
5069  IntrinsicType xmm5( (~C).load(i+2UL,j ) );
5070  IntrinsicType xmm6( (~C).load(i+2UL,j1) );
5071  IntrinsicType xmm7( (~C).load(i+3UL,j ) );
5072  IntrinsicType xmm8( (~C).load(i+3UL,j1) );
5073 
5074  for( size_t k=kbegin; k<kend; ++k ) {
5075  const IntrinsicType a1( set( A(i ,k) ) );
5076  const IntrinsicType a2( set( A(i+1UL,k) ) );
5077  const IntrinsicType a3( set( A(i+2UL,k) ) );
5078  const IntrinsicType a4( set( A(i+3UL,k) ) );
5079  const IntrinsicType b1( B.load(k,j ) );
5080  const IntrinsicType b2( B.load(k,j1) );
5081  xmm1 = xmm1 - a1 * b1;
5082  xmm2 = xmm2 - a1 * b2;
5083  xmm3 = xmm3 - a2 * b1;
5084  xmm4 = xmm4 - a2 * b2;
5085  xmm5 = xmm5 - a3 * b1;
5086  xmm6 = xmm6 - a3 * b2;
5087  xmm7 = xmm7 - a4 * b1;
5088  xmm8 = xmm8 - a4 * b2;
5089  }
5090 
5091  (~C).store( i , j , xmm1 );
5092  (~C).store( i , j1, xmm2 );
5093  (~C).store( i+1UL, j , xmm3 );
5094  (~C).store( i+1UL, j1, xmm4 );
5095  (~C).store( i+2UL, j , xmm5 );
5096  (~C).store( i+2UL, j1, xmm6 );
5097  (~C).store( i+3UL, j , xmm7 );
5098  (~C).store( i+3UL, j1, xmm8 );
5099  }
5100 
5101  for( ; (i+2UL) <= iend; i+=2UL )
5102  {
5103  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5104  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5105  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5106  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
5107 
5108  IntrinsicType xmm1( (~C).load(i ,j ) );
5109  IntrinsicType xmm2( (~C).load(i ,j1) );
5110  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
5111  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
5112 
5113  for( size_t k=kbegin; k<kend; ++k ) {
5114  const IntrinsicType a1( set( A(i ,k) ) );
5115  const IntrinsicType a2( set( A(i+1UL,k) ) );
5116  const IntrinsicType b1( B.load(k,j ) );
5117  const IntrinsicType b2( B.load(k,j1) );
5118  xmm1 = xmm1 - a1 * b1;
5119  xmm2 = xmm2 - a1 * b2;
5120  xmm3 = xmm3 - a2 * b1;
5121  xmm4 = xmm4 - a2 * b2;
5122  }
5123 
5124  (~C).store( i , j , xmm1 );
5125  (~C).store( i , j1, xmm2 );
5126  (~C).store( i+1UL, j , xmm3 );
5127  (~C).store( i+1UL, j1, xmm4 );
5128  }
5129 
5130  if( i < iend )
5131  {
5132  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5133  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5134  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5135  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
5136 
5137  IntrinsicType xmm1( (~C).load(i,j ) );
5138  IntrinsicType xmm2( (~C).load(i,j1) );
5139 
5140  for( size_t k=kbegin; k<kend; ++k ) {
5141  const IntrinsicType a1( set( A(i,k) ) );
5142  xmm1 = xmm1 - a1 * B.load(k,j );
5143  xmm2 = xmm2 - a1 * B.load(k,j1);
5144  }
5145 
5146  (~C).store( i, j , xmm1 );
5147  (~C).store( i, j1, xmm2 );
5148  }
5149  }
5150 
5151  if( j < jend )
5152  {
5153  for( size_t i=ii; i<iend; ++i )
5154  {
5155  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5156  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5157  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5158  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
5159 
5160  IntrinsicType xmm1( (~C).load(i,j) );
5161 
5162  for( size_t k=kbegin; k<kend; ++k ) {
5163  const IntrinsicType a1( set( A(i,k) ) );
5164  xmm1 = xmm1 - a1 * B.load(k,j);
5165  }
5166 
5167  (~C).store( i, j, xmm1 );
5168  }
5169  }
5170  }
5171  }
5172  }
5173  }
5175  //**********************************************************************************************
5176 
5177  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
5192  template< typename MT3 // Type of the left-hand side target matrix
5193  , typename MT4 // Type of the left-hand side matrix operand
5194  , typename MT5 > // Type of the right-hand side matrix operand
5195  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
5196  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
5197  {
5198  typedef IntrinsicTrait<ElementType> IT;
5199 
5200  const size_t M( A.rows() );
5201  const size_t N( B.columns() );
5202  const size_t K( A.columns() );
5203 
5204  const size_t iblock( 128UL );
5205  const size_t jblock( 64UL );
5206  const size_t kblock( 128UL );
5207 
5208  for( size_t ii=0UL; ii<M; ii+=iblock )
5209  {
5210  const size_t iend( min( ii+iblock, M ) );
5211 
5212  for( size_t jj=0UL; jj<N; jj+=jblock )
5213  {
5214  const size_t jend( min( jj+jblock, N ) );
5215 
5216  for( size_t kk=0UL; kk<K; kk+=kblock )
5217  {
5218  const size_t ktmp( min( kk+kblock, K ) );
5219 
5220  size_t i( ii );
5221 
5222  for( ; (i+IT::size*3UL) < iend; i+=IT::size*4UL )
5223  {
5224  const size_t i1( i+IT::size );
5225  const size_t i2( i+IT::size*2UL );
5226  const size_t i3( i+IT::size*3UL );
5227 
5228  size_t j( jj );
5229 
5230  for( ; (j+2UL) <= jend; j+=2UL )
5231  {
5232  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5233  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5234  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
5235  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5236 
5237  IntrinsicType xmm1( (~C).load(i ,j ) );
5238  IntrinsicType xmm2( (~C).load(i1,j ) );
5239  IntrinsicType xmm3( (~C).load(i2,j ) );
5240  IntrinsicType xmm4( (~C).load(i3,j ) );
5241  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
5242  IntrinsicType xmm6( (~C).load(i1,j+1UL) );
5243  IntrinsicType xmm7( (~C).load(i2,j+1UL) );
5244  IntrinsicType xmm8( (~C).load(i3,j+1UL) );
5245 
5246  for( size_t k=kbegin; k<kend; ++k ) {
5247  const IntrinsicType a1( A.load(i ,k) );
5248  const IntrinsicType a2( A.load(i1,k) );
5249  const IntrinsicType a3( A.load(i2,k) );
5250  const IntrinsicType a4( A.load(i3,k) );
5251  const IntrinsicType b1( set( B(k,j ) ) );
5252  const IntrinsicType b2( set( B(k,j+1UL) ) );
5253  xmm1 = xmm1 - a1 * b1;
5254  xmm2 = xmm2 - a2 * b1;
5255  xmm3 = xmm3 - a3 * b1;
5256  xmm4 = xmm4 - a4 * b1;
5257  xmm5 = xmm5 - a1 * b2;
5258  xmm6 = xmm6 - a2 * b2;
5259  xmm7 = xmm7 - a3 * b2;
5260  xmm8 = xmm8 - a4 * b2;
5261  }
5262 
5263  (~C).store( i , j , xmm1 );
5264  (~C).store( i1, j , xmm2 );
5265  (~C).store( i2, j , xmm3 );
5266  (~C).store( i3, j , xmm4 );
5267  (~C).store( i , j+1UL, xmm5 );
5268  (~C).store( i1, j+1UL, xmm6 );
5269  (~C).store( i2, j+1UL, xmm7 );
5270  (~C).store( i3, j+1UL, xmm8 );
5271  }
5272 
5273  if( j < jend )
5274  {
5275  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5276  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5277  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
5278  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5279 
5280  IntrinsicType xmm1( (~C).load(i ,j) );
5281  IntrinsicType xmm2( (~C).load(i1,j) );
5282  IntrinsicType xmm3( (~C).load(i2,j) );
5283  IntrinsicType xmm4( (~C).load(i3,j) );
5284 
5285  for( size_t k=kbegin; k<kend; ++k ) {
5286  const IntrinsicType b1( set( B(k,j) ) );
5287  xmm1 = xmm1 - A.load(i ,k) * b1;
5288  xmm2 = xmm2 - A.load(i1,k) * b1;
5289  xmm3 = xmm3 - A.load(i2,k) * b1;
5290  xmm4 = xmm4 - A.load(i3,k) * b1;
5291  }
5292 
5293  (~C).store( i , j, xmm1 );
5294  (~C).store( i1, j, xmm2 );
5295  (~C).store( i2, j, xmm3 );
5296  (~C).store( i3, j, xmm4 );
5297  }
5298  }
5299 
5300  for( ; (i+IT::size) < iend; i+=IT::size*2UL )
5301  {
5302  const size_t i1( i+IT::size );
5303 
5304  size_t j( jj );
5305 
5306  for( ; (j+4UL) <= jend; j+=4UL )
5307  {
5308  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5309  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5310  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
5311  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
5312 
5313  IntrinsicType xmm1( (~C).load(i ,j ) );
5314  IntrinsicType xmm2( (~C).load(i1,j ) );
5315  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
5316  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
5317  IntrinsicType xmm5( (~C).load(i ,j+2UL) );
5318  IntrinsicType xmm6( (~C).load(i1,j+2UL) );
5319  IntrinsicType xmm7( (~C).load(i ,j+3UL) );
5320  IntrinsicType xmm8( (~C).load(i1,j+3UL) );
5321 
5322  for( size_t k=kbegin; k<kend; ++k ) {
5323  const IntrinsicType a1( A.load(i ,k) );
5324  const IntrinsicType a2( A.load(i1,k) );
5325  const IntrinsicType b1( set( B(k,j ) ) );
5326  const IntrinsicType b2( set( B(k,j+1UL) ) );
5327  const IntrinsicType b3( set( B(k,j+2UL) ) );
5328  const IntrinsicType b4( set( B(k,j+3UL) ) );
5329  xmm1 = xmm1 - a1 * b1;
5330  xmm2 = xmm2 - a2 * b1;
5331  xmm3 = xmm3 - a1 * b2;
5332  xmm4 = xmm4 - a2 * b2;
5333  xmm5 = xmm5 - a1 * b3;
5334  xmm6 = xmm6 - a2 * b3;
5335  xmm7 = xmm7 - a1 * b4;
5336  xmm8 = xmm8 - a2 * b4;
5337  }
5338 
5339  (~C).store( i , j , xmm1 );
5340  (~C).store( i1, j , xmm2 );
5341  (~C).store( i , j+1UL, xmm3 );
5342  (~C).store( i1, j+1UL, xmm4 );
5343  (~C).store( i , j+2UL, xmm5 );
5344  (~C).store( i1, j+2UL, xmm6 );
5345  (~C).store( i , j+3UL, xmm7 );
5346  (~C).store( i1, j+3UL, xmm8 );
5347  }
5348 
5349  for( ; (j+2UL) <= jend; j+=2UL )
5350  {
5351  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5352  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5353  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
5354  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5355 
5356  IntrinsicType xmm1( (~C).load(i ,j ) );
5357  IntrinsicType xmm2( (~C).load(i1,j ) );
5358  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
5359  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
5360 
5361  for( size_t k=kbegin; k<kend; ++k ) {
5362  const IntrinsicType a1( A.load(i ,k) );
5363  const IntrinsicType a2( A.load(i1,k) );
5364  const IntrinsicType b1( set( B(k,j ) ) );
5365  const IntrinsicType b2( set( B(k,j+1UL) ) );
5366  xmm1 = xmm1 - a1 * b1;
5367  xmm2 = xmm2 - a2 * b1;
5368  xmm3 = xmm3 - a1 * b2;
5369  xmm4 = xmm4 - a2 * b2;
5370  }
5371 
5372  (~C).store( i , j , xmm1 );
5373  (~C).store( i1, j , xmm2 );
5374  (~C).store( i , j+1UL, xmm3 );
5375  (~C).store( i1, j+1UL, xmm4 );
5376  }
5377 
5378  if( j < jend )
5379  {
5380  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5381  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5382  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
5383  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5384 
5385  IntrinsicType xmm1( (~C).load(i ,j) );
5386  IntrinsicType xmm2( (~C).load(i1,j) );
5387 
5388  for( size_t k=kbegin; k<kend; ++k ) {
5389  const IntrinsicType b1( set( B(k,j) ) );
5390  xmm1 = xmm1 - A.load(i ,k) * b1;
5391  xmm2 = xmm2 - A.load(i1,k) * b1;
5392  }
5393 
5394  (~C).store( i , j, xmm1 );
5395  (~C).store( i1, j, xmm2 );
5396  }
5397  }
5398 
5399  if( i < iend )
5400  {
5401  for( size_t j=jj; j<jend; ++j )
5402  {
5403  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
5404  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
5405  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
5406  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5407 
5408  IntrinsicType xmm1( (~C).load(i,j) );
5409 
5410  for( size_t k=kbegin; k<kend; ++k ) {
5411  const IntrinsicType b1( set( B(k,j) ) );
5412  xmm1 = xmm1 - A.load(i,k) * b1;
5413  }
5414 
5415  (~C).store( i, j, xmm1 );
5416  }
5417  }
5418  }
5419  }
5420  }
5421  }
5423  //**********************************************************************************************
5424 
5425  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
5439  template< typename MT3 // Type of the left-hand side target matrix
5440  , typename MT4 // Type of the left-hand side matrix operand
5441  , typename MT5 > // Type of the right-hand side matrix operand
5442  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
5443  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5444  {
5445  selectLargeSubAssignKernel( C, A, B );
5446  }
5448  //**********************************************************************************************
5449 
5450  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
5451 #if BLAZE_BLAS_MODE
5452 
5465  template< typename MT3 // Type of the left-hand side target matrix
5466  , typename MT4 // Type of the left-hand side matrix operand
5467  , typename MT5 > // Type of the right-hand side matrix operand
5468  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
5469  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5470  {
5471  if( IsTriangular<MT4>::value ) {
5472  typename MT3::ResultType tmp( B );
5473  strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0F );
5474  subAssign( C, tmp );
5475  }
5476  else if( IsTriangular<MT5>::value ) {
5477  typename MT3::ResultType tmp( A );
5478  strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0F );
5479  subAssign( C, tmp );
5480  }
5481  else {
5482  sgemm( C, A, B, -1.0F, 1.0F );
5483  }
5484  }
5486 #endif
5487  //**********************************************************************************************
5488 
5489  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
5490 #if BLAZE_BLAS_MODE
5491 
5504  template< typename MT3 // Type of the left-hand side target matrix
5505  , typename MT4 // Type of the left-hand side matrix operand
5506  , typename MT5 > // Type of the right-hand side matrix operand
5507  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
5508  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5509  {
5510  if( IsTriangular<MT4>::value ) {
5511  typename MT3::ResultType tmp( B );
5512  dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0 );
5513  subAssign( C, tmp );
5514  }
5515  else if( IsTriangular<MT5>::value ) {
5516  typename MT3::ResultType tmp( A );
5517  dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0 );
5518  subAssign( C, tmp );
5519  }
5520  else {
5521  dgemm( C, A, B, -1.0, 1.0 );
5522  }
5523  }
5525 #endif
5526  //**********************************************************************************************
5527 
5528  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
5529 #if BLAZE_BLAS_MODE
5530 
5543  template< typename MT3 // Type of the left-hand side target matrix
5544  , typename MT4 // Type of the left-hand side matrix operand
5545  , typename MT5 > // Type of the right-hand side matrix operand
5546  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
5547  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5548  {
5549  if( IsTriangular<MT4>::value ) {
5550  typename MT3::ResultType tmp( B );
5551  ctrmm( tmp, A, CblasLeft,
5552  ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
5553  complex<float>( 1.0F, 0.0F ) );
5554  subAssign( C, tmp );
5555  }
5556  else if( IsTriangular<MT5>::value ) {
5557  typename MT3::ResultType tmp( A );
5558  ctrmm( tmp, B, CblasRight,
5559  ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
5560  complex<float>( 1.0F, 0.0F ) );
5561  subAssign( C, tmp );
5562  }
5563  else {
5564  cgemm( C, A, B, complex<float>( -1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
5565  }
5566  }
5568 #endif
5569  //**********************************************************************************************
5570 
5571  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
5572 #if BLAZE_BLAS_MODE
5573 
5586  template< typename MT3 // Type of the left-hand side target matrix
5587  , typename MT4 // Type of the left-hand side matrix operand
5588  , typename MT5 > // Type of the right-hand side matrix operand
5589  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
5590  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5591  {
5592  if( IsTriangular<MT4>::value ) {
5593  typename MT3::ResultType tmp( B );
5594  ztrmm( tmp, A, CblasLeft,
5595  ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
5596  complex<float>( 1.0, 0.0 ) );
5597  subAssign( C, tmp );
5598  }
5599  else if( IsTriangular<MT5>::value ) {
5600  typename MT3::ResultType tmp( A );
5601  ztrmm( tmp, B, CblasRight,
5602  ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
5603  complex<float>( 1.0, 0.0 ) );
5604  subAssign( C, tmp );
5605  }
5606  else {
5607  zgemm( C, A, B, complex<double>( -1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
5608  }
5609  }
5611 #endif
5612  //**********************************************************************************************
5613 
5614  //**Subtraction assignment to sparse matrices***************************************************
5615  // No special implementation for the subtraction assignment to sparse matrices.
5616  //**********************************************************************************************
5617 
5618  //**Multiplication assignment to dense matrices*************************************************
5619  // No special implementation for the multiplication assignment to dense matrices.
5620  //**********************************************************************************************
5621 
5622  //**Multiplication assignment to sparse matrices************************************************
5623  // No special implementation for the multiplication assignment to sparse matrices.
5624  //**********************************************************************************************
5625 
5626  //**SMP assignment to dense matrices************************************************************
5642  template< typename MT // Type of the target dense matrix
5643  , bool SO > // Storage order of the target dense matrix
5644  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5645  smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5646  {
5648 
5649  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5650  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5651 
5652  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
5653  return;
5654  }
5655  else if( rhs.lhs_.columns() == 0UL ) {
5656  reset( ~lhs );
5657  return;
5658  }
5659 
5660  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
5661  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
5662 
5663  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5664  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5665  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5666  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5667  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5668  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
5669 
5670  smpAssign( ~lhs, A * B );
5671  }
5673  //**********************************************************************************************
5674 
5675  //**SMP assignment to sparse matrices***********************************************************
5691  template< typename MT // Type of the target sparse matrix
5692  , bool SO > // Storage order of the target sparse matrix
5693  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5694  smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5695  {
5697 
5698  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
5699 
5706 
5707  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5708  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5709 
5710  const TmpType tmp( rhs );
5711  smpAssign( ~lhs, tmp );
5712  }
5714  //**********************************************************************************************
5715 
5716  //**SMP addition assignment to dense matrices***************************************************
5732  template< typename MT // Type of the target dense matrix
5733  , bool SO > // Storage order of the target dense matrix
5734  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5735  smpAddAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5736  {
5738 
5739  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5740  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5741 
5742  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5743  return;
5744  }
5745 
5746  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
5747  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
5748 
5749  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5750  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5751  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5752  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5753  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5754  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
5755 
5756  smpAddAssign( ~lhs, A * B );
5757  }
5759  //**********************************************************************************************
5760 
5761  //**SMP addition assignment to sparse matrices**************************************************
5762  // No special implementation for the SMP addition assignment to sparse matrices.
5763  //**********************************************************************************************
5764 
5765  //**SMP subtraction assignment to dense matrices************************************************
5781  template< typename MT // Type of the target dense matrix
5782  , bool SO > // Storage order of the target dense matrix
5783  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5784  smpSubAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5785  {
5787 
5788  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5789  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5790 
5791  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5792  return;
5793  }
5794 
5795  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
5796  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
5797 
5798  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5799  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5800  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5801  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5802  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5803  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
5804 
5805  smpSubAssign( ~lhs, A * B );
5806  }
5808  //**********************************************************************************************
5809 
5810  //**SMP subtraction assignment to sparse matrices***********************************************
5811  // No special implementation for the SMP subtraction assignment to sparse matrices.
5812  //**********************************************************************************************
5813 
5814  //**SMP multiplication assignment to dense matrices*********************************************
5815  // No special implementation for the SMP multiplication assignment to dense matrices.
5816  //**********************************************************************************************
5817 
5818  //**SMP multiplication assignment to sparse matrices********************************************
5819  // No special implementation for the SMP multiplication assignment to sparse matrices.
5820  //**********************************************************************************************
5821 
5822  //**Compile time checks*************************************************************************
5830  //**********************************************************************************************
5831 };
5832 //*************************************************************************************************
5833 
5834 
5835 
5836 
5837 //=================================================================================================
5838 //
5839 // DMATSCALARMULTEXPR SPECIALIZATION
5840 //
5841 //=================================================================================================
5842 
5843 //*************************************************************************************************
5851 template< typename MT1 // Type of the left-hand side dense matrix
5852  , typename MT2 // Type of the right-hand side dense matrix
5853  , typename ST > // Type of the right-hand side scalar value
5854 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >
5855  : public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
5856  , private MatScalarMultExpr
5857  , private Computation
5858 {
5859  private:
5860  //**Type definitions****************************************************************************
5861  typedef TDMatDMatMultExpr<MT1,MT2> MMM;
5862  typedef typename MMM::ResultType RES;
5863  typedef typename MT1::ResultType RT1;
5864  typedef typename MT2::ResultType RT2;
5865  typedef typename RT1::ElementType ET1;
5866  typedef typename RT2::ElementType ET2;
5867  typedef typename MT1::CompositeType CT1;
5868  typedef typename MT2::CompositeType CT2;
5869  //**********************************************************************************************
5870 
5871  //**********************************************************************************************
5873  enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
5874  //**********************************************************************************************
5875 
5876  //**********************************************************************************************
5878  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
5879  //**********************************************************************************************
5880 
5881  //**********************************************************************************************
5883 
5886  template< typename T1, typename T2, typename T3 >
5887  struct IsEvaluationRequired {
5888  enum { value = ( evaluateLeft || evaluateRight ) };
5889  };
5890  //**********************************************************************************************
5891 
5892  //**********************************************************************************************
5894 
5897  template< typename T1, typename T2, typename T3, typename T4 >
5898  struct UseSinglePrecisionKernel {
5899  enum { value = BLAZE_BLAS_MODE &&
5900  HasMutableDataAccess<T1>::value &&
5901  HasConstDataAccess<T2>::value &&
5902  HasConstDataAccess<T3>::value &&
5903  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
5904  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5905  IsFloat<typename T1::ElementType>::value &&
5906  IsFloat<typename T2::ElementType>::value &&
5907  IsFloat<typename T3::ElementType>::value &&
5908  !IsComplex<T4>::value };
5909  };
5910  //**********************************************************************************************
5911 
5912  //**********************************************************************************************
5914 
5917  template< typename T1, typename T2, typename T3, typename T4 >
5918  struct UseDoublePrecisionKernel {
5919  enum { value = BLAZE_BLAS_MODE &&
5920  HasMutableDataAccess<T1>::value &&
5921  HasConstDataAccess<T2>::value &&
5922  HasConstDataAccess<T3>::value &&
5923  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
5924  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5925  IsDouble<typename T1::ElementType>::value &&
5926  IsDouble<typename T2::ElementType>::value &&
5927  IsDouble<typename T3::ElementType>::value &&
5928  !IsComplex<T4>::value };
5929  };
5930  //**********************************************************************************************
5931 
5932  //**********************************************************************************************
5934 
5937  template< typename T1, typename T2, typename T3 >
5938  struct UseSinglePrecisionComplexKernel {
5939  typedef complex<float> Type;
5940  enum { value = BLAZE_BLAS_MODE &&
5941  HasMutableDataAccess<T1>::value &&
5942  HasConstDataAccess<T2>::value &&
5943  HasConstDataAccess<T3>::value &&
5944  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
5945  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5946  IsSame<typename T1::ElementType,Type>::value &&
5947  IsSame<typename T2::ElementType,Type>::value &&
5948  IsSame<typename T3::ElementType,Type>::value };
5949  };
5950  //**********************************************************************************************
5951 
5952  //**********************************************************************************************
5954 
5957  template< typename T1, typename T2, typename T3 >
5958  struct UseDoublePrecisionComplexKernel {
5959  typedef complex<double> Type;
5960  enum { value = BLAZE_BLAS_MODE &&
5961  HasMutableDataAccess<T1>::value &&
5962  HasConstDataAccess<T2>::value &&
5963  HasConstDataAccess<T3>::value &&
5964  !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
5965  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5966  IsSame<typename T1::ElementType,Type>::value &&
5967  IsSame<typename T2::ElementType,Type>::value &&
5968  IsSame<typename T3::ElementType,Type>::value };
5969  };
5970  //**********************************************************************************************
5971 
5972  //**********************************************************************************************
5974 
5976  template< typename T1, typename T2, typename T3, typename T4 >
5977  struct UseDefaultKernel {
5978  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
5979  !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
5980  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
5981  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
5982  };
5983  //**********************************************************************************************
5984 
5985  //**********************************************************************************************
5987 
5989  template< typename T1, typename T2, typename T3, typename T4 >
5990  struct UseVectorizedDefaultKernel {
5991  enum { value = !( IsDiagonal<T2>::value && IsDiagonal<T3>::value ) &&
5992  !( IsDiagonal<T2>::value && IsColumnMajorMatrix<T1>::value ) &&
5993  !( IsDiagonal<T3>::value && IsRowMajorMatrix<T1>::value ) &&
5994  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5995  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
5996  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
5997  IsSame<typename T1::ElementType,T4>::value &&
5998  IntrinsicTrait<typename T1::ElementType>::addition &&
5999  IntrinsicTrait<typename T1::ElementType>::subtraction &&
6000  IntrinsicTrait<typename T1::ElementType>::multiplication };
6001  };
6002  //**********************************************************************************************
6003 
6004  public:
6005  //**Type definitions****************************************************************************
6006  typedef DMatScalarMultExpr<MMM,ST,true> This;
6007  typedef typename MultTrait<RES,ST>::Type ResultType;
6008  typedef typename ResultType::OppositeType OppositeType;
6009  typedef typename ResultType::TransposeType TransposeType;
6010  typedef typename ResultType::ElementType ElementType;
6011  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
6012  typedef const ElementType ReturnType;
6013  typedef const ResultType CompositeType;
6014 
6016  typedef const TDMatDMatMultExpr<MT1,MT2> LeftOperand;
6017 
6019  typedef ST RightOperand;
6020 
6022  typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type LT;
6023 
6025  typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type RT;
6026  //**********************************************************************************************
6027 
6028  //**Compilation flags***************************************************************************
6030  enum { vectorizable = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
6031  MT1::vectorizable && MT2::vectorizable &&
6032  IsSame<ET1,ET2>::value &&
6033  IsSame<ET1,ST>::value &&
6034  IntrinsicTrait<ET1>::addition &&
6035  IntrinsicTrait<ET1>::multiplication };
6036 
6038  enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
6039  !evaluateRight && MT2::smpAssignable };
6040  //**********************************************************************************************
6041 
6042  //**Constructor*********************************************************************************
6048  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
6049  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
6050  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
6051  {}
6052  //**********************************************************************************************
6053 
6054  //**Access operator*****************************************************************************
6061  inline ResultType operator()( size_t i, size_t j ) const {
6062  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
6063  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
6064  return matrix_(i,j) * scalar_;
6065  }
6066  //**********************************************************************************************
6067 
6068  //**Rows function*******************************************************************************
6073  inline size_t rows() const {
6074  return matrix_.rows();
6075  }
6076  //**********************************************************************************************
6077 
6078  //**Columns function****************************************************************************
6083  inline size_t columns() const {
6084  return matrix_.columns();
6085  }
6086  //**********************************************************************************************
6087 
6088  //**Left operand access*************************************************************************
6093  inline LeftOperand leftOperand() const {
6094  return matrix_;
6095  }
6096  //**********************************************************************************************
6097 
6098  //**Right operand access************************************************************************
6103  inline RightOperand rightOperand() const {
6104  return scalar_;
6105  }
6106  //**********************************************************************************************
6107 
6108  //**********************************************************************************************
6114  template< typename T >
6115  inline bool canAlias( const T* alias ) const {
6116  return matrix_.canAlias( alias );
6117  }
6118  //**********************************************************************************************
6119 
6120  //**********************************************************************************************
6126  template< typename T >
6127  inline bool isAliased( const T* alias ) const {
6128  return matrix_.isAliased( alias );
6129  }
6130  //**********************************************************************************************
6131 
6132  //**********************************************************************************************
6137  inline bool isAligned() const {
6138  return matrix_.isAligned();
6139  }
6140  //**********************************************************************************************
6141 
6142  //**********************************************************************************************
6147  inline bool canSMPAssign() const {
6148  typename MMM::RightOperand B( matrix_.rightOperand() );
6149  return ( !BLAZE_BLAS_IS_PARALLEL ||
6150  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
6151  ( B.columns() > SMP_TDMATDMATMULT_THRESHOLD );
6152  }
6153  //**********************************************************************************************
6154 
6155  private:
6156  //**Member variables****************************************************************************
6157  LeftOperand matrix_;
6158  RightOperand scalar_;
6159  //**********************************************************************************************
6160 
6161  //**Assignment to dense matrices****************************************************************
6173  template< typename MT // Type of the target dense matrix
6174  , bool SO > // Storage order of the target dense matrix
6175  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6176  {
6178 
6179  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6180  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6181 
6182  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6183  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6184 
6185  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
6186  return;
6187  }
6188  else if( left.columns() == 0UL ) {
6189  reset( ~lhs );
6190  return;
6191  }
6192 
6193  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6194  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6195 
6196  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6197  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6198  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6199  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6200  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6201  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6202 
6203  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
6204  }
6205  //**********************************************************************************************
6206 
6207  //**Assignment to dense matrices (kernel selection)*********************************************
6218  template< typename MT3 // Type of the left-hand side target matrix
6219  , typename MT4 // Type of the left-hand side matrix operand
6220  , typename MT5 // Type of the right-hand side matrix operand
6221  , typename ST2 > // Type of the scalar value
6222  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6223  {
6224  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
6225  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
6226  selectSmallAssignKernel( C, A, B, scalar );
6227  else
6228  selectBlasAssignKernel( C, A, B, scalar );
6229  }
6230  //**********************************************************************************************
6231 
6232  //**Default assignment to row-major dense matrices (general/general)****************************
6246  template< typename MT3 // Type of the left-hand side target matrix
6247  , typename MT4 // Type of the left-hand side matrix operand
6248  , typename MT5 // Type of the right-hand side matrix operand
6249  , typename ST2 > // Type of the scalar value
6250  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6251  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6252  {
6253  const size_t M( A.rows() );
6254  const size_t N( B.columns() );
6255  const size_t K( A.columns() );
6256 
6257  for( size_t i=0UL; i<M; ++i )
6258  {
6259  const size_t kbegin( ( IsUpper<MT4>::value )
6260  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6261  :( 0UL ) );
6262  const size_t kend( ( IsLower<MT4>::value )
6263  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6264  :( K ) );
6265  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
6266 
6267  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
6268  for( size_t j=0UL; j<N; ++j ) {
6269  reset( (~C)(i,j) );
6270  }
6271  continue;
6272  }
6273 
6274  {
6275  const size_t jbegin( ( IsUpper<MT5>::value )
6276  ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
6277  :( 0UL ) );
6278  const size_t jend( ( IsLower<MT5>::value )
6279  ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
6280  :( N ) );
6281  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6282 
6283  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
6284  for( size_t j=0UL; j<jbegin; ++j ) {
6285  reset( (~C)(i,j) );
6286  }
6287  }
6288  else if( IsStrictlyUpper<MT5>::value ) {
6289  reset( (~C)(i,0UL) );
6290  }
6291  for( size_t j=jbegin; j<jend; ++j ) {
6292  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6293  }
6294  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
6295  for( size_t j=jend; j<N; ++j ) {
6296  reset( (~C)(i,j) );
6297  }
6298  }
6299  else if( IsStrictlyLower<MT5>::value ) {
6300  reset( (~C)(i,N-1UL) );
6301  }
6302  }
6303 
6304  for( size_t k=kbegin+1UL; k<kend; ++k )
6305  {
6306  const size_t jbegin( ( IsUpper<MT5>::value )
6307  ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
6308  :( 0UL ) );
6309  const size_t jend( ( IsLower<MT5>::value )
6310  ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
6311  :( N ) );
6312  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6313 
6314  for( size_t j=jbegin; j<jend; ++j ) {
6315  (~C)(i,j) += A(i,k) * B(k,j);
6316  }
6317  if( IsLower<MT5>::value ) {
6318  (~C)(i,jend) = A(i,k) * B(k,jend);
6319  }
6320  }
6321 
6322  {
6323  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
6324  ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? i+1UL : i )
6325  :( 0UL ) );
6326  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
6327  ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? i : i+1UL )
6328  :( N ) );
6329  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6330 
6331  for( size_t j=jbegin; j<jend; ++j ) {
6332  (~C)(i,j) *= scalar;
6333  }
6334  }
6335  }
6336  }
6337  //**********************************************************************************************
6338 
6339  //**Default assignment to column-major dense matrices (general/general)*************************
6353  template< typename MT3 // Type of the left-hand side target matrix
6354  , typename MT4 // Type of the left-hand side matrix operand
6355  , typename MT5 // Type of the right-hand side matrix operand
6356  , typename ST2 > // Type of the scalar value
6357  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6358  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6359  {
6360  const size_t M( A.rows() );
6361  const size_t N( B.columns() );
6362  const size_t K( A.columns() );
6363 
6364  for( size_t j=0UL; j<N; ++j )
6365  {
6366  const size_t kbegin( ( IsLower<MT5>::value )
6367  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6368  :( 0UL ) );
6369  const size_t kend( ( IsUpper<MT5>::value )
6370  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6371  :( K ) );
6372  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
6373 
6374  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
6375  for( size_t i=0UL; i<M; ++i ) {
6376  reset( (~C)(i,j) );
6377  }
6378  continue;
6379  }
6380 
6381  {
6382  const size_t ibegin( ( IsLower<MT4>::value )
6383  ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
6384  :( 0UL ) );
6385  const size_t iend( ( IsUpper<MT4>::value )
6386  ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
6387  :( M ) );
6388  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6389 
6390  if( IsLower<MT4>::value && IsLower<MT5>::value ) {
6391  for( size_t i=0UL; i<ibegin; ++i ) {
6392  reset( (~C)(i,j) );
6393  }
6394  }
6395  else if( IsStrictlyLower<MT4>::value ) {
6396  reset( (~C)(0UL,j) );
6397  }
6398  for( size_t i=ibegin; i<iend; ++i ) {
6399  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6400  }
6401  if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
6402  for( size_t i=iend; i<M; ++i ) {
6403  reset( (~C)(i,j) );
6404  }
6405  }
6406  else if( IsStrictlyUpper<MT4>::value ) {
6407  reset( (~C)(M-1UL,j) );
6408  }
6409  }
6410 
6411  for( size_t k=kbegin+1UL; k<kend; ++k )
6412  {
6413  const size_t ibegin( ( IsLower<MT4>::value )
6414  ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
6415  :( 0UL ) );
6416  const size_t iend( ( IsUpper<MT4>::value )
6417  ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
6418  :( M ) );
6419  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6420 
6421  for( size_t i=ibegin; i<iend; ++i ) {
6422  (~C)(i,j) += A(i,k) * B(k,j);
6423  }
6424  if( IsUpper<MT4>::value ) {
6425  (~C)(iend,j) = A(iend,k) * B(k,j);
6426  }
6427  }
6428 
6429  {
6430  const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
6431  ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? j+1UL : j )
6432  :( 0UL ) );
6433  const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
6434  ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? j : j+1UL )
6435  :( M ) );
6436  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6437 
6438  for( size_t i=ibegin; i<iend; ++i ) {
6439  (~C)(i,j) *= scalar;
6440  }
6441  }
6442  }
6443  }
6444  //**********************************************************************************************
6445 
6446  //**Default assignment to row-major dense matrices (general/diagonal)***************************
6460  template< typename MT3 // Type of the left-hand side target matrix
6461  , typename MT4 // Type of the left-hand side matrix operand
6462  , typename MT5 // Type of the right-hand side matrix operand
6463  , typename ST2 > // Type of the scalar value
6464  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6465  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6466  {
6467  const size_t M( A.rows() );
6468  const size_t N( B.columns() );
6469 
6470  const size_t block( 16UL );
6471 
6472  for( size_t ii=0UL; ii<M; ii+=block ) {
6473  const size_t iend( min( M, ii+block ) );
6474  for( size_t jj=0UL; jj<N; jj+=block ) {
6475  const size_t jend( min( N, jj+block ) );
6476  for( size_t i=ii; i<iend; ++i )
6477  {
6478  const size_t jbegin( ( IsUpper<MT4>::value )
6479  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
6480  :( jj ) );
6481  const size_t jpos( ( IsLower<MT4>::value )
6482  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
6483  :( jend ) );
6484 
6485  if( IsUpper<MT4>::value ) {
6486  for( size_t j=jj; j<jbegin; ++j ) {
6487  reset( (~C)(i,j) );
6488  }
6489  }
6490  for( size_t j=jbegin; j<jpos; ++j ) {
6491  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6492  }
6493  if( IsLower<MT4>::value ) {
6494  for( size_t j=jpos; j<jend; ++j ) {
6495  reset( (~C)(i,j) );
6496  }
6497  }
6498  }
6499  }
6500  }
6501  }
6502  //**********************************************************************************************
6503 
6504  //**Default assignment to column-major dense matrices (general/diagonal)************************
6518  template< typename MT3 // Type of the left-hand side target matrix
6519  , typename MT4 // Type of the left-hand side matrix operand
6520  , typename MT5 // Type of the right-hand side matrix operand
6521  , typename ST2 > // Type of the scalar value
6522  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6523  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6524  {
6525  const size_t M( A.rows() );
6526  const size_t N( B.columns() );
6527 
6528  for( size_t j=0UL; j<N; ++j )
6529  {
6530  const size_t ibegin( ( IsLower<MT4>::value )
6531  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6532  :( 0UL ) );
6533  const size_t iend( ( IsUpper<MT4>::value )
6534  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6535  :( M ) );
6536  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6537 
6538  if( IsLower<MT4>::value ) {
6539  for( size_t i=0UL; i<ibegin; ++i ) {
6540  reset( (~C)(i,j) );
6541  }
6542  }
6543  for( size_t i=ibegin; i<iend; ++i ) {
6544  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6545  }
6546  if( IsUpper<MT4>::value ) {
6547  for( size_t i=iend; i<M; ++i ) {
6548  reset( (~C)(i,j) );
6549  }
6550  }
6551  }
6552  }
6553  //**********************************************************************************************
6554 
6555  //**Default assignment to row-major dense matrices (diagonal/general)***************************
6569  template< typename MT3 // Type of the left-hand side target matrix
6570  , typename MT4 // Type of the left-hand side matrix operand
6571  , typename MT5 // Type of the right-hand side matrix operand
6572  , typename ST2 > // Type of the scalar value
6573  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6574  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6575  {
6576  const size_t M( A.rows() );
6577  const size_t N( B.columns() );
6578 
6579  for( size_t i=0UL; i<M; ++i )
6580  {
6581  const size_t jbegin( ( IsUpper<MT5>::value )
6582  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
6583  :( 0UL ) );
6584  const size_t jend( ( IsLower<MT5>::value )
6585  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
6586  :( N ) );
6587  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6588 
6589  if( IsUpper<MT5>::value ) {
6590  for( size_t j=0UL; j<jbegin; ++j ) {
6591  reset( (~C)(i,j) );
6592  }
6593  }
6594  for( size_t j=jbegin; j<jend; ++j ) {
6595  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6596  }
6597  if( IsLower<MT5>::value ) {
6598  for( size_t j=jend; j<N; ++j ) {
6599  reset( (~C)(i,j) );
6600  }
6601  }
6602  }
6603  }
6604  //**********************************************************************************************
6605 
6606  //**Default assignment to column-major dense matrices (diagonal/general)************************
6620  template< typename MT3 // Type of the left-hand side target matrix
6621  , typename MT4 // Type of the left-hand side matrix operand
6622  , typename MT5 // Type of the right-hand side matrix operand
6623  , typename ST2 > // Type of the scalar value
6624  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6625  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6626  {
6627  const size_t M( A.rows() );
6628  const size_t N( B.columns() );
6629 
6630  const size_t block( 16UL );
6631 
6632  for( size_t jj=0UL; jj<N; jj+=block ) {
6633  const size_t jend( min( N, jj+block ) );
6634  for( size_t ii=0UL; ii<M; ii+=block ) {
6635  const size_t iend( min( M, ii+block ) );
6636  for( size_t j=jj; j<jend; ++j )
6637  {
6638  const size_t ibegin( ( IsLower<MT5>::value )
6639  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
6640  :( ii ) );
6641  const size_t ipos( ( IsUpper<MT5>::value )
6642  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
6643  :( iend ) );
6644 
6645  if( IsLower<MT5>::value ) {
6646  for( size_t i=ii; i<ibegin; ++i ) {
6647  reset( (~C)(i,j) );
6648  }
6649  }
6650  for( size_t i=ibegin; i<ipos; ++i ) {
6651  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6652  }
6653  if( IsUpper<MT5>::value ) {
6654  for( size_t i=ipos; i<iend; ++i ) {
6655  reset( (~C)(i,j) );
6656  }
6657  }
6658  }
6659  }
6660  }
6661  }
6662  //**********************************************************************************************
6663 
6664  //**Default assignment to dense matrices (diagonal/diagonal)************************************
6678  template< typename MT3 // Type of the left-hand side target matrix
6679  , typename MT4 // Type of the left-hand side matrix operand
6680  , typename MT5 // Type of the right-hand side matrix operand
6681  , typename ST2 > // Type of the scalar value
6682  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
6683  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6684  {
6685  reset( C );
6686 
6687  for( size_t i=0UL; i<A.rows(); ++i ) {
6688  C(i,i) = A(i,i) * B(i,i) * scalar;
6689  }
6690  }
6691  //**********************************************************************************************
6692 
6693  //**Default assignment to dense matrices (small matrices)***************************************
6707  template< typename MT3 // Type of the left-hand side target matrix
6708  , typename MT4 // Type of the left-hand side matrix operand
6709  , typename MT5 // Type of the right-hand side matrix operand
6710  , typename ST2 > // Type of the scalar value
6711  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6712  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6713  {
6714  selectDefaultAssignKernel( C, A, B, scalar );
6715  }
6716  //**********************************************************************************************
6717 
6718  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
6733  template< typename MT3 // Type of the left-hand side target matrix
6734  , typename MT4 // Type of the left-hand side matrix operand
6735  , typename MT5 // Type of the right-hand side matrix operand
6736  , typename ST2 > // Type of the scalar value
6737  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6738  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6739  {
6740  typedef IntrinsicTrait<ElementType> IT;
6741 
6742  const size_t M( A.rows() );
6743  const size_t N( B.columns() );
6744  const size_t K( A.columns() );
6745 
6746  const IntrinsicType factor( set( scalar ) );
6747 
6748  size_t j( 0UL );
6749 
6750  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
6751  for( size_t i=0UL; i<M; ++i )
6752  {
6753  const size_t kbegin( ( IsUpper<MT4>::value )
6754  ?( ( IsLower<MT5>::value )
6755  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6756  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6757  :( IsLower<MT5>::value ? j : 0UL ) );
6758  const size_t kend( ( IsLower<MT4>::value )
6759  ?( ( IsUpper<MT5>::value )
6760  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
6761  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
6762  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
6763 
6764  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6765 
6766  for( size_t k=kbegin; k<kend; ++k ) {
6767  const IntrinsicType a1( set( A(i,k) ) );
6768  xmm1 = xmm1 + a1 * B.load(k,j );
6769  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
6770  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
6771  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
6772  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
6773  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
6774  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
6775  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
6776  }
6777 
6778  (~C).store( i, j , xmm1 * factor );
6779  (~C).store( i, j+IT::size , xmm2 * factor );
6780  (~C).store( i, j+IT::size*2UL, xmm3 * factor );
6781  (~C).store( i, j+IT::size*3UL, xmm4 * factor );
6782  (~C).store( i, j+IT::size*4UL, xmm5 * factor );
6783  (~C).store( i, j+IT::size*5UL, xmm6 * factor );
6784  (~C).store( i, j+IT::size*6UL, xmm7 * factor );
6785  (~C).store( i, j+IT::size*7UL, xmm8 * factor );
6786  }
6787  }
6788 
6789  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL )
6790  {
6791  size_t i( 0UL );
6792 
6793  for( ; (i+2UL) <= M; i+=2UL )
6794  {
6795  const size_t kbegin( ( IsUpper<MT4>::value )
6796  ?( ( IsLower<MT5>::value )
6797  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6798  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6799  :( IsLower<MT5>::value ? j : 0UL ) );
6800  const size_t kend( ( IsLower<MT4>::value )
6801  ?( ( IsUpper<MT5>::value )
6802  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
6803  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6804  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
6805 
6806  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6807 
6808  for( size_t k=kbegin; k<kend; ++k ) {
6809  const IntrinsicType a1( set( A(i ,k) ) );
6810  const IntrinsicType a2( set( A(i+1UL,k) ) );
6811  const IntrinsicType b1( B.load(k,j ) );
6812  const IntrinsicType b2( B.load(k,j+IT::size ) );
6813  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
6814  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
6815  xmm1 = xmm1 + a1 * b1;
6816  xmm2 = xmm2 + a1 * b2;
6817  xmm3 = xmm3 + a1 * b3;
6818  xmm4 = xmm4 + a1 * b4;
6819  xmm5 = xmm5 + a2 * b1;
6820  xmm6 = xmm6 + a2 * b2;
6821  xmm7 = xmm7 + a2 * b3;
6822  xmm8 = xmm8 + a2 * b4;
6823  }
6824 
6825  (~C).store( i , j , xmm1 * factor );
6826  (~C).store( i , j+IT::size , xmm2 * factor );
6827  (~C).store( i , j+IT::size*2UL, xmm3 * factor );
6828  (~C).store( i , j+IT::size*3UL, xmm4 * factor );
6829  (~C).store( i+1UL, j , xmm5 * factor );
6830  (~C).store( i+1UL, j+IT::size , xmm6 * factor );
6831  (~C).store( i+1UL, j+IT::size*2UL, xmm7 * factor );
6832  (~C).store( i+1UL, j+IT::size*3UL, xmm8 * factor );
6833  }
6834 
6835  if( i < M )
6836  {
6837  const size_t kbegin( ( IsUpper<MT4>::value )
6838  ?( ( IsLower<MT5>::value )
6839  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6840  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6841  :( IsLower<MT5>::value ? j : 0UL ) );
6842  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
6843 
6844  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6845 
6846  for( size_t k=kbegin; k<kend; ++k ) {
6847  const IntrinsicType a1( set( A(i,k) ) );
6848  xmm1 = xmm1 + a1 * B.load(k,j );
6849  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
6850  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
6851  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
6852  }
6853 
6854  (~C).store( i, j , xmm1 * factor );
6855  (~C).store( i, j+IT::size , xmm2 * factor );
6856  (~C).store( i, j+IT::size*2UL, xmm3 * factor );
6857  (~C).store( i, j+IT::size*3UL, xmm4 * factor );
6858  }
6859  }
6860 
6861  for( ; (j+IT::size) < N; j+=IT::size*2UL )
6862  {
6863  size_t i( 0UL );
6864 
6865  for( ; (i+2UL) <= M; i+=2UL )
6866  {
6867  const size_t kbegin( ( IsUpper<MT4>::value )
6868  ?( ( IsLower<MT5>::value )
6869  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6870  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6871  :( IsLower<MT5>::value ? j : 0UL ) );
6872  const size_t kend( ( IsLower<MT4>::value )
6873  ?( ( IsUpper<MT5>::value )
6874  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
6875  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6876  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
6877 
6878  IntrinsicType xmm1, xmm2, xmm3, xmm4;
6879 
6880  for( size_t k=kbegin; k<kend; ++k ) {
6881  const IntrinsicType a1( set( A(i ,k) ) );
6882  const IntrinsicType a2( set( A(i+1UL,k) ) );
6883  const IntrinsicType b1( B.load(k,j ) );
6884  const IntrinsicType b2( B.load(k,j+IT::size) );
6885  xmm1 = xmm1 + a1 * b1;
6886  xmm2 = xmm2 + a1 * b2;
6887  xmm3 = xmm3 + a2 * b1;
6888  xmm4 = xmm4 + a2 * b2;
6889  }
6890 
6891  (~C).store( i , j , xmm1 * factor );
6892  (~C).store( i , j+IT::size, xmm2 * factor );
6893  (~C).store( i+1UL, j , xmm3 * factor );
6894  (~C).store( i+1UL, j+IT::size, xmm4 * factor );
6895  }
6896 
6897  if( i < M )
6898  {
6899  const size_t kbegin( ( IsUpper<MT4>::value )
6900  ?( ( IsLower<MT5>::value )
6901  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6902  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6903  :( IsLower<MT5>::value ? j : 0UL ) );
6904  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
6905 
6906  IntrinsicType xmm1, xmm2;
6907 
6908  for( size_t k=kbegin; k<kend; ++k ) {
6909  const IntrinsicType a1( set( A(i,k) ) );
6910  xmm1 = xmm1 + a1 * B.load(k,j );
6911  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
6912  }
6913 
6914  (~C).store( i, j , xmm1 * factor );
6915  (~C).store( i, j+IT::size, xmm2 * factor );
6916  }
6917  }
6918 
6919  if( j < N )
6920  {
6921  size_t i( 0UL );
6922 
6923  for( ; (i+2UL) <= M; i+=2UL )
6924  {
6925  const size_t kbegin( ( IsUpper<MT4>::value )
6926  ?( ( IsLower<MT5>::value )
6927  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6928  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6929  :( IsLower<MT5>::value ? j : 0UL ) );
6930  const size_t kend( ( IsLower<MT4>::value )
6931  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6932  :( K ) );
6933 
6934  IntrinsicType xmm1, xmm2;
6935 
6936  for( size_t k=kbegin; k<kend; ++k ) {
6937  const IntrinsicType b1( B.load(k,j) );
6938  xmm1 = xmm1 + set( A(i ,k) ) * b1;
6939  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
6940  }
6941 
6942  (~C).store( i , j, xmm1 * factor );
6943  (~C).store( i+1UL, j, xmm2 * factor );
6944  }
6945 
6946  if( i < M )
6947  {
6948  const size_t kbegin( ( IsUpper<MT4>::value )
6949  ?( ( IsLower<MT5>::value )
6950  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6951  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6952  :( IsLower<MT5>::value ? j : 0UL ) );
6953 
6954  IntrinsicType xmm1;
6955 
6956  for( size_t k=kbegin; k<K; ++k ) {
6957  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
6958  }
6959 
6960  (~C).store( i, j, xmm1 * factor );
6961  }
6962  }
6963  }
6964  //**********************************************************************************************
6965 
6966  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
6981  template< typename MT3 // Type of the left-hand side target matrix
6982  , typename MT4 // Type of the left-hand side matrix operand
6983  , typename MT5 // Type of the right-hand side matrix operand
6984  , typename ST2 > // Type of the scalar value
6985  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6986  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6987  {
6988  typedef IntrinsicTrait<ElementType> IT;
6989 
6990  const size_t M( A.rows() );
6991  const size_t N( B.columns() );
6992  const size_t K( A.columns() );
6993 
6994  const IntrinsicType factor( set( scalar ) );
6995 
6996  size_t i( 0UL );
6997 
6998  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
6999  for( size_t j=0UL; j<N; ++j )
7000  {
7001  const size_t kbegin( ( IsLower<MT5>::value )
7002  ?( ( IsUpper<MT4>::value )
7003  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7004  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7005  :( IsUpper<MT4>::value ? i : 0UL ) );
7006  const size_t kend( ( IsUpper<MT5>::value )
7007  ?( ( IsLower<MT4>::value )
7008  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
7009  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
7010  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
7011 
7012  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7013 
7014  for( size_t k=kbegin; k<kend; ++k ) {
7015  const IntrinsicType b1( set( B(k,j) ) );
7016  xmm1 = xmm1 + A.load(i ,k) * b1;
7017  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
7018  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
7019  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
7020  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
7021  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
7022  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
7023  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
7024  }
7025 
7026  (~C).store( i , j, xmm1 * factor );
7027  (~C).store( i+IT::size , j, xmm2 * factor );
7028  (~C).store( i+IT::size*2UL, j, xmm3 * factor );
7029  (~C).store( i+IT::size*3UL, j, xmm4 * factor );
7030  (~C).store( i+IT::size*4UL, j, xmm5 * factor );
7031  (~C).store( i+IT::size*5UL, j, xmm6 * factor );
7032  (~C).store( i+IT::size*6UL, j, xmm7 * factor );
7033  (~C).store( i+IT::size*7UL, j, xmm8 * factor );
7034  }
7035  }
7036 
7037  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL )
7038  {
7039  size_t j( 0UL );
7040 
7041  for( ; (j+2UL) <= N; j+=2UL )
7042  {
7043  const size_t kbegin( ( IsLower<MT5>::value )
7044  ?( ( IsUpper<MT4>::value )
7045  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7046  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7047  :( IsUpper<MT4>::value ? i : 0UL ) );
7048  const size_t kend( ( IsUpper<MT5>::value )
7049  ?( ( IsLower<MT4>::value )
7050  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7051  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7052  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
7053 
7054  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7055 
7056  for( size_t k=kbegin; k<kend; ++k ) {
7057  const IntrinsicType a1( A.load(i ,k) );
7058  const IntrinsicType a2( A.load(i+IT::size ,k) );
7059  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
7060  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
7061  const IntrinsicType b1( set( B(k,j ) ) );
7062  const IntrinsicType b2( set( B(k,j+1UL) ) );
7063  xmm1 = xmm1 + a1 * b1;
7064  xmm2 = xmm2 + a2 * b1;
7065  xmm3 = xmm3 + a3 * b1;
7066  xmm4 = xmm4 + a4 * b1;
7067  xmm5 = xmm5 + a1 * b2;
7068  xmm6 = xmm6 + a2 * b2;
7069  xmm7 = xmm7 + a3 * b2;
7070  xmm8 = xmm8 + a4 * b2;
7071  }
7072 
7073  (~C).store( i , j , xmm1 * factor );
7074  (~C).store( i+IT::size , j , xmm2 * factor );
7075  (~C).store( i+IT::size*2UL, j , xmm3 * factor );
7076  (~C).store( i+IT::size*3UL, j , xmm4 * factor );
7077  (~C).store( i , j+1UL, xmm5 * factor );
7078  (~C).store( i+IT::size , j+1UL, xmm6 * factor );
7079  (~C).store( i+IT::size*2UL, j+1UL, xmm7 * factor );
7080  (~C).store( i+IT::size*3UL, j+1UL, xmm8 * factor );
7081  }
7082 
7083  if( j < N )
7084  {
7085  const size_t kbegin( ( IsLower<MT5>::value )
7086  ?( ( IsUpper<MT4>::value )
7087  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7088  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7089  :( IsUpper<MT4>::value ? i : 0UL ) );
7090  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
7091 
7092  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7093 
7094  for( size_t k=kbegin; k<kend; ++k ) {
7095  const IntrinsicType b1( set( B(k,j) ) );
7096  xmm1 = xmm1 + A.load(i ,k) * b1;
7097  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
7098  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
7099  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
7100  }
7101 
7102  (~C).store( i , j, xmm1 * factor );
7103  (~C).store( i+IT::size , j, xmm2 * factor );
7104  (~C).store( i+IT::size*2UL, j, xmm3 * factor );
7105  (~C).store( i+IT::size*3UL, j, xmm4 * factor );
7106  }
7107  }
7108 
7109  for( ; (i+IT::size) < M; i+=IT::size*2UL )
7110  {
7111  size_t j( 0UL );
7112 
7113  for( ; (j+2UL) <= N; j+=2UL )
7114  {
7115  const size_t kbegin( ( IsLower<MT5>::value )
7116  ?( ( IsUpper<MT4>::value )
7117  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7118  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7119  :( IsUpper<MT4>::value ? i : 0UL ) );
7120  const size_t kend( ( IsUpper<MT5>::value )
7121  ?( ( IsLower<MT4>::value )
7122  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7123  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7124  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
7125 
7126  IntrinsicType xmm1, xmm2, xmm3, xmm4;
7127 
7128  for( size_t k=kbegin; k<kend; ++k ) {
7129  const IntrinsicType a1( A.load(i ,k) );
7130  const IntrinsicType a2( A.load(i+IT::size,k) );
7131  const IntrinsicType b1( set( B(k,j ) ) );
7132  const IntrinsicType b2( set( B(k,j+1UL) ) );
7133  xmm1 = xmm1 + a1 * b1;
7134  xmm2 = xmm2 + a2 * b1;
7135  xmm3 = xmm3 + a1 * b2;
7136  xmm4 = xmm4 + a2 * b2;
7137  }
7138 
7139  (~C).store( i , j , xmm1 * factor );
7140  (~C).store( i+IT::size, j , xmm2 * factor );
7141  (~C).store( i , j+1UL, xmm3 * factor );
7142  (~C).store( i+IT::size, j+1UL, xmm4 * factor );
7143  }
7144 
7145  if( j < N )
7146  {
7147  const size_t kbegin( ( IsLower<MT5>::value )
7148  ?( ( IsUpper<MT4>::value )
7149  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7150  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7151  :( IsUpper<MT4>::value ? i : 0UL ) );
7152  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
7153 
7154  IntrinsicType xmm1, xmm2;
7155 
7156  for( size_t k=kbegin; k<kend; ++k ) {
7157  const IntrinsicType b1( set( B(k,j) ) );
7158  xmm1 = xmm1 + A.load(i ,k) * b1;
7159  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
7160  }
7161 
7162  (~C).store( i , j, xmm1 * factor );
7163  (~C).store( i+IT::size, j, xmm2 * factor );
7164  }
7165  }
7166 
7167  if( i < M )
7168  {
7169  size_t j( 0UL );
7170 
7171  for( ; (j+2UL) <= N; j+=2UL )
7172  {
7173  const size_t kbegin( ( IsLower<MT5>::value )
7174  ?( ( IsUpper<MT4>::value )
7175  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7176  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7177  :( IsUpper<MT4>::value ? i : 0UL ) );
7178  const size_t kend( ( IsUpper<MT5>::value )
7179  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7180  :( K ) );
7181 
7182  IntrinsicType xmm1, xmm2;
7183 
7184  for( size_t k=kbegin; k<kend; ++k ) {
7185  const IntrinsicType a1( A.load(i,k) );
7186  xmm1 = xmm1 + a1 * set( B(k,j ) );
7187  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
7188  }
7189 
7190  (~C).store( i, j , xmm1 * factor );
7191  (~C).store( i, j+1UL, xmm2 * factor );
7192  }
7193 
7194  if( j < N )
7195  {
7196  const size_t kbegin( ( IsLower<MT5>::value )
7197  ?( ( IsUpper<MT4>::value )
7198  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7199  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7200  :( IsUpper<MT4>::value ? i : 0UL ) );
7201 
7202  IntrinsicType xmm1;
7203 
7204  for( size_t k=kbegin; k<K; ++k ) {
7205  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
7206  }
7207 
7208  (~C).store( i, j, xmm1 * factor );
7209  }
7210  }
7211  }
7212  //**********************************************************************************************
7213 
7214  //**Default assignment to dense matrices (large matrices)***************************************
7228  template< typename MT3 // Type of the left-hand side target matrix
7229  , typename MT4 // Type of the left-hand side matrix operand
7230  , typename MT5 // Type of the right-hand side matrix operand
7231  , typename ST2 > // Type of the scalar value
7232  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7233  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7234  {
7235  selectDefaultAssignKernel( C, A, B, scalar );
7236  }
7237  //**********************************************************************************************
7238 
7239  //**Vectorized default assignment to row-major dense matrices (large matrices)******************
7254  template< typename MT3 // Type of the left-hand side target matrix
7255  , typename MT4 // Type of the left-hand side matrix operand
7256  , typename MT5 // Type of the right-hand side matrix operand
7257  , typename ST2 > // Type of the scalar value
7258  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7259  selectLargeAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7260  {
7261  typedef IntrinsicTrait<ElementType> IT;
7262 
7263  const size_t M( A.rows() );
7264  const size_t N( B.columns() );
7265  const size_t K( A.columns() );
7266 
7267  const size_t iblock( 64UL );
7268  const size_t jblock( 128UL );
7269  const size_t kblock( 128UL );
7270 
7271  const IntrinsicType factor( set( scalar ) );
7272 
7273  for( size_t jj=0UL; jj<N; jj+=jblock )
7274  {
7275  const size_t jend( min( jj+jblock, N ) );
7276 
7277  for( size_t ii=0UL; ii<M; ii+=iblock )
7278  {
7279  const size_t iend( min( ii+iblock, M ) );
7280 
7281  for( size_t i=ii; i<iend; ++i ) {
7282  for( size_t j=jj; j<jend; ++j ) {
7283  reset( (~C)(i,j) );
7284  }
7285  }
7286 
7287  for( size_t kk=0UL; kk<K; kk+=kblock )
7288  {
7289  const size_t ktmp( min( kk+kblock, K ) );
7290 
7291  size_t j( jj );
7292 
7293  for( ; (j+IT::size*3UL) < jend; j+=IT::size*4UL )
7294  {
7295  const size_t j1( j+IT::size );
7296  const size_t j2( j+IT::size*2UL );
7297  const size_t j3( j+IT::size*3UL );
7298 
7299  size_t i( ii );
7300 
7301  for( ; (i+2UL) <= iend; i+=2UL )
7302  {
7303  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7304  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7305  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7306  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
7307 
7308  IntrinsicType xmm1( (~C).load(i ,j ) );
7309  IntrinsicType xmm2( (~C).load(i ,j1) );
7310  IntrinsicType xmm3( (~C).load(i ,j2) );
7311  IntrinsicType xmm4( (~C).load(i ,j3) );
7312  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
7313  IntrinsicType xmm6( (~C).load(i+1UL,j1) );
7314  IntrinsicType xmm7( (~C).load(i+1UL,j2) );
7315  IntrinsicType xmm8( (~C).load(i+1UL,j3) );
7316 
7317  for( size_t k=kbegin; k<kend; ++k ) {
7318  const IntrinsicType a1( set( A(i ,k) ) );
7319  const IntrinsicType a2( set( A(i+1UL,k) ) );
7320  const IntrinsicType b1( B.load(k,j ) );
7321  const IntrinsicType b2( B.load(k,j1) );
7322  const IntrinsicType b3( B.load(k,j2) );
7323  const IntrinsicType b4( B.load(k,j3) );
7324  xmm1 = xmm1 + a1 * b1;
7325  xmm2 = xmm2 + a1 * b2;
7326  xmm3 = xmm3 + a1 * b3;
7327  xmm4 = xmm4 + a1 * b4;
7328  xmm5 = xmm5 + a2 * b1;
7329  xmm6 = xmm6 + a2 * b2;
7330  xmm7 = xmm7 + a2 * b3;
7331  xmm8 = xmm8 + a2 * b4;
7332  }
7333 
7334  (~C).store( i , j , xmm1 * factor );
7335  (~C).store( i , j1, xmm2 * factor );
7336  (~C).store( i , j2, xmm3 * factor );
7337  (~C).store( i , j3, xmm4 * factor );
7338  (~C).store( i+1UL, j , xmm5 * factor );
7339  (~C).store( i+1UL, j1, xmm6 * factor );
7340  (~C).store( i+1UL, j2, xmm7 * factor );
7341  (~C).store( i+1UL, j3, xmm8 * factor );
7342  }
7343 
7344  if( i < iend )
7345  {
7346  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7347  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7348  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7349  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
7350 
7351  IntrinsicType xmm1( (~C).load(i,j ) );
7352  IntrinsicType xmm2( (~C).load(i,j1) );
7353  IntrinsicType xmm3( (~C).load(i,j2) );
7354  IntrinsicType xmm4( (~C).load(i,j3) );
7355 
7356  for( size_t k=kbegin; k<kend; ++k ) {
7357  const IntrinsicType a1( set( A(i,k) ) );
7358  xmm1 = xmm1 + a1 * B.load(k,j );
7359  xmm2 = xmm2 + a1 * B.load(k,j1);
7360  xmm3 = xmm3 + a1 * B.load(k,j2);
7361  xmm4 = xmm4 + a1 * B.load(k,j3);
7362  }
7363 
7364  (~C).store( i, j , xmm1 * factor );
7365  (~C).store( i, j1, xmm2 * factor );
7366  (~C).store( i, j2, xmm3 * factor );
7367  (~C).store( i, j3, xmm4 * factor );
7368  }
7369  }
7370 
7371  for( ; (j+IT::size) < jend; j+=IT::size*2UL )
7372  {
7373  const size_t j1( j+IT::size );
7374 
7375  size_t i( ii );
7376 
7377  for( ; (i+4UL) <= iend; i+=4UL )
7378  {
7379  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7380  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7381  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
7382  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
7383 
7384  IntrinsicType xmm1( (~C).load(i ,j ) );
7385  IntrinsicType xmm2( (~C).load(i ,j1) );
7386  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
7387  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
7388  IntrinsicType xmm5( (~C).load(i+2UL,j ) );
7389  IntrinsicType xmm6( (~C).load(i+2UL,j1) );
7390  IntrinsicType xmm7( (~C).load(i+3UL,j ) );
7391  IntrinsicType xmm8( (~C).load(i+3UL,j1) );
7392 
7393  for( size_t k=kbegin; k<kend; ++k ) {
7394  const IntrinsicType a1( set( A(i ,k) ) );
7395  const IntrinsicType a2( set( A(i+1UL,k) ) );
7396  const IntrinsicType a3( set( A(i+2UL,k) ) );
7397  const IntrinsicType a4( set( A(i+3UL,k) ) );
7398  const IntrinsicType b1( B.load(k,j ) );
7399  const IntrinsicType b2( B.load(k,j1) );
7400  xmm1 = xmm1 + a1 * b1;
7401  xmm2 = xmm2 + a1 * b2;
7402  xmm3 = xmm3 + a2 * b1;
7403  xmm4 = xmm4 + a2 * b2;
7404  xmm5 = xmm5 + a3 * b1;
7405  xmm6 = xmm6 + a3 * b2;
7406  xmm7 = xmm7 + a4 * b1;
7407  xmm8 = xmm8 + a4 * b2;
7408  }
7409 
7410  (~C).store( i , j , xmm1 * factor );
7411  (~C).store( i , j1, xmm2 * factor );
7412  (~C).store( i+1UL, j , xmm3 * factor );
7413  (~C).store( i+1UL, j1, xmm4 * factor );
7414  (~C).store( i+2UL, j , xmm5 * factor );
7415  (~C).store( i+2UL, j1, xmm6 * factor );
7416  (~C).store( i+3UL, j , xmm7 * factor );
7417  (~C).store( i+3UL, j1, xmm8 * factor );
7418  }
7419 
7420  for( ; (i+2UL) <= iend; i+=2UL )
7421  {
7422  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7423  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7424  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7425  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
7426 
7427  IntrinsicType xmm1( (~C).load(i ,j ) );
7428  IntrinsicType xmm2( (~C).load(i ,j1) );
7429  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
7430  IntrinsicType xmm4( (~C).load(i+1UL,j1) );
7431 
7432  for( size_t k=kbegin; k<kend; ++k ) {
7433  const IntrinsicType a1( set( A(i ,k) ) );
7434  const IntrinsicType a2( set( A(i+1UL,k) ) );
7435  const IntrinsicType b1( B.load(k,j ) );
7436  const IntrinsicType b2( B.load(k,j1) );
7437  xmm1 = xmm1 + a1 * b1;
7438  xmm2 = xmm2 + a1 * b2;
7439  xmm3 = xmm3 + a2 * b1;
7440  xmm4 = xmm4 + a2 * b2;
7441  }
7442 
7443  (~C).store( i , j , xmm1 * factor );
7444  (~C).store( i , j1, xmm2 * factor );
7445  (~C).store( i+1UL, j , xmm3 * factor );
7446  (~C).store( i+1UL, j1, xmm4 * factor );
7447  }
7448 
7449  if( i < iend )
7450  {
7451  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7452  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7453  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7454  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
7455 
7456  IntrinsicType xmm1( (~C).load(i,j ) );
7457  IntrinsicType xmm2( (~C).load(i,j1) );
7458 
7459  for( size_t k=kbegin; k<kend; ++k ) {
7460  const IntrinsicType a1( set( A(i,k) ) );
7461  xmm1 = xmm1 + a1 * B.load(k,j );
7462  xmm2 = xmm2 + a1 * B.load(k,j1);
7463  }
7464 
7465  (~C).store( i, j , xmm1 * factor );
7466  (~C).store( i, j1, xmm2 * factor );
7467  }
7468  }
7469 
7470  if( j < jend )
7471  {
7472  for( size_t i=ii; i<iend; ++i )
7473  {
7474  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7475  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7476  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7477  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
7478 
7479  IntrinsicType xmm1( (~C).load(i,j) );
7480 
7481  for( size_t k=kbegin; k<kend; ++k ) {
7482  const IntrinsicType a1( set( A(i,k) ) );
7483  xmm1 = xmm1 + a1 * B.load(k,j);
7484  }
7485 
7486  (~C).store( i, j, xmm1 * factor );
7487  }
7488  }
7489  }
7490  }
7491  }
7492  }
7493  //**********************************************************************************************
7494 
7495  //**Vectorized default assignment to column-major dense matrices (large matrices)***************
7510  template< typename MT3 // Type of the left-hand side target matrix
7511  , typename MT4 // Type of the left-hand side matrix operand
7512  , typename MT5 // Type of the right-hand side matrix operand
7513  , typename ST2 > // Type of the scalar value
7514  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7515  selectLargeAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7516  {
7517  typedef IntrinsicTrait<ElementType> IT;
7518 
7519  const size_t M( A.rows() );
7520  const size_t N( B.columns() );
7521  const size_t K( A.columns() );
7522 
7523  const size_t iblock( 128UL );
7524  const size_t jblock( 64UL );
7525  const size_t kblock( 128UL );
7526 
7527  const IntrinsicType factor( set( scalar ) );
7528 
7529  for( size_t ii=0UL; ii<M; ii+=iblock )
7530  {
7531  const size_t iend( min( ii+iblock, M ) );
7532 
7533  for( size_t jj=0UL; jj<N; jj+=jblock )
7534  {
7535  const size_t jend( min( jj+jblock, N ) );
7536 
7537  for( size_t j=jj; j<jend; ++j ) {
7538  for( size_t i=ii; i<iend; ++i ) {
7539  reset( (~C)(i,j) );
7540  }
7541  }
7542 
7543  for( size_t kk=0UL; kk<K; kk+=kblock )
7544  {
7545  const size_t ktmp( min( kk+kblock, K ) );
7546 
7547  size_t i( ii );
7548 
7549  for( ; (i+IT::size*3UL) < iend; i+=IT::size*4UL )
7550  {
7551  const size_t i1( i+IT::size );
7552  const size_t i2( i+IT::size*2UL );
7553  const size_t i3( i+IT::size*3UL );
7554 
7555  size_t j( jj );
7556 
7557  for( ; (j+2UL) <= jend; j+=2UL )
7558  {
7559  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7560  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7561  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
7562  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7563 
7564  IntrinsicType xmm1( (~C).load(i ,j ) );
7565  IntrinsicType xmm2( (~C).load(i1,j ) );
7566  IntrinsicType xmm3( (~C).load(i2,j ) );
7567  IntrinsicType xmm4( (~C).load(i3,j ) );
7568  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
7569  IntrinsicType xmm6( (~C).load(i1,j+1UL) );
7570  IntrinsicType xmm7( (~C).load(i2,j+1UL) );
7571  IntrinsicType xmm8( (~C).load(i3,j+1UL) );
7572 
7573  for( size_t k=kbegin; k<kend; ++k ) {
7574  const IntrinsicType a1( A.load(i ,k) );
7575  const IntrinsicType a2( A.load(i1,k) );
7576  const IntrinsicType a3( A.load(i2,k) );
7577  const IntrinsicType a4( A.load(i3,k) );
7578  const IntrinsicType b1( set( B(k,j ) ) );
7579  const IntrinsicType b2( set( B(k,j+1UL) ) );
7580  xmm1 = xmm1 + a1 * b1;
7581  xmm2 = xmm2 + a2 * b1;
7582  xmm3 = xmm3 + a3 * b1;
7583  xmm4 = xmm4 + a4 * b1;
7584  xmm5 = xmm5 + a1 * b2;
7585  xmm6 = xmm6 + a2 * b2;
7586  xmm7 = xmm7 + a3 * b2;
7587  xmm8 = xmm8 + a4 * b2;
7588  }
7589 
7590  (~C).store( i , j , xmm1 * factor );
7591  (~C).store( i1, j , xmm2 * factor );
7592  (~C).store( i2, j , xmm3 * factor );
7593  (~C).store( i3, j , xmm4 * factor );
7594  (~C).store( i , j+1UL, xmm5 * factor );
7595  (~C).store( i1, j+1UL, xmm6 * factor );
7596  (~C).store( i2, j+1UL, xmm7 * factor );
7597  (~C).store( i3, j+1UL, xmm8 * factor );
7598  }
7599 
7600  if( j < jend )
7601  {
7602  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7603  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7604  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
7605  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7606 
7607  IntrinsicType xmm1( (~C).load(i ,j) );
7608  IntrinsicType xmm2( (~C).load(i1,j) );
7609  IntrinsicType xmm3( (~C).load(i2,j) );
7610  IntrinsicType xmm4( (~C).load(i3,j) );
7611 
7612  for( size_t k=kbegin; k<kend; ++k ) {
7613  const IntrinsicType b1( set( B(k,j) ) );
7614  xmm1 = xmm1 + A.load(i ,k) * b1;
7615  xmm2 = xmm2 + A.load(i1,k) * b1;
7616  xmm3 = xmm3 + A.load(i2,k) * b1;
7617  xmm4 = xmm4 + A.load(i3,k) * b1;
7618  }
7619 
7620  (~C).store( i , j, xmm1 * factor );
7621  (~C).store( i1, j, xmm2 * factor );
7622  (~C).store( i2, j, xmm3 * factor );
7623  (~C).store( i3, j, xmm4 * factor );
7624  }
7625  }
7626 
7627  for( ; (i+IT::size) < iend; i+=IT::size*2UL )
7628  {
7629  const size_t i1( i+IT::size );
7630 
7631  size_t j( jj );
7632 
7633  for( ; (j+4UL) <= jend; j+=4UL )
7634  {
7635  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7636  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7637  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
7638  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
7639 
7640  IntrinsicType xmm1( (~C).load(i ,j ) );
7641  IntrinsicType xmm2( (~C).load(i1,j ) );
7642  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
7643  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
7644  IntrinsicType xmm5( (~C).load(i ,j+2UL) );
7645  IntrinsicType xmm6( (~C).load(i1,j+2UL) );
7646  IntrinsicType xmm7( (~C).load(i ,j+3UL) );
7647  IntrinsicType xmm8( (~C).load(i1,j+3UL) );
7648 
7649  for( size_t k=kbegin; k<kend; ++k ) {
7650  const IntrinsicType a1( A.load(i ,k) );
7651  const IntrinsicType a2( A.load(i1,k) );
7652  const IntrinsicType b1( set( B(k,j ) ) );
7653  const IntrinsicType b2( set( B(k,j+1UL) ) );
7654  const IntrinsicType b3( set( B(k,j+2UL) ) );
7655  const IntrinsicType b4( set( B(k,j+3UL) ) );
7656  xmm1 = xmm1 + a1 * b1;
7657  xmm2 = xmm2 + a2 * b1;
7658  xmm3 = xmm3 + a1 * b2;
7659  xmm4 = xmm4 + a2 * b2;
7660  xmm5 = xmm5 + a1 * b3;
7661  xmm6 = xmm6 + a2 * b3;
7662  xmm7 = xmm7 + a1 * b4;
7663  xmm8 = xmm8 + a2 * b4;
7664  }
7665 
7666  (~C).store( i , j , xmm1 * factor );
7667  (~C).store( i1, j , xmm2 * factor );
7668  (~C).store( i , j+1UL, xmm3 * factor );
7669  (~C).store( i1, j+1UL, xmm4 * factor );
7670  (~C).store( i , j+2UL, xmm5 * factor );
7671  (~C).store( i1, j+2UL, xmm6 * factor );
7672  (~C).store( i , j+3UL, xmm7 * factor );
7673  (~C).store( i1, j+3UL, xmm8 * factor );
7674  }
7675 
7676  for( ; (j+2UL) <= jend; j+=2UL )
7677  {
7678  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7679  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7680  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
7681  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7682 
7683  IntrinsicType xmm1( (~C).load(i ,j ) );
7684  IntrinsicType xmm2( (~C).load(i1,j ) );
7685  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
7686  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
7687 
7688  for( size_t k=kbegin; k<kend; ++k ) {
7689  const IntrinsicType a1( A.load(i ,k) );
7690  const IntrinsicType a2( A.load(i1,k) );
7691  const IntrinsicType b1( set( B(k,j ) ) );
7692  const IntrinsicType b2( set( B(k,j+1UL) ) );
7693  xmm1 = xmm1 + a1 * b1;
7694  xmm2 = xmm2 + a2 * b1;
7695  xmm3 = xmm3 + a1 * b2;
7696  xmm4 = xmm4 + a2 * b2;
7697  }
7698 
7699  (~C).store( i , j , xmm1 * factor );
7700  (~C).store( i1, j , xmm2 * factor );
7701  (~C).store( i , j+1UL, xmm3 * factor );
7702  (~C).store( i1, j+1UL, xmm4 * factor );
7703  }
7704 
7705  if( j < jend )
7706  {
7707  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7708  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7709  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
7710  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7711 
7712  IntrinsicType xmm1( (~C).load(i ,j) );
7713  IntrinsicType xmm2( (~C).load(i1,j) );
7714 
7715  for( size_t k=kbegin; k<kend; ++k ) {
7716  const IntrinsicType b1( set( B(k,j) ) );
7717  xmm1 = xmm1 + A.load(i ,k) * b1;
7718  xmm2 = xmm2 + A.load(i1,k) * b1;
7719  }
7720 
7721  (~C).store( i , j, xmm1 * factor );
7722  (~C).store( i1, j, xmm2 * factor );
7723  }
7724  }
7725 
7726  if( i < iend )
7727  {
7728  for( size_t j=jj; j<jend; ++j )
7729  {
7730  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
7731  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
7732  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
7733  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7734 
7735  IntrinsicType xmm1( (~C).load(i,j) );
7736 
7737  for( size_t k=kbegin; k<kend; ++k ) {
7738  const IntrinsicType b1( set( B(k,j) ) );
7739  xmm1 = xmm1 + A.load(i,k) * b1;
7740  }
7741 
7742  (~C).store( i, j, xmm1 * factor );
7743  }
7744  }
7745  }
7746  }
7747  }
7748  }
7749  //**********************************************************************************************
7750 
7751  //**BLAS-based assignment to dense matrices (default)*******************************************
7765  template< typename MT3 // Type of the left-hand side target matrix
7766  , typename MT4 // Type of the left-hand side matrix operand
7767  , typename MT5 // Type of the right-hand side matrix operand
7768  , typename ST2 > // Type of the scalar value
7769  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7770  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7771  {
7772  selectLargeAssignKernel( C, A, B, scalar );
7773  }
7774  //**********************************************************************************************
7775 
7776  //**BLAS-based assignment to dense matrices (single precision)**********************************
7777 #if BLAZE_BLAS_MODE
7778 
7791  template< typename MT3 // Type of the left-hand side target matrix
7792  , typename MT4 // Type of the left-hand side matrix operand
7793  , typename MT5 // Type of the right-hand side matrix operand
7794  , typename ST2 > // Type of the scalar value
7795  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
7796  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7797  {
7798  if( IsTriangular<MT4>::value ) {
7799  assign( C, B );
7800  strmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
7801  }
7802  else if( IsTriangular<MT5>::value ) {
7803  assign( C, A );
7804  strmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
7805  }
7806  else {
7807  sgemm( C, A, B, scalar, 0.0F );
7808  }
7809  }
7810 #endif
7811  //**********************************************************************************************
7812 
7813  //**BLAS-based assignment to dense matrices (double precision)**********************************
7814 #if BLAZE_BLAS_MODE
7815 
7828  template< typename MT3 // Type of the left-hand side target matrix
7829  , typename MT4 // Type of the left-hand side matrix operand
7830  , typename MT5 // Type of the right-hand side matrix operand
7831  , typename ST2 > // Type of the scalar value
7832  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
7833  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7834  {
7835  if( IsTriangular<MT4>::value ) {
7836  assign( C, B );
7837  dtrmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
7838  }
7839  else if( IsTriangular<MT5>::value ) {
7840  assign( C, A );
7841  dtrmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
7842  }
7843  else {
7844  dgemm( C, A, B, scalar, 0.0 );
7845  }
7846  }
7847 #endif
7848  //**********************************************************************************************
7849 
7850  //**BLAS-based assignment to dense matrices (single precision complex)**************************
7851 #if BLAZE_BLAS_MODE
7852 
7865  template< typename MT3 // Type of the left-hand side target matrix
7866  , typename MT4 // Type of the left-hand side matrix operand
7867  , typename MT5 // Type of the right-hand side matrix operand
7868  , typename ST2 > // Type of the scalar value
7869  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
7870  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7871  {
7872  if( IsTriangular<MT4>::value ) {
7873  assign( C, B );
7874  ctrmm( C, A, CblasLeft,
7875  ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
7876  complex<float>( scalar, 0.0F ) );
7877  }
7878  else if( IsTriangular<MT5>::value ) {
7879  assign( C, A );
7880  ctrmm( C, B, CblasRight,
7881  ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
7882  complex<float>( scalar, 0.0F ) );
7883  }
7884  else {
7885  cgemm( C, A, B, complex<float>( scalar, 0.0F ), complex<float>( 0.0F, 0.0F ) );
7886  }
7887  }
7888 #endif
7889  //**********************************************************************************************
7890 
7891  //**BLAS-based assignment to dense matrices (double precision complex)**************************
7892 #if BLAZE_BLAS_MODE
7893 
7906  template< typename MT3 // Type of the left-hand side target matrix
7907  , typename MT4 // Type of the left-hand side matrix operand
7908  , typename MT5 // Type of the right-hand side matrix operand
7909  , typename ST2 > // Type of the scalar value
7910  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
7911  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7912  {
7913  if( IsTriangular<MT4>::value ) {
7914  assign( C, B );
7915  ztrmm( C, A, CblasLeft,
7916  ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
7917  complex<double>( scalar, 0.0 ) );
7918  }
7919  else if( IsTriangular<MT5>::value ) {
7920  assign( C, A );
7921  ztrmm( C, B, CblasRight,
7922  ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
7923  complex<double>( scalar, 0.0 ) );
7924  }
7925  else {
7926  zgemm( C, A, B, complex<double>( scalar, 0.0 ), complex<double>( 0.0, 0.0 ) );
7927  }
7928  }
7929 #endif
7930  //**********************************************************************************************
7931 
7932  //**Assignment to sparse matrices***************************************************************
7944  template< typename MT // Type of the target sparse matrix
7945  , bool SO > // Storage order of the target sparse matrix
7946  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7947  {
7949 
7950  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
7951 
7958 
7959  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7960  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7961 
7962  const TmpType tmp( serial( rhs ) );
7963  assign( ~lhs, tmp );
7964  }
7965  //**********************************************************************************************
7966 
7967  //**Addition assignment to dense matrices*******************************************************
7979  template< typename MT // Type of the target dense matrix
7980  , bool SO > // Storage order of the target dense matrix
7981  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7982  {
7984 
7985  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7986  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7987 
7988  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7989  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7990 
7991  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7992  return;
7993  }
7994 
7995  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7996  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7997 
7998  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7999  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8000  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8001  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8002  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8003  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8004 
8005  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
8006  }
8007  //**********************************************************************************************
8008 
8009  //**Addition assignment to dense matrices (kernel selection)************************************
8020  template< typename MT3 // Type of the left-hand side target matrix
8021  , typename MT4 // Type of the left-hand side matrix operand
8022  , typename MT5 // Type of the right-hand side matrix operand
8023  , typename ST2 > // Type of the scalar value
8024  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8025  {
8026  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
8027  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
8028  selectSmallAddAssignKernel( C, A, B, scalar );
8029  else
8030  selectBlasAddAssignKernel( C, A, B, scalar );
8031  }
8032  //**********************************************************************************************
8033 
8034  //**Default addition assignment to dense matrices (general/general)*****************************
8048  template< typename MT3 // Type of the left-hand side target matrix
8049  , typename MT4 // Type of the left-hand side matrix operand
8050  , typename MT5 // Type of the right-hand side matrix operand
8051  , typename ST2 > // Type of the scalar value
8052  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
8053  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8054  {
8055  const ResultType tmp( serial( A * B * scalar ) );
8056  addAssign( C, tmp );
8057  }
8058  //**********************************************************************************************
8059 
8060  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
8074  template< typename MT3 // Type of the left-hand side target matrix
8075  , typename MT4 // Type of the left-hand side matrix operand
8076  , typename MT5 // Type of the right-hand side matrix operand
8077  , typename ST2 > // Type of the scalar value
8078  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
8079  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
8080  {
8081  const size_t M( A.rows() );
8082  const size_t N( B.columns() );
8083 
8084  const size_t block( 16UL );
8085 
8086  for( size_t ii=0UL; ii<M; ii+=block ) {
8087  const size_t iend( min( M, ii+block ) );
8088  for( size_t jj=0UL; jj<N; jj+=block ) {
8089  const size_t jend( min( N, jj+block ) );
8090  for( size_t i=ii; i<iend; ++i )
8091  {
8092  const size_t jbegin( ( IsUpper<MT4>::value )
8093  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
8094  :( jj ) );
8095  const size_t jpos( ( IsLower<MT4>::value )
8096  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
8097  :( jend ) );
8098 
8099  for( size_t j=jbegin; j<jpos; ++j ) {
8100  (~C)(i,j) += A(i,j) * B(j,j) * scalar;
8101  }
8102  }
8103  }
8104  }
8105  }
8106  //**********************************************************************************************
8107 
8108  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
8122  template< typename MT3 // Type of the left-hand side target matrix
8123  , typename MT4 // Type of the left-hand side matrix operand
8124  , typename MT5 // Type of the right-hand side matrix operand
8125  , typename ST2 > // Type of the scalar value
8126  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
8127  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8128  {
8129  const size_t M( A.rows() );
8130  const size_t N( B.columns() );
8131 
8132  for( size_t j=0UL; j<N; ++j )
8133  {
8134  const size_t ibegin( ( IsLower<MT4>::value )
8135  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
8136  :( 0UL ) );
8137  const size_t iend( ( IsUpper<MT4>::value )
8138  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
8139  :( M ) );
8140  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
8141 
8142  const size_t inum( iend - ibegin );
8143  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
8144 
8145  for( size_t i=ibegin; i<ipos; i+=2UL ) {
8146  (~C)(i ,j) += A(i ,j) * B(j,j) * scalar;
8147  (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
8148  }
8149  if( ipos < iend ) {
8150  (~C)(ipos,j) += A(ipos,j) * B(j,j) * scalar;
8151  }
8152  }
8153  }
8154  //**********************************************************************************************
8155 
8156  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
8170  template< typename MT3 // Type of the left-hand side target matrix
8171  , typename MT4 // Type of the left-hand side matrix operand
8172  , typename MT5 // Type of the right-hand side matrix operand
8173  , typename ST2 > // Type of the scalar value
8174  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
8175  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
8176  {
8177  const size_t M( A.rows() );
8178  const size_t N( B.columns() );
8179 
8180  for( size_t i=0UL; i<M; ++i )
8181  {
8182  const size_t jbegin( ( IsUpper<MT5>::value )
8183  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
8184  :( 0UL ) );
8185  const size_t jend( ( IsLower<MT5>::value )
8186  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
8187  :( N ) );
8188  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
8189 
8190  const size_t jnum( jend - jbegin );
8191  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
8192 
8193  for( size_t j=jbegin; j<jpos; j+=2UL ) {
8194  (~C)(i,j ) += A(i,i) * B(i,j ) * scalar;
8195  (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
8196  }
8197  if( jpos < jend ) {
8198  (~C)(i,jpos) += A(i,i) * B(i,jpos) * scalar;
8199  }
8200  }
8201  }
8202  //**********************************************************************************************
8203 
8204  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
8218  template< typename MT3 // Type of the left-hand side target matrix
8219  , typename MT4 // Type of the left-hand side matrix operand
8220  , typename MT5 // Type of the right-hand side matrix operand
8221  , typename ST2 > // Type of the scalar value
8222  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
8223  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8224  {
8225  const size_t M( A.rows() );
8226  const size_t N( B.columns() );
8227 
8228  const size_t block( 16UL );
8229 
8230  for( size_t jj=0UL; jj<N; jj+=block ) {
8231  const size_t jend( min( N, jj+block ) );
8232  for( size_t ii=0UL; ii<M; ii+=block ) {
8233  const size_t iend( min( M, ii+block ) );
8234  for( size_t j=jj; j<jend; ++j )
8235  {
8236  const size_t ibegin( ( IsLower<MT5>::value )
8237  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
8238  :( ii ) );
8239  const size_t ipos( ( IsUpper<MT5>::value )
8240  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
8241  :( iend ) );
8242 
8243  for( size_t i=ibegin; i<ipos; ++i ) {
8244  (~C)(i,j) += A(i,i) * B(i,j) * scalar;
8245  }
8246  }
8247  }
8248  }
8249  }
8250  //**********************************************************************************************
8251 
8252  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
8266  template< typename MT3 // Type of the left-hand side target matrix
8267  , typename MT4 // Type of the left-hand side matrix operand
8268  , typename MT5 // Type of the right-hand side matrix operand
8269  , typename ST2 > // Type of the scalar value
8270  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
8271  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8272  {
8273  for( size_t i=0UL; i<A.rows(); ++i ) {
8274  C(i,i) += A(i,i) * B(i,i) * scalar;
8275  }
8276  }
8277  //**********************************************************************************************
8278 
8279  //**Default addition assignment to dense matrices (small matrices)******************************
8293  template< typename MT3 // Type of the left-hand side target matrix
8294  , typename MT4 // Type of the left-hand side matrix operand
8295  , typename MT5 // Type of the right-hand side matrix operand
8296  , typename ST2 > // Type of the scalar value
8297  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8298  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8299  {
8300  selectDefaultAddAssignKernel( C, A, B, scalar );
8301  }
8302  //**********************************************************************************************
8303 
8304  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
8319  template< typename MT3 // Type of the left-hand side target matrix
8320  , typename MT4 // Type of the left-hand side matrix operand
8321  , typename MT5 // Type of the right-hand side matrix operand
8322  , typename ST2 > // Type of the scalar value
8323  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8324  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
8325  {
8326  typedef IntrinsicTrait<ElementType> IT;
8327 
8328  const size_t M( A.rows() );
8329  const size_t N( B.columns() );
8330  const size_t K( A.columns() );
8331 
8332  const IntrinsicType factor( set( scalar ) );
8333 
8334  size_t j( 0UL );
8335 
8336  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
8337  for( size_t i=0UL; i<M; ++i )
8338  {
8339  const size_t kbegin( ( IsUpper<MT4>::value )
8340  ?( ( IsLower<MT5>::value )
8341  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8342  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8343  :( IsLower<MT5>::value ? j : 0UL ) );
8344  const size_t kend( ( IsLower<MT4>::value )
8345  ?( ( IsUpper<MT5>::value )
8346  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
8347  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
8348  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
8349 
8350  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8351 
8352  for( size_t k=kbegin; k<kend; ++k ) {
8353  const IntrinsicType a1( set( A(i,k) ) );
8354  xmm1 = xmm1 + a1 * B.load(k,j );
8355  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
8356  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
8357  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
8358  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
8359  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
8360  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
8361  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
8362  }
8363 
8364  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8365  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
8366  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
8367  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
8368  (~C).store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) + xmm5 * factor );
8369  (~C).store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) + xmm6 * factor );
8370  (~C).store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) + xmm7 * factor );
8371  (~C).store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) + xmm8 * factor );
8372  }
8373  }
8374 
8375  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL )
8376  {
8377  size_t i( 0UL );
8378 
8379  for( ; (i+2UL) <= M; i+=2UL )
8380  {
8381  const size_t kbegin( ( IsUpper<MT4>::value )
8382  ?( ( IsLower<MT5>::value )
8383  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8384  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8385  :( IsLower<MT5>::value ? j : 0UL ) );
8386  const size_t kend( ( IsLower<MT4>::value )
8387  ?( ( IsUpper<MT5>::value )
8388  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
8389  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8390  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
8391 
8392  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8393 
8394  for( size_t k=kbegin; k<kend; ++k ) {
8395  const IntrinsicType a1( set( A(i ,k) ) );
8396  const IntrinsicType a2( set( A(i+1UL,k) ) );
8397  const IntrinsicType b1( B.load(k,j ) );
8398  const IntrinsicType b2( B.load(k,j+IT::size ) );
8399  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
8400  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
8401  xmm1 = xmm1 + a1 * b1;
8402  xmm2 = xmm2 + a1 * b2;
8403  xmm3 = xmm3 + a1 * b3;
8404  xmm4 = xmm4 + a1 * b4;
8405  xmm5 = xmm5 + a2 * b1;
8406  xmm6 = xmm6 + a2 * b2;
8407  xmm7 = xmm7 + a2 * b3;
8408  xmm8 = xmm8 + a2 * b4;
8409  }
8410 
8411  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8412  (~C).store( i , j+IT::size , (~C).load(i ,j+IT::size ) + xmm2 * factor );
8413  (~C).store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) + xmm3 * factor );
8414  (~C).store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) + xmm4 * factor );
8415  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8416  (~C).store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) + xmm6 * factor );
8417  (~C).store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) + xmm7 * factor );
8418  (~C).store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) + xmm8 * factor );
8419  }
8420 
8421  if( i < M )
8422  {
8423  const size_t kbegin( ( IsUpper<MT4>::value )
8424  ?( ( IsLower<MT5>::value )
8425  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8426  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8427  :( IsLower<MT5>::value ? j : 0UL ) );
8428  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
8429 
8430  IntrinsicType xmm1, xmm2, xmm3, xmm4;
8431 
8432  for( size_t k=kbegin; k<kend; ++k ) {
8433  const IntrinsicType a1( set( A(i,k) ) );
8434  xmm1 = xmm1 + a1 * B.load(k,j );
8435  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
8436  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
8437  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
8438  }
8439 
8440  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8441  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
8442  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
8443  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
8444  }
8445  }
8446 
8447  for( ; (j+IT::size) < N; j+=IT::size*2UL )
8448  {
8449  size_t i( 0UL );
8450 
8451  for( ; (i+2UL) <= M; i+=2UL )
8452  {
8453  const size_t kbegin( ( IsUpper<MT4>::value )
8454  ?( ( IsLower<MT5>::value )
8455  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8456  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8457  :( IsLower<MT5>::value ? j : 0UL ) );
8458  const size_t kend( ( IsLower<MT4>::value )
8459  ?( ( IsUpper<MT5>::value )
8460  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
8461  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8462  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
8463 
8464  IntrinsicType xmm1, xmm2, xmm3, xmm4;
8465 
8466  for( size_t k=kbegin; k<kend; ++k ) {
8467  const IntrinsicType a1( set( A(i ,k) ) );
8468  const IntrinsicType a2( set( A(i+1UL,k) ) );
8469  const IntrinsicType b1( B.load(k,j ) );
8470  const IntrinsicType b2( B.load(k,j+IT::size) );
8471  xmm1 = xmm1 + a1 * b1;
8472  xmm2 = xmm2 + a1 * b2;
8473  xmm3 = xmm3 + a2 * b1;
8474  xmm4 = xmm4 + a2 * b2;
8475  }
8476 
8477  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8478  (~C).store( i , j+IT::size, (~C).load(i ,j+IT::size) + xmm2 * factor );
8479  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
8480  (~C).store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) + xmm4 * factor );
8481  }
8482 
8483  if( i < M )
8484  {
8485  const size_t kbegin( ( IsUpper<MT4>::value )
8486  ?( ( IsLower<MT5>::value )
8487  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8488  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8489  :( IsLower<MT5>::value ? j : 0UL ) );
8490  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
8491 
8492  IntrinsicType xmm1, xmm2;
8493 
8494  for( size_t k=kbegin; k<kend; ++k ) {
8495  const IntrinsicType a1( set( A(i,k) ) );
8496  xmm1 = xmm1 + a1 * B.load(k,j );
8497  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
8498  }
8499 
8500  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8501  (~C).store( i, j+IT::size, (~C).load(i,j+IT::size) + xmm2 * factor );
8502  }
8503  }
8504 
8505  if( j < N )
8506  {
8507  size_t i( 0UL );
8508 
8509  for( ; (i+2UL) <= M; i+=2UL )
8510  {
8511  const size_t kbegin( ( IsUpper<MT4>::value )
8512  ?( ( IsLower<MT5>::value )
8513  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8514  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8515  :( IsLower<MT5>::value ? j : 0UL ) );
8516  const size_t kend( ( IsLower<MT4>::value )
8517  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8518  :( K ) );
8519 
8520  IntrinsicType xmm1, xmm2;
8521 
8522  for( size_t k=kbegin; k<kend; ++k ) {
8523  const IntrinsicType b1( B.load(k,j) );
8524  xmm1 = xmm1 + set( A(i ,k) ) * b1;
8525  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
8526  }
8527 
8528  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8529  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
8530  }
8531 
8532  if( i < M )
8533  {
8534  const size_t kbegin( ( IsUpper<MT4>::value )
8535  ?( ( IsLower<MT5>::value )
8536  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8537  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8538  :( IsLower<MT5>::value ? j : 0UL ) );
8539 
8540  IntrinsicType xmm1;
8541 
8542  for( size_t k=kbegin; k<K; ++k ) {
8543  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
8544  }
8545 
8546  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8547  }
8548  }
8549  }
8550  //**********************************************************************************************
8551 
8552  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
8567  template< typename MT3 // Type of the left-hand side target matrix
8568  , typename MT4 // Type of the left-hand side matrix operand
8569  , typename MT5 // Type of the right-hand side matrix operand
8570  , typename ST2 > // Type of the scalar value
8571  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8572  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8573  {
8574  typedef IntrinsicTrait<ElementType> IT;
8575 
8576  const size_t M( A.rows() );
8577  const size_t N( B.columns() );
8578  const size_t K( A.columns() );
8579 
8580  const IntrinsicType factor( set( scalar ) );
8581 
8582  size_t i( 0UL );
8583 
8584  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
8585  for( size_t j=0UL; j<N; ++j )
8586  {
8587  const size_t kbegin( ( IsLower<MT5>::value )
8588  ?( ( IsUpper<MT4>::value )
8589  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8590  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8591  :( IsUpper<MT4>::value ? i : 0UL ) );
8592  const size_t kend( ( IsUpper<MT5>::value )
8593  ?( ( IsLower<MT4>::value )
8594  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
8595  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
8596  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
8597 
8598  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8599 
8600  for( size_t k=kbegin; k<kend; ++k ) {
8601  const IntrinsicType b1( set( B(k,j) ) );
8602  xmm1 = xmm1 + A.load(i ,k) * b1;
8603  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
8604  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
8605  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
8606  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
8607  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
8608  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
8609  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
8610  }
8611 
8612  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8613  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
8614  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
8615  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
8616  (~C).store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) + xmm5 * factor );
8617  (~C).store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) + xmm6 * factor );
8618  (~C).store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) + xmm7 * factor );
8619  (~C).store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) + xmm8 * factor );
8620  }
8621  }
8622 
8623  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL )
8624  {
8625  size_t j( 0UL );
8626 
8627  for( ; (j+2UL) <= N; j+=2UL )
8628  {
8629  const size_t kbegin( ( IsLower<MT5>::value )
8630  ?( ( IsUpper<MT4>::value )
8631  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8632  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8633  :( IsUpper<MT4>::value ? i : 0UL ) );
8634  const size_t kend( ( IsUpper<MT5>::value )
8635  ?( ( IsLower<MT4>::value )
8636  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8637  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8638  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
8639 
8640  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8641 
8642  for( size_t k=kbegin; k<kend; ++k ) {
8643  const IntrinsicType a1( A.load(i ,k) );
8644  const IntrinsicType a2( A.load(i+IT::size ,k) );
8645  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
8646  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
8647  const IntrinsicType b1( set( B(k,j ) ) );
8648  const IntrinsicType b2( set( B(k,j+1UL) ) );
8649  xmm1 = xmm1 + a1 * b1;
8650  xmm2 = xmm2 + a2 * b1;
8651  xmm3 = xmm3 + a3 * b1;
8652  xmm4 = xmm4 + a4 * b1;
8653  xmm5 = xmm5 + a1 * b2;
8654  xmm6 = xmm6 + a2 * b2;
8655  xmm7 = xmm7 + a3 * b2;
8656  xmm8 = xmm8 + a4 * b2;
8657  }
8658 
8659  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8660  (~C).store( i+IT::size , j , (~C).load(i+IT::size ,j ) + xmm2 * factor );
8661  (~C).store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) + xmm3 * factor );
8662  (~C).store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) + xmm4 * factor );
8663  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
8664  (~C).store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) + xmm6 * factor );
8665  (~C).store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) + xmm7 * factor );
8666  (~C).store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) + xmm8 * factor );
8667  }
8668 
8669  if( j < N )
8670  {
8671  const size_t kbegin( ( IsLower<MT5>::value )
8672  ?( ( IsUpper<MT4>::value )
8673  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8674  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8675  :( IsUpper<MT4>::value ? i : 0UL ) );
8676  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
8677 
8678  IntrinsicType xmm1, xmm2, xmm3, xmm4;
8679 
8680  for( size_t k=kbegin; k<kend; ++k ) {
8681  const IntrinsicType b1( set( B(k,j) ) );
8682  xmm1 = xmm1 + A.load(i ,k) * b1;
8683  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
8684  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
8685  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
8686  }
8687 
8688  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8689  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
8690  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
8691  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
8692  }
8693  }
8694 
8695  for( ; (i+IT::size) < M; i+=IT::size*2UL )
8696  {
8697  size_t j( 0UL );
8698 
8699  for( ; (j+2UL) <= N; j+=2UL )
8700  {
8701  const size_t kbegin( ( IsLower<MT5>::value )
8702  ?( ( IsUpper<MT4>::value )
8703  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8704  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8705  :( IsUpper<MT4>::value ? i : 0UL ) );
8706  const size_t kend( ( IsUpper<MT5>::value )
8707  ?( ( IsLower<MT4>::value )
8708  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8709  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8710  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
8711 
8712  IntrinsicType xmm1, xmm2, xmm3, xmm4;
8713 
8714  for( size_t k=kbegin; k<kend; ++k ) {
8715  const IntrinsicType a1( A.load(i ,k) );
8716  const IntrinsicType a2( A.load(i+IT::size,k) );
8717  const IntrinsicType b1( set( B(k,j ) ) );
8718  const IntrinsicType b2( set( B(k,j+1UL) ) );
8719  xmm1 = xmm1 + a1 * b1;
8720  xmm2 = xmm2 + a2 * b1;
8721  xmm3 = xmm3 + a1 * b2;
8722  xmm4 = xmm4 + a2 * b2;
8723  }
8724 
8725  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8726  (~C).store( i+IT::size, j , (~C).load(i+IT::size,j ) + xmm2 * factor );
8727  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
8728  (~C).store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) + xmm4 * factor );
8729  }
8730 
8731  if( j < N )
8732  {
8733  const size_t kbegin( ( IsLower<MT5>::value )
8734  ?( ( IsUpper<MT4>::value )
8735  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8736  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8737  :( IsUpper<MT4>::value ? i : 0UL ) );
8738  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
8739 
8740  IntrinsicType xmm1, xmm2;
8741 
8742  for( size_t k=kbegin; k<kend; ++k ) {
8743  const IntrinsicType b1( set( B(k,j) ) );
8744  xmm1 = xmm1 + A.load(i ,k) * b1;
8745  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
8746  }
8747 
8748  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8749  (~C).store( i+IT::size, j, (~C).load(i+IT::size,j) + xmm2 * factor );
8750  }
8751  }
8752 
8753  if( i < M )
8754  {
8755  size_t j( 0UL );
8756 
8757  for( ; (j+2UL) <= N; j+=2UL )
8758  {
8759  const size_t kbegin( ( IsLower<MT5>::value )
8760  ?( ( IsUpper<MT4>::value )
8761  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8762  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8763  :( IsUpper<MT4>::value ? i : 0UL ) );
8764  const size_t kend( ( IsUpper<MT5>::value )
8765  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8766  :( K ) );
8767 
8768  IntrinsicType xmm1, xmm2;
8769 
8770  for( size_t k=kbegin; k<kend; ++k ) {
8771  const IntrinsicType a1( A.load(i,k) );
8772  xmm1 = xmm1 + a1 * set( B(k,j ) );
8773  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
8774  }
8775 
8776  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8777  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
8778  }
8779 
8780  if( j < N )
8781  {
8782  const size_t kbegin( ( IsLower<MT5>::value )
8783  ?( ( IsUpper<MT4>::value )
8784  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8785  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8786  :( IsUpper<MT4>::value ? i : 0UL ) );
8787 
8788  IntrinsicType xmm1;
8789 
8790  for( size_t k=kbegin; k<K; ++k ) {
8791  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
8792  }
8793 
8794  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8795  }
8796  }
8797  }
8798  //**********************************************************************************************
8799 
8800  //**Default addition assignment to dense matrices (large matrices)******************************
8814  template< typename MT3 // Type of the left-hand side target matrix
8815  , typename MT4 // Type of the left-hand side matrix operand
8816  , typename MT5 // Type of the right-hand side matrix operand
8817  , typename ST2 > // Type of the scalar value
8818  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8819  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8820  {
8821  selectDefaultAddAssignKernel( C, A, B, scalar );
8822  }
8823  //**********************************************************************************************
8824 
8825  //**Vectorized default addition assignment to row-major dense matrices (large matrices)*********
8840  template< typename MT3 // Type of the left-hand side target matrix
8841  , typename MT4 // Type of the left-hand side matrix operand
8842  , typename MT5 // Type of the right-hand side matrix operand
8843  , typename ST2 > // Type of the scalar value
8844  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8845  selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
8846  {
8847  typedef IntrinsicTrait<ElementType> IT;
8848 
8849  const size_t M( A.rows() );
8850  const size_t N( B.columns() );
8851  const size_t K( A.columns() );
8852 
8853  const size_t iblock( 64UL );
8854  const size_t jblock( 128UL );
8855  const size_t kblock( 128UL );
8856 
8857  const IntrinsicType factor( set( scalar ) );
8858 
8859  for( size_t jj=0UL; jj<N; jj+=jblock )
8860  {
8861  const size_t jend( min( jj+jblock, N ) );
8862 
8863  for( size_t ii=0UL; ii<M; ii+=iblock )
8864  {
8865  const size_t iend( min( ii+iblock, M ) );
8866 
8867  for( size_t kk=0UL; kk<K; kk+=kblock )
8868  {
8869  const size_t ktmp( min( kk+kblock, K ) );
8870 
8871  size_t j( jj );
8872 
8873  for( ; (j+IT::size*3UL) < jend; j+=IT::size*4UL )
8874  {
8875  const size_t j1( j+IT::size );
8876  const size_t j2( j+IT::size*2UL );
8877  const size_t j3( j+IT::size*3UL );
8878 
8879  size_t i( ii );
8880 
8881  for( ; (i+2UL) <= iend; i+=2UL )
8882  {
8883  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
8884  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
8885  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
8886  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
8887 
8888  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8889 
8890  for( size_t k=kbegin; k<kend; ++k ) {
8891  const IntrinsicType a1( set( A(i ,k) ) );
8892  const IntrinsicType a2( set( A(i+1UL,k) ) );
8893  const IntrinsicType b1( B.load(k,j ) );
8894  const IntrinsicType b2( B.load(k,j1) );
8895  const IntrinsicType b3( B.load(k,j2) );
8896  const IntrinsicType b4( B.load(k,j3) );
8897  xmm1 = xmm1 + a1 * b1;
8898  xmm2 = xmm2 + a1 * b2;
8899  xmm3 = xmm3 + a1 * b3;
8900  xmm4 = xmm4 + a1 * b4;
8901  xmm5 = xmm5 + a2 * b1;
8902  xmm6 = xmm6 + a2 * b2;
8903  xmm7 = xmm7 + a2 * b3;
8904  xmm8 = xmm8 + a2 * b4;
8905  }
8906 
8907  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8908  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
8909  (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
8910  (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
8911  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8912  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
8913  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
8914  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
8915  }
8916 
8917  if( i < iend )
8918  {
8919  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
8920  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
8921  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
8922  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
8923 
8924  IntrinsicType xmm1, xmm2, xmm3, xmm4;
8925 
8926  for( size_t k=kbegin; k<kend; ++k ) {
8927  const IntrinsicType a1( set( A(i,k) ) );
8928  xmm1 = xmm1 + a1 * B.load(k,j );
8929  xmm2 = xmm2 + a1 * B.load(k,j1);
8930  xmm3 = xmm3 + a1 * B.load(k,j2);
8931  xmm4 = xmm4 + a1 * B.load(k,j3);
8932  }
8933 
8934  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8935  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
8936  (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
8937  (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
8938  }
8939  }
8940 
8941  for( ; (j+IT::size) < jend; j+=IT::size*2UL )
8942  {
8943  const size_t j1( j+IT::size );
8944 
8945  size_t i( ii );
8946 
8947  for( ; (i+4UL) <= iend; i+=4UL )
8948  {
8949  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
8950  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
8951  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
8952  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
8953 
8954  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8955 
8956  for( size_t k=kbegin; k<kend; ++k ) {
8957  const IntrinsicType a1( set( A(i ,k) ) );
8958  const IntrinsicType a2( set( A(i+1UL,k) ) );
8959  const IntrinsicType a3( set( A(i+2UL,k) ) );
8960  const IntrinsicType a4( set( A(i+3UL,k) ) );
8961  const IntrinsicType b1( B.load(k,j ) );
8962  const IntrinsicType b2( B.load(k,j1) );
8963  xmm1 = xmm1 + a1 * b1;
8964  xmm2 = xmm2 + a1 * b2;
8965  xmm3 = xmm3 + a2 * b1;
8966  xmm4 = xmm4 + a2 * b2;
8967  xmm5 = xmm5 + a3 * b1;
8968  xmm6 = xmm6 + a3 * b2;
8969  xmm7 = xmm7 + a4 * b1;
8970  xmm8 = xmm8 + a4 * b2;
8971  }
8972 
8973  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8974  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
8975  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
8976  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
8977  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
8978  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
8979  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
8980  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
8981  }
8982 
8983  for( ; (i+2UL) <= iend; i+=2UL )
8984  {
8985  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
8986  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
8987  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
8988  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
8989 
8990  IntrinsicType xmm1, xmm2, xmm3, xmm4;
8991 
8992  for( size_t k=kbegin; k<kend; ++k ) {
8993  const IntrinsicType a1( set( A(i ,k) ) );
8994  const IntrinsicType a2( set( A(i+1UL,k) ) );
8995  const IntrinsicType b1( B.load(k,j ) );
8996  const IntrinsicType b2( B.load(k,j1) );
8997  xmm1 = xmm1 + a1 * b1;
8998  xmm2 = xmm2 + a1 * b2;
8999  xmm3 = xmm3 + a2 * b1;
9000  xmm4 = xmm4 + a2 * b2;
9001  }
9002 
9003  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9004  (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
9005  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
9006  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
9007  }
9008 
9009  if( i < iend )
9010  {
9011  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9012  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9013  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9014  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
9015 
9016  IntrinsicType xmm1, xmm2;
9017 
9018  for( size_t k=kbegin; k<kend; ++k ) {
9019  const IntrinsicType a1( set( A(i,k) ) );
9020  xmm1 = xmm1 + a1 * B.load(k,j );
9021  xmm2 = xmm2 + a1 * B.load(k,j1);
9022  }
9023 
9024  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9025  (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
9026  }
9027  }
9028 
9029  if( j < jend )
9030  {
9031  for( size_t i=ii; i<iend; ++i )
9032  {
9033  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9034  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9035  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9036  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
9037 
9038  IntrinsicType xmm1;
9039 
9040  for( size_t k=kbegin; k<kend; ++k ) {
9041  const IntrinsicType a1( set( A(i,k) ) );
9042  xmm1 = xmm1 + a1 * B.load(k,j);
9043  }
9044 
9045  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
9046  }
9047  }
9048  }
9049  }
9050  }
9051  }
9052  //**********************************************************************************************
9053 
9054  //**Vectorized default addition assignment to column-major dense matrices (large matrices)******
9069  template< typename MT3 // Type of the left-hand side target matrix
9070  , typename MT4 // Type of the left-hand side matrix operand
9071  , typename MT5 // Type of the right-hand side matrix operand
9072  , typename ST2 > // Type of the scalar value
9073  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9074  selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9075  {
9076  typedef IntrinsicTrait<ElementType> IT;
9077 
9078  const size_t M( A.rows() );
9079  const size_t N( B.columns() );
9080  const size_t K( A.columns() );
9081 
9082  const size_t iblock( 128UL );
9083  const size_t jblock( 64UL );
9084  const size_t kblock( 128UL );
9085 
9086  const IntrinsicType factor( set( scalar ) );
9087 
9088  for( size_t ii=0UL; ii<M; ii+=iblock )
9089  {
9090  const size_t iend( min( ii+iblock, M ) );
9091 
9092  for( size_t jj=0UL; jj<N; jj+=jblock )
9093  {
9094  const size_t jend( min( jj+jblock, N ) );
9095 
9096  for( size_t kk=0UL; kk<K; kk+=kblock )
9097  {
9098  const size_t ktmp( min( kk+kblock, K ) );
9099 
9100  size_t i( ii );
9101 
9102  for( ; (i+IT::size*3UL) < iend; i+=IT::size*4UL )
9103  {
9104  const size_t i1( i+IT::size );
9105  const size_t i2( i+IT::size*2UL );
9106  const size_t i3( i+IT::size*3UL );
9107 
9108  size_t j( jj );
9109 
9110  for( ; (j+2UL) <= jend; j+=2UL )
9111  {
9112  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9113  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9114  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
9115  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
9116 
9117  IntrinsicType xmm1( (~C).load(i ,j ) );
9118  IntrinsicType xmm2( (~C).load(i1,j ) );
9119  IntrinsicType xmm3( (~C).load(i2,j ) );
9120  IntrinsicType xmm4( (~C).load(i3,j ) );
9121  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
9122  IntrinsicType xmm6( (~C).load(i1,j+1UL) );
9123  IntrinsicType xmm7( (~C).load(i2,j+1UL) );
9124  IntrinsicType xmm8( (~C).load(i3,j+1UL) );
9125 
9126  for( size_t k=kbegin; k<kend; ++k ) {
9127  const IntrinsicType a1( A.load(i ,k) );
9128  const IntrinsicType a2( A.load(i1,k) );
9129  const IntrinsicType a3( A.load(i2,k) );
9130  const IntrinsicType a4( A.load(i3,k) );
9131  const IntrinsicType b1( set( B(k,j ) ) );
9132  const IntrinsicType b2( set( B(k,j+1UL) ) );
9133  xmm1 = xmm1 + a1 * b1;
9134  xmm2 = xmm2 + a2 * b1;
9135  xmm3 = xmm3 + a3 * b1;
9136  xmm4 = xmm4 + a4 * b1;
9137  xmm5 = xmm5 + a1 * b2;
9138  xmm6 = xmm6 + a2 * b2;
9139  xmm7 = xmm7 + a3 * b2;
9140  xmm8 = xmm8 + a4 * b2;
9141  }
9142 
9143  (~C).store( i , j , xmm1 * factor );
9144  (~C).store( i1, j , xmm2 * factor );
9145  (~C).store( i2, j , xmm3 * factor );
9146  (~C).store( i3, j , xmm4 * factor );
9147  (~C).store( i , j+1UL, xmm5 * factor );
9148  (~C).store( i1, j+1UL, xmm6 * factor );
9149  (~C).store( i2, j+1UL, xmm7 * factor );
9150  (~C).store( i3, j+1UL, xmm8 * factor );
9151  }
9152 
9153  if( j < jend )
9154  {
9155  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9156  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9157  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
9158  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9159 
9160  IntrinsicType xmm1( (~C).load(i ,j) );
9161  IntrinsicType xmm2( (~C).load(i1,j) );
9162  IntrinsicType xmm3( (~C).load(i2,j) );
9163  IntrinsicType xmm4( (~C).load(i3,j) );
9164 
9165  for( size_t k=kbegin; k<kend; ++k ) {
9166  const IntrinsicType b1( set( B(k,j) ) );
9167  xmm1 = xmm1 + A.load(i ,k) * b1;
9168  xmm2 = xmm2 + A.load(i1,k) * b1;
9169  xmm3 = xmm3 + A.load(i2,k) * b1;
9170  xmm4 = xmm4 + A.load(i3,k) * b1;
9171  }
9172 
9173  (~C).store( i , j, xmm1 * factor );
9174  (~C).store( i1, j, xmm2 * factor );
9175  (~C).store( i2, j, xmm3 * factor );
9176  (~C).store( i3, j, xmm4 * factor );
9177  }
9178  }
9179 
9180  for( ; (i+IT::size) < iend; i+=IT::size*2UL )
9181  {
9182  const size_t i1( i+IT::size );
9183 
9184  size_t j( jj );
9185 
9186  for( ; (j+4UL) <= jend; j+=4UL )
9187  {
9188  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9189  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9190  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
9191  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
9192 
9193  IntrinsicType xmm1( (~C).load(i ,j ) );
9194  IntrinsicType xmm2( (~C).load(i1,j ) );
9195  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
9196  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
9197  IntrinsicType xmm5( (~C).load(i ,j+2UL) );
9198  IntrinsicType xmm6( (~C).load(i1,j+2UL) );
9199  IntrinsicType xmm7( (~C).load(i ,j+3UL) );
9200  IntrinsicType xmm8( (~C).load(i1,j+3UL) );
9201 
9202  for( size_t k=kbegin; k<kend; ++k ) {
9203  const IntrinsicType a1( A.load(i ,k) );
9204  const IntrinsicType a2( A.load(i1,k) );
9205  const IntrinsicType b1( set( B(k,j ) ) );
9206  const IntrinsicType b2( set( B(k,j+1UL) ) );
9207  const IntrinsicType b3( set( B(k,j+2UL) ) );
9208  const IntrinsicType b4( set( B(k,j+3UL) ) );
9209  xmm1 = xmm1 + a1 * b1;
9210  xmm2 = xmm2 + a2 * b1;
9211  xmm3 = xmm3 + a1 * b2;
9212  xmm4 = xmm4 + a2 * b2;
9213  xmm5 = xmm5 + a1 * b3;
9214  xmm6 = xmm6 + a2 * b3;
9215  xmm7 = xmm7 + a1 * b4;
9216  xmm8 = xmm8 + a2 * b4;
9217  }
9218 
9219  (~C).store( i , j , xmm1 * factor );
9220  (~C).store( i1, j , xmm2 * factor );
9221  (~C).store( i , j+1UL, xmm3 * factor );
9222  (~C).store( i1, j+1UL, xmm4 * factor );
9223  (~C).store( i , j+2UL, xmm5 * factor );
9224  (~C).store( i1, j+2UL, xmm6 * factor );
9225  (~C).store( i , j+3UL, xmm7 * factor );
9226  (~C).store( i1, j+3UL, xmm8 * factor );
9227  }
9228 
9229  for( ; (j+2UL) <= jend; j+=2UL )
9230  {
9231  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9232  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9233  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
9234  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
9235 
9236  IntrinsicType xmm1( (~C).load(i ,j ) );
9237  IntrinsicType xmm2( (~C).load(i1,j ) );
9238  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
9239  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
9240 
9241  for( size_t k=kbegin; k<kend; ++k ) {
9242  const IntrinsicType a1( A.load(i ,k) );
9243  const IntrinsicType a2( A.load(i1,k) );
9244  const IntrinsicType b1( set( B(k,j ) ) );
9245  const IntrinsicType b2( set( B(k,j+1UL) ) );
9246  xmm1 = xmm1 + a1 * b1;
9247  xmm2 = xmm2 + a2 * b1;
9248  xmm3 = xmm3 + a1 * b2;
9249  xmm4 = xmm4 + a2 * b2;
9250  }
9251 
9252  (~C).store( i , j , xmm1 * factor );
9253  (~C).store( i1, j , xmm2 * factor );
9254  (~C).store( i , j+1UL, xmm3 * factor );
9255  (~C).store( i1, j+1UL, xmm4 * factor );
9256  }
9257 
9258  if( j < jend )
9259  {
9260  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9261  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9262  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
9263  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9264 
9265  IntrinsicType xmm1( (~C).load(i ,j) );
9266  IntrinsicType xmm2( (~C).load(i1,j) );
9267 
9268  for( size_t k=kbegin; k<kend; ++k ) {
9269  const IntrinsicType b1( set( B(k,j) ) );
9270  xmm1 = xmm1 + A.load(i ,k) * b1;
9271  xmm2 = xmm2 + A.load(i1,k) * b1;
9272  }
9273 
9274  (~C).store( i , j, xmm1 * factor );
9275  (~C).store( i1, j, xmm2 * factor );
9276  }
9277  }
9278 
9279  if( i < iend )
9280  {
9281  for( size_t j=jj; j<jend; ++j )
9282  {
9283  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
9284  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
9285  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
9286  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9287 
9288  IntrinsicType xmm1( (~C).load(i,j) );
9289 
9290  for( size_t k=kbegin; k<kend; ++k ) {
9291  const IntrinsicType b1( set( B(k,j) ) );
9292  xmm1 = xmm1 + A.load(i,k) * b1;
9293  }
9294 
9295  (~C).store( i, j, xmm1 * factor );
9296  }
9297  }
9298  }
9299  }
9300  }
9301  }
9302  //**********************************************************************************************
9303 
9304  //**BLAS-based addition assignment to dense matrices (default)**********************************
9318  template< typename MT3 // Type of the left-hand side target matrix
9319  , typename MT4 // Type of the left-hand side matrix operand
9320  , typename MT5 // Type of the right-hand side matrix operand
9321  , typename ST2 > // Type of the scalar value
9322  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9323  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9324  {
9325  selectLargeAddAssignKernel( C, A, B, scalar );
9326  }
9327  //**********************************************************************************************
9328 
9329  //**BLAS-based addition assignment to dense matrices (single precision)*************************
9330 #if BLAZE_BLAS_MODE
9331 
9344  template< typename MT3 // Type of the left-hand side target matrix
9345  , typename MT4 // Type of the left-hand side matrix operand
9346  , typename MT5 // Type of the right-hand side matrix operand
9347  , typename ST2 > // Type of the scalar value
9348  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
9349  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9350  {
9351  if( IsTriangular<MT4>::value ) {
9352  typename MT3::ResultType tmp( B );
9353  strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
9354  addAssign( C, tmp );
9355  }
9356  else if( IsTriangular<MT5>::value ) {
9357  typename MT3::ResultType tmp( A );
9358  strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
9359  addAssign( C, tmp );
9360  }
9361  else {
9362  sgemm( C, A, B, scalar, 1.0F );
9363  }
9364  }
9365 #endif
9366  //**********************************************************************************************
9367 
9368  //**BLAS-based addition assignment to dense matrices (double precision)*************************
9369 #if BLAZE_BLAS_MODE
9370 
9383  template< typename MT3 // Type of the left-hand side target matrix
9384  , typename MT4 // Type of the left-hand side matrix operand
9385  , typename MT5 // Type of the right-hand side matrix operand
9386  , typename ST2 > // Type of the scalar value
9387  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
9388  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9389  {
9390  if( IsTriangular<MT4>::value ) {
9391  typename MT3::ResultType tmp( B );
9392  dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
9393  addAssign( C, tmp );
9394  }
9395  else if( IsTriangular<MT5>::value ) {
9396  typename MT3::ResultType tmp( A );
9397  dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
9398  addAssign( C, tmp );
9399  }
9400  else {
9401  dgemm( C, A, B, scalar, 1.0 );
9402  }
9403  }
9404 #endif
9405  //**********************************************************************************************
9406 
9407  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
9408 #if BLAZE_BLAS_MODE
9409 
9422  template< typename MT3 // Type of the left-hand side target matrix
9423  , typename MT4 // Type of the left-hand side matrix operand
9424  , typename MT5 // Type of the right-hand side matrix operand
9425  , typename ST2 > // Type of the scalar value
9426  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
9427  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9428  {
9429  if( IsTriangular<MT4>::value ) {
9430  typename MT3::ResultType tmp( B );
9431  ctrmm( tmp, A, CblasLeft,
9432  ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
9433  complex<float>( scalar, 0.0F ) );
9434  addAssign( C, tmp );
9435  }
9436  else if( IsTriangular<MT5>::value ) {
9437  typename MT3::ResultType tmp( A );
9438  ctrmm( tmp, B, CblasRight,
9439  ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
9440  complex<float>( scalar, 0.0F ) );
9441  addAssign( C, tmp );
9442  }
9443  else {
9444  cgemm( C, A, B, complex<float>( scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
9445  }
9446  }
9447 #endif
9448  //**********************************************************************************************
9449 
9450  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
9451 #if BLAZE_BLAS_MODE
9452 
9465  template< typename MT3 // Type of the left-hand side target matrix
9466  , typename MT4 // Type of the left-hand side matrix operand
9467  , typename MT5 // Type of the right-hand side matrix operand
9468  , typename ST2 > // Type of the scalar value
9469  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
9470  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9471  {
9472  if( IsTriangular<MT4>::value ) {
9473  typename MT3::ResultType tmp( B );
9474  ztrmm( tmp, A, CblasLeft,
9475  ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
9476  complex<double>( scalar, 0.0 ) );
9477  addAssign( C, tmp );
9478  }
9479  else if( IsTriangular<MT5>::value ) {
9480  typename MT3::ResultType tmp( A );
9481  ztrmm( tmp, B, CblasRight,
9482  ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
9483  complex<double>( scalar, 0.0 ) );
9484  addAssign( C, tmp );
9485  }
9486  else {
9487  zgemm( C, A, B, complex<double>( scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
9488  }
9489  }
9490 #endif
9491  //**********************************************************************************************
9492 
9493  //**Addition assignment to sparse matrices******************************************************
9494  // No special implementation for the addition assignment to sparse matrices.
9495  //**********************************************************************************************
9496 
9497  //**Subtraction assignment to dense matrices****************************************************
9509  template< typename MT // Type of the target dense matrix
9510  , bool SO > // Storage order of the target dense matrix
9511  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9512  {
9514 
9515  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9516  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9517 
9518  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
9519  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
9520 
9521  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
9522  return;
9523  }
9524 
9525  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
9526  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
9527 
9528  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9529  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9530  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9531  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9532  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
9533  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
9534 
9535  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
9536  }
9537  //**********************************************************************************************
9538 
9539  //**Subtraction assignment to dense matrices (kernel selection)*********************************
9550  template< typename MT3 // Type of the left-hand side target matrix
9551  , typename MT4 // Type of the left-hand side matrix operand
9552  , typename MT5 // Type of the right-hand side matrix operand
9553  , typename ST2 > // Type of the scalar value
9554  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9555  {
9556  if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
9557  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9558  selectSmallSubAssignKernel( C, A, B, scalar );
9559  else
9560  selectBlasSubAssignKernel( C, A, B, scalar );
9561  }
9562  //**********************************************************************************************
9563 
9564  //**Default subtraction assignment to dense matrices********************************************
9578  template< typename MT3 // Type of the left-hand side target matrix
9579  , typename MT4 // Type of the left-hand side matrix operand
9580  , typename MT5 // Type of the right-hand side matrix operand
9581  , typename ST2 > // Type of the scalar value
9582  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
9583  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9584  {
9585  const ResultType tmp( serial( A * B * scalar ) );
9586  subAssign( C, tmp );
9587  }
9588  //**********************************************************************************************
9589 
9590  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
9604  template< typename MT3 // Type of the left-hand side target matrix
9605  , typename MT4 // Type of the left-hand side matrix operand
9606  , typename MT5 // Type of the right-hand side matrix operand
9607  , typename ST2 > // Type of the scalar value
9608  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
9609  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9610  {
9611  const size_t M( A.rows() );
9612  const size_t N( B.columns() );
9613 
9614  const size_t block( 16UL );
9615 
9616  for( size_t ii=0UL; ii<M; ii+=block ) {
9617  const size_t iend( min( M, ii+block ) );
9618  for( size_t jj=0UL; jj<N; jj+=block ) {
9619  const size_t jend( min( N, jj+block ) );
9620  for( size_t i=ii; i<iend; ++i )
9621  {
9622  const size_t jbegin( ( IsUpper<MT4>::value )
9623  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
9624  :( jj ) );
9625  const size_t jpos( ( IsLower<MT4>::value )
9626  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
9627  :( jend ) );
9628 
9629  for( size_t j=jbegin; j<jpos; ++j ) {
9630  (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
9631  }
9632  }
9633  }
9634  }
9635  }
9636  //**********************************************************************************************
9637 
9638  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
9652  template< typename MT3 // Type of the left-hand side target matrix
9653  , typename MT4 // Type of the left-hand side matrix operand
9654  , typename MT5 // Type of the right-hand side matrix operand
9655  , typename ST2 > // Type of the scalar value
9656  static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
9657  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9658  {
9659  const size_t M( A.rows() );
9660  const size_t N( B.columns() );
9661 
9662  for( size_t j=0UL; j<N; ++j )
9663  {
9664  const size_t ibegin( ( IsLower<MT4>::value )
9665  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
9666  :( 0UL ) );
9667  const size_t iend( ( IsUpper<MT4>::value )
9668  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
9669  :( M ) );
9670  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
9671 
9672  const size_t inum( iend - ibegin );
9673  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
9674 
9675  for( size_t i=ibegin; i<ipos; i+=2UL ) {
9676  (~C)(i ,j) -= A(i ,j) * B(j,j) * scalar;
9677  (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
9678  }
9679  if( ipos < iend ) {
9680  (~C)(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
9681  }
9682  }
9683  }
9684  //**********************************************************************************************
9685 
9686  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
9700  template< typename MT3 // Type of the left-hand side target matrix
9701  , typename MT4 // Type of the left-hand side matrix operand
9702  , typename MT5 // Type of the right-hand side matrix operand
9703  , typename ST2 > // Type of the scalar value
9704  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
9705  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9706  {
9707  const size_t M( A.rows() );
9708  const size_t N( B.columns() );
9709 
9710  for( size_t i=0UL; i<M; ++i )
9711  {
9712  const size_t jbegin( ( IsUpper<MT5>::value )
9713  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
9714  :( 0UL ) );
9715  const size_t jend( ( IsLower<MT5>::value )
9716  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
9717  :( N ) );
9718  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
9719 
9720  const size_t jnum( jend - jbegin );
9721  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
9722 
9723  for( size_t j=jbegin; j<jpos; j+=2UL ) {
9724  (~C)(i,j ) -= A(i,i) * B(i,j ) * scalar;
9725  (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
9726  }
9727  if( jpos < jend ) {
9728  (~C)(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
9729  }
9730  }
9731  }
9732  //**********************************************************************************************
9733 
9734  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
9748  template< typename MT3 // Type of the left-hand side target matrix
9749  , typename MT4 // Type of the left-hand side matrix operand
9750  , typename MT5 // Type of the right-hand side matrix operand
9751  , typename ST2 > // Type of the scalar value
9752  static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
9753  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9754  {
9755  const size_t M( A.rows() );
9756  const size_t N( B.columns() );
9757 
9758  const size_t block( 16UL );
9759 
9760  for( size_t jj=0UL; jj<N; jj+=block ) {
9761  const size_t jend( min( N, jj+block ) );
9762  for( size_t ii=0UL; ii<M; ii+=block ) {
9763  const size_t iend( min( M, ii+block ) );
9764  for( size_t j=jj; j<jend; ++j )
9765  {
9766  const size_t ibegin( ( IsLower<MT5>::value )
9767  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
9768  :( ii ) );
9769  const size_t ipos( ( IsUpper<MT5>::value )
9770  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
9771  :( iend ) );
9772 
9773  for( size_t i=ibegin; i<ipos; ++i ) {
9774  (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
9775  }
9776  }
9777  }
9778  }
9779  }
9780  //**********************************************************************************************
9781 
9782  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
9796  template< typename MT3 // Type of the left-hand side target matrix
9797  , typename MT4 // Type of the left-hand side matrix operand
9798  , typename MT5 // Type of the right-hand side matrix operand
9799  , typename ST2 > // Type of the scalar value
9800  static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
9801  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9802  {
9803  for( size_t i=0UL; i<A.rows(); ++i ) {
9804  C(i,i) -= A(i,i) * B(i,i) * scalar;
9805  }
9806  }
9807  //**********************************************************************************************
9808 
9809  //**Default subtraction assignment to dense matrices (small matrices)***************************
9823  template< typename MT3 // Type of the left-hand side target matrix
9824  , typename MT4 // Type of the left-hand side matrix operand
9825  , typename MT5 // Type of the right-hand side matrix operand
9826  , typename ST2 > // Type of the scalar value
9827  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9828  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9829  {
9830  selectDefaultSubAssignKernel( C, A, B, scalar );
9831  }
9832  //**********************************************************************************************
9833 
9834  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
9849  template< typename MT3 // Type of the left-hand side target matrix
9850  , typename MT4 // Type of the left-hand side matrix operand
9851  , typename MT5 // Type of the right-hand side matrix operand
9852  , typename ST2 > // Type of the scalar value
9853  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9854  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9855  {
9856  typedef IntrinsicTrait<ElementType> IT;
9857 
9858  const size_t M( A.rows() );
9859  const size_t N( B.columns() );
9860  const size_t K( A.columns() );
9861 
9862  const IntrinsicType factor( set( scalar ) );
9863 
9864  size_t j( 0UL );
9865 
9866  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
9867  for( size_t i=0UL; i<M; ++i )
9868  {
9869  const size_t kbegin( ( IsUpper<MT4>::value )
9870  ?( ( IsLower<MT5>::value )
9871  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9872  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9873  :( IsLower<MT5>::value ? j : 0UL ) );
9874  const size_t kend( ( IsLower<MT4>::value )
9875  ?( ( IsUpper<MT5>::value )
9876  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+IT::size*8UL, K ) )
9877  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
9878  :( IsUpper<MT5>::value ? min( j+IT::size*8UL, K ) : K ) );
9879 
9880  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9881 
9882  for( size_t k=kbegin; k<kend; ++k ) {
9883  const IntrinsicType a1( set( A(i,k) ) );
9884  xmm1 = xmm1 + a1 * B.load(k,j );
9885  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
9886  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
9887  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
9888  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
9889  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
9890  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
9891  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
9892  }
9893 
9894  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9895  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
9896  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
9897  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
9898  (~C).store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) - xmm5 * factor );
9899  (~C).store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) - xmm6 * factor );
9900  (~C).store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) - xmm7 * factor );
9901  (~C).store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) - xmm8 * factor );
9902  }
9903  }
9904 
9905  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL )
9906  {
9907  size_t i( 0UL );
9908 
9909  for( ; (i+2UL) <= M; i+=2UL )
9910  {
9911  const size_t kbegin( ( IsUpper<MT4>::value )
9912  ?( ( IsLower<MT5>::value )
9913  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9914  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9915  :( IsLower<MT5>::value ? j : 0UL ) );
9916  const size_t kend( ( IsLower<MT4>::value )
9917  ?( ( IsUpper<MT5>::value )
9918  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*4UL, K ) )
9919  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9920  :( IsUpper<MT5>::value ? min( j+IT::size*4UL, K ) : K ) );
9921 
9922  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9923 
9924  for( size_t k=kbegin; k<kend; ++k ) {
9925  const IntrinsicType a1( set( A(i ,k) ) );
9926  const IntrinsicType a2( set( A(i+1UL,k) ) );
9927  const IntrinsicType b1( B.load(k,j ) );
9928  const IntrinsicType b2( B.load(k,j+IT::size ) );
9929  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
9930  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
9931  xmm1 = xmm1 + a1 * b1;
9932  xmm2 = xmm2 + a1 * b2;
9933  xmm3 = xmm3 + a1 * b3;
9934  xmm4 = xmm4 + a1 * b4;
9935  xmm5 = xmm5 + a2 * b1;
9936  xmm6 = xmm6 + a2 * b2;
9937  xmm7 = xmm7 + a2 * b3;
9938  xmm8 = xmm8 + a2 * b4;
9939  }
9940 
9941  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9942  (~C).store( i , j+IT::size , (~C).load(i ,j+IT::size ) - xmm2 * factor );
9943  (~C).store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) - xmm3 * factor );
9944  (~C).store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) - xmm4 * factor );
9945  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
9946  (~C).store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) - xmm6 * factor );
9947  (~C).store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) - xmm7 * factor );
9948  (~C).store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) - xmm8 * factor );
9949  }
9950 
9951  if( i < M )
9952  {
9953  const size_t kbegin( ( IsUpper<MT4>::value )
9954  ?( ( IsLower<MT5>::value )
9955  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9956  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9957  :( IsLower<MT5>::value ? j : 0UL ) );
9958  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, K ) ):( K ) );
9959 
9960  IntrinsicType xmm1, xmm2, xmm3, xmm4;
9961 
9962  for( size_t k=kbegin; k<kend; ++k ) {
9963  const IntrinsicType a1( set( A(i,k) ) );
9964  xmm1 = xmm1 + a1 * B.load(k,j );
9965  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
9966  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
9967  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
9968  }
9969 
9970  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9971  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
9972  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
9973  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
9974  }
9975  }
9976 
9977  for( ; (j+IT::size) < N; j+=IT::size*2UL )
9978  {
9979  size_t i( 0UL );
9980 
9981  for( ; (i+2UL) <= M; i+=2UL )
9982  {
9983  const size_t kbegin( ( IsUpper<MT4>::value )
9984  ?( ( IsLower<MT5>::value )
9985  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9986  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9987  :( IsLower<MT5>::value ? j : 0UL ) );
9988  const size_t kend( ( IsLower<MT4>::value )
9989  ?( ( IsUpper<MT5>::value )
9990  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+IT::size*2UL, K ) )
9991  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9992  :( IsUpper<MT5>::value ? min( j+IT::size*2UL, K ) : K ) );
9993 
9994  IntrinsicType xmm1, xmm2, xmm3, xmm4;
9995 
9996  for( size_t k=kbegin; k<kend; ++k ) {
9997  const IntrinsicType a1( set( A(i ,k) ) );
9998  const IntrinsicType a2( set( A(i+1UL,k) ) );
9999  const IntrinsicType b1( B.load(k,j ) );
10000  const IntrinsicType b2( B.load(k,j+IT::size) );
10001  xmm1 = xmm1 + a1 * b1;
10002  xmm2 = xmm2 + a1 * b2;
10003  xmm3 = xmm3 + a2 * b1;
10004  xmm4 = xmm4 + a2 * b2;
10005  }
10006 
10007  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10008  (~C).store( i , j+IT::size, (~C).load(i ,j+IT::size) - xmm2 * factor );
10009  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10010  (~C).store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) - xmm4 * factor );
10011  }
10012 
10013  if( i < M )
10014  {
10015  const size_t kbegin( ( IsUpper<MT4>::value )
10016  ?( ( IsLower<MT5>::value )
10017  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10018  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10019  :( IsLower<MT5>::value ? j : 0UL ) );
10020  const size_t kend( ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, K ) ):( K ) );
10021 
10022  IntrinsicType xmm1, xmm2;
10023 
10024  for( size_t k=kbegin; k<kend; ++k ) {
10025  const IntrinsicType a1( set( A(i,k) ) );
10026  xmm1 = xmm1 + a1 * B.load(k,j );
10027  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
10028  }
10029 
10030  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10031  (~C).store( i, j+IT::size, (~C).load(i,j+IT::size) - xmm2 * factor );
10032  }
10033  }
10034 
10035  if( j < N )
10036  {
10037  size_t i( 0UL );
10038 
10039  for( ; (i+2UL) <= M; i+=2UL )
10040  {
10041  const size_t kbegin( ( IsUpper<MT4>::value )
10042  ?( ( IsLower<MT5>::value )
10043  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10044  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10045  :( IsLower<MT5>::value ? j : 0UL ) );
10046  const size_t kend( ( IsLower<MT4>::value )
10047  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
10048  :( K ) );
10049 
10050  IntrinsicType xmm1, xmm2;
10051 
10052  for( size_t k=kbegin; k<kend; ++k ) {
10053  const IntrinsicType b1( B.load(k,j) );
10054  xmm1 = xmm1 + set( A(i ,k) ) * b1;
10055  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
10056  }
10057 
10058  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10059  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
10060  }
10061 
10062  if( i < M )
10063  {
10064  const size_t kbegin( ( IsUpper<MT4>::value )
10065  ?( ( IsLower<MT5>::value )
10066  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10067  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10068  :( IsLower<MT5>::value ? j : 0UL ) );
10069 
10070  IntrinsicType xmm1;
10071 
10072  for( size_t k=kbegin; k<K; ++k ) {
10073  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
10074  }
10075 
10076  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10077  }
10078  }
10079  }
10080  //**********************************************************************************************
10081 
10082  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
10097  template< typename MT3 // Type of the left-hand side target matrix
10098  , typename MT4 // Type of the left-hand side matrix operand
10099  , typename MT5 // Type of the right-hand side matrix operand
10100  , typename ST2 > // Type of the scalar value
10101  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10102  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
10103  {
10104  typedef IntrinsicTrait<ElementType> IT;
10105 
10106  const size_t M( A.rows() );
10107  const size_t N( B.columns() );
10108  const size_t K( A.columns() );
10109 
10110  const IntrinsicType factor( set( scalar ) );
10111 
10112  size_t i( 0UL );
10113 
10114  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
10115  for( size_t j=0UL; j<N; ++j )
10116  {
10117  const size_t kbegin( ( IsLower<MT5>::value )
10118  ?( ( IsUpper<MT4>::value )
10119  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10120  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10121  :( IsUpper<MT4>::value ? i : 0UL ) );
10122  const size_t kend( ( IsUpper<MT5>::value )
10123  ?( ( IsLower<MT4>::value )
10124  ?( min( i+IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
10125  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
10126  :( IsLower<MT4>::value ? min( i+IT::size*8UL, K ) : K ) );
10127 
10128  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10129 
10130  for( size_t k=kbegin; k<kend; ++k ) {
10131  const IntrinsicType b1( set( B(k,j) ) );
10132  xmm1 = xmm1 + A.load(i ,k) * b1;
10133  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
10134  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
10135  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
10136  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
10137  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
10138  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
10139  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
10140  }
10141 
10142  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10143  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
10144  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
10145  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
10146  (~C).store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) - xmm5 * factor );
10147  (~C).store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) - xmm6 * factor );
10148  (~C).store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) - xmm7 * factor );
10149  (~C).store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) - xmm8 * factor );
10150  }
10151  }
10152 
10153  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL )
10154  {
10155  size_t j( 0UL );
10156 
10157  for( ; (j+2UL) <= N; j+=2UL )
10158  {
10159  const size_t kbegin( ( IsLower<MT5>::value )
10160  ?( ( IsUpper<MT4>::value )
10161  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10162  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10163  :( IsUpper<MT4>::value ? i : 0UL ) );
10164  const size_t kend( ( IsUpper<MT5>::value )
10165  ?( ( IsLower<MT4>::value )
10166  ?( min( i+IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10167  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10168  :( IsLower<MT4>::value ? min( i+IT::size*4UL, K ) : K ) );
10169 
10170  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10171 
10172  for( size_t k=kbegin; k<kend; ++k ) {
10173  const IntrinsicType a1( A.load(i ,k) );
10174  const IntrinsicType a2( A.load(i+IT::size ,k) );
10175  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
10176  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
10177  const IntrinsicType b1( set( B(k,j ) ) );
10178  const IntrinsicType b2( set( B(k,j+1UL) ) );
10179  xmm1 = xmm1 + a1 * b1;
10180  xmm2 = xmm2 + a2 * b1;
10181  xmm3 = xmm3 + a3 * b1;
10182  xmm4 = xmm4 + a4 * b1;
10183  xmm5 = xmm5 + a1 * b2;
10184  xmm6 = xmm6 + a2 * b2;
10185  xmm7 = xmm7 + a3 * b2;
10186  xmm8 = xmm8 + a4 * b2;
10187  }
10188 
10189  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10190  (~C).store( i+IT::size , j , (~C).load(i+IT::size ,j ) - xmm2 * factor );
10191  (~C).store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) - xmm3 * factor );
10192  (~C).store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) - xmm4 * factor );
10193  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
10194  (~C).store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) - xmm6 * factor );
10195  (~C).store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) - xmm7 * factor );
10196  (~C).store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) - xmm8 * factor );
10197  }
10198 
10199  if( j < N )
10200  {
10201  const size_t kbegin( ( IsLower<MT5>::value )
10202  ?( ( IsUpper<MT4>::value )
10203  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10204  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10205  :( IsUpper<MT4>::value ? i : 0UL ) );
10206  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, K ) ):( K ) );
10207 
10208  IntrinsicType xmm1, xmm2, xmm3, xmm4;
10209 
10210  for( size_t k=kbegin; k<kend; ++k ) {
10211  const IntrinsicType b1( set( B(k,j) ) );
10212  xmm1 = xmm1 + A.load(i ,k) * b1;
10213  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
10214  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
10215  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
10216  }
10217 
10218  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10219  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
10220  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
10221  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
10222  }
10223  }
10224 
10225  for( ; (i+IT::size) < M; i+=IT::size*2UL )
10226  {
10227  size_t j( 0UL );
10228 
10229  for( ; (j+2UL) <= N; j+=2UL )
10230  {
10231  const size_t kbegin( ( IsLower<MT5>::value )
10232  ?( ( IsUpper<MT4>::value )
10233  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10234  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10235  :( IsUpper<MT4>::value ? i : 0UL ) );
10236  const size_t kend( ( IsUpper<MT5>::value )
10237  ?( ( IsLower<MT4>::value )
10238  ?( min( i+IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10239  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10240  :( IsLower<MT4>::value ? min( i+IT::size*2UL, K ) : K ) );
10241 
10242  IntrinsicType xmm1, xmm2, xmm3, xmm4;
10243 
10244  for( size_t k=kbegin; k<kend; ++k ) {
10245  const IntrinsicType a1( A.load(i ,k) );
10246  const IntrinsicType a2( A.load(i+IT::size,k) );
10247  const IntrinsicType b1( set( B(k,j ) ) );
10248  const IntrinsicType b2( set( B(k,j+1UL) ) );
10249  xmm1 = xmm1 + a1 * b1;
10250  xmm2 = xmm2 + a2 * b1;
10251  xmm3 = xmm3 + a1 * b2;
10252  xmm4 = xmm4 + a2 * b2;
10253  }
10254 
10255  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10256  (~C).store( i+IT::size, j , (~C).load(i+IT::size,j ) - xmm2 * factor );
10257  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10258  (~C).store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) - xmm4 * factor );
10259  }
10260 
10261  if( j < N )
10262  {
10263  const size_t kbegin( ( IsLower<MT5>::value )
10264  ?( ( IsUpper<MT4>::value )
10265  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10266  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10267  :( IsUpper<MT4>::value ? i : 0UL ) );
10268  const size_t kend( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, K ) ):( K ) );
10269 
10270  IntrinsicType xmm1, xmm2;
10271 
10272  for( size_t k=kbegin; k<kend; ++k ) {
10273  const IntrinsicType b1( set( B(k,j) ) );
10274  xmm1 = xmm1 + A.load(i ,k) * b1;
10275  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
10276  }
10277 
10278  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10279  (~C).store( i+IT::size, j, (~C).load(i+IT::size,j) - xmm2 * factor );
10280  }
10281  }
10282 
10283  if( i < M )
10284  {
10285  size_t j( 0UL );
10286 
10287  for( ; (j+2UL) <= N; j+=2UL )
10288  {
10289  const size_t kbegin( ( IsLower<MT5>::value )
10290  ?( ( IsUpper<MT4>::value )
10291  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10292  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10293  :( IsUpper<MT4>::value ? i : 0UL ) );
10294  const size_t kend( ( IsUpper<MT5>::value )
10295  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10296  :( K ) );
10297 
10298  IntrinsicType xmm1, xmm2;
10299 
10300  for( size_t k=kbegin; k<kend; ++k ) {
10301  const IntrinsicType a1( A.load(i,k) );
10302  xmm1 = xmm1 + a1 * set( B(k,j ) );
10303  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
10304  }
10305 
10306  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10307  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
10308  }
10309 
10310  if( j < N )
10311  {
10312  const size_t kbegin( ( IsLower<MT5>::value )
10313  ?( ( IsUpper<MT4>::value )
10314  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10315  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10316  :( IsUpper<MT4>::value ? i : 0UL ) );
10317 
10318  IntrinsicType xmm1;
10319 
10320  for( size_t k=kbegin; k<K; ++k ) {
10321  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
10322  }
10323 
10324  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10325  }
10326  }
10327  }
10328  //**********************************************************************************************
10329 
10330  //**Default subtraction assignment to dense matrices (large matrices)***************************
10344  template< typename MT3 // Type of the left-hand side target matrix
10345  , typename MT4 // Type of the left-hand side matrix operand
10346  , typename MT5 // Type of the right-hand side matrix operand
10347  , typename ST2 > // Type of the scalar value
10348  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10349  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10350  {
10351  selectDefaultSubAssignKernel( C, A, B, scalar );
10352  }
10353  //**********************************************************************************************
10354 
10355  //**Vectorized default subtraction assignment to row-major dense matrices (large matrices)******
10370  template< typename MT3 // Type of the left-hand side target matrix
10371  , typename MT4 // Type of the left-hand side matrix operand
10372  , typename MT5 // Type of the right-hand side matrix operand
10373  , typename ST2 > // Type of the scalar value
10374  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10375  selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
10376  {
10377  typedef IntrinsicTrait<ElementType> IT;
10378 
10379  const size_t M( A.rows() );
10380  const size_t N( B.columns() );
10381  const size_t K( A.columns() );
10382 
10383  const size_t iblock( 64UL );
10384  const size_t jblock( 128UL );
10385  const size_t kblock( 128UL );
10386 
10387  const IntrinsicType factor( set( scalar ) );
10388 
10389  for( size_t jj=0UL; jj<N; jj+=jblock )
10390  {
10391  const size_t jend( min( jj+jblock, N ) );
10392 
10393  for( size_t ii=0UL; ii<M; ii+=iblock )
10394  {
10395  const size_t iend( min( ii+iblock, M ) );
10396 
10397  for( size_t kk=0UL; kk<K; kk+=kblock )
10398  {
10399  const size_t ktmp( min( kk+kblock, K ) );
10400 
10401  size_t j( jj );
10402 
10403  for( ; (j+IT::size*3UL) < jend; j+=IT::size*4UL )
10404  {
10405  const size_t j1( j+IT::size );
10406  const size_t j2( j+IT::size*2UL );
10407  const size_t j3( j+IT::size*3UL );
10408 
10409  size_t i( ii );
10410 
10411  for( ; (i+2UL) <= iend; i+=2UL )
10412  {
10413  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10414  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10415  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
10416  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
10417 
10418  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10419 
10420  for( size_t k=kbegin; k<kend; ++k ) {
10421  const IntrinsicType a1( set( A(i ,k) ) );
10422  const IntrinsicType a2( set( A(i+1UL,k) ) );
10423  const IntrinsicType b1( B.load(k,j ) );
10424  const IntrinsicType b2( B.load(k,j1) );
10425  const IntrinsicType b3( B.load(k,j2) );
10426  const IntrinsicType b4( B.load(k,j3) );
10427  xmm1 = xmm1 + a1 * b1;
10428  xmm2 = xmm2 + a1 * b2;
10429  xmm3 = xmm3 + a1 * b3;
10430  xmm4 = xmm4 + a1 * b4;
10431  xmm5 = xmm5 + a2 * b1;
10432  xmm6 = xmm6 + a2 * b2;
10433  xmm7 = xmm7 + a2 * b3;
10434  xmm8 = xmm8 + a2 * b4;
10435  }
10436 
10437  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10438  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10439  (~C).store( i , j2, (~C).load(i ,j2) - xmm3 * factor );
10440  (~C).store( i , j3, (~C).load(i ,j3) - xmm4 * factor );
10441  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
10442  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm6 * factor );
10443  (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) - xmm7 * factor );
10444  (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) - xmm8 * factor );
10445  }
10446 
10447  if( i < iend )
10448  {
10449  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10450  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10451  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10452  ( IsUpper<MT5>::value )?( min( j+IT::size*4UL, ktmp ) ):( ktmp ) ) );
10453 
10454  IntrinsicType xmm1, xmm2, xmm3, xmm4;
10455 
10456  for( size_t k=kbegin; k<kend; ++k ) {
10457  const IntrinsicType a1( set( A(i,k) ) );
10458  xmm1 = xmm1 + a1 * B.load(k,j );
10459  xmm2 = xmm2 + a1 * B.load(k,j1);
10460  xmm3 = xmm3 + a1 * B.load(k,j2);
10461  xmm4 = xmm4 + a1 * B.load(k,j3);
10462  }
10463 
10464  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10465  (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
10466  (~C).store( i, j2, (~C).load(i,j2) - xmm3 * factor );
10467  (~C).store( i, j3, (~C).load(i,j3) - xmm4 * factor );
10468  }
10469  }
10470 
10471  for( ; (j+IT::size) < jend; j+=IT::size*2UL )
10472  {
10473  const size_t j1( j+IT::size );
10474 
10475  size_t i( ii );
10476 
10477  for( ; (i+4UL) <= iend; i+=4UL )
10478  {
10479  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10480  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10481  const size_t kend ( min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
10482  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
10483 
10484  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10485 
10486  for( size_t k=kbegin; k<kend; ++k ) {
10487  const IntrinsicType a1( set( A(i ,k) ) );
10488  const IntrinsicType a2( set( A(i+1UL,k) ) );
10489  const IntrinsicType a3( set( A(i+2UL,k) ) );
10490  const IntrinsicType a4( set( A(i+3UL,k) ) );
10491  const IntrinsicType b1( B.load(k,j ) );
10492  const IntrinsicType b2( B.load(k,j1) );
10493  xmm1 = xmm1 + a1 * b1;
10494  xmm2 = xmm2 + a1 * b2;
10495  xmm3 = xmm3 + a2 * b1;
10496  xmm4 = xmm4 + a2 * b2;
10497  xmm5 = xmm5 + a3 * b1;
10498  xmm6 = xmm6 + a3 * b2;
10499  xmm7 = xmm7 + a4 * b1;
10500  xmm8 = xmm8 + a4 * b2;
10501  }
10502 
10503  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10504  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10505  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10506  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
10507  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
10508  (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) - xmm6 * factor );
10509  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
10510  (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) - xmm8 * factor );
10511  }
10512 
10513  for( ; (i+2UL) <= iend; i+=2UL )
10514  {
10515  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10516  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10517  const size_t kend ( min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
10518  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
10519 
10520  IntrinsicType xmm1, xmm2, xmm3, xmm4;
10521 
10522  for( size_t k=kbegin; k<kend; ++k ) {
10523  const IntrinsicType a1( set( A(i ,k) ) );
10524  const IntrinsicType a2( set( A(i+1UL,k) ) );
10525  const IntrinsicType b1( B.load(k,j ) );
10526  const IntrinsicType b2( B.load(k,j1) );
10527  xmm1 = xmm1 + a1 * b1;
10528  xmm2 = xmm2 + a1 * b2;
10529  xmm3 = xmm3 + a2 * b1;
10530  xmm4 = xmm4 + a2 * b2;
10531  }
10532 
10533  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10534  (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10535  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10536  (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
10537  }
10538 
10539  if( i < iend )
10540  {
10541  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10542  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10543  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10544  ( IsUpper<MT5>::value )?( min( j+IT::size*2UL, ktmp ) ):( ktmp ) ) );
10545 
10546  IntrinsicType xmm1, xmm2;
10547 
10548  for( size_t k=kbegin; k<kend; ++k ) {
10549  const IntrinsicType a1( set( A(i,k) ) );
10550  xmm1 = xmm1 + a1 * B.load(k,j );
10551  xmm2 = xmm2 + a1 * B.load(k,j1);
10552  }
10553 
10554  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10555  (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
10556  }
10557  }
10558 
10559  if( j < jend )
10560  {
10561  for( size_t i=ii; i<iend; ++i )
10562  {
10563  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10564  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10565  const size_t kend ( min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10566  ( IsUpper<MT5>::value )?( min( j+IT::size, ktmp ) ):( ktmp ) ) );
10567 
10568  IntrinsicType xmm1;
10569 
10570  for( size_t k=kbegin; k<kend; ++k ) {
10571  const IntrinsicType a1( set( A(i,k) ) );
10572  xmm1 = xmm1 + a1 * B.load(k,j);
10573  }
10574 
10575  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10576  }
10577  }
10578  }
10579  }
10580  }
10581  }
10582  //**********************************************************************************************
10583 
10584  //**Vectorized default subtraction assignment to column-major dense matrices (large matrices)***
10599  template< typename MT3 // Type of the left-hand side target matrix
10600  , typename MT4 // Type of the left-hand side matrix operand
10601  , typename MT5 // Type of the right-hand side matrix operand
10602  , typename ST2 > // Type of the scalar value
10603  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10604  selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
10605  {
10606  typedef IntrinsicTrait<ElementType> IT;
10607 
10608  const size_t M( A.rows() );
10609  const size_t N( B.columns() );
10610  const size_t K( A.columns() );
10611 
10612  const size_t iblock( 128UL );
10613  const size_t jblock( 64UL );
10614  const size_t kblock( 128UL );
10615 
10616  const IntrinsicType factor( set( scalar ) );
10617 
10618  for( size_t ii=0UL; ii<M; ii+=iblock )
10619  {
10620  const size_t iend( min( ii+iblock, M ) );
10621 
10622  for( size_t jj=0UL; jj<N; jj+=jblock )
10623  {
10624  const size_t jend( min( jj+jblock, N ) );
10625 
10626  for( size_t kk=0UL; kk<K; kk+=kblock )
10627  {
10628  const size_t ktmp( min( kk+kblock, K ) );
10629 
10630  size_t i( ii );
10631 
10632  for( ; (i+IT::size*3UL) < iend; i+=IT::size*4UL )
10633  {
10634  const size_t i1( i+IT::size );
10635  const size_t i2( i+IT::size*2UL );
10636  const size_t i3( i+IT::size*3UL );
10637 
10638  size_t j( jj );
10639 
10640  for( ; (j+2UL) <= jend; j+=2UL )
10641  {
10642  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10643  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10644  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
10645  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
10646 
10647  IntrinsicType xmm1( (~C).load(i ,j ) );
10648  IntrinsicType xmm2( (~C).load(i1,j ) );
10649  IntrinsicType xmm3( (~C).load(i2,j ) );
10650  IntrinsicType xmm4( (~C).load(i3,j ) );
10651  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
10652  IntrinsicType xmm6( (~C).load(i1,j+1UL) );
10653  IntrinsicType xmm7( (~C).load(i2,j+1UL) );
10654  IntrinsicType xmm8( (~C).load(i3,j+1UL) );
10655 
10656  for( size_t k=kbegin; k<kend; ++k ) {
10657  const IntrinsicType a1( A.load(i ,k) );
10658  const IntrinsicType a2( A.load(i1,k) );
10659  const IntrinsicType a3( A.load(i2,k) );
10660  const IntrinsicType a4( A.load(i3,k) );
10661  const IntrinsicType b1( set( B(k,j ) ) );
10662  const IntrinsicType b2( set( B(k,j+1UL) ) );
10663  xmm1 = xmm1 - a1 * b1;
10664  xmm2 = xmm2 - a2 * b1;
10665  xmm3 = xmm3 - a3 * b1;
10666  xmm4 = xmm4 - a4 * b1;
10667  xmm5 = xmm5 - a1 * b2;
10668  xmm6 = xmm6 - a2 * b2;
10669  xmm7 = xmm7 - a3 * b2;
10670  xmm8 = xmm8 - a4 * b2;
10671  }
10672 
10673  (~C).store( i , j , xmm1 * factor );
10674  (~C).store( i1, j , xmm2 * factor );
10675  (~C).store( i2, j , xmm3 * factor );
10676  (~C).store( i3, j , xmm4 * factor );
10677  (~C).store( i , j+1UL, xmm5 * factor );
10678  (~C).store( i1, j+1UL, xmm6 * factor );
10679  (~C).store( i2, j+1UL, xmm7 * factor );
10680  (~C).store( i3, j+1UL, xmm8 * factor );
10681  }
10682 
10683  if( j < jend )
10684  {
10685  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10686  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10687  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*4UL, ktmp ) ):( ktmp ),
10688  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10689 
10690  IntrinsicType xmm1( (~C).load(i ,j) );
10691  IntrinsicType xmm2( (~C).load(i1,j) );
10692  IntrinsicType xmm3( (~C).load(i2,j) );
10693  IntrinsicType xmm4( (~C).load(i3,j) );
10694 
10695  for( size_t k=kbegin; k<kend; ++k ) {
10696  const IntrinsicType b1( set( B(k,j) ) );
10697  xmm1 = xmm1 - A.load(i ,k) * b1;
10698  xmm2 = xmm2 - A.load(i1,k) * b1;
10699  xmm3 = xmm3 - A.load(i2,k) * b1;
10700  xmm4 = xmm4 - A.load(i3,k) * b1;
10701  }
10702 
10703  (~C).store( i , j, xmm1 * factor );
10704  (~C).store( i1, j, xmm2 * factor );
10705  (~C).store( i2, j, xmm3 * factor );
10706  (~C).store( i3, j, xmm4 * factor );
10707  }
10708  }
10709 
10710  for( ; (i+IT::size) < iend; i+=IT::size*2UL )
10711  {
10712  const size_t i1( i+IT::size );
10713 
10714  size_t j( jj );
10715 
10716  for( ; (j+4UL) <= jend; j+=4UL )
10717  {
10718  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10719  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10720  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
10721  ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
10722 
10723  IntrinsicType xmm1( (~C).load(i ,j ) );
10724  IntrinsicType xmm2( (~C).load(i1,j ) );
10725  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
10726  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
10727  IntrinsicType xmm5( (~C).load(i ,j+2UL) );
10728  IntrinsicType xmm6( (~C).load(i1,j+2UL) );
10729  IntrinsicType xmm7( (~C).load(i ,j+3UL) );
10730  IntrinsicType xmm8( (~C).load(i1,j+3UL) );
10731 
10732  for( size_t k=kbegin; k<kend; ++k ) {
10733  const IntrinsicType a1( A.load(i ,k) );
10734  const IntrinsicType a2( A.load(i1,k) );
10735  const IntrinsicType b1( set( B(k,j ) ) );
10736  const IntrinsicType b2( set( B(k,j+1UL) ) );
10737  const IntrinsicType b3( set( B(k,j+2UL) ) );
10738  const IntrinsicType b4( set( B(k,j+3UL) ) );
10739  xmm1 = xmm1 - a1 * b1;
10740  xmm2 = xmm2 - a2 * b1;
10741  xmm3 = xmm3 - a1 * b2;
10742  xmm4 = xmm4 - a2 * b2;
10743  xmm5 = xmm5 - a1 * b3;
10744  xmm6 = xmm6 - a2 * b3;
10745  xmm7 = xmm7 - a1 * b4;
10746  xmm8 = xmm8 - a2 * b4;
10747  }
10748 
10749  (~C).store( i , j , xmm1 * factor );
10750  (~C).store( i1, j , xmm2 * factor );
10751  (~C).store( i , j+1UL, xmm3 * factor );
10752  (~C).store( i1, j+1UL, xmm4 * factor );
10753  (~C).store( i , j+2UL, xmm5 * factor );
10754  (~C).store( i1, j+2UL, xmm6 * factor );
10755  (~C).store( i , j+3UL, xmm7 * factor );
10756  (~C).store( i1, j+3UL, xmm8 * factor );
10757  }
10758 
10759  for( ; (j+2UL) <= jend; j+=2UL )
10760  {
10761  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10762  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10763  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
10764  ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
10765 
10766  IntrinsicType xmm1( (~C).load(i ,j ) );
10767  IntrinsicType xmm2( (~C).load(i1,j ) );
10768  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
10769  IntrinsicType xmm4( (~C).load(i1,j+1UL) );
10770 
10771  for( size_t k=kbegin; k<kend; ++k ) {
10772  const IntrinsicType a1( A.load(i ,k) );
10773  const IntrinsicType a2( A.load(i1,k) );
10774  const IntrinsicType b1( set( B(k,j ) ) );
10775  const IntrinsicType b2( set( B(k,j+1UL) ) );
10776  xmm1 = xmm1 - a1 * b1;
10777  xmm2 = xmm2 - a2 * b1;
10778  xmm3 = xmm3 - a1 * b2;
10779  xmm4 = xmm4 - a2 * b2;
10780  }
10781 
10782  (~C).store( i , j , xmm1 * factor );
10783  (~C).store( i1, j , xmm2 * factor );
10784  (~C).store( i , j+1UL, xmm3 * factor );
10785  (~C).store( i1, j+1UL, xmm4 * factor );
10786  }
10787 
10788  if( j < jend )
10789  {
10790  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10791  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10792  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size*2UL, ktmp ) ):( ktmp ),
10793  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10794 
10795  IntrinsicType xmm1( (~C).load(i ,j) );
10796  IntrinsicType xmm2( (~C).load(i1,j) );
10797 
10798  for( size_t k=kbegin; k<kend; ++k ) {
10799  const IntrinsicType b1( set( B(k,j) ) );
10800  xmm1 = xmm1 - A.load(i ,k) * b1;
10801  xmm2 = xmm2 - A.load(i1,k) * b1;
10802  }
10803 
10804  (~C).store( i , j, xmm1 * factor );
10805  (~C).store( i1, j, xmm2 * factor );
10806  }
10807  }
10808 
10809  if( i < iend )
10810  {
10811  for( size_t j=jj; j<jend; ++j )
10812  {
10813  const size_t kbegin( max( ( IsUpper<MT4>::value )?( max( i, kk ) ):( kk ),
10814  ( IsLower<MT5>::value )?( max( j, kk ) ):( kk ) ) );
10815  const size_t kend ( min( ( IsLower<MT4>::value )?( min( i+IT::size, ktmp ) ):( ktmp ),
10816  ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10817 
10818  IntrinsicType xmm1( (~C).load(i,j) );
10819 
10820  for( size_t k=kbegin; k<kend; ++k ) {
10821  const IntrinsicType b1( set( B(k,j) ) );
10822  xmm1 = xmm1 - A.load(i,k) * b1;
10823  }
10824 
10825  (~C).store( i, j, xmm1 * factor );
10826  }
10827  }
10828  }
10829  }
10830  }
10831  }
10832  //**********************************************************************************************
10833 
10834  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
10848  template< typename MT3 // Type of the left-hand side target matrix
10849  , typename MT4 // Type of the left-hand side matrix operand
10850  , typename MT5 // Type of the right-hand side matrix operand
10851  , typename ST2 > // Type of the scalar value
10852  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10853  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10854  {
10855  selectLargeSubAssignKernel( C, A, B, scalar );
10856  }
10857  //**********************************************************************************************
10858 
10859  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
10860 #if BLAZE_BLAS_MODE
10861 
10874  template< typename MT3 // Type of the left-hand side target matrix
10875  , typename MT4 // Type of the left-hand side matrix operand
10876  , typename MT5 // Type of the right-hand side matrix operand
10877  , typename ST2 > // Type of the scalar value
10878  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
10879  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10880  {
10881  if( IsTriangular<MT4>::value ) {
10882  typename MT3::ResultType tmp( B );
10883  strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
10884  subAssign( C, tmp );
10885  }
10886  else if( IsTriangular<MT5>::value ) {
10887  typename MT3::ResultType tmp( A );
10888  strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
10889  subAssign( C, tmp );
10890  }
10891  else {
10892  sgemm( C, A, B, -scalar, 1.0F );
10893  }
10894  }
10895 #endif
10896  //**********************************************************************************************
10897 
10898  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
10899 #if BLAZE_BLAS_MODE
10900 
10913  template< typename MT3 // Type of the left-hand side target matrix
10914  , typename MT4 // Type of the left-hand side matrix operand
10915  , typename MT5 // Type of the right-hand side matrix operand
10916  , typename ST2 > // Type of the scalar value
10917  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
10918  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10919  {
10920  if( IsTriangular<MT4>::value ) {
10921  typename MT3::ResultType tmp( B );
10922  dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
10923  subAssign( C, tmp );
10924  }
10925  else if( IsTriangular<MT5>::value ) {
10926  typename MT3::ResultType tmp( A );
10927  dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
10928  subAssign( C, tmp );
10929  }
10930  else {
10931  dgemm( C, A, B, -scalar, 1.0 );
10932  }
10933  }
10934 #endif
10935  //**********************************************************************************************
10936 
10937  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
10938 #if BLAZE_BLAS_MODE
10939 
10952  template< typename MT3 // Type of the left-hand side target matrix
10953  , typename MT4 // Type of the left-hand side matrix operand
10954  , typename MT5 // Type of the right-hand side matrix operand
10955  , typename ST2 > // Type of the scalar value
10956  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
10957  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10958  {
10959  if( IsTriangular<MT4>::value ) {
10960  typename MT3::ResultType tmp( B );
10961  ctrmm( tmp, A, CblasLeft,
10962  ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
10963  complex<float>( scalar, 0.0F ) );
10964  subAssign( C, tmp );
10965  }
10966  else if( IsTriangular<MT5>::value ) {
10967  typename MT3::ResultType tmp( A );
10968  ctrmm( tmp, B, CblasRight,
10969  ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
10970  complex<float>( scalar, 0.0F ) );
10971  subAssign( C, tmp );
10972  }
10973  else {
10974  cgemm( C, A, B, complex<float>( -scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
10975  }
10976  }
10977 #endif
10978  //**********************************************************************************************
10979 
10980  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
10981 #if BLAZE_BLAS_MODE
10982 
10995  template< typename MT3 // Type of the left-hand side target matrix
10996  , typename MT4 // Type of the left-hand side matrix operand
10997  , typename MT5 // Type of the right-hand side matrix operand
10998  , typename ST2 > // Type of the scalar value
10999  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
11000  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11001  {
11002  if( IsTriangular<MT4>::value ) {
11003  typename MT3::ResultType tmp( B );
11004  ztrmm( tmp, A, CblasLeft,
11005  ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
11006  complex<float>( scalar, 0.0 ) );
11007  subAssign( C, tmp );
11008  }
11009  else if( IsTriangular<MT5>::value ) {
11010  typename MT3::ResultType tmp( A );
11011  ztrmm( tmp, B, CblasRight,
11012  ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
11013  complex<float>( scalar, 0.0 ) );
11014  subAssign( C, tmp );
11015  }
11016  else {
11017  zgemm( C, A, B, complex<double>( -scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
11018  }
11019  }
11020 #endif
11021  //**********************************************************************************************
11022 
11023  //**Subtraction assignment to sparse matrices***************************************************
11024  // No special implementation for the subtraction assignment to sparse matrices.
11025  //**********************************************************************************************
11026 
11027  //**Multiplication assignment to dense matrices*************************************************
11028  // No special implementation for the multiplication assignment to dense matrices.
11029  //**********************************************************************************************
11030 
11031  //**Multiplication assignment to sparse matrices************************************************
11032  // No special implementation for the multiplication assignment to sparse matrices.
11033  //**********************************************************************************************
11034 
11035  //**SMP assignment to dense matrices************************************************************
11050  template< typename MT // Type of the target dense matrix
11051  , bool SO > // Storage order of the target dense matrix
11052  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11053  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11054  {
11056 
11057  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11058  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11059 
11060  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
11061  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
11062 
11063  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
11064  return;
11065  }
11066  else if( left.columns() == 0UL ) {
11067  reset( ~lhs );
11068  return;
11069  }
11070 
11071  LT A( left ); // Evaluation of the left-hand side dense matrix operand
11072  RT B( right ); // Evaluation of the right-hand side dense matrix operand
11073 
11074  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11075  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11076  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11077  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11078  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
11079  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
11080 
11081  smpAssign( ~lhs, A * B * rhs.scalar_ );
11082  }
11083  //**********************************************************************************************
11084 
11085  //**SMP assignment to sparse matrices***********************************************************
11100  template< typename MT // Type of the target sparse matrix
11101  , bool SO > // Storage order of the target sparse matrix
11102  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11103  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11104  {
11106 
11107  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
11108 
11115 
11116  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11117  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11118 
11119  const TmpType tmp( rhs );
11120  smpAssign( ~lhs, tmp );
11121  }
11122  //**********************************************************************************************
11123 
11124  //**SMP addition assignment to dense matrices***************************************************
11139  template< typename MT // Type of the target dense matrix
11140  , bool SO > // Storage order of the target dense matrix
11141  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11142  smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11143  {
11145 
11146  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11147  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11148 
11149  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
11150  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
11151 
11152  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
11153  return;
11154  }
11155 
11156  LT A( left ); // Evaluation of the left-hand side dense matrix operand
11157  RT B( right ); // Evaluation of the right-hand side dense matrix operand
11158 
11159  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11160  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11161  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11162  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11163  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
11164  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
11165 
11166  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
11167  }
11168  //**********************************************************************************************
11169 
11170  //**SMP addition assignment to sparse matrices**************************************************
11171  // No special implementation for the SMP addition assignment to sparse matrices.
11172  //**********************************************************************************************
11173 
11174  //**SMP subtraction assignment to dense matrices************************************************
11189  template< typename MT // Type of the target dense matrix
11190  , bool SO > // Storage order of the target dense matrix
11191  friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11192  smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11193  {
11195 
11196  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11197  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11198 
11199  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
11200  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
11201 
11202  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
11203  return;
11204  }
11205 
11206  LT A( left ); // Evaluation of the left-hand side dense matrix operand
11207  RT B( right ); // Evaluation of the right-hand side dense matrix operand
11208 
11209  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11210  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11211  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11212  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11213  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
11214  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
11215 
11216  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
11217  }
11218  //**********************************************************************************************
11219 
11220  //**SMP subtraction assignment to sparse matrices***********************************************
11221  // No special implementation for the SMP subtraction assignment to sparse matrices.
11222  //**********************************************************************************************
11223 
11224  //**SMP multiplication assignment to dense matrices*********************************************
11225  // No special implementation for the SMP multiplication assignment to dense matrices.
11226  //**********************************************************************************************
11227 
11228  //**SMP multiplication assignment to sparse matrices********************************************
11229  // No special implementation for the SMP multiplication assignment to sparse matrices.
11230  //**********************************************************************************************
11231 
11232  //**Compile time checks*************************************************************************
11240  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
11241  //**********************************************************************************************
11242 };
11244 //*************************************************************************************************
11245 
11246 
11247 
11248 
11249 //=================================================================================================
11250 //
11251 // GLOBAL BINARY ARITHMETIC OPERATORS
11252 //
11253 //=================================================================================================
11254 
11255 //*************************************************************************************************
11284 template< typename T1 // Type of the left-hand side dense matrix
11285  , typename T2 > // Type of the right-hand side dense matrix
11286 inline const TDMatDMatMultExpr<T1,T2>
11288 {
11290 
11291  if( (~lhs).columns() != (~rhs).rows() )
11292  throw std::invalid_argument( "Matrix sizes do not match" );
11293 
11294  return TDMatDMatMultExpr<T1,T2>( ~lhs, ~rhs );
11295 }
11296 //*************************************************************************************************
11297 
11298 
11299 
11300 
11301 //=================================================================================================
11302 //
11303 // ROWS SPECIALIZATIONS
11304 //
11305 //=================================================================================================
11306 
11307 //*************************************************************************************************
11309 template< typename MT1, typename MT2 >
11310 struct Rows< TDMatDMatMultExpr<MT1,MT2> >
11311  : public Rows<MT1>
11312 {};
11314 //*************************************************************************************************
11315 
11316 
11317 
11318 
11319 //=================================================================================================
11320 //
11321 // COLUMNS SPECIALIZATIONS
11322 //
11323 //=================================================================================================
11324 
11325 //*************************************************************************************************
11327 template< typename MT1, typename MT2 >
11328 struct Columns< TDMatDMatMultExpr<MT1,MT2> >
11329  : public Columns<MT2>
11330 {};
11332 //*************************************************************************************************
11333 
11334 
11335 
11336 
11337 //=================================================================================================
11338 //
11339 // ISLOWER SPECIALIZATIONS
11340 //
11341 //=================================================================================================
11342 
11343 //*************************************************************************************************
11345 template< typename MT1, typename MT2 >
11346 struct IsLower< TDMatDMatMultExpr<MT1,MT2> >
11347  : public IsTrue< And< IsLower<MT1>, IsLower<MT2> >::value >
11348 {};
11350 //*************************************************************************************************
11351 
11352 
11353 
11354 
11355 //=================================================================================================
11356 //
11357 // ISUNILOWER SPECIALIZATIONS
11358 //
11359 //=================================================================================================
11360 
11361 //*************************************************************************************************
11363 template< typename MT1, typename MT2 >
11364 struct IsUniLower< TDMatDMatMultExpr<MT1,MT2> >
11365  : public IsTrue< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
11366 {};
11368 //*************************************************************************************************
11369 
11370 
11371 
11372 
11373 //=================================================================================================
11374 //
11375 // ISSTRICTLYLOWER SPECIALIZATIONS
11376 //
11377 //=================================================================================================
11378 
11379 //*************************************************************************************************
11381 template< typename MT1, typename MT2 >
11382 struct IsStrictlyLower< TDMatDMatMultExpr<MT1,MT2> >
11383  : public IsTrue< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
11384  , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
11385 {};
11387 //*************************************************************************************************
11388 
11389 
11390 
11391 
11392 //=================================================================================================
11393 //
11394 // ISUPPER SPECIALIZATIONS
11395 //
11396 //=================================================================================================
11397 
11398 //*************************************************************************************************
11400 template< typename MT1, typename MT2 >
11401 struct IsUpper< TDMatDMatMultExpr<MT1,MT2> >
11402  : public IsTrue< And< IsUpper<MT1>, IsUpper<MT2> >::value >
11403 {};
11405 //*************************************************************************************************
11406 
11407 
11408 
11409 
11410 //=================================================================================================
11411 //
11412 // ISUNIUPPER SPECIALIZATIONS
11413 //
11414 //=================================================================================================
11415 
11416 //*************************************************************************************************
11418 template< typename MT1, typename MT2 >
11419 struct IsUniUpper< TDMatDMatMultExpr<MT1,MT2> >
11420  : public IsTrue< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
11421 {};
11423 //*************************************************************************************************
11424 
11425 
11426 
11427 
11428 //=================================================================================================
11429 //
11430 // ISSTRICTLYUPPER SPECIALIZATIONS
11431 //
11432 //=================================================================================================
11433 
11434 //*************************************************************************************************
11436 template< typename MT1, typename MT2 >
11437 struct IsStrictlyUpper< TDMatDMatMultExpr<MT1,MT2> >
11438  : public IsTrue< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
11439  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
11440 {};
11442 //*************************************************************************************************
11443 
11444 
11445 
11446 
11447 //=================================================================================================
11448 //
11449 // EXPRESSION TRAIT SPECIALIZATIONS
11450 //
11451 //=================================================================================================
11452 
11453 //*************************************************************************************************
11455 template< typename MT1, typename MT2, typename VT >
11456 struct TDMatDVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
11457 {
11458  public:
11459  //**********************************************************************************************
11460  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11461  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
11462  IsDenseVector<VT>::value && IsColumnVector<VT>::value
11463  , typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
11464  , INVALID_TYPE >::Type Type;
11465  //**********************************************************************************************
11466 };
11468 //*************************************************************************************************
11469 
11470 
11471 //*************************************************************************************************
11473 template< typename MT1, typename MT2, typename VT >
11474 struct TDMatSVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
11475 {
11476  public:
11477  //**********************************************************************************************
11478  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11479  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
11480  IsSparseVector<VT>::value && IsColumnVector<VT>::value
11481  , typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
11482  , INVALID_TYPE >::Type Type;
11483  //**********************************************************************************************
11484 };
11486 //*************************************************************************************************
11487 
11488 
11489 //*************************************************************************************************
11491 template< typename VT, typename MT1, typename MT2 >
11492 struct TDVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
11493 {
11494  public:
11495  //**********************************************************************************************
11496  typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
11497  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11498  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
11499  , typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
11500  , INVALID_TYPE >::Type Type;
11501  //**********************************************************************************************
11502 };
11504 //*************************************************************************************************
11505 
11506 
11507 //*************************************************************************************************
11509 template< typename VT, typename MT1, typename MT2 >
11510 struct TSVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
11511 {
11512  public:
11513  //**********************************************************************************************
11514  typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
11515  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11516  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
11517  , typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
11518  , INVALID_TYPE >::Type Type;
11519  //**********************************************************************************************
11520 };
11522 //*************************************************************************************************
11523 
11524 
11525 //*************************************************************************************************
11527 template< typename MT1, typename MT2, bool AF >
11528 struct SubmatrixExprTrait< TDMatDMatMultExpr<MT1,MT2>, AF >
11529 {
11530  public:
11531  //**********************************************************************************************
11532  typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
11533  , typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
11534  //**********************************************************************************************
11535 };
11537 //*************************************************************************************************
11538 
11539 
11540 //*************************************************************************************************
11542 template< typename MT1, typename MT2 >
11543 struct RowExprTrait< TDMatDMatMultExpr<MT1,MT2> >
11544 {
11545  public:
11546  //**********************************************************************************************
11547  typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
11548  //**********************************************************************************************
11549 };
11551 //*************************************************************************************************
11552 
11553 
11554 //*************************************************************************************************
11556 template< typename MT1, typename MT2 >
11557 struct ColumnExprTrait< TDMatDMatMultExpr<MT1,MT2> >
11558 {
11559  public:
11560  //**********************************************************************************************
11561  typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
11562  //**********************************************************************************************
11563 };
11565 //*************************************************************************************************
11566 
11567 } // namespace blaze
11568 
11569 #endif
const MT::ElementType max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1649
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:310
Constraint on the data type.
Header file for mathematical functions.
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8247
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:105
Header file for basic type definitions.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:484
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:292
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:264
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:209
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:483
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:821
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:430
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2507
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:261
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:142
Header file for the And class template.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:259
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:699
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Header file for the IsUniLower type trait.
Header file for the IsFloat type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:454
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:400
Constraint on the data type.
const size_t TDMATDMATMULT_THRESHOLD
Column-major dense matrix/row-major dense matrix multiplication threshold.This setting specifies the ...
Definition: Thresholds.h:159
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:125
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:291
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:410
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the Or class template.
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > > >::Type store(T *address, const sse_int16_t &value)
Aligned store of a vector of 2-byte integral values.
Definition: Store.h:80
Header file for the TDMatSVecMultExprTrait class template.
const MT::ElementType min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1602
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:474
Header file for the DenseMatrix base class.
BLAZE_ALWAYS_INLINE void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:635
Header file for the Columns type trait.
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:332
const size_t SMP_TDMATDMATMULT_THRESHOLD
SMP column-major dense matrix/row-major dense matrix multiplication threshold.This threshold specifie...
Definition: Thresholds.h:880
Header file for the Not class template.
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:304
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDMatMultExpr.h:296
Header file for the IsLower type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
Header file for BLAS level 3 functions.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2505
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:297
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > >, sse_int16_t >::Type load(const T *address)
Loads a vector of 2-byte integral values.
Definition: Load.h:79
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:293
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:749
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:144
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:150
BLAZE_ALWAYS_INLINE void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:742
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:442
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:139
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:420
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:307
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:298
Header file for the HasMutableDataAccess type trait.
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > >, sse_int16_t >::Type set(T value)
Sets all values in the vector to the given 2-byte integral value.
Definition: Set.h:73
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:301
Header file for all intrinsic functionality.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:464
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:140
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:143
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:260
Header file for the TDVecDMatMultExprTrait class template.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:141
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2502
Header file for the IsTrue value trait.
Header file for the IsComplex type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:347
Header file for the complex data type.
Header file for the IsUpper type trait.
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:295
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:294
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
BLAZE_ALWAYS_INLINE void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:849