All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DMatTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
20 //=================================================================================================
21 
22 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
24 
25 
26 //*************************************************************************************************
27 // Includes
28 //*************************************************************************************************
29 
30 #include <stdexcept>
31 #include <boost/cast.hpp>
39 #include <blaze/math/Intrinsics.h>
40 #include <blaze/math/shims/Reset.h>
59 #include <blaze/system/BLAS.h>
61 #include <blaze/util/Assert.h>
62 #include <blaze/util/Complex.h>
68 #include <blaze/util/DisableIf.h>
69 #include <blaze/util/EnableIf.h>
70 #include <blaze/util/InvalidType.h>
72 #include <blaze/util/SelectType.h>
73 #include <blaze/util/Types.h>
79 
80 
81 namespace blaze {
82 
83 //=================================================================================================
84 //
85 // CLASS DMATTDMATMULTEXPR
86 //
87 //=================================================================================================
88 
89 //*************************************************************************************************
96 template< typename MT1 // Type of the left-hand side dense matrix
97  , typename MT2 > // Type of the right-hand side dense matrix
98 class DMatTDMatMultExpr : public DenseMatrix< DMatTDMatMultExpr<MT1,MT2>, false >
99  , private MatMatMultExpr
100  , private Computation
101 {
102  private:
103  //**Type definitions****************************************************************************
104  typedef typename MT1::ResultType RT1;
105  typedef typename MT2::ResultType RT2;
106  typedef typename MT1::CompositeType CT1;
107  typedef typename MT2::CompositeType CT2;
108  //**********************************************************************************************
109 
110  //**********************************************************************************************
112 
113 
115  template< typename T1, typename T2, typename T3 >
116  struct UseSinglePrecisionKernel {
120  };
122  //**********************************************************************************************
123 
124  //**********************************************************************************************
126 
127 
129  template< typename T1, typename T2, typename T3 >
130  struct UseDoublePrecisionKernel {
134  };
136  //**********************************************************************************************
137 
138  //**********************************************************************************************
140 
141 
144  template< typename T1, typename T2, typename T3 >
145  struct UseSinglePrecisionComplexKernel {
146  typedef complex<float> Type;
147  enum { value = IsSame<typename T1::ElementType,Type>::value &&
148  IsSame<typename T2::ElementType,Type>::value &&
149  IsSame<typename T3::ElementType,Type>::value };
150  };
152  //**********************************************************************************************
153 
154  //**********************************************************************************************
156 
157 
160  template< typename T1, typename T2, typename T3 >
161  struct UseDoublePrecisionComplexKernel {
162  typedef complex<double> Type;
163  enum { value = IsSame<typename T1::ElementType,Type>::value &&
164  IsSame<typename T2::ElementType,Type>::value &&
165  IsSame<typename T3::ElementType,Type>::value };
166  };
168  //**********************************************************************************************
169 
170  //**********************************************************************************************
172 
173 
175  template< typename T1, typename T2, typename T3 >
176  struct UseDefaultKernel {
177  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
178  !UseDoublePrecisionKernel<T1,T2,T3>::value &&
179  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
180  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
181  };
183  //**********************************************************************************************
184 
185  //**********************************************************************************************
187 
188 
190  template< typename T1, typename T2, typename T3 >
191  struct UseVectorizedDefaultKernel {
192  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
193  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
194  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
195  IntrinsicTrait<typename T1::ElementType>::addition &&
196  IntrinsicTrait<typename T1::ElementType>::multiplication };
197  };
199  //**********************************************************************************************
200 
201  public:
202  //**Type definitions****************************************************************************
205  typedef typename ResultType::OppositeType OppositeType;
206  typedef typename ResultType::TransposeType TransposeType;
207  typedef typename ResultType::ElementType ElementType;
209  typedef const ElementType ReturnType;
210  typedef const ResultType CompositeType;
211 
213  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
214 
216  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
217 
219  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
220 
222  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
223  //**********************************************************************************************
224 
225  //**Compilation flags***************************************************************************
227  enum { vectorizable = 0 };
228  //**********************************************************************************************
229 
230  //**Constructor*********************************************************************************
236  explicit inline DMatTDMatMultExpr( const MT1& lhs, const MT2& rhs )
237  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
238  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
239  {
240  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
241  }
242  //**********************************************************************************************
243 
244  //**Access operator*****************************************************************************
251  inline ReturnType operator()( size_t i, size_t j ) const {
252  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
253  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
254 
255  ElementType tmp;
256 
257  if( lhs_.columns() != 0UL ) {
258  const size_t end( ( ( lhs_.columns()-1UL ) & size_t(-2) ) + 1UL );
259  tmp = lhs_(i,0UL) * rhs_(0UL,j);
260  for( size_t k=1UL; k<end; k+=2UL ) {
261  tmp += lhs_(i,k ) * rhs_(k ,j);
262  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
263  }
264  if( end < lhs_.columns() ) {
265  tmp += lhs_(i,end) * rhs_(end,j);
266  }
267  }
268  else {
269  reset( tmp );
270  }
271 
272  return tmp;
273  }
274  //**********************************************************************************************
275 
276  //**Rows function*******************************************************************************
281  inline size_t rows() const {
282  return lhs_.rows();
283  }
284  //**********************************************************************************************
285 
286  //**Columns function****************************************************************************
291  inline size_t columns() const {
292  return rhs_.columns();
293  }
294  //**********************************************************************************************
295 
296  //**Left operand access*************************************************************************
301  inline LeftOperand leftOperand() const {
302  return lhs_;
303  }
304  //**********************************************************************************************
305 
306  //**Right operand access************************************************************************
311  inline RightOperand rightOperand() const {
312  return rhs_;
313  }
314  //**********************************************************************************************
315 
316  //**********************************************************************************************
322  template< typename T >
323  inline bool canAlias( const T* alias ) const {
324  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
325  }
326  //**********************************************************************************************
327 
328  //**********************************************************************************************
334  template< typename T >
335  inline bool isAliased( const T* alias ) const {
336  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
337  }
338  //**********************************************************************************************
339 
340  private:
341  //**Member variables****************************************************************************
344  //**********************************************************************************************
345 
346  //**Assignment to dense matrices****************************************************************
355  template< typename MT // Type of the target dense matrix
356  , bool SO > // Storage order of the target dense matrix
357  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
358  {
360 
361  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
362  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
363 
364  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
365  return;
366  }
367  else if( rhs.lhs_.columns() == 0UL ) {
368  reset( ~lhs );
369  return;
370  }
371 
372  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
373  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
374 
375  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
376  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
377  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
378  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
379  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
380  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
381 
382  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
383  DMatTDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
384  else
385  DMatTDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
386  }
388  //**********************************************************************************************
389 
390  //**Default assignment to dense matrices********************************************************
404  template< typename MT3 // Type of the left-hand side target matrix
405  , typename MT4 // Type of the left-hand side matrix operand
406  , typename MT5 > // Type of the right-hand side matrix operand
407  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
408  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
409  {
410  const size_t M( A.rows() );
411  const size_t N( B.columns() );
412  const size_t K( A.columns() );
413 
414  for( size_t i=0UL; i<M; ++i ) {
415  for( size_t j=0UL; j<N; ++j ) {
416  C(i,j) = A(i,0UL) * B(0UL,j);
417  }
418  for( size_t k=1UL; k<K; ++k ) {
419  for( size_t j=0UL; j<N; ++j ) {
420  C(i,j) += A(i,k) * B(k,j);
421  }
422  }
423  }
424  }
426  //**********************************************************************************************
427 
428  //**Vectorized default assignment to row-major dense matrices***********************************
442  template< typename MT3 // Type of the left-hand side target matrix
443  , typename MT4 // Type of the left-hand side matrix operand
444  , typename MT5 > // Type of the right-hand side matrix operand
445  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
446  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
447  {
448  typedef IntrinsicTrait<ElementType> IT;
449 
450  const size_t M( A.rows() );
451  const size_t N( B.columns() );
452  const size_t K( A.columns() );
453 
454  size_t i( 0UL );
455 
456  for( ; (i+2UL) <= M; i+=2UL ) {
457  size_t j( 0UL );
458  for( ; (j+4UL) <= N; j+=4UL ) {
459  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
460  for( size_t k=0UL; k<K; k+=IT::size ) {
461  const IntrinsicType a1( A.get(i ,k) );
462  const IntrinsicType a2( A.get(i+1UL,k) );
463  const IntrinsicType b1( B.get(k,j ) );
464  const IntrinsicType b2( B.get(k,j+1UL) );
465  const IntrinsicType b3( B.get(k,j+2UL) );
466  const IntrinsicType b4( B.get(k,j+3UL) );
467  xmm1 = xmm1 + a1 * b1;
468  xmm2 = xmm2 + a1 * b2;
469  xmm3 = xmm3 + a1 * b3;
470  xmm4 = xmm4 + a1 * b4;
471  xmm5 = xmm5 + a2 * b1;
472  xmm6 = xmm6 + a2 * b2;
473  xmm7 = xmm7 + a2 * b3;
474  xmm8 = xmm8 + a2 * b4;
475  }
476  (~C)(i ,j ) = sum( xmm1 );
477  (~C)(i ,j+1UL) = sum( xmm2 );
478  (~C)(i ,j+2UL) = sum( xmm3 );
479  (~C)(i ,j+3UL) = sum( xmm4 );
480  (~C)(i+1UL,j ) = sum( xmm5 );
481  (~C)(i+1UL,j+1UL) = sum( xmm6 );
482  (~C)(i+1UL,j+2UL) = sum( xmm7 );
483  (~C)(i+1UL,j+3UL) = sum( xmm8 );
484  }
485  for( ; (j+2UL) <= N; j+=2UL ) {
486  IntrinsicType xmm1, xmm2, xmm3, xmm4;
487  for( size_t k=0UL; k<K; k+=IT::size ) {
488  const IntrinsicType a1( A.get(i ,k) );
489  const IntrinsicType a2( A.get(i+1UL,k) );
490  const IntrinsicType b1( B.get(k,j ) );
491  const IntrinsicType b2( B.get(k,j+1UL) );
492  xmm1 = xmm1 + a1 * b1;
493  xmm2 = xmm2 + a1 * b2;
494  xmm3 = xmm3 + a2 * b1;
495  xmm4 = xmm4 + a2 * b2;
496  }
497  (~C)(i ,j ) = sum( xmm1 );
498  (~C)(i ,j+1UL) = sum( xmm2 );
499  (~C)(i+1UL,j ) = sum( xmm3 );
500  (~C)(i+1UL,j+1UL) = sum( xmm4 );
501  }
502  if( j < N ) {
503  IntrinsicType xmm1, xmm2;
504  for( size_t k=0UL; k<K; k+=IT::size ) {
505  const IntrinsicType b1( B.get(k,j) );
506  xmm1 = xmm1 + A.get(i ,k) * b1;
507  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
508  }
509  (~C)(i ,j) = sum( xmm1 );
510  (~C)(i+1UL,j) = sum( xmm2 );
511  }
512  }
513  if( i < M ) {
514  size_t j( 0UL );
515  for( ; (j+4UL) <= N; j+=4UL ) {
516  IntrinsicType xmm1, xmm2, xmm3, xmm4;
517  for( size_t k=0UL; k<K; k+=IT::size ) {
518  const IntrinsicType a1( A.get(i,k) );
519  xmm1 = xmm1 + a1 * B.get(k,j );
520  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
521  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
522  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
523  }
524  (~C)(i,j ) = sum( xmm1 );
525  (~C)(i,j+1UL) = sum( xmm2 );
526  (~C)(i,j+2UL) = sum( xmm3 );
527  (~C)(i,j+3UL) = sum( xmm4 );
528  }
529  for( ; (j+2UL) <= N; j+=2UL ) {
530  IntrinsicType xmm1, xmm2;
531  for( size_t k=0UL; k<K; k+=IT::size ) {
532  const IntrinsicType a1( A.get(i,k) );
533  xmm1 = xmm1 + a1 * B.get(k,j );
534  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
535  }
536  (~C)(i,j ) = sum( xmm1 );
537  (~C)(i,j+1UL) = sum( xmm2 );
538  }
539  if( j < N ) {
540  IntrinsicType xmm1, xmm2;
541  for( size_t k=0UL; k<K; k+=IT::size ) {
542  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
543  }
544  (~C)(i,j) = sum( xmm1 );
545  }
546  }
547  }
549  //**********************************************************************************************
550 
551  //**Vectorized default assignment to column-major dense matrices********************************
565  template< typename MT3 // Type of the left-hand side target matrix
566  , typename MT4 // Type of the left-hand side matrix operand
567  , typename MT5 > // Type of the right-hand side matrix operand
568  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
569  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
570  {
571  typedef IntrinsicTrait<ElementType> IT;
572 
573  const size_t M( A.rows() );
574  const size_t N( B.columns() );
575  const size_t K( A.columns() );
576 
577  size_t i( 0UL );
578 
579  for( ; (i+4UL) <= M; i+=4UL ) {
580  size_t j( 0UL );
581  for( ; (j+2UL) <= N; j+=2UL ) {
582  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
583  for( size_t k=0UL; k<K; k+=IT::size ) {
584  const IntrinsicType a1( A.get(i ,k) );
585  const IntrinsicType a2( A.get(i+1UL,k) );
586  const IntrinsicType a3( A.get(i+2UL,k) );
587  const IntrinsicType a4( A.get(i+3UL,k) );
588  const IntrinsicType b1( B.get(k,j ) );
589  const IntrinsicType b2( B.get(k,j+1UL) );
590  xmm1 = xmm1 + a1 * b1;
591  xmm2 = xmm2 + a1 * b2;
592  xmm3 = xmm3 + a2 * b1;
593  xmm4 = xmm4 + a2 * b2;
594  xmm5 = xmm5 + a3 * b1;
595  xmm6 = xmm6 + a3 * b2;
596  xmm7 = xmm7 + a4 * b1;
597  xmm8 = xmm8 + a4 * b2;
598  }
599  (~C)(i ,j ) = sum( xmm1 );
600  (~C)(i ,j+1UL) = sum( xmm2 );
601  (~C)(i+1UL,j ) = sum( xmm3 );
602  (~C)(i+1UL,j+1UL) = sum( xmm4 );
603  (~C)(i+2UL,j ) = sum( xmm5 );
604  (~C)(i+2UL,j+1UL) = sum( xmm6 );
605  (~C)(i+3UL,j ) = sum( xmm7 );
606  (~C)(i+3UL,j+1UL) = sum( xmm8 );
607  }
608  if( j < N ) {
609  IntrinsicType xmm1, xmm2, xmm3, xmm4;
610  for( size_t k=0UL; k<K; k+=IT::size ) {
611  const IntrinsicType b1( B.get(k,j) );
612  xmm1 = xmm1 + A.get(i ,k) * b1;
613  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
614  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
615  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
616  }
617  (~C)(i ,j) = sum( xmm1 );
618  (~C)(i+1UL,j) = sum( xmm2 );
619  (~C)(i+2UL,j) = sum( xmm3 );
620  (~C)(i+3UL,j) = sum( xmm4 );
621  }
622  }
623  for( ; (i+2UL) <= M; i+=2UL ) {
624  size_t j( 0UL );
625  for( ; (j+2UL) <= N; j+=2UL ) {
626  IntrinsicType xmm1, xmm2, xmm3, xmm4;
627  for( size_t k=0UL; k<K; k+=IT::size ) {
628  const IntrinsicType a1( A.get(i ,k) );
629  const IntrinsicType a2( A.get(i+1UL,k) );
630  const IntrinsicType b1( B.get(k,j ) );
631  const IntrinsicType b2( B.get(k,j+1UL) );
632  xmm1 = xmm1 + a1 * b1;
633  xmm2 = xmm2 + a1 * b2;
634  xmm3 = xmm3 + a2 * b1;
635  xmm4 = xmm4 + a2 * b2;
636  }
637  (~C)(i ,j ) = sum( xmm1 );
638  (~C)(i ,j+1UL) = sum( xmm2 );
639  (~C)(i+1UL,j ) = sum( xmm3 );
640  (~C)(i+1UL,j+1UL) = sum( xmm4 );
641  }
642  if( j < N ) {
643  IntrinsicType xmm1, xmm2;
644  for( size_t k=0UL; k<K; k+=IT::size ) {
645  const IntrinsicType b1( B.get(k,j) );
646  xmm1 = xmm1 + A.get(i ,k) * b1;
647  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
648  }
649  (~C)(i ,j) = sum( xmm1 );
650  (~C)(i+1UL,j) = sum( xmm2 );
651  }
652  }
653  if( i < M ) {
654  size_t j( 0UL );
655  for( ; (j+2UL) <= N; j+=2UL ) {
656  IntrinsicType xmm1, xmm2;
657  for( size_t k=0UL; k<K; k+=IT::size ) {
658  const IntrinsicType a1( A.get(i,k) );
659  xmm1 = xmm1 + a1 * B.get(k,j );
660  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
661  }
662  (~C)(i,j ) = sum( xmm1 );
663  (~C)(i,j+1UL) = sum( xmm2 );
664  }
665  if( j < N ) {
666  IntrinsicType xmm1, xmm2;
667  for( size_t k=0UL; k<K; k+=IT::size ) {
668  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
669  }
670  (~C)(i,j) = sum( xmm1 );
671  }
672  }
673  }
675  //**********************************************************************************************
676 
677  //**Default assignment to dense matrices********************************************************
691  template< typename MT3 // Type of the left-hand side target matrix
692  , typename MT4 // Type of the left-hand side matrix operand
693  , typename MT5 > // Type of the right-hand side matrix operand
694  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
695  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
696  {
697  selectDefaultAssignKernel( C, A, B );
698  }
700  //**********************************************************************************************
701 
702  //**BLAS-based assignment to dense matrices (single precision)**********************************
703 #if BLAZE_BLAS_MODE
704 
717  template< typename MT3 // Type of the left-hand side target matrix
718  , typename MT4 // Type of the left-hand side matrix operand
719  , typename MT5 > // Type of the right-hand side matrix operand
720  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
721  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
722  {
723  using boost::numeric_cast;
724 
725  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
726  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
727  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
728 
729  const int M ( numeric_cast<int>( A.rows() ) );
730  const int N ( numeric_cast<int>( B.columns() ) );
731  const int K ( numeric_cast<int>( A.columns() ) );
732  const int lda( numeric_cast<int>( A.spacing() ) );
733  const int ldb( numeric_cast<int>( B.spacing() ) );
734  const int ldc( numeric_cast<int>( C.spacing() ) );
735 
736  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
737  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
738  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
739  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
740  }
742 #endif
743  //**********************************************************************************************
744 
745  //**BLAS-based assignment to dense matrices (double precision)**********************************
746 #if BLAZE_BLAS_MODE
747 
760  template< typename MT3 // Type of the left-hand side target matrix
761  , typename MT4 // Type of the left-hand side matrix operand
762  , typename MT5 > // Type of the right-hand side matrix operand
763  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
764  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
765  {
766  using boost::numeric_cast;
767 
768  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
769  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
770  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
771 
772  const int M ( numeric_cast<int>( A.rows() ) );
773  const int N ( numeric_cast<int>( B.columns() ) );
774  const int K ( numeric_cast<int>( A.columns() ) );
775  const int lda( numeric_cast<int>( A.spacing() ) );
776  const int ldb( numeric_cast<int>( B.spacing() ) );
777  const int ldc( numeric_cast<int>( C.spacing() ) );
778 
779  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
780  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
781  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
782  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
783  }
785 #endif
786  //**********************************************************************************************
787 
788  //**BLAS-based assignment to dense matrices (single precision complex)**************************
789 #if BLAZE_BLAS_MODE
790 
803  template< typename MT3 // Type of the left-hand side target matrix
804  , typename MT4 // Type of the left-hand side matrix operand
805  , typename MT5 > // Type of the right-hand side matrix operand
806  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
807  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
808  {
809  using boost::numeric_cast;
810 
811  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
812  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
813  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
814  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
815  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
816  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
817 
818  const int M ( numeric_cast<int>( A.rows() ) );
819  const int N ( numeric_cast<int>( B.columns() ) );
820  const int K ( numeric_cast<int>( A.columns() ) );
821  const int lda( numeric_cast<int>( A.spacing() ) );
822  const int ldb( numeric_cast<int>( B.spacing() ) );
823  const int ldc( numeric_cast<int>( C.spacing() ) );
824  const complex<float> alpha( 1.0F, 0.0F );
825  const complex<float> beta ( 0.0F, 0.0F );
826 
827  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
828  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
829  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
830  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
831  }
833 #endif
834  //**********************************************************************************************
835 
836  //**BLAS-based assignment to dense matrices (double precision complex)**************************
837 #if BLAZE_BLAS_MODE
838 
851  template< typename MT3 // Type of the left-hand side target matrix
852  , typename MT4 // Type of the left-hand side matrix operand
853  , typename MT5 > // Type of the right-hand side matrix operand
854  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
855  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
856  {
857  using boost::numeric_cast;
858 
859  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
860  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
861  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
862  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
863  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
864  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
865 
866  const int M ( numeric_cast<int>( A.rows() ) );
867  const int N ( numeric_cast<int>( B.columns() ) );
868  const int K ( numeric_cast<int>( A.columns() ) );
869  const int lda( numeric_cast<int>( A.spacing() ) );
870  const int ldb( numeric_cast<int>( B.spacing() ) );
871  const int ldc( numeric_cast<int>( C.spacing() ) );
872  const complex<double> alpha( 1.0, 0.0 );
873  const complex<double> beta ( 0.0, 0.0 );
874 
875  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
876  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
877  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
878  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
879  }
881 #endif
882  //**********************************************************************************************
883 
884  //**Assignment to sparse matrices***************************************************************
896  template< typename MT // Type of the target sparse matrix
897  , bool SO > // Storage order of the target sparse matrix
898  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
899  {
901 
902  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
903 
909  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( typename TmpType::CompositeType );
910 
911  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
912  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
913 
914  const TmpType tmp( rhs );
915  assign( ~lhs, tmp );
916  }
918  //**********************************************************************************************
919 
920  //**Addition assignment to dense matrices*******************************************************
933  template< typename MT // Type of the target dense matrix
934  , bool SO > // Storage order of the target dense matrix
935  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
936  {
938 
939  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
940  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
941 
942  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
943  return;
944  }
945 
946  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
947  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
948 
949  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
950  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
951  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
952  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
953  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
954  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
955 
956  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
957  DMatTDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
958  else
959  DMatTDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
960  }
962  //**********************************************************************************************
963 
964  //**Default addition assignment to dense matrices***********************************************
978  template< typename MT3 // Type of the left-hand side target matrix
979  , typename MT4 // Type of the left-hand side matrix operand
980  , typename MT5 > // Type of the right-hand side matrix operand
981  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
982  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
983  {
984  const size_t M( A.rows() );
985  const size_t N( B.columns() );
986  const size_t K( A.columns() );
987 
988  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
989  const size_t end( N & size_t(-2) );
990 
991  for( size_t i=0UL; i<M; ++i ) {
992  for( size_t k=0UL; k<K; ++k ) {
993  for( size_t j=0UL; j<end; j+=2UL ) {
994  C(i,j ) += A(i,k) * B(k,j );
995  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
996  }
997  if( end < N ) {
998  C(i,end) += A(i,k) * B(k,end);
999  }
1000  }
1001  }
1002  }
1004  //**********************************************************************************************
1005 
1006  //**Vectorized default addition assignment to row-major dense matrices**************************
1020  template< typename MT3 // Type of the left-hand side target matrix
1021  , typename MT4 // Type of the left-hand side matrix operand
1022  , typename MT5 > // Type of the right-hand side matrix operand
1023  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1024  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1025  {
1026  typedef IntrinsicTrait<ElementType> IT;
1027 
1028  const size_t M( A.rows() );
1029  const size_t N( B.columns() );
1030  const size_t K( A.columns() );
1031 
1032  size_t i( 0UL );
1033 
1034  for( ; (i+2UL) <= M; i+=2UL ) {
1035  size_t j( 0UL );
1036  for( ; (j+4UL) <= N; j+=4UL ) {
1037  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1038  for( size_t k=0UL; k<K; k+=IT::size ) {
1039  const IntrinsicType a1( A.get(i ,k) );
1040  const IntrinsicType a2( A.get(i+1UL,k) );
1041  const IntrinsicType b1( B.get(k,j ) );
1042  const IntrinsicType b2( B.get(k,j+1UL) );
1043  const IntrinsicType b3( B.get(k,j+2UL) );
1044  const IntrinsicType b4( B.get(k,j+3UL) );
1045  xmm1 = xmm1 + a1 * b1;
1046  xmm2 = xmm2 + a1 * b2;
1047  xmm3 = xmm3 + a1 * b3;
1048  xmm4 = xmm4 + a1 * b4;
1049  xmm5 = xmm5 + a2 * b1;
1050  xmm6 = xmm6 + a2 * b2;
1051  xmm7 = xmm7 + a2 * b3;
1052  xmm8 = xmm8 + a2 * b4;
1053  }
1054  (~C)(i ,j ) += sum( xmm1 );
1055  (~C)(i ,j+1UL) += sum( xmm2 );
1056  (~C)(i ,j+2UL) += sum( xmm3 );
1057  (~C)(i ,j+3UL) += sum( xmm4 );
1058  (~C)(i+1UL,j ) += sum( xmm5 );
1059  (~C)(i+1UL,j+1UL) += sum( xmm6 );
1060  (~C)(i+1UL,j+2UL) += sum( xmm7 );
1061  (~C)(i+1UL,j+3UL) += sum( xmm8 );
1062  }
1063  for( ; (j+2UL) <= N; j+=2UL ) {
1064  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1065  for( size_t k=0UL; k<K; k+=IT::size ) {
1066  const IntrinsicType a1( A.get(i ,k) );
1067  const IntrinsicType a2( A.get(i+1UL,k) );
1068  const IntrinsicType b1( B.get(k,j ) );
1069  const IntrinsicType b2( B.get(k,j+1UL) );
1070  xmm1 = xmm1 + a1 * b1;
1071  xmm2 = xmm2 + a1 * b2;
1072  xmm3 = xmm3 + a2 * b1;
1073  xmm4 = xmm4 + a2 * b2;
1074  }
1075  (~C)(i ,j ) += sum( xmm1 );
1076  (~C)(i ,j+1UL) += sum( xmm2 );
1077  (~C)(i+1UL,j ) += sum( xmm3 );
1078  (~C)(i+1UL,j+1UL) += sum( xmm4 );
1079  }
1080  if( j < N ) {
1081  IntrinsicType xmm1, xmm2;
1082  for( size_t k=0UL; k<K; k+=IT::size ) {
1083  const IntrinsicType b1( B.get(k,j) );
1084  xmm1 = xmm1 + A.get(i ,k) * b1;
1085  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1086  }
1087  (~C)(i ,j) += sum( xmm1 );
1088  (~C)(i+1UL,j) += sum( xmm2 );
1089  }
1090  }
1091  if( i < M ) {
1092  size_t j( 0UL );
1093  for( ; (j+4UL) <= N; j+=4UL ) {
1094  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1095  for( size_t k=0UL; k<K; k+=IT::size ) {
1096  const IntrinsicType a1( A.get(i,k) );
1097  xmm1 = xmm1 + a1 * B.get(k,j );
1098  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1099  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
1100  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
1101  }
1102  (~C)(i,j ) += sum( xmm1 );
1103  (~C)(i,j+1UL) += sum( xmm2 );
1104  (~C)(i,j+2UL) += sum( xmm3 );
1105  (~C)(i,j+3UL) += sum( xmm4 );
1106  }
1107  for( ; (j+2UL) <= N; j+=2UL ) {
1108  IntrinsicType xmm1, xmm2;
1109  for( size_t k=0UL; k<K; k+=IT::size ) {
1110  const IntrinsicType a1( A.get(i,k) );
1111  xmm1 = xmm1 + a1 * B.get(k,j );
1112  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1113  }
1114  (~C)(i,j ) += sum( xmm1 );
1115  (~C)(i,j+1UL) += sum( xmm2 );
1116  }
1117  if( j < N ) {
1118  IntrinsicType xmm1, xmm2;
1119  for( size_t k=0UL; k<K; k+=IT::size ) {
1120  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1121  }
1122  (~C)(i,j) += sum( xmm1 );
1123  }
1124  }
1125  }
1127  //**********************************************************************************************
1128 
1129  //**Vectorized default addition assignment to column-major dense matrices***********************
1143  template< typename MT3 // Type of the left-hand side target matrix
1144  , typename MT4 // Type of the left-hand side matrix operand
1145  , typename MT5 > // Type of the right-hand side matrix operand
1146  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1147  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1148  {
1149  typedef IntrinsicTrait<ElementType> IT;
1150 
1151  const size_t M( A.rows() );
1152  const size_t N( B.columns() );
1153  const size_t K( A.columns() );
1154 
1155  size_t i( 0UL );
1156 
1157  for( ; (i+4UL) <= M; i+=4UL ) {
1158  size_t j( 0UL );
1159  for( ; (j+2UL) <= N; j+=2UL ) {
1160  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1161  for( size_t k=0UL; k<K; k+=IT::size ) {
1162  const IntrinsicType a1( A.get(i ,k) );
1163  const IntrinsicType a2( A.get(i+1UL,k) );
1164  const IntrinsicType a3( A.get(i+2UL,k) );
1165  const IntrinsicType a4( A.get(i+3UL,k) );
1166  const IntrinsicType b1( B.get(k,j ) );
1167  const IntrinsicType b2( B.get(k,j+1UL) );
1168  xmm1 = xmm1 + a1 * b1;
1169  xmm2 = xmm2 + a1 * b2;
1170  xmm3 = xmm3 + a2 * b1;
1171  xmm4 = xmm4 + a2 * b2;
1172  xmm5 = xmm5 + a3 * b1;
1173  xmm6 = xmm6 + a3 * b2;
1174  xmm7 = xmm7 + a4 * b1;
1175  xmm8 = xmm8 + a4 * b2;
1176  }
1177  (~C)(i ,j ) += sum( xmm1 );
1178  (~C)(i ,j+1UL) += sum( xmm2 );
1179  (~C)(i+1UL,j ) += sum( xmm3 );
1180  (~C)(i+1UL,j+1UL) += sum( xmm4 );
1181  (~C)(i+2UL,j ) += sum( xmm5 );
1182  (~C)(i+2UL,j+1UL) += sum( xmm6 );
1183  (~C)(i+3UL,j ) += sum( xmm7 );
1184  (~C)(i+3UL,j+1UL) += sum( xmm8 );
1185  }
1186  if( j < N ) {
1187  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1188  for( size_t k=0UL; k<K; k+=IT::size ) {
1189  const IntrinsicType b1( B.get(k,j) );
1190  xmm1 = xmm1 + A.get(i ,k) * b1;
1191  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1192  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
1193  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
1194  }
1195  (~C)(i ,j) += sum( xmm1 );
1196  (~C)(i+1UL,j) += sum( xmm2 );
1197  (~C)(i+2UL,j) += sum( xmm3 );
1198  (~C)(i+3UL,j) += sum( xmm4 );
1199  }
1200  }
1201  for( ; (i+2UL) <= M; i+=2UL ) {
1202  size_t j( 0UL );
1203  for( ; (j+2UL) <= N; j+=2UL ) {
1204  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1205  for( size_t k=0UL; k<K; k+=IT::size ) {
1206  const IntrinsicType a1( A.get(i ,k) );
1207  const IntrinsicType a2( A.get(i+1UL,k) );
1208  const IntrinsicType b1( B.get(k,j ) );
1209  const IntrinsicType b2( B.get(k,j+1UL) );
1210  xmm1 = xmm1 + a1 * b1;
1211  xmm2 = xmm2 + a1 * b2;
1212  xmm3 = xmm3 + a2 * b1;
1213  xmm4 = xmm4 + a2 * b2;
1214  }
1215  (~C)(i ,j ) += sum( xmm1 );
1216  (~C)(i ,j+1UL) += sum( xmm2 );
1217  (~C)(i+1UL,j ) += sum( xmm3 );
1218  (~C)(i+1UL,j+1UL) += sum( xmm4 );
1219  }
1220  if( j < N ) {
1221  IntrinsicType xmm1, xmm2;
1222  for( size_t k=0UL; k<K; k+=IT::size ) {
1223  const IntrinsicType b1( B.get(k,j) );
1224  xmm1 = xmm1 + A.get(i ,k) * b1;
1225  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1226  }
1227  (~C)(i ,j) += sum( xmm1 );
1228  (~C)(i+1UL,j) += sum( xmm2 );
1229  }
1230  }
1231  if( i < M ) {
1232  size_t j( 0UL );
1233  for( ; (j+2UL) <= N; j+=2UL ) {
1234  IntrinsicType xmm1, xmm2;
1235  for( size_t k=0UL; k<K; k+=IT::size ) {
1236  const IntrinsicType a1( A.get(i,k) );
1237  xmm1 = xmm1 + a1 * B.get(k,j );
1238  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1239  }
1240  (~C)(i,j ) += sum( xmm1 );
1241  (~C)(i,j+1UL) += sum( xmm2 );
1242  }
1243  if( j < N ) {
1244  IntrinsicType xmm1, xmm2;
1245  for( size_t k=0UL; k<K; k+=IT::size ) {
1246  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1247  }
1248  (~C)(i,j) += sum( xmm1 );
1249  }
1250  }
1251  }
1253  //**********************************************************************************************
1254 
1255  //**Default addition assignment to dense matrices***********************************************
1269  template< typename MT3 // Type of the left-hand side target matrix
1270  , typename MT4 // Type of the left-hand side matrix operand
1271  , typename MT5 > // Type of the right-hand side matrix operand
1272  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1273  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1274  {
1275  selectDefaultAddAssignKernel( C, A, B );
1276  }
1278  //**********************************************************************************************
1279 
1280  //**BLAS-based addition assignment to dense matrices (single precision)*************************
1281 #if BLAZE_BLAS_MODE
1282 
1295  template< typename MT3 // Type of the left-hand side target matrix
1296  , typename MT4 // Type of the left-hand side matrix operand
1297  , typename MT5 > // Type of the right-hand side matrix operand
1298  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1299  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1300  {
1301  using boost::numeric_cast;
1302 
1303  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
1304  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
1305  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
1306 
1307  const int M ( numeric_cast<int>( A.rows() ) );
1308  const int N ( numeric_cast<int>( B.columns() ) );
1309  const int K ( numeric_cast<int>( A.columns() ) );
1310  const int lda( numeric_cast<int>( A.spacing() ) );
1311  const int ldb( numeric_cast<int>( B.spacing() ) );
1312  const int ldc( numeric_cast<int>( C.spacing() ) );
1313 
1314  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1315  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1316  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1317  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1318  }
1320 #endif
1321  //**********************************************************************************************
1322 
1323  //**BLAS-based addition assignment to dense matrices (double precision)*************************
1324 #if BLAZE_BLAS_MODE
1325 
1338  template< typename MT3 // Type of the left-hand side target matrix
1339  , typename MT4 // Type of the left-hand side matrix operand
1340  , typename MT5 > // Type of the right-hand side matrix operand
1341  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1342  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1343  {
1344  using boost::numeric_cast;
1345 
1346  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
1347  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
1348  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
1349 
1350  const int M ( numeric_cast<int>( A.rows() ) );
1351  const int N ( numeric_cast<int>( B.columns() ) );
1352  const int K ( numeric_cast<int>( A.columns() ) );
1353  const int lda( numeric_cast<int>( A.spacing() ) );
1354  const int ldb( numeric_cast<int>( B.spacing() ) );
1355  const int ldc( numeric_cast<int>( C.spacing() ) );
1356 
1357  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1358  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1359  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1360  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1361  }
1363 #endif
1364  //**********************************************************************************************
1365 
1366  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
1367 #if BLAZE_BLAS_MODE
1368 
1381  template< typename MT3 // Type of the left-hand side target matrix
1382  , typename MT4 // Type of the left-hand side matrix operand
1383  , typename MT5 > // Type of the right-hand side matrix operand
1384  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1385  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1386  {
1387  using boost::numeric_cast;
1388 
1389  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1390  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1391  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1392  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
1393  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
1394  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
1395 
1396  const int M ( numeric_cast<int>( A.rows() ) );
1397  const int N ( numeric_cast<int>( B.columns() ) );
1398  const int K ( numeric_cast<int>( A.columns() ) );
1399  const int lda( numeric_cast<int>( A.spacing() ) );
1400  const int ldb( numeric_cast<int>( B.spacing() ) );
1401  const int ldc( numeric_cast<int>( C.spacing() ) );
1402  const complex<float> alpha( 1.0F, 0.0F );
1403  const complex<float> beta ( 1.0F, 0.0F );
1404 
1405  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1406  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1407  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1408  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1409  }
1411 #endif
1412  //**********************************************************************************************
1413 
1414  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
1415 #if BLAZE_BLAS_MODE
1416 
1429  template< typename MT3 // Type of the left-hand side target matrix
1430  , typename MT4 // Type of the left-hand side matrix operand
1431  , typename MT5 > // Type of the right-hand side matrix operand
1432  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1433  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1434  {
1435  using boost::numeric_cast;
1436 
1437  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1438  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1439  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1440  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1441  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1442  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1443 
1444  const int M ( numeric_cast<int>( A.rows() ) );
1445  const int N ( numeric_cast<int>( B.columns() ) );
1446  const int K ( numeric_cast<int>( A.columns() ) );
1447  const int lda( numeric_cast<int>( A.spacing() ) );
1448  const int ldb( numeric_cast<int>( B.spacing() ) );
1449  const int ldc( numeric_cast<int>( C.spacing() ) );
1450  const complex<double> alpha( 1.0, 0.0 );
1451  const complex<double> beta ( 1.0, 0.0 );
1452 
1453  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1454  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1455  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1456  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1457  }
1459 #endif
1460  //**********************************************************************************************
1461 
1462  //**Addition assignment to sparse matrices******************************************************
1463  // No special implementation for the addition assignment to sparse matrices.
1464  //**********************************************************************************************
1465 
1466  //**Subtraction assignment to dense matrices****************************************************
1479  template< typename MT // Type of the target dense matrix
1480  , bool SO > // Storage order of the target dense matrix
1481  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
1482  {
1484 
1485  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1486  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1487 
1488  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1489  return;
1490  }
1491 
1492  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
1493  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
1494 
1495  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1496  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1497  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1498  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1499  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1500  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1501 
1502  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
1503  DMatTDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1504  else
1505  DMatTDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1506  }
1508  //**********************************************************************************************
1509 
1510  //**Default subtraction assignment to dense matrices********************************************
1524  template< typename MT3 // Type of the left-hand side target matrix
1525  , typename MT4 // Type of the left-hand side matrix operand
1526  , typename MT5 > // Type of the right-hand side matrix operand
1527  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1528  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1529  {
1530  const size_t M( A.rows() );
1531  const size_t N( B.columns() );
1532  const size_t K( A.columns() );
1533 
1534  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1535  const size_t end( N & size_t(-2) );
1536 
1537  for( size_t i=0UL; i<M; ++i ) {
1538  for( size_t k=0UL; k<K; ++k ) {
1539  for( size_t j=0UL; j<end; j+=2UL ) {
1540  C(i,j ) -= A(i,k) * B(k,j );
1541  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1542  }
1543  if( end < N ) {
1544  C(i,end) -= A(i,k) * B(k,end);
1545  }
1546  }
1547  }
1548  }
1550  //**********************************************************************************************
1551 
1552  //**Default subtraction assignment to row-major dense matrices**********************************
1566  template< typename MT3 // Type of the left-hand side target matrix
1567  , typename MT4 // Type of the left-hand side matrix operand
1568  , typename MT5 > // Type of the right-hand side matrix operand
1569  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1570  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1571  {
1572  typedef IntrinsicTrait<ElementType> IT;
1573 
1574  const size_t M( A.rows() );
1575  const size_t N( B.columns() );
1576  const size_t K( A.columns() );
1577 
1578  size_t i( 0UL );
1579 
1580  for( ; (i+2UL) <= M; i+=2UL ) {
1581  size_t j( 0UL );
1582  for( ; (j+4UL) <= N; j+=4UL ) {
1583  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1584  for( size_t k=0UL; k<K; k+=IT::size ) {
1585  const IntrinsicType a1( A.get(i ,k) );
1586  const IntrinsicType a2( A.get(i+1UL,k) );
1587  const IntrinsicType b1( B.get(k,j ) );
1588  const IntrinsicType b2( B.get(k,j+1UL) );
1589  const IntrinsicType b3( B.get(k,j+2UL) );
1590  const IntrinsicType b4( B.get(k,j+3UL) );
1591  xmm1 = xmm1 + a1 * b1;
1592  xmm2 = xmm2 + a1 * b2;
1593  xmm3 = xmm3 + a1 * b3;
1594  xmm4 = xmm4 + a1 * b4;
1595  xmm5 = xmm5 + a2 * b1;
1596  xmm6 = xmm6 + a2 * b2;
1597  xmm7 = xmm7 + a2 * b3;
1598  xmm8 = xmm8 + a2 * b4;
1599  }
1600  (~C)(i ,j ) -= sum( xmm1 );
1601  (~C)(i ,j+1UL) -= sum( xmm2 );
1602  (~C)(i ,j+2UL) -= sum( xmm3 );
1603  (~C)(i ,j+3UL) -= sum( xmm4 );
1604  (~C)(i+1UL,j ) -= sum( xmm5 );
1605  (~C)(i+1UL,j+1UL) -= sum( xmm6 );
1606  (~C)(i+1UL,j+2UL) -= sum( xmm7 );
1607  (~C)(i+1UL,j+3UL) -= sum( xmm8 );
1608  }
1609  for( ; (j+2UL) <= N; j+=2UL ) {
1610  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1611  for( size_t k=0UL; k<K; k+=IT::size ) {
1612  const IntrinsicType a1( A.get(i ,k) );
1613  const IntrinsicType a2( A.get(i+1UL,k) );
1614  const IntrinsicType b1( B.get(k,j ) );
1615  const IntrinsicType b2( B.get(k,j+1UL) );
1616  xmm1 = xmm1 + a1 * b1;
1617  xmm2 = xmm2 + a1 * b2;
1618  xmm3 = xmm3 + a2 * b1;
1619  xmm4 = xmm4 + a2 * b2;
1620  }
1621  (~C)(i ,j ) -= sum( xmm1 );
1622  (~C)(i ,j+1UL) -= sum( xmm2 );
1623  (~C)(i+1UL,j ) -= sum( xmm3 );
1624  (~C)(i+1UL,j+1UL) -= sum( xmm4 );
1625  }
1626  if( j < N ) {
1627  IntrinsicType xmm1, xmm2;
1628  for( size_t k=0UL; k<K; k+=IT::size ) {
1629  const IntrinsicType b1( B.get(k,j) );
1630  xmm1 = xmm1 + A.get(i ,k) * b1;
1631  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1632  }
1633  (~C)(i ,j) -= sum( xmm1 );
1634  (~C)(i+1UL,j) -= sum( xmm2 );
1635  }
1636  }
1637  if( i < M ) {
1638  size_t j( 0UL );
1639  for( ; (j+4UL) <= N; j+=4UL ) {
1640  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1641  for( size_t k=0UL; k<K; k+=IT::size ) {
1642  const IntrinsicType a1( A.get(i,k) );
1643  xmm1 = xmm1 + a1 * B.get(k,j );
1644  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1645  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
1646  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
1647  }
1648  (~C)(i,j ) -= sum( xmm1 );
1649  (~C)(i,j+1UL) -= sum( xmm2 );
1650  (~C)(i,j+2UL) -= sum( xmm3 );
1651  (~C)(i,j+3UL) -= sum( xmm4 );
1652  }
1653  for( ; (j+2UL) <= N; j+=2UL ) {
1654  IntrinsicType xmm1, xmm2;
1655  for( size_t k=0UL; k<K; k+=IT::size ) {
1656  const IntrinsicType a1( A.get(i,k) );
1657  xmm1 = xmm1 + a1 * B.get(k,j );
1658  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1659  }
1660  (~C)(i,j ) -= sum( xmm1 );
1661  (~C)(i,j+1UL) -= sum( xmm2 );
1662  }
1663  if( j < N ) {
1664  IntrinsicType xmm1, xmm2;
1665  for( size_t k=0UL; k<K; k+=IT::size ) {
1666  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1667  }
1668  (~C)(i,j) -= sum( xmm1 );
1669  }
1670  }
1671  }
1673  //**********************************************************************************************
1674 
1675  //**Default subtraction assignment to column-major dense matrices*******************************
1689  template< typename MT3 // Type of the left-hand side target matrix
1690  , typename MT4 // Type of the left-hand side matrix operand
1691  , typename MT5 > // Type of the right-hand side matrix operand
1692  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1693  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1694  {
1695  typedef IntrinsicTrait<ElementType> IT;
1696 
1697  const size_t M( A.rows() );
1698  const size_t N( B.columns() );
1699  const size_t K( A.columns() );
1700 
1701  size_t i( 0UL );
1702 
1703  for( ; (i+4UL) <= M; i+=4UL ) {
1704  size_t j( 0UL );
1705  for( ; (j+2UL) <= N; j+=2UL ) {
1706  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1707  for( size_t k=0UL; k<K; k+=IT::size ) {
1708  const IntrinsicType a1( A.get(i ,k) );
1709  const IntrinsicType a2( A.get(i+1UL,k) );
1710  const IntrinsicType a3( A.get(i+2UL,k) );
1711  const IntrinsicType a4( A.get(i+3UL,k) );
1712  const IntrinsicType b1( B.get(k,j ) );
1713  const IntrinsicType b2( B.get(k,j+1UL) );
1714  xmm1 = xmm1 + a1 * b1;
1715  xmm2 = xmm2 + a1 * b2;
1716  xmm3 = xmm3 + a2 * b1;
1717  xmm4 = xmm4 + a2 * b2;
1718  xmm5 = xmm5 + a3 * b1;
1719  xmm6 = xmm6 + a3 * b2;
1720  xmm7 = xmm7 + a4 * b1;
1721  xmm8 = xmm8 + a4 * b2;
1722  }
1723  (~C)(i ,j ) -= sum( xmm1 );
1724  (~C)(i ,j+1UL) -= sum( xmm2 );
1725  (~C)(i+1UL,j ) -= sum( xmm3 );
1726  (~C)(i+1UL,j+1UL) -= sum( xmm4 );
1727  (~C)(i+2UL,j ) -= sum( xmm5 );
1728  (~C)(i+2UL,j+1UL) -= sum( xmm6 );
1729  (~C)(i+3UL,j ) -= sum( xmm7 );
1730  (~C)(i+3UL,j+1UL) -= sum( xmm8 );
1731  }
1732  if( j < N ) {
1733  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1734  for( size_t k=0UL; k<K; k+=IT::size ) {
1735  const IntrinsicType b1( B.get(k,j) );
1736  xmm1 = xmm1 + A.get(i ,k) * b1;
1737  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1738  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
1739  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
1740  }
1741  (~C)(i ,j) -= sum( xmm1 );
1742  (~C)(i+1UL,j) -= sum( xmm2 );
1743  (~C)(i+2UL,j) -= sum( xmm3 );
1744  (~C)(i+3UL,j) -= sum( xmm4 );
1745  }
1746  }
1747  for( ; (i+2UL) <= M; i+=2UL ) {
1748  size_t j( 0UL );
1749  for( ; (j+2UL) <= N; j+=2UL ) {
1750  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1751  for( size_t k=0UL; k<K; k+=IT::size ) {
1752  const IntrinsicType a1( A.get(i ,k) );
1753  const IntrinsicType a2( A.get(i+1UL,k) );
1754  const IntrinsicType b1( B.get(k,j ) );
1755  const IntrinsicType b2( B.get(k,j+1UL) );
1756  xmm1 = xmm1 + a1 * b1;
1757  xmm2 = xmm2 + a1 * b2;
1758  xmm3 = xmm3 + a2 * b1;
1759  xmm4 = xmm4 + a2 * b2;
1760  }
1761  (~C)(i ,j ) -= sum( xmm1 );
1762  (~C)(i ,j+1UL) -= sum( xmm2 );
1763  (~C)(i+1UL,j ) -= sum( xmm3 );
1764  (~C)(i+1UL,j+1UL) -= sum( xmm4 );
1765  }
1766  if( j < N ) {
1767  IntrinsicType xmm1, xmm2;
1768  for( size_t k=0UL; k<K; k+=IT::size ) {
1769  const IntrinsicType b1( B.get(k,j) );
1770  xmm1 = xmm1 + A.get(i ,k) * b1;
1771  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1772  }
1773  (~C)(i ,j) -= sum( xmm1 );
1774  (~C)(i+1UL,j) -= sum( xmm2 );
1775  }
1776  }
1777  if( i < M ) {
1778  size_t j( 0UL );
1779  for( ; (j+2UL) <= N; j+=2UL ) {
1780  IntrinsicType xmm1, xmm2;
1781  for( size_t k=0UL; k<K; k+=IT::size ) {
1782  const IntrinsicType a1( A.get(i,k) );
1783  xmm1 = xmm1 + a1 * B.get(k,j );
1784  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1785  }
1786  (~C)(i,j ) -= sum( xmm1 );
1787  (~C)(i,j+1UL) -= sum( xmm2 );
1788  }
1789  if( j < N ) {
1790  IntrinsicType xmm1, xmm2;
1791  for( size_t k=0UL; k<K; k+=IT::size ) {
1792  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1793  }
1794  (~C)(i,j) -= sum( xmm1 );
1795  }
1796  }
1797  }
1799  //**********************************************************************************************
1800 
1801  //**Default subtraction assignment to dense matrices********************************************
1815  template< typename MT3 // Type of the left-hand side target matrix
1816  , typename MT4 // Type of the left-hand side matrix operand
1817  , typename MT5 > // Type of the right-hand side matrix operand
1818  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1819  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1820  {
1821  selectDefaultSubAssignKernel( C, A, B );
1822  }
1824  //**********************************************************************************************
1825 
1826  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
1827 #if BLAZE_BLAS_MODE
1828 
1841  template< typename MT3 // Type of the left-hand side target matrix
1842  , typename MT4 // Type of the left-hand side matrix operand
1843  , typename MT5 > // Type of the right-hand side matrix operand
1844  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1845  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1846  {
1847  using boost::numeric_cast;
1848 
1849  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
1850  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
1851  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
1852 
1853  const int M ( numeric_cast<int>( A.rows() ) );
1854  const int N ( numeric_cast<int>( B.columns() ) );
1855  const int K ( numeric_cast<int>( A.columns() ) );
1856  const int lda( numeric_cast<int>( A.spacing() ) );
1857  const int ldb( numeric_cast<int>( B.spacing() ) );
1858  const int ldc( numeric_cast<int>( C.spacing() ) );
1859 
1860  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1861  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1862  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1863  M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1864  }
1866 #endif
1867  //**********************************************************************************************
1868 
1869  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
1870 #if BLAZE_BLAS_MODE
1871 
1884  template< typename MT3 // Type of the left-hand side target matrix
1885  , typename MT4 // Type of the left-hand side matrix operand
1886  , typename MT5 > // Type of the right-hand side matrix operand
1887  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1888  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1889  {
1890  using boost::numeric_cast;
1891 
1892  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
1893  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
1894  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
1895 
1896  const int M ( numeric_cast<int>( A.rows() ) );
1897  const int N ( numeric_cast<int>( B.columns() ) );
1898  const int K ( numeric_cast<int>( A.columns() ) );
1899  const int lda( numeric_cast<int>( A.spacing() ) );
1900  const int ldb( numeric_cast<int>( B.spacing() ) );
1901  const int ldc( numeric_cast<int>( C.spacing() ) );
1902 
1903  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1904  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1905  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1906  M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1907  }
1909 #endif
1910  //**********************************************************************************************
1911 
1912  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
1913 #if BLAZE_BLAS_MODE
1914 
1927  template< typename MT3 // Type of the left-hand side target matrix
1928  , typename MT4 // Type of the left-hand side matrix operand
1929  , typename MT5 > // Type of the right-hand side matrix operand
1930  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1931  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1932  {
1933  using boost::numeric_cast;
1934 
1935  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1936  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1937  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1938  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
1939  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
1940  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
1941 
1942  const int M ( numeric_cast<int>( A.rows() ) );
1943  const int N ( numeric_cast<int>( B.columns() ) );
1944  const int K ( numeric_cast<int>( A.columns() ) );
1945  const int lda( numeric_cast<int>( A.spacing() ) );
1946  const int ldb( numeric_cast<int>( B.spacing() ) );
1947  const int ldc( numeric_cast<int>( C.spacing() ) );
1948  const complex<float> alpha( -1.0F, 0.0F );
1949  const complex<float> beta ( 1.0F, 0.0F );
1950 
1951  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1952  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1953  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1954  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1955  }
1957 #endif
1958  //**********************************************************************************************
1959 
1960  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
1961 #if BLAZE_BLAS_MODE
1962 
1975  template< typename MT3 // Type of the left-hand side target matrix
1976  , typename MT4 // Type of the left-hand side matrix operand
1977  , typename MT5 > // Type of the right-hand side matrix operand
1978  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1979  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1980  {
1981  using boost::numeric_cast;
1982 
1983  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1984  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1985  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1986  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1987  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1988  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1989 
1990  const int M ( numeric_cast<int>( A.rows() ) );
1991  const int N ( numeric_cast<int>( B.columns() ) );
1992  const int K ( numeric_cast<int>( A.columns() ) );
1993  const int lda( numeric_cast<int>( A.spacing() ) );
1994  const int ldb( numeric_cast<int>( B.spacing() ) );
1995  const int ldc( numeric_cast<int>( C.spacing() ) );
1996  const complex<double> alpha( -1.0, 0.0 );
1997  const complex<double> beta ( 1.0, 0.0 );
1998 
1999  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2000  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2001  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2002  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2003  }
2005 #endif
2006  //**********************************************************************************************
2007 
2008  //**Subtraction assignment to sparse matrices***************************************************
2009  // No special implementation for the subtraction assignment to sparse matrices.
2010  //**********************************************************************************************
2011 
2012  //**Multiplication assignment to dense matrices*************************************************
2013  // No special implementation for the multiplication assignment to dense matrices.
2014  //**********************************************************************************************
2015 
2016  //**Multiplication assignment to sparse matrices************************************************
2017  // No special implementation for the multiplication assignment to sparse matrices.
2018  //**********************************************************************************************
2019 
2020  //**Compile time checks*************************************************************************
2027  //**********************************************************************************************
2028 };
2029 //*************************************************************************************************
2030 
2031 
2032 
2033 
2034 //=================================================================================================
2035 //
2036 // DMATSCALARMULTEXPR SPECIALIZATION
2037 //
2038 //=================================================================================================
2039 
2040 //*************************************************************************************************
2048 template< typename MT1 // Type of the left-hand side dense matrix
2049  , typename MT2 // Type of the right-hand side dense matrix
2050  , typename ST > // Type of the right-hand side scalar value
2051 class DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >
2052  : public DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >, false >
2053  , private MatScalarMultExpr
2054  , private Computation
2055 {
2056  private:
2057  //**Type definitions****************************************************************************
2058  typedef DMatTDMatMultExpr<MT1,MT2> MMM;
2059  typedef typename MMM::ResultType RES;
2060  typedef typename MT1::ResultType RT1;
2061  typedef typename MT2::ResultType RT2;
2062  typedef typename MT1::CompositeType CT1;
2063  typedef typename MT2::CompositeType CT2;
2064  //**********************************************************************************************
2065 
2066  //**********************************************************************************************
2068 
2071  template< typename T1, typename T2, typename T3, typename T4 >
2072  struct UseSinglePrecisionKernel {
2073  enum { value = IsFloat<typename T1::ElementType>::value &&
2074  IsFloat<typename T2::ElementType>::value &&
2075  IsFloat<typename T3::ElementType>::value &&
2076  !IsComplex<T4>::value };
2077  };
2078  //**********************************************************************************************
2079 
2080  //**********************************************************************************************
2082 
2085  template< typename T1, typename T2, typename T3, typename T4 >
2086  struct UseDoublePrecisionKernel {
2087  enum { value = IsDouble<typename T1::ElementType>::value &&
2088  IsDouble<typename T2::ElementType>::value &&
2089  IsDouble<typename T3::ElementType>::value &&
2090  !IsComplex<T4>::value };
2091  };
2092  //**********************************************************************************************
2093 
2094  //**********************************************************************************************
2096 
2099  template< typename T1, typename T2, typename T3 >
2100  struct UseSinglePrecisionComplexKernel {
2101  typedef complex<float> Type;
2102  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2103  IsSame<typename T2::ElementType,Type>::value &&
2104  IsSame<typename T3::ElementType,Type>::value };
2105  };
2106  //**********************************************************************************************
2107 
2108  //**********************************************************************************************
2110 
2113  template< typename T1, typename T2, typename T3 >
2114  struct UseDoublePrecisionComplexKernel {
2115  typedef complex<double> Type;
2116  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2117  IsSame<typename T2::ElementType,Type>::value &&
2118  IsSame<typename T3::ElementType,Type>::value };
2119  };
2120  //**********************************************************************************************
2121 
2122  //**********************************************************************************************
2124 
2126  template< typename T1, typename T2, typename T3, typename T4 >
2127  struct UseDefaultKernel {
2128  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2129  !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2130  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2131  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2132  };
2133  //**********************************************************************************************
2134 
2135  //**********************************************************************************************
2137 
2139  template< typename T1, typename T2, typename T3, typename T4 >
2140  struct UseVectorizedDefaultKernel {
2141  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2142  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2143  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2144  IsSame<typename T1::ElementType,T4>::value &&
2145  IntrinsicTrait<typename T1::ElementType>::addition &&
2146  IntrinsicTrait<typename T1::ElementType>::multiplication };
2147  };
2148  //**********************************************************************************************
2149 
2150  public:
2151  //**Type definitions****************************************************************************
2152  typedef DMatScalarMultExpr<MMM,ST,false> This;
2153  typedef typename MultTrait<RES,ST>::Type ResultType;
2154  typedef typename ResultType::OppositeType OppositeType;
2155  typedef typename ResultType::TransposeType TransposeType;
2156  typedef typename ResultType::ElementType ElementType;
2157  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
2158  typedef const ElementType ReturnType;
2159  typedef const ResultType CompositeType;
2160 
2162  typedef const DMatTDMatMultExpr<MT1,MT2> LeftOperand;
2163 
2165  typedef ST RightOperand;
2166 
2168  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
2169 
2171  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
2172  //**********************************************************************************************
2173 
2174  //**Compilation flags***************************************************************************
2176  enum { vectorizable = 0 };
2177  //**********************************************************************************************
2178 
2179  //**Constructor*********************************************************************************
2185  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
2186  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
2187  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2188  {}
2189  //**********************************************************************************************
2190 
2191  //**Access operator*****************************************************************************
2198  inline ReturnType operator()( size_t i, size_t j ) const {
2199  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
2200  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
2201  return matrix_(i,j) * scalar_;
2202  }
2203  //**********************************************************************************************
2204 
2205  //**Rows function*******************************************************************************
2210  inline size_t rows() const {
2211  return matrix_.rows();
2212  }
2213  //**********************************************************************************************
2214 
2215  //**Columns function****************************************************************************
2220  inline size_t columns() const {
2221  return matrix_.columns();
2222  }
2223  //**********************************************************************************************
2224 
2225  //**Left operand access*************************************************************************
2230  inline LeftOperand leftOperand() const {
2231  return matrix_;
2232  }
2233  //**********************************************************************************************
2234 
2235  //**Right operand access************************************************************************
2240  inline RightOperand rightOperand() const {
2241  return scalar_;
2242  }
2243  //**********************************************************************************************
2244 
2245  //**********************************************************************************************
2251  template< typename T >
2252  inline bool canAlias( const T* alias ) const {
2253  return matrix_.canAlias( alias );
2254  }
2255  //**********************************************************************************************
2256 
2257  //**********************************************************************************************
2263  template< typename T >
2264  inline bool isAliased( const T* alias ) const {
2265  return matrix_.isAliased( alias );
2266  }
2267  //**********************************************************************************************
2268 
2269  private:
2270  //**Member variables****************************************************************************
2271  LeftOperand matrix_;
2272  RightOperand scalar_;
2273  //**********************************************************************************************
2274 
2275  //**Assignment to dense matrices****************************************************************
2284  template< typename MT3 // Type of the target dense matrix
2285  , bool SO > // Storage order of the target dense matrix
2286  friend inline void assign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
2287  {
2289 
2290  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2291  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2292 
2293  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2294  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2295 
2296  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
2297  return;
2298  }
2299  else if( left.columns() == 0UL ) {
2300  reset( ~lhs );
2301  return;
2302  }
2303 
2304  LT A( left ); // Evaluation of the left-hand side dense matrix operand
2305  RT B( right ); // Evaluation of the right-hand side dense matrix operand
2306 
2307  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2308  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
2309  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
2310  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
2311  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2312  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
2313 
2314  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
2315  DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2316  else
2317  DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2318  }
2319  //**********************************************************************************************
2320 
2321  //**Default assignment to dense matrices********************************************************
2335  template< typename MT3 // Type of the left-hand side target matrix
2336  , typename MT4 // Type of the left-hand side matrix operand
2337  , typename MT5 // Type of the right-hand side matrix operand
2338  , typename ST2 > // Type of the scalar value
2339  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2340  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2341  {
2342  for( size_t i=0UL; i<A.rows(); ++i ) {
2343  for( size_t k=0UL; k<B.columns(); ++k ) {
2344  C(i,k) = A(i,0UL) * B(0UL,k);
2345  }
2346  for( size_t j=1UL; j<A.columns(); ++j ) {
2347  for( size_t k=0UL; k<B.columns(); ++k ) {
2348  C(i,k) += A(i,j) * B(j,k);
2349  }
2350  }
2351  for( size_t k=0UL; k<B.columns(); ++k ) {
2352  C(i,k) *= scalar;
2353  }
2354  }
2355  }
2356  //**********************************************************************************************
2357 
2358  //**Vectorized default assignment to row-major dense matrices***********************************
2372  template< typename MT3 // Type of the left-hand side target matrix
2373  , typename MT4 // Type of the left-hand side matrix operand
2374  , typename MT5 // Type of the right-hand side matrix operand
2375  , typename ST2 > // Type of the scalar value
2376  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2377  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
2378  {
2379  typedef IntrinsicTrait<ElementType> IT;
2380 
2381  const size_t M( A.rows() );
2382  const size_t N( B.columns() );
2383  const size_t K( A.columns() );
2384 
2385  size_t i( 0UL );
2386 
2387  for( ; (i+2UL) <= M; i+=2UL ) {
2388  size_t j( 0UL );
2389  for( ; (j+4UL) <= N; j+=4UL ) {
2390  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2391  for( size_t k=0UL; k<K; k+=IT::size ) {
2392  const IntrinsicType a1( A.get(i ,k) );
2393  const IntrinsicType a2( A.get(i+1UL,k) );
2394  const IntrinsicType b1( B.get(k,j ) );
2395  const IntrinsicType b2( B.get(k,j+1UL) );
2396  const IntrinsicType b3( B.get(k,j+2UL) );
2397  const IntrinsicType b4( B.get(k,j+3UL) );
2398  xmm1 = xmm1 + a1 * b1;
2399  xmm2 = xmm2 + a1 * b2;
2400  xmm3 = xmm3 + a1 * b3;
2401  xmm4 = xmm4 + a1 * b4;
2402  xmm5 = xmm5 + a2 * b1;
2403  xmm6 = xmm6 + a2 * b2;
2404  xmm7 = xmm7 + a2 * b3;
2405  xmm8 = xmm8 + a2 * b4;
2406  }
2407  (~C)(i ,j ) = sum( xmm1 ) * scalar;
2408  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
2409  (~C)(i ,j+2UL) = sum( xmm3 ) * scalar;
2410  (~C)(i ,j+3UL) = sum( xmm4 ) * scalar;
2411  (~C)(i+1UL,j ) = sum( xmm5 ) * scalar;
2412  (~C)(i+1UL,j+1UL) = sum( xmm6 ) * scalar;
2413  (~C)(i+1UL,j+2UL) = sum( xmm7 ) * scalar;
2414  (~C)(i+1UL,j+3UL) = sum( xmm8 ) * scalar;
2415  }
2416  for( ; (j+2UL) <= N; j+=2UL ) {
2417  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2418  for( size_t k=0UL; k<K; k+=IT::size ) {
2419  const IntrinsicType a1( A.get(i ,k) );
2420  const IntrinsicType a2( A.get(i+1UL,k) );
2421  const IntrinsicType b1( B.get(k,j ) );
2422  const IntrinsicType b2( B.get(k,j+1UL) );
2423  xmm1 = xmm1 + a1 * b1;
2424  xmm2 = xmm2 + a1 * b2;
2425  xmm3 = xmm3 + a2 * b1;
2426  xmm4 = xmm4 + a2 * b2;
2427  }
2428  (~C)(i ,j ) = sum( xmm1 ) * scalar;
2429  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
2430  (~C)(i+1UL,j ) = sum( xmm3 ) * scalar;
2431  (~C)(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
2432  }
2433  if( j < N ) {
2434  IntrinsicType xmm1, xmm2;
2435  for( size_t k=0UL; k<K; k+=IT::size ) {
2436  const IntrinsicType b1( B.get(k,j) );
2437  xmm1 = xmm1 + A.get(i ,k) * b1;
2438  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2439  }
2440  (~C)(i ,j) = sum( xmm1 ) * scalar;
2441  (~C)(i+1UL,j) = sum( xmm2 ) * scalar;
2442  }
2443  }
2444  if( i < M ) {
2445  size_t j( 0UL );
2446  for( ; (j+4UL) <= N; j+=4UL ) {
2447  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2448  for( size_t k=0UL; k<K; k+=IT::size ) {
2449  const IntrinsicType a1( A.get(i,k) );
2450  xmm1 = xmm1 + a1 * B.get(k,j );
2451  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2452  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
2453  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
2454  }
2455  (~C)(i,j ) = sum( xmm1 ) * scalar;
2456  (~C)(i,j+1UL) = sum( xmm2 ) * scalar;
2457  (~C)(i,j+2UL) = sum( xmm3 ) * scalar;
2458  (~C)(i,j+3UL) = sum( xmm4 ) * scalar;
2459  }
2460  for( ; (j+2UL) <= N; j+=2UL ) {
2461  IntrinsicType xmm1, xmm2;
2462  for( size_t k=0UL; k<K; k+=IT::size ) {
2463  const IntrinsicType a1( A.get(i,k) );
2464  xmm1 = xmm1 + a1 * B.get(k,j );
2465  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2466  }
2467  (~C)(i,j ) = sum( xmm1 ) * scalar;
2468  (~C)(i,j+1UL) = sum( xmm2 ) * scalar;
2469  }
2470  if( j < N ) {
2471  IntrinsicType xmm1, xmm2;
2472  for( size_t k=0UL; k<K; k+=IT::size ) {
2473  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
2474  }
2475  (~C)(i,j) = sum( xmm1 ) * scalar;
2476  }
2477  }
2478  }
2479  //**********************************************************************************************
2480 
2481  //**Vectorized default assignment to column-major dense matrices********************************
2495  template< typename MT3 // Type of the left-hand side target matrix
2496  , typename MT4 // Type of the left-hand side matrix operand
2497  , typename MT5 // Type of the right-hand side matrix operand
2498  , typename ST2 > // Type of the scalar value
2499  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2500  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
2501  {
2502  typedef IntrinsicTrait<ElementType> IT;
2503 
2504  const size_t M( A.rows() );
2505  const size_t N( B.columns() );
2506  const size_t K( A.columns() );
2507 
2508  size_t i( 0UL );
2509 
2510  for( ; (i+4UL) <= M; i+=4UL ) {
2511  size_t j( 0UL );
2512  for( ; (j+2UL) <= N; j+=2UL ) {
2513  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2514  for( size_t k=0UL; k<K; k+=IT::size ) {
2515  const IntrinsicType a1( A.get(i ,k) );
2516  const IntrinsicType a2( A.get(i+1UL,k) );
2517  const IntrinsicType a3( A.get(i+2UL,k) );
2518  const IntrinsicType a4( A.get(i+3UL,k) );
2519  const IntrinsicType b1( B.get(k,j ) );
2520  const IntrinsicType b2( B.get(k,j+1UL) );
2521  xmm1 = xmm1 + a1 * b1;
2522  xmm2 = xmm2 + a1 * b2;
2523  xmm3 = xmm3 + a2 * b1;
2524  xmm4 = xmm4 + a2 * b2;
2525  xmm5 = xmm5 + a3 * b1;
2526  xmm6 = xmm6 + a3 * b2;
2527  xmm7 = xmm7 + a4 * b1;
2528  xmm8 = xmm8 + a4 * b2;
2529  }
2530  (~C)(i ,j ) = sum( xmm1 ) * scalar;
2531  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
2532  (~C)(i+1UL,j ) = sum( xmm3 ) * scalar;
2533  (~C)(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
2534  (~C)(i+2UL,j ) = sum( xmm5 ) * scalar;
2535  (~C)(i+2UL,j+1UL) = sum( xmm6 ) * scalar;
2536  (~C)(i+3UL,j ) = sum( xmm7 ) * scalar;
2537  (~C)(i+3UL,j+1UL) = sum( xmm8 ) * scalar;
2538  }
2539  if( j < N ) {
2540  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2541  for( size_t k=0UL; k<K; k+=IT::size ) {
2542  const IntrinsicType b1( B.get(k,j) );
2543  xmm1 = xmm1 + A.get(i ,k) * b1;
2544  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2545  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
2546  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
2547  }
2548  (~C)(i ,j) = sum( xmm1 ) * scalar;
2549  (~C)(i+1UL,j) = sum( xmm2 ) * scalar;
2550  (~C)(i+2UL,j) = sum( xmm3 ) * scalar;
2551  (~C)(i+3UL,j) = sum( xmm4 ) * scalar;
2552  }
2553  }
2554  for( ; (i+2UL) <= M; i+=2UL ) {
2555  size_t j( 0UL );
2556  for( ; (j+2UL) <= N; j+=2UL ) {
2557  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2558  for( size_t k=0UL; k<K; k+=IT::size ) {
2559  const IntrinsicType a1( A.get(i ,k) );
2560  const IntrinsicType a2( A.get(i+1UL,k) );
2561  const IntrinsicType b1( B.get(k,j ) );
2562  const IntrinsicType b2( B.get(k,j+1UL) );
2563  xmm1 = xmm1 + a1 * b1;
2564  xmm2 = xmm2 + a1 * b2;
2565  xmm3 = xmm3 + a2 * b1;
2566  xmm4 = xmm4 + a2 * b2;
2567  }
2568  (~C)(i ,j ) = sum( xmm1 ) * scalar;
2569  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
2570  (~C)(i+1UL,j ) = sum( xmm3 ) * scalar;
2571  (~C)(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
2572  }
2573  if( j < N ) {
2574  IntrinsicType xmm1, xmm2;
2575  for( size_t k=0UL; k<K; k+=IT::size ) {
2576  const IntrinsicType b1( B.get(k,j) );
2577  xmm1 = xmm1 + A.get(i ,k) * b1;
2578  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2579  }
2580  (~C)(i ,j) = sum( xmm1 ) * scalar;
2581  (~C)(i+1UL,j) = sum( xmm2 ) * scalar;
2582  }
2583  }
2584  if( i < M ) {
2585  size_t j( 0UL );
2586  for( ; (j+2UL) <= N; j+=2UL ) {
2587  IntrinsicType xmm1, xmm2;
2588  for( size_t k=0UL; k<K; k+=IT::size ) {
2589  const IntrinsicType a1( A.get(i,k) );
2590  xmm1 = xmm1 + a1 * B.get(k,j );
2591  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2592  }
2593  (~C)(i,j ) = sum( xmm1 ) * scalar;
2594  (~C)(i,j+1UL) = sum( xmm2 ) * scalar;
2595  }
2596  if( j < N ) {
2597  IntrinsicType xmm1, xmm2;
2598  for( size_t k=0UL; k<K; k+=IT::size ) {
2599  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
2600  }
2601  (~C)(i,j) = sum( xmm1 ) * scalar;
2602  }
2603  }
2604  }
2605  //**********************************************************************************************
2606 
2607  //**BLAS-based assignment to dense matrices (default)*******************************************
2621  template< typename MT3 // Type of the left-hand side target matrix
2622  , typename MT4 // Type of the left-hand side matrix operand
2623  , typename MT5 // Type of the right-hand side matrix operand
2624  , typename ST2 > // Type of the scalar value
2625  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2626  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2627  {
2628  selectDefaultAssignKernel( C, A, B, scalar );
2629  }
2630  //**********************************************************************************************
2631 
2632  //**BLAS-based assignment to dense matrices (single precision)**********************************
2633 #if BLAZE_BLAS_MODE
2634 
2647  template< typename MT3 // Type of the left-hand side target matrix
2648  , typename MT4 // Type of the left-hand side matrix operand
2649  , typename MT5 // Type of the right-hand side matrix operand
2650  , typename ST2 > // Type of the scalar value
2651  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2652  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2653  {
2654  using boost::numeric_cast;
2655 
2656  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
2657  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
2658  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
2659 
2660  const int M ( numeric_cast<int>( A.rows() ) );
2661  const int N ( numeric_cast<int>( B.columns() ) );
2662  const int K ( numeric_cast<int>( A.columns() ) );
2663  const int lda( numeric_cast<int>( A.spacing() ) );
2664  const int ldb( numeric_cast<int>( B.spacing() ) );
2665  const int ldc( numeric_cast<int>( C.spacing() ) );
2666 
2667  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2668  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2669  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2670  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2671  }
2672 #endif
2673  //**********************************************************************************************
2674 
2675  //**BLAS-based assignment to dense matrices (double precision)**********************************
2676 #if BLAZE_BLAS_MODE
2677 
2690  template< typename MT3 // Type of the left-hand side target matrix
2691  , typename MT4 // Type of the left-hand side matrix operand
2692  , typename MT5 // Type of the right-hand side matrix operand
2693  , typename ST2 > // Type of the scalar value
2694  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2695  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2696  {
2697  using boost::numeric_cast;
2698 
2699  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
2700  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
2701  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
2702 
2703  const int M ( numeric_cast<int>( A.rows() ) );
2704  const int N ( numeric_cast<int>( B.columns() ) );
2705  const int K ( numeric_cast<int>( A.columns() ) );
2706  const int lda( numeric_cast<int>( A.spacing() ) );
2707  const int ldb( numeric_cast<int>( B.spacing() ) );
2708  const int ldc( numeric_cast<int>( C.spacing() ) );
2709 
2710  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2711  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2712  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2713  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2714  }
2715 #endif
2716  //**********************************************************************************************
2717 
2718  //**BLAS-based assignment to dense matrices (single precision complex)**************************
2719 #if BLAZE_BLAS_MODE
2720 
2733  template< typename MT3 // Type of the left-hand side target matrix
2734  , typename MT4 // Type of the left-hand side matrix operand
2735  , typename MT5 // Type of the right-hand side matrix operand
2736  , typename ST2 > // Type of the scalar value
2737  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2738  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2739  {
2740  using boost::numeric_cast;
2741 
2742  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
2743  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
2744  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
2745  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
2746  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
2747  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
2748 
2749  const int M ( numeric_cast<int>( A.rows() ) );
2750  const int N ( numeric_cast<int>( B.columns() ) );
2751  const int K ( numeric_cast<int>( A.columns() ) );
2752  const int lda( numeric_cast<int>( A.spacing() ) );
2753  const int ldb( numeric_cast<int>( B.spacing() ) );
2754  const int ldc( numeric_cast<int>( C.spacing() ) );
2755  const complex<float> alpha( scalar );
2756  const complex<float> beta ( 0.0F, 0.0F );
2757 
2758  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2759  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2760  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2761  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2762  }
2763 #endif
2764  //**********************************************************************************************
2765 
2766  //**BLAS-based assignment to dense matrices (double precision complex)**************************
2767 #if BLAZE_BLAS_MODE
2768 
2781  template< typename MT3 // Type of the left-hand side target matrix
2782  , typename MT4 // Type of the left-hand side matrix operand
2783  , typename MT5 // Type of the right-hand side matrix operand
2784  , typename ST2 > // Type of the scalar value
2785  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2786  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2787  {
2788  using boost::numeric_cast;
2789 
2790  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
2791  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
2792  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
2793  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
2794  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
2795  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
2796 
2797  const int M ( numeric_cast<int>( A.rows() ) );
2798  const int N ( numeric_cast<int>( B.columns() ) );
2799  const int K ( numeric_cast<int>( A.columns() ) );
2800  const int lda( numeric_cast<int>( A.spacing() ) );
2801  const int ldb( numeric_cast<int>( B.spacing() ) );
2802  const int ldc( numeric_cast<int>( C.spacing() ) );
2803  const complex<double> alpha( scalar );
2804  const complex<double> beta ( 0.0, 0.0 );
2805 
2806  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2807  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2808  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2809  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2810  }
2811 #endif
2812  //**********************************************************************************************
2813 
2814  //**Assignment to sparse matrices***************************************************************
2826  template< typename MT // Type of the target sparse matrix
2827  , bool SO > // Storage order of the target sparse matrix
2828  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
2829  {
2831 
2832  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
2833 
2839  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( typename TmpType::CompositeType );
2840 
2841  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2842  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2843 
2844  const TmpType tmp( rhs );
2845  assign( ~lhs, tmp );
2846  }
2847  //**********************************************************************************************
2848 
2849  //**Addition assignment to dense matrices*******************************************************
2861  template< typename MT3 // Type of the target dense matrix
2862  , bool SO > // Storage order of the target dense matrix
2863  friend inline void addAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
2864  {
2866 
2867  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2868  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2869 
2870  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2871  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2872 
2873  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
2874  return;
2875  }
2876 
2877  LT A( left ); // Evaluation of the left-hand side dense matrix operand
2878  RT B( right ); // Evaluation of the right-hand side dense matrix operand
2879 
2880  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2881  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
2882  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
2883  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
2884  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2885  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
2886 
2887  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
2888  DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2889  else
2890  DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2891  }
2892  //**********************************************************************************************
2893 
2894  //**Default addition assignment to dense matrices***********************************************
2908  template< typename MT3 // Type of the left-hand side target matrix
2909  , typename MT4 // Type of the left-hand side matrix operand
2910  , typename MT5 // Type of the right-hand side matrix operand
2911  , typename ST2 > // Type of the scalar value
2912  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2913  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2914  {
2915  const ResultType tmp( A * B * scalar );
2916  addAssign( C, tmp );
2917  }
2918  //**********************************************************************************************
2919 
2920  //**Vectorized default addition assignment to row-major dense matrices**************************
2934  template< typename MT3 // Type of the left-hand side target matrix
2935  , typename MT4 // Type of the left-hand side matrix operand
2936  , typename MT5 // Type of the right-hand side matrix operand
2937  , typename ST2 > // Type of the scalar value
2938  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2939  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
2940  {
2941  typedef IntrinsicTrait<ElementType> IT;
2942 
2943  const size_t M( A.rows() );
2944  const size_t N( B.columns() );
2945  const size_t K( A.columns() );
2946 
2947  size_t i( 0UL );
2948 
2949  for( ; (i+2UL) <= M; i+=2UL ) {
2950  size_t j( 0UL );
2951  for( ; (j+4UL) <= N; j+=4UL ) {
2952  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2953  for( size_t k=0UL; k<K; k+=IT::size ) {
2954  const IntrinsicType a1( A.get(i ,k) );
2955  const IntrinsicType a2( A.get(i+1UL,k) );
2956  const IntrinsicType b1( B.get(k,j ) );
2957  const IntrinsicType b2( B.get(k,j+1UL) );
2958  const IntrinsicType b3( B.get(k,j+2UL) );
2959  const IntrinsicType b4( B.get(k,j+3UL) );
2960  xmm1 = xmm1 + a1 * b1;
2961  xmm2 = xmm2 + a1 * b2;
2962  xmm3 = xmm3 + a1 * b3;
2963  xmm4 = xmm4 + a1 * b4;
2964  xmm5 = xmm5 + a2 * b1;
2965  xmm6 = xmm6 + a2 * b2;
2966  xmm7 = xmm7 + a2 * b3;
2967  xmm8 = xmm8 + a2 * b4;
2968  }
2969  (~C)(i ,j ) += sum( xmm1 ) * scalar;
2970  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
2971  (~C)(i ,j+2UL) += sum( xmm3 ) * scalar;
2972  (~C)(i ,j+3UL) += sum( xmm4 ) * scalar;
2973  (~C)(i+1UL,j ) += sum( xmm5 ) * scalar;
2974  (~C)(i+1UL,j+1UL) += sum( xmm6 ) * scalar;
2975  (~C)(i+1UL,j+2UL) += sum( xmm7 ) * scalar;
2976  (~C)(i+1UL,j+3UL) += sum( xmm8 ) * scalar;
2977  }
2978  for( ; (j+2UL) <= N; j+=2UL ) {
2979  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2980  for( size_t k=0UL; k<K; k+=IT::size ) {
2981  const IntrinsicType a1( A.get(i ,k) );
2982  const IntrinsicType a2( A.get(i+1UL,k) );
2983  const IntrinsicType b1( B.get(k,j ) );
2984  const IntrinsicType b2( B.get(k,j+1UL) );
2985  xmm1 = xmm1 + a1 * b1;
2986  xmm2 = xmm2 + a1 * b2;
2987  xmm3 = xmm3 + a2 * b1;
2988  xmm4 = xmm4 + a2 * b2;
2989  }
2990  (~C)(i ,j ) += sum( xmm1 ) * scalar;
2991  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
2992  (~C)(i+1UL,j ) += sum( xmm3 ) * scalar;
2993  (~C)(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
2994  }
2995  if( j < N ) {
2996  IntrinsicType xmm1, xmm2;
2997  for( size_t k=0UL; k<K; k+=IT::size ) {
2998  const IntrinsicType b1( B.get(k,j) );
2999  xmm1 = xmm1 + A.get(i ,k) * b1;
3000  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3001  }
3002  (~C)(i ,j) += sum( xmm1 ) * scalar;
3003  (~C)(i+1UL,j) += sum( xmm2 ) * scalar;
3004  }
3005  }
3006  if( i < M ) {
3007  size_t j( 0UL );
3008  for( ; (j+4UL) <= N; j+=4UL ) {
3009  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3010  for( size_t k=0UL; k<K; k+=IT::size ) {
3011  const IntrinsicType a1( A.get(i,k) );
3012  xmm1 = xmm1 + a1 * B.get(k,j );
3013  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3014  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
3015  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
3016  }
3017  (~C)(i,j ) += sum( xmm1 ) * scalar;
3018  (~C)(i,j+1UL) += sum( xmm2 ) * scalar;
3019  (~C)(i,j+2UL) += sum( xmm3 ) * scalar;
3020  (~C)(i,j+3UL) += sum( xmm4 ) * scalar;
3021  }
3022  for( ; (j+2UL) <= N; j+=2UL ) {
3023  IntrinsicType xmm1, xmm2;
3024  for( size_t k=0UL; k<K; k+=IT::size ) {
3025  const IntrinsicType a1( A.get(i,k) );
3026  xmm1 = xmm1 + a1 * B.get(k,j );
3027  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3028  }
3029  (~C)(i,j ) += sum( xmm1 ) * scalar;
3030  (~C)(i,j+1UL) += sum( xmm2 ) * scalar;
3031  }
3032  if( j < N ) {
3033  IntrinsicType xmm1, xmm2;
3034  for( size_t k=0UL; k<K; k+=IT::size ) {
3035  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3036  }
3037  (~C)(i,j) += sum( xmm1 ) * scalar;
3038  }
3039  }
3040  }
3041  //**********************************************************************************************
3042 
3043  //**Vectorized default addition assignment to column-major dense matrices***********************
3057  template< typename MT3 // Type of the left-hand side target matrix
3058  , typename MT4 // Type of the left-hand side matrix operand
3059  , typename MT5 // Type of the right-hand side matrix operand
3060  , typename ST2 > // Type of the scalar value
3061  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3062  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3063  {
3064  typedef IntrinsicTrait<ElementType> IT;
3065 
3066  const size_t M( A.rows() );
3067  const size_t N( B.columns() );
3068  const size_t K( A.columns() );
3069 
3070  size_t i( 0UL );
3071 
3072  for( ; (i+4UL) <= M; i+=4UL ) {
3073  size_t j( 0UL );
3074  for( ; (j+2UL) <= N; j+=2UL ) {
3075  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3076  for( size_t k=0UL; k<K; k+=IT::size ) {
3077  const IntrinsicType a1( A.get(i ,k) );
3078  const IntrinsicType a2( A.get(i+1UL,k) );
3079  const IntrinsicType a3( A.get(i+2UL,k) );
3080  const IntrinsicType a4( A.get(i+3UL,k) );
3081  const IntrinsicType b1( B.get(k,j ) );
3082  const IntrinsicType b2( B.get(k,j+1UL) );
3083  xmm1 = xmm1 + a1 * b1;
3084  xmm2 = xmm2 + a1 * b2;
3085  xmm3 = xmm3 + a2 * b1;
3086  xmm4 = xmm4 + a2 * b2;
3087  xmm5 = xmm5 + a3 * b1;
3088  xmm6 = xmm6 + a3 * b2;
3089  xmm7 = xmm7 + a4 * b1;
3090  xmm8 = xmm8 + a4 * b2;
3091  }
3092  (~C)(i ,j ) += sum( xmm1 ) * scalar;
3093  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
3094  (~C)(i+1UL,j ) += sum( xmm3 ) * scalar;
3095  (~C)(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
3096  (~C)(i+2UL,j ) += sum( xmm5 ) * scalar;
3097  (~C)(i+2UL,j+1UL) += sum( xmm6 ) * scalar;
3098  (~C)(i+3UL,j ) += sum( xmm7 ) * scalar;
3099  (~C)(i+3UL,j+1UL) += sum( xmm8 ) * scalar;
3100  }
3101  if( j < N ) {
3102  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3103  for( size_t k=0UL; k<K; k+=IT::size ) {
3104  const IntrinsicType b1( B.get(k,j) );
3105  xmm1 = xmm1 + A.get(i ,k) * b1;
3106  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3107  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
3108  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
3109  }
3110  (~C)(i ,j) += sum( xmm1 ) * scalar;
3111  (~C)(i+1UL,j) += sum( xmm2 ) * scalar;
3112  (~C)(i+2UL,j) += sum( xmm3 ) * scalar;
3113  (~C)(i+3UL,j) += sum( xmm4 ) * scalar;
3114  }
3115  }
3116  for( ; (i+2UL) <= M; i+=2UL ) {
3117  size_t j( 0UL );
3118  for( ; (j+2UL) <= N; j+=2UL ) {
3119  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3120  for( size_t k=0UL; k<K; k+=IT::size ) {
3121  const IntrinsicType a1( A.get(i ,k) );
3122  const IntrinsicType a2( A.get(i+1UL,k) );
3123  const IntrinsicType b1( B.get(k,j ) );
3124  const IntrinsicType b2( B.get(k,j+1UL) );
3125  xmm1 = xmm1 + a1 * b1;
3126  xmm2 = xmm2 + a1 * b2;
3127  xmm3 = xmm3 + a2 * b1;
3128  xmm4 = xmm4 + a2 * b2;
3129  }
3130  (~C)(i ,j ) += sum( xmm1 ) * scalar;
3131  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
3132  (~C)(i+1UL,j ) += sum( xmm3 ) * scalar;
3133  (~C)(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
3134  }
3135  if( j < N ) {
3136  IntrinsicType xmm1, xmm2;
3137  for( size_t k=0UL; k<K; k+=IT::size ) {
3138  const IntrinsicType b1( B.get(k,j) );
3139  xmm1 = xmm1 + A.get(i ,k) * b1;
3140  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3141  }
3142  (~C)(i ,j) += sum( xmm1 ) * scalar;
3143  (~C)(i+1UL,j) += sum( xmm2 ) * scalar;
3144  }
3145  }
3146  if( i < M ) {
3147  size_t j( 0UL );
3148  for( ; (j+2UL) <= N; j+=2UL ) {
3149  IntrinsicType xmm1, xmm2;
3150  for( size_t k=0UL; k<K; k+=IT::size ) {
3151  const IntrinsicType a1( A.get(i,k) );
3152  xmm1 = xmm1 + a1 * B.get(k,j );
3153  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3154  }
3155  (~C)(i,j ) += sum( xmm1 ) * scalar;
3156  (~C)(i,j+1UL) += sum( xmm2 ) * scalar;
3157  }
3158  if( j < N ) {
3159  IntrinsicType xmm1, xmm2;
3160  for( size_t k=0UL; k<K; k+=IT::size ) {
3161  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3162  }
3163  (~C)(i,j) += sum( xmm1 ) * scalar;
3164  }
3165  }
3166  }
3167  //**********************************************************************************************
3168 
3169  //**BLAS-based addition assignment to dense matrices (default)**********************************
3183  template< typename MT3 // Type of the left-hand side target matrix
3184  , typename MT4 // Type of the left-hand side matrix operand
3185  , typename MT5 // Type of the right-hand side matrix operand
3186  , typename ST2 > // Type of the scalar value
3187  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3188  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3189  {
3190  selectDefaultAddAssignKernel( C, A, B, scalar );
3191  }
3192  //**********************************************************************************************
3193 
3194  //**BLAS-based addition assignment to dense matrices (single precision)*************************
3195 #if BLAZE_BLAS_MODE
3196 
3209  template< typename MT3 // Type of the left-hand side target matrix
3210  , typename MT4 // Type of the left-hand side matrix operand
3211  , typename MT5 // Type of the right-hand side matrix operand
3212  , typename ST2 > // Type of the scalar value
3213  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3214  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3215  {
3216  using boost::numeric_cast;
3217 
3218  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
3219  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
3220  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
3221 
3222  const int M ( numeric_cast<int>( A.rows() ) );
3223  const int N ( numeric_cast<int>( B.columns() ) );
3224  const int K ( numeric_cast<int>( A.columns() ) );
3225  const int lda( numeric_cast<int>( A.spacing() ) );
3226  const int ldb( numeric_cast<int>( B.spacing() ) );
3227  const int ldc( numeric_cast<int>( C.spacing() ) );
3228 
3229  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3230  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3231  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3232  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3233  }
3234 #endif
3235  //**********************************************************************************************
3236 
3237  //**BLAS-based addition assignment to dense matrices (double precision)*************************
3238 #if BLAZE_BLAS_MODE
3239 
3252  template< typename MT3 // Type of the left-hand side target matrix
3253  , typename MT4 // Type of the left-hand side matrix operand
3254  , typename MT5 // Type of the right-hand side matrix operand
3255  , typename ST2 > // Type of the scalar value
3256  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3257  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3258  {
3259  using boost::numeric_cast;
3260 
3261  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
3262  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
3263  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
3264 
3265  const int M ( numeric_cast<int>( A.rows() ) );
3266  const int N ( numeric_cast<int>( B.columns() ) );
3267  const int K ( numeric_cast<int>( A.columns() ) );
3268  const int lda( numeric_cast<int>( A.spacing() ) );
3269  const int ldb( numeric_cast<int>( B.spacing() ) );
3270  const int ldc( numeric_cast<int>( C.spacing() ) );
3271 
3272  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3273  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3274  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3275  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3276  }
3277 #endif
3278  //**********************************************************************************************
3279 
3280  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
3281 #if BLAZE_BLAS_MODE
3282 
3295  template< typename MT3 // Type of the left-hand side target matrix
3296  , typename MT4 // Type of the left-hand side matrix operand
3297  , typename MT5 // Type of the right-hand side matrix operand
3298  , typename ST2 > // Type of the scalar value
3299  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3300  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3301  {
3302  using boost::numeric_cast;
3303 
3304  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3305  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3306  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3307  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3308  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3309  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3310 
3311  const int M ( numeric_cast<int>( A.rows() ) );
3312  const int N ( numeric_cast<int>( B.columns() ) );
3313  const int K ( numeric_cast<int>( A.columns() ) );
3314  const int lda( numeric_cast<int>( A.spacing() ) );
3315  const int ldb( numeric_cast<int>( B.spacing() ) );
3316  const int ldc( numeric_cast<int>( C.spacing() ) );
3317  const complex<float> alpha( scalar );
3318  const complex<float> beta ( 1.0F, 0.0F );
3319 
3320  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3321  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3322  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3323  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3324  }
3325 #endif
3326  //**********************************************************************************************
3327 
3328  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
3329 #if BLAZE_BLAS_MODE
3330 
3343  template< typename MT3 // Type of the left-hand side target matrix
3344  , typename MT4 // Type of the left-hand side matrix operand
3345  , typename MT5 // Type of the right-hand side matrix operand
3346  , typename ST2 > // Type of the scalar value
3347  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3348  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3349  {
3350  using boost::numeric_cast;
3351 
3352  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3353  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3354  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3355  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3356  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3357  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3358 
3359  const int M ( numeric_cast<int>( A.rows() ) );
3360  const int N ( numeric_cast<int>( B.columns() ) );
3361  const int K ( numeric_cast<int>( A.columns() ) );
3362  const int lda( numeric_cast<int>( A.spacing() ) );
3363  const int ldb( numeric_cast<int>( B.spacing() ) );
3364  const int ldc( numeric_cast<int>( C.spacing() ) );
3365  const complex<double> alpha( scalar );
3366  const complex<double> beta ( 1.0, 0.0 );
3367 
3368  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3369  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3370  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3371  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3372  }
3373 #endif
3374  //**********************************************************************************************
3375 
3376  //**Addition assignment to sparse matrices******************************************************
3377  // No special implementation for the addition assignment to sparse matrices.
3378  //**********************************************************************************************
3379 
3380  //**Subtraction assignment to dense matrices****************************************************
3392  template< typename MT3 // Type of the target dense matrix
3393  , bool SO > // Storage order of the target dense matrix
3394  friend inline void subAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
3395  {
3397 
3398  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3399  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3400 
3401  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3402  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3403 
3404  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3405  return;
3406  }
3407 
3408  LT A( left ); // Evaluation of the left-hand side dense matrix operand
3409  RT B( right ); // Evaluation of the right-hand side dense matrix operand
3410 
3411  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3412  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3413  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3414  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3415  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3416  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3417 
3418  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
3419  DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3420  else
3421  DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3422  }
3423  //**********************************************************************************************
3424 
3425  //**Default subtraction assignment to dense matrices********************************************
3439  template< typename MT3 // Type of the left-hand side target matrix
3440  , typename MT4 // Type of the left-hand side matrix operand
3441  , typename MT5 // Type of the right-hand side matrix operand
3442  , typename ST2 > // Type of the scalar value
3443  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3444  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3445  {
3446  const ResultType tmp( A * B * scalar );
3447  subAssign( C, tmp );
3448  }
3449  //**********************************************************************************************
3450 
3451  //**Vectorized default subtraction assignment to row-major dense matrices***********************
3465  template< typename MT3 // Type of the left-hand side target matrix
3466  , typename MT4 // Type of the left-hand side matrix operand
3467  , typename MT5 // Type of the right-hand side matrix operand
3468  , typename ST2 > // Type of the scalar value
3469  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3470  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3471  {
3472  typedef IntrinsicTrait<ElementType> IT;
3473 
3474  const size_t M( A.rows() );
3475  const size_t N( B.columns() );
3476  const size_t K( A.columns() );
3477 
3478  size_t i( 0UL );
3479 
3480  for( ; (i+2UL) <= M; i+=2UL ) {
3481  size_t j( 0UL );
3482  for( ; (j+4UL) <= N; j+=4UL ) {
3483  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3484  for( size_t k=0UL; k<K; k+=IT::size ) {
3485  const IntrinsicType a1( A.get(i ,k) );
3486  const IntrinsicType a2( A.get(i+1UL,k) );
3487  const IntrinsicType b1( B.get(k,j ) );
3488  const IntrinsicType b2( B.get(k,j+1UL) );
3489  const IntrinsicType b3( B.get(k,j+2UL) );
3490  const IntrinsicType b4( B.get(k,j+3UL) );
3491  xmm1 = xmm1 + a1 * b1;
3492  xmm2 = xmm2 + a1 * b2;
3493  xmm3 = xmm3 + a1 * b3;
3494  xmm4 = xmm4 + a1 * b4;
3495  xmm5 = xmm5 + a2 * b1;
3496  xmm6 = xmm6 + a2 * b2;
3497  xmm7 = xmm7 + a2 * b3;
3498  xmm8 = xmm8 + a2 * b4;
3499  }
3500  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
3501  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
3502  (~C)(i ,j+2UL) -= sum( xmm3 ) * scalar;
3503  (~C)(i ,j+3UL) -= sum( xmm4 ) * scalar;
3504  (~C)(i+1UL,j ) -= sum( xmm5 ) * scalar;
3505  (~C)(i+1UL,j+1UL) -= sum( xmm6 ) * scalar;
3506  (~C)(i+1UL,j+2UL) -= sum( xmm7 ) * scalar;
3507  (~C)(i+1UL,j+3UL) -= sum( xmm8 ) * scalar;
3508  }
3509  for( ; (j+2UL) <= N; j+=2UL ) {
3510  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3511  for( size_t k=0UL; k<K; k+=IT::size ) {
3512  const IntrinsicType a1( A.get(i ,k) );
3513  const IntrinsicType a2( A.get(i+1UL,k) );
3514  const IntrinsicType b1( B.get(k,j ) );
3515  const IntrinsicType b2( B.get(k,j+1UL) );
3516  xmm1 = xmm1 + a1 * b1;
3517  xmm2 = xmm2 + a1 * b2;
3518  xmm3 = xmm3 + a2 * b1;
3519  xmm4 = xmm4 + a2 * b2;
3520  }
3521  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
3522  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
3523  (~C)(i+1UL,j ) -= sum( xmm3 ) * scalar;
3524  (~C)(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
3525  }
3526  if( j < N ) {
3527  IntrinsicType xmm1, xmm2;
3528  for( size_t k=0UL; k<K; k+=IT::size ) {
3529  const IntrinsicType b1( B.get(k,j) );
3530  xmm1 = xmm1 + A.get(i ,k) * b1;
3531  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3532  }
3533  (~C)(i ,j) -= sum( xmm1 ) * scalar;
3534  (~C)(i+1UL,j) -= sum( xmm2 ) * scalar;
3535  }
3536  }
3537  if( i < M ) {
3538  size_t j( 0UL );
3539  for( ; (j+4UL) <= N; j+=4UL ) {
3540  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3541  for( size_t k=0UL; k<K; k+=IT::size ) {
3542  const IntrinsicType a1( A.get(i,k) );
3543  xmm1 = xmm1 + a1 * B.get(k,j );
3544  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3545  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
3546  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
3547  }
3548  (~C)(i,j ) -= sum( xmm1 ) * scalar;
3549  (~C)(i,j+1UL) -= sum( xmm2 ) * scalar;
3550  (~C)(i,j+2UL) -= sum( xmm3 ) * scalar;
3551  (~C)(i,j+3UL) -= sum( xmm4 ) * scalar;
3552  }
3553  for( ; (j+2UL) <= N; j+=2UL ) {
3554  IntrinsicType xmm1, xmm2;
3555  for( size_t k=0UL; k<K; k+=IT::size ) {
3556  const IntrinsicType a1( A.get(i,k) );
3557  xmm1 = xmm1 + a1 * B.get(k,j );
3558  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3559  }
3560  (~C)(i,j ) -= sum( xmm1 ) * scalar;
3561  (~C)(i,j+1UL) -= sum( xmm2 ) * scalar;
3562  }
3563  if( j < N ) {
3564  IntrinsicType xmm1, xmm2;
3565  for( size_t k=0UL; k<K; k+=IT::size ) {
3566  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3567  }
3568  (~C)(i,j) -= sum( xmm1 ) * scalar;
3569  }
3570  }
3571  }
3572  //**********************************************************************************************
3573 
3574  //**Vectorized default subtraction assignment to column-major dense matrices********************
3588  template< typename MT3 // Type of the left-hand side target matrix
3589  , typename MT4 // Type of the left-hand side matrix operand
3590  , typename MT5 // Type of the right-hand side matrix operand
3591  , typename ST2 > // Type of the scalar value
3592  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3593  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3594  {
3595  typedef IntrinsicTrait<ElementType> IT;
3596 
3597  const size_t M( A.rows() );
3598  const size_t N( B.columns() );
3599  const size_t K( A.columns() );
3600 
3601  size_t i( 0UL );
3602 
3603  for( ; (i+4UL) <= M; i+=4UL ) {
3604  size_t j( 0UL );
3605  for( ; (j+2UL) <= N; j+=2UL ) {
3606  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3607  for( size_t k=0UL; k<K; k+=IT::size ) {
3608  const IntrinsicType a1( A.get(i ,k) );
3609  const IntrinsicType a2( A.get(i+1UL,k) );
3610  const IntrinsicType a3( A.get(i+2UL,k) );
3611  const IntrinsicType a4( A.get(i+3UL,k) );
3612  const IntrinsicType b1( B.get(k,j ) );
3613  const IntrinsicType b2( B.get(k,j+1UL) );
3614  xmm1 = xmm1 + a1 * b1;
3615  xmm2 = xmm2 + a1 * b2;
3616  xmm3 = xmm3 + a2 * b1;
3617  xmm4 = xmm4 + a2 * b2;
3618  xmm5 = xmm5 + a3 * b1;
3619  xmm6 = xmm6 + a3 * b2;
3620  xmm7 = xmm7 + a4 * b1;
3621  xmm8 = xmm8 + a4 * b2;
3622  }
3623  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
3624  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
3625  (~C)(i+1UL,j ) -= sum( xmm3 ) * scalar;
3626  (~C)(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
3627  (~C)(i+2UL,j ) -= sum( xmm5 ) * scalar;
3628  (~C)(i+2UL,j+1UL) -= sum( xmm6 ) * scalar;
3629  (~C)(i+3UL,j ) -= sum( xmm7 ) * scalar;
3630  (~C)(i+3UL,j+1UL) -= sum( xmm8 ) * scalar;
3631  }
3632  if( j < N ) {
3633  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3634  for( size_t k=0UL; k<K; k+=IT::size ) {
3635  const IntrinsicType b1( B.get(k,j) );
3636  xmm1 = xmm1 + A.get(i ,k) * b1;
3637  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3638  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
3639  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
3640  }
3641  (~C)(i ,j) -= sum( xmm1 ) * scalar;
3642  (~C)(i+1UL,j) -= sum( xmm2 ) * scalar;
3643  (~C)(i+2UL,j) -= sum( xmm3 ) * scalar;
3644  (~C)(i+3UL,j) -= sum( xmm4 ) * scalar;
3645  }
3646  }
3647  for( ; (i+2UL) <= M; i+=2UL ) {
3648  size_t j( 0UL );
3649  for( ; (j+2UL) <= N; j+=2UL ) {
3650  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3651  for( size_t k=0UL; k<K; k+=IT::size ) {
3652  const IntrinsicType a1( A.get(i ,k) );
3653  const IntrinsicType a2( A.get(i+1UL,k) );
3654  const IntrinsicType b1( B.get(k,j ) );
3655  const IntrinsicType b2( B.get(k,j+1UL) );
3656  xmm1 = xmm1 + a1 * b1;
3657  xmm2 = xmm2 + a1 * b2;
3658  xmm3 = xmm3 + a2 * b1;
3659  xmm4 = xmm4 + a2 * b2;
3660  }
3661  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
3662  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
3663  (~C)(i+1UL,j ) -= sum( xmm3 ) * scalar;
3664  (~C)(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
3665  }
3666  if( j < N ) {
3667  IntrinsicType xmm1, xmm2;
3668  for( size_t k=0UL; k<K; k+=IT::size ) {
3669  const IntrinsicType b1( B.get(k,j) );
3670  xmm1 = xmm1 + A.get(i ,k) * b1;
3671  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3672  }
3673  (~C)(i ,j) -= sum( xmm1 ) * scalar;
3674  (~C)(i+1UL,j) -= sum( xmm2 ) * scalar;
3675  }
3676  }
3677  if( i < M ) {
3678  size_t j( 0UL );
3679  for( ; (j+2UL) <= N; j+=2UL ) {
3680  IntrinsicType xmm1, xmm2;
3681  for( size_t k=0UL; k<K; k+=IT::size ) {
3682  const IntrinsicType a1( A.get(i,k) );
3683  xmm1 = xmm1 + a1 * B.get(k,j );
3684  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3685  }
3686  (~C)(i,j ) -= sum( xmm1 ) * scalar;
3687  (~C)(i,j+1UL) -= sum( xmm2 ) * scalar;
3688  }
3689  if( j < N ) {
3690  IntrinsicType xmm1, xmm2;
3691  for( size_t k=0UL; k<K; k+=IT::size ) {
3692  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3693  }
3694  (~C)(i,j) -= sum( xmm1 ) * scalar;
3695  }
3696  }
3697  }
3698  //**********************************************************************************************
3699 
3700  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
3714  template< typename MT3 // Type of the left-hand side target matrix
3715  , typename MT4 // Type of the left-hand side matrix operand
3716  , typename MT5 // Type of the right-hand side matrix operand
3717  , typename ST2 > // Type of the scalar value
3718  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3719  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3720  {
3721  selectDefaultSubAssignKernel( C, A, B, scalar );
3722  }
3723  //**********************************************************************************************
3724 
3725  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
3726 #if BLAZE_BLAS_MODE
3727 
3740  template< typename MT3 // Type of the left-hand side target matrix
3741  , typename MT4 // Type of the left-hand side matrix operand
3742  , typename MT5 // Type of the right-hand side matrix operand
3743  , typename ST2 > // Type of the scalar value
3744  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3745  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3746  {
3747  using boost::numeric_cast;
3748 
3749  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
3750  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
3751  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
3752 
3753  const int M ( numeric_cast<int>( A.rows() ) );
3754  const int N ( numeric_cast<int>( B.columns() ) );
3755  const int K ( numeric_cast<int>( A.columns() ) );
3756  const int lda( numeric_cast<int>( A.spacing() ) );
3757  const int ldb( numeric_cast<int>( B.spacing() ) );
3758  const int ldc( numeric_cast<int>( C.spacing() ) );
3759 
3760  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3761  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3762  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3763  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3764  }
3765 #endif
3766  //**********************************************************************************************
3767 
3768  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
3769 #if BLAZE_BLAS_MODE
3770 
3783  template< typename MT3 // Type of the left-hand side target matrix
3784  , typename MT4 // Type of the left-hand side matrix operand
3785  , typename MT5 // Type of the right-hand side matrix operand
3786  , typename ST2 > // Type of the scalar value
3787  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3788  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3789  {
3790  using boost::numeric_cast;
3791 
3792  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
3793  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
3794  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
3795 
3796  const int M ( numeric_cast<int>( A.rows() ) );
3797  const int N ( numeric_cast<int>( B.columns() ) );
3798  const int K ( numeric_cast<int>( A.columns() ) );
3799  const int lda( numeric_cast<int>( A.spacing() ) );
3800  const int ldb( numeric_cast<int>( B.spacing() ) );
3801  const int ldc( numeric_cast<int>( C.spacing() ) );
3802 
3803  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3804  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3805  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3806  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3807  }
3808 #endif
3809  //**********************************************************************************************
3810 
3811  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
3812 #if BLAZE_BLAS_MODE
3813 
3826  template< typename MT3 // Type of the left-hand side target matrix
3827  , typename MT4 // Type of the left-hand side matrix operand
3828  , typename MT5 // Type of the right-hand side matrix operand
3829  , typename ST2 > // Type of the scalar value
3830  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3831  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3832  {
3833  using boost::numeric_cast;
3834 
3835  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3836  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3837  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3838  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3839  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3840  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3841 
3842  const int M ( numeric_cast<int>( A.rows() ) );
3843  const int N ( numeric_cast<int>( B.columns() ) );
3844  const int K ( numeric_cast<int>( A.columns() ) );
3845  const int lda( numeric_cast<int>( A.spacing() ) );
3846  const int ldb( numeric_cast<int>( B.spacing() ) );
3847  const int ldc( numeric_cast<int>( C.spacing() ) );
3848  const complex<float> alpha( -scalar );
3849  const complex<float> beta ( 1.0F, 0.0F );
3850 
3851  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3852  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3853  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3854  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3855  }
3856 #endif
3857  //**********************************************************************************************
3858 
3859  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
3860 #if BLAZE_BLAS_MODE
3861 
3874  template< typename MT3 // Type of the left-hand side target matrix
3875  , typename MT4 // Type of the left-hand side matrix operand
3876  , typename MT5 // Type of the right-hand side matrix operand
3877  , typename ST2 > // Type of the scalar value
3878  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3879  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3880  {
3881  using boost::numeric_cast;
3882 
3883  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3884  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3885  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3886  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3887  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3888  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3889 
3890  const int M ( numeric_cast<int>( A.rows() ) );
3891  const int N ( numeric_cast<int>( B.columns() ) );
3892  const int K ( numeric_cast<int>( A.columns() ) );
3893  const int lda( numeric_cast<int>( A.spacing() ) );
3894  const int ldb( numeric_cast<int>( B.spacing() ) );
3895  const int ldc( numeric_cast<int>( C.spacing() ) );
3896  const complex<double> alpha( -scalar );
3897  const complex<double> beta ( 1.0, 0.0 );
3898 
3899  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3900  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3901  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3902  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3903  }
3904 #endif
3905  //**********************************************************************************************
3906 
3907  //**Subtraction assignment to sparse matrices***************************************************
3908  // No special implementation for the subtraction assignment to sparse matrices.
3909  //**********************************************************************************************
3910 
3911  //**Multiplication assignment to dense matrices*************************************************
3912  // No special implementation for the multiplication assignment to dense matrices.
3913  //**********************************************************************************************
3914 
3915  //**Multiplication assignment to sparse matrices************************************************
3916  // No special implementation for the multiplication assignment to sparse matrices.
3917  //**********************************************************************************************
3918 
3919  //**Compile time checks*************************************************************************
3928  //**********************************************************************************************
3929 };
3931 //*************************************************************************************************
3932 
3933 
3934 
3935 
3936 //=================================================================================================
3937 //
3938 // GLOBAL BINARY ARITHMETIC OPERATORS
3939 //
3940 //=================================================================================================
3941 
3942 //*************************************************************************************************
3971 template< typename T1 // Type of the left-hand side dense matrix
3972  , typename T2 > // Type of the right-hand side dense matrix
3973 inline const DMatTDMatMultExpr<T1,T2>
3975 {
3977 
3978  if( (~lhs).columns() != (~rhs).rows() )
3979  throw std::invalid_argument( "Matrix sizes do not match" );
3980 
3981  return DMatTDMatMultExpr<T1,T2>( ~lhs, ~rhs );
3982 }
3983 //*************************************************************************************************
3984 
3985 
3986 
3987 
3988 //=================================================================================================
3989 //
3990 // EXPRESSION TRAIT SPECIALIZATIONS
3991 //
3992 //=================================================================================================
3993 
3994 //*************************************************************************************************
3996 template< typename MT1, typename MT2, typename VT >
3997 struct DMatDVecMultExprTrait< DMatTDMatMultExpr<MT1,MT2>, VT >
3998 {
3999  public:
4000  //**********************************************************************************************
4001  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4002  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
4003  IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
4004  , typename DMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
4005  , INVALID_TYPE >::Type Type;
4006  //**********************************************************************************************
4007 };
4009 //*************************************************************************************************
4010 
4011 
4012 //*************************************************************************************************
4014 template< typename MT1, typename MT2, typename VT >
4015 struct DMatSVecMultExprTrait< DMatTDMatMultExpr<MT1,MT2>, VT >
4016 {
4017  public:
4018  //**********************************************************************************************
4019  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4020  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
4021  IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
4022  , typename DMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
4023  , INVALID_TYPE >::Type Type;
4024  //**********************************************************************************************
4025 };
4027 //*************************************************************************************************
4028 
4029 
4030 //*************************************************************************************************
4032 template< typename VT, typename MT1, typename MT2 >
4033 struct TDVecDMatMultExprTrait< VT, DMatTDMatMultExpr<MT1,MT2> >
4034 {
4035  public:
4036  //**********************************************************************************************
4037  typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
4038  IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4039  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
4040  , typename TDVecTDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4041  , INVALID_TYPE >::Type Type;
4042  //**********************************************************************************************
4043 };
4045 //*************************************************************************************************
4046 
4047 
4048 //*************************************************************************************************
4050 template< typename VT, typename MT1, typename MT2 >
4051 struct TSVecDMatMultExprTrait< VT, DMatTDMatMultExpr<MT1,MT2> >
4052 {
4053  public:
4054  //**********************************************************************************************
4055  typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
4056  IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4057  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
4058  , typename TDVecTDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4059  , INVALID_TYPE >::Type Type;
4060  //**********************************************************************************************
4061 };
4063 //*************************************************************************************************
4064 
4065 
4066 //*************************************************************************************************
4068 template< typename MT1, typename MT2 >
4069 struct RowExprTrait< DMatTDMatMultExpr<MT1,MT2> >
4070 {
4071  public:
4072  //**********************************************************************************************
4073  typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
4074  //**********************************************************************************************
4075 };
4077 //*************************************************************************************************
4078 
4079 
4080 //*************************************************************************************************
4082 template< typename MT1, typename MT2 >
4083 struct ColumnExprTrait< DMatTDMatMultExpr<MT1,MT2> >
4084 {
4085  public:
4086  //**********************************************************************************************
4087  typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
4088  //**********************************************************************************************
4089 };
4091 //*************************************************************************************************
4092 
4093 } // namespace blaze
4094 
4095 #endif