All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DMatTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
20 //=================================================================================================
21 
22 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
24 
25 
26 //*************************************************************************************************
27 // Includes
28 //*************************************************************************************************
29 
30 #include <stdexcept>
31 #include <boost/cast.hpp>
38 #include <blaze/math/Intrinsics.h>
39 #include <blaze/math/shims/Reset.h>
56 #include <blaze/system/BLAS.h>
58 #include <blaze/util/Assert.h>
59 #include <blaze/util/Complex.h>
64 #include <blaze/util/DisableIf.h>
65 #include <blaze/util/EnableIf.h>
66 #include <blaze/util/InvalidType.h>
67 #include <blaze/util/SelectType.h>
68 #include <blaze/util/Types.h>
74 
75 
76 namespace blaze {
77 
78 //=================================================================================================
79 //
80 // CLASS DMATTDMATMULTEXPR
81 //
82 //=================================================================================================
83 
84 //*************************************************************************************************
91 template< typename MT1 // Type of the left-hand side dense matrix
92  , typename MT2 > // Type of the right-hand side dense matrix
93 class DMatTDMatMultExpr : public DenseMatrix< DMatTDMatMultExpr<MT1,MT2>, false >
94  , private Expression
95  , private Computation
96 {
97  private:
98  //**Type definitions****************************************************************************
99  typedef typename MT1::ResultType RT1;
100  typedef typename MT2::ResultType RT2;
101  typedef typename MT1::CompositeType CT1;
102  typedef typename MT2::CompositeType CT2;
103  //**********************************************************************************************
104 
105  //**********************************************************************************************
107 
108 
110  template< typename T1, typename T2, typename T3 >
111  struct UseSinglePrecisionKernel {
115  };
117  //**********************************************************************************************
118 
119  //**********************************************************************************************
121 
122 
124  template< typename T1, typename T2, typename T3 >
125  struct UseDoublePrecisionKernel {
129  };
131  //**********************************************************************************************
132 
133  //**********************************************************************************************
135 
136 
139  template< typename T1, typename T2, typename T3 >
140  struct UseSinglePrecisionComplexKernel {
141  typedef complex<float> Type;
142  enum { value = IsSame<typename T1::ElementType,Type>::value &&
143  IsSame<typename T2::ElementType,Type>::value &&
144  IsSame<typename T3::ElementType,Type>::value };
145  };
147  //**********************************************************************************************
148 
149  //**********************************************************************************************
151 
152 
155  template< typename T1, typename T2, typename T3 >
156  struct UseDoublePrecisionComplexKernel {
157  typedef complex<double> Type;
158  enum { value = IsSame<typename T1::ElementType,Type>::value &&
159  IsSame<typename T2::ElementType,Type>::value &&
160  IsSame<typename T3::ElementType,Type>::value };
161  };
163  //**********************************************************************************************
164 
165  //**********************************************************************************************
167 
168 
170  template< typename T1, typename T2, typename T3 >
171  struct UseDefaultKernel {
172  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
173  !UseDoublePrecisionKernel<T1,T2,T3>::value &&
174  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
175  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
176  };
178  //**********************************************************************************************
179 
180  //**********************************************************************************************
182 
183 
185  template< typename T1, typename T2, typename T3 >
186  struct UseVectorizedDefaultKernel {
187  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
188  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
189  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
190  IntrinsicTrait<typename T1::ElementType>::addition &&
191  IntrinsicTrait<typename T1::ElementType>::multiplication };
192  };
194  //**********************************************************************************************
195 
196  public:
197  //**Type definitions****************************************************************************
200  typedef typename ResultType::OppositeType OppositeType;
201  typedef typename ResultType::TransposeType TransposeType;
202  typedef typename ResultType::ElementType ElementType;
204  typedef const ElementType ReturnType;
205  typedef const ResultType CompositeType;
206 
208  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
209 
211  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
212 
214  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
215 
217  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
218  //**********************************************************************************************
219 
220  //**Compilation flags***************************************************************************
222  enum { vectorizable = 0 };
223 
225  enum { canAlias = !IsComputation<MT1>::value || !IsComputation<MT2>::value };
226  //**********************************************************************************************
227 
228  //**Constructor*********************************************************************************
234  explicit inline DMatTDMatMultExpr( const MT1& lhs, const MT2& rhs )
235  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
236  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
237  {
238  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
239  }
240  //**********************************************************************************************
241 
242  //**Access operator*****************************************************************************
249  inline ReturnType operator()( size_t i, size_t j ) const {
250  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
251  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
252 
253  ElementType tmp;
254 
255  if( lhs_.columns() != 0UL ) {
256  const size_t end( ( ( lhs_.columns()-1UL ) & size_t(-2) ) + 1UL );
257  tmp = lhs_(i,0UL) * rhs_(0UL,j);
258  for( size_t k=1UL; k<end; k+=2UL ) {
259  tmp += lhs_(i,k ) * rhs_(k ,j);
260  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
261  }
262  if( end < lhs_.columns() ) {
263  tmp += lhs_(i,end) * rhs_(end,j);
264  }
265  }
266  else {
267  reset( tmp );
268  }
269 
270  return tmp;
271  }
272  //**********************************************************************************************
273 
274  //**Rows function*******************************************************************************
279  inline size_t rows() const {
280  return lhs_.rows();
281  }
282  //**********************************************************************************************
283 
284  //**Columns function****************************************************************************
289  inline size_t columns() const {
290  return rhs_.columns();
291  }
292  //**********************************************************************************************
293 
294  //**Left operand access*************************************************************************
299  inline LeftOperand leftOperand() const {
300  return lhs_;
301  }
302  //**********************************************************************************************
303 
304  //**Right operand access************************************************************************
309  inline RightOperand rightOperand() const {
310  return rhs_;
311  }
312  //**********************************************************************************************
313 
314  //**********************************************************************************************
320  template< typename T >
321  inline bool isAliased( const T* alias ) const {
322  return ( !IsComputation<MT1>::value && lhs_.isAliased( alias ) ) ||
323  ( !IsComputation<MT2>::value && rhs_.isAliased( alias ) );
324  }
325  //**********************************************************************************************
326 
327  private:
328  //**Member variables****************************************************************************
331  //**********************************************************************************************
332 
333  //**Assignment to dense matrices****************************************************************
342  template< typename MT // Type of the target dense matrix
343  , bool SO > // Storage order of the target dense matrix
344  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
345  {
346  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
347  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
348 
349  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
350  return;
351  }
352  else if( rhs.lhs_.columns() == 0UL ) {
353  reset( ~lhs );
354  return;
355  }
356 
357  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
358  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
359 
360  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
361  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
362  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
363  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
364  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
365  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
366 
367  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
368  DMatTDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
369  else
370  DMatTDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
371  }
373  //**********************************************************************************************
374 
375  //**Default assignment to dense matrices********************************************************
389  template< typename MT3 // Type of the left-hand side target matrix
390  , typename MT4 // Type of the left-hand side matrix operand
391  , typename MT5 > // Type of the right-hand side matrix operand
392  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
393  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
394  {
395  const size_t M( A.rows() );
396  const size_t N( B.columns() );
397  const size_t K( A.columns() );
398 
399  for( size_t i=0UL; i<M; ++i ) {
400  for( size_t j=0UL; j<N; ++j ) {
401  C(i,j) = A(i,0UL) * B(0UL,j);
402  }
403  for( size_t k=1UL; k<K; ++k ) {
404  for( size_t j=0UL; j<N; ++j ) {
405  C(i,j) += A(i,k) * B(k,j);
406  }
407  }
408  }
409  }
411  //**********************************************************************************************
412 
413  //**Vectorized default assignment to row-major dense matrices***********************************
427  template< typename MT3 // Type of the left-hand side target matrix
428  , typename MT4 // Type of the left-hand side matrix operand
429  , typename MT5 > // Type of the right-hand side matrix operand
430  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
431  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
432  {
433  typedef IntrinsicTrait<ElementType> IT;
434 
435  const size_t M( A.rows() );
436  const size_t N( B.columns() );
437  const size_t K( A.columns() );
438 
439  size_t i( 0UL );
440 
441  for( ; (i+2UL) <= M; i+=2UL ) {
442  size_t j( 0UL );
443  for( ; (j+4UL) <= N; j+=4UL ) {
444  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
445  for( size_t k=0UL; k<K; k+=IT::size ) {
446  const IntrinsicType a1( A.get(i ,k) );
447  const IntrinsicType a2( A.get(i+1UL,k) );
448  const IntrinsicType b1( B.get(k,j ) );
449  const IntrinsicType b2( B.get(k,j+1UL) );
450  const IntrinsicType b3( B.get(k,j+2UL) );
451  const IntrinsicType b4( B.get(k,j+3UL) );
452  xmm1 = xmm1 + a1 * b1;
453  xmm2 = xmm2 + a1 * b2;
454  xmm3 = xmm3 + a1 * b3;
455  xmm4 = xmm4 + a1 * b4;
456  xmm5 = xmm5 + a2 * b1;
457  xmm6 = xmm6 + a2 * b2;
458  xmm7 = xmm7 + a2 * b3;
459  xmm8 = xmm8 + a2 * b4;
460  }
461  (~C)(i ,j ) = sum( xmm1 );
462  (~C)(i ,j+1UL) = sum( xmm2 );
463  (~C)(i ,j+2UL) = sum( xmm3 );
464  (~C)(i ,j+3UL) = sum( xmm4 );
465  (~C)(i+1UL,j ) = sum( xmm5 );
466  (~C)(i+1UL,j+1UL) = sum( xmm6 );
467  (~C)(i+1UL,j+2UL) = sum( xmm7 );
468  (~C)(i+1UL,j+3UL) = sum( xmm8 );
469  }
470  for( ; (j+2UL) <= N; j+=2UL ) {
471  IntrinsicType xmm1, xmm2, xmm3, xmm4;
472  for( size_t k=0UL; k<K; k+=IT::size ) {
473  const IntrinsicType a1( A.get(i ,k) );
474  const IntrinsicType a2( A.get(i+1UL,k) );
475  const IntrinsicType b1( B.get(k,j ) );
476  const IntrinsicType b2( B.get(k,j+1UL) );
477  xmm1 = xmm1 + a1 * b1;
478  xmm2 = xmm2 + a1 * b2;
479  xmm3 = xmm3 + a2 * b1;
480  xmm4 = xmm4 + a2 * b2;
481  }
482  (~C)(i ,j ) = sum( xmm1 );
483  (~C)(i ,j+1UL) = sum( xmm2 );
484  (~C)(i+1UL,j ) = sum( xmm3 );
485  (~C)(i+1UL,j+1UL) = sum( xmm4 );
486  }
487  if( j < N ) {
488  IntrinsicType xmm1, xmm2;
489  for( size_t k=0UL; k<K; k+=IT::size ) {
490  const IntrinsicType b1( B.get(k,j) );
491  xmm1 = xmm1 + A.get(i ,k) * b1;
492  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
493  }
494  (~C)(i ,j) = sum( xmm1 );
495  (~C)(i+1UL,j) = sum( xmm2 );
496  }
497  }
498  if( i < M ) {
499  size_t j( 0UL );
500  for( ; (j+4UL) <= N; j+=4UL ) {
501  IntrinsicType xmm1, xmm2, xmm3, xmm4;
502  for( size_t k=0UL; k<K; k+=IT::size ) {
503  const IntrinsicType a1( A.get(i,k) );
504  xmm1 = xmm1 + a1 * B.get(k,j );
505  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
506  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
507  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
508  }
509  (~C)(i,j ) = sum( xmm1 );
510  (~C)(i,j+1UL) = sum( xmm2 );
511  (~C)(i,j+2UL) = sum( xmm3 );
512  (~C)(i,j+3UL) = sum( xmm4 );
513  }
514  for( ; (j+2UL) <= N; j+=2UL ) {
515  IntrinsicType xmm1, xmm2;
516  for( size_t k=0UL; k<K; k+=IT::size ) {
517  const IntrinsicType a1( A.get(i,k) );
518  xmm1 = xmm1 + a1 * B.get(k,j );
519  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
520  }
521  (~C)(i,j ) = sum( xmm1 );
522  (~C)(i,j+1UL) = sum( xmm2 );
523  }
524  if( j < N ) {
525  IntrinsicType xmm1, xmm2;
526  for( size_t k=0UL; k<K; k+=IT::size ) {
527  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
528  }
529  (~C)(i,j) = sum( xmm1 );
530  }
531  }
532  }
534  //**********************************************************************************************
535 
536  //**Vectorized default assignment to column-major dense matrices********************************
550  template< typename MT3 // Type of the left-hand side target matrix
551  , typename MT4 // Type of the left-hand side matrix operand
552  , typename MT5 > // Type of the right-hand side matrix operand
553  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
554  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
555  {
556  typedef IntrinsicTrait<ElementType> IT;
557 
558  const size_t M( A.rows() );
559  const size_t N( B.columns() );
560  const size_t K( A.columns() );
561 
562  size_t i( 0UL );
563 
564  for( ; (i+4UL) <= M; i+=4UL ) {
565  size_t j( 0UL );
566  for( ; (j+2UL) <= N; j+=2UL ) {
567  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
568  for( size_t k=0UL; k<K; k+=IT::size ) {
569  const IntrinsicType a1( A.get(i ,k) );
570  const IntrinsicType a2( A.get(i+1UL,k) );
571  const IntrinsicType a3( A.get(i+2UL,k) );
572  const IntrinsicType a4( A.get(i+3UL,k) );
573  const IntrinsicType b1( B.get(k,j ) );
574  const IntrinsicType b2( B.get(k,j+1UL) );
575  xmm1 = xmm1 + a1 * b1;
576  xmm2 = xmm2 + a1 * b2;
577  xmm3 = xmm3 + a2 * b1;
578  xmm4 = xmm4 + a2 * b2;
579  xmm5 = xmm5 + a3 * b1;
580  xmm6 = xmm6 + a3 * b2;
581  xmm7 = xmm7 + a4 * b1;
582  xmm8 = xmm8 + a4 * b2;
583  }
584  (~C)(i ,j ) = sum( xmm1 );
585  (~C)(i ,j+1UL) = sum( xmm2 );
586  (~C)(i+1UL,j ) = sum( xmm3 );
587  (~C)(i+1UL,j+1UL) = sum( xmm4 );
588  (~C)(i+2UL,j ) = sum( xmm5 );
589  (~C)(i+2UL,j+1UL) = sum( xmm6 );
590  (~C)(i+3UL,j ) = sum( xmm7 );
591  (~C)(i+3UL,j+1UL) = sum( xmm8 );
592  }
593  if( j < N ) {
594  IntrinsicType xmm1, xmm2, xmm3, xmm4;
595  for( size_t k=0UL; k<K; k+=IT::size ) {
596  const IntrinsicType b1( B.get(k,j) );
597  xmm1 = xmm1 + A.get(i ,k) * b1;
598  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
599  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
600  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
601  }
602  (~C)(i ,j) = sum( xmm1 );
603  (~C)(i+1UL,j) = sum( xmm2 );
604  (~C)(i+2UL,j) = sum( xmm3 );
605  (~C)(i+3UL,j) = sum( xmm4 );
606  }
607  }
608  for( ; (i+2UL) <= M; i+=2UL ) {
609  size_t j( 0UL );
610  for( ; (j+2UL) <= N; j+=2UL ) {
611  IntrinsicType xmm1, xmm2, xmm3, xmm4;
612  for( size_t k=0UL; k<K; k+=IT::size ) {
613  const IntrinsicType a1( A.get(i ,k) );
614  const IntrinsicType a2( A.get(i+1UL,k) );
615  const IntrinsicType b1( B.get(k,j ) );
616  const IntrinsicType b2( B.get(k,j+1UL) );
617  xmm1 = xmm1 + a1 * b1;
618  xmm2 = xmm2 + a1 * b2;
619  xmm3 = xmm3 + a2 * b1;
620  xmm4 = xmm4 + a2 * b2;
621  }
622  (~C)(i ,j ) = sum( xmm1 );
623  (~C)(i ,j+1UL) = sum( xmm2 );
624  (~C)(i+1UL,j ) = sum( xmm3 );
625  (~C)(i+1UL,j+1UL) = sum( xmm4 );
626  }
627  if( j < N ) {
628  IntrinsicType xmm1, xmm2;
629  for( size_t k=0UL; k<K; k+=IT::size ) {
630  const IntrinsicType b1( B.get(k,j) );
631  xmm1 = xmm1 + A.get(i ,k) * b1;
632  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
633  }
634  (~C)(i ,j) = sum( xmm1 );
635  (~C)(i+1UL,j) = sum( xmm2 );
636  }
637  }
638  if( i < M ) {
639  size_t j( 0UL );
640  for( ; (j+2UL) <= N; j+=2UL ) {
641  IntrinsicType xmm1, xmm2;
642  for( size_t k=0UL; k<K; k+=IT::size ) {
643  const IntrinsicType a1( A.get(i,k) );
644  xmm1 = xmm1 + a1 * B.get(k,j );
645  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
646  }
647  (~C)(i,j ) = sum( xmm1 );
648  (~C)(i,j+1UL) = sum( xmm2 );
649  }
650  if( j < N ) {
651  IntrinsicType xmm1, xmm2;
652  for( size_t k=0UL; k<K; k+=IT::size ) {
653  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
654  }
655  (~C)(i,j) = sum( xmm1 );
656  }
657  }
658  }
660  //**********************************************************************************************
661 
662  //**Default assignment to dense matrices********************************************************
676  template< typename MT3 // Type of the left-hand side target matrix
677  , typename MT4 // Type of the left-hand side matrix operand
678  , typename MT5 > // Type of the right-hand side matrix operand
679  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
680  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
681  {
682  selectDefaultAssignKernel( C, A, B );
683  }
685  //**********************************************************************************************
686 
687  //**BLAS-based assignment to dense matrices (single precision)**********************************
688 #if BLAZE_BLAS_MODE
689 
702  template< typename MT3 // Type of the left-hand side target matrix
703  , typename MT4 // Type of the left-hand side matrix operand
704  , typename MT5 > // Type of the right-hand side matrix operand
705  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
706  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
707  {
708  using boost::numeric_cast;
709 
710  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
711  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
712  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
713 
714  const int M ( numeric_cast<int>( A.rows() ) );
715  const int N ( numeric_cast<int>( B.columns() ) );
716  const int K ( numeric_cast<int>( A.columns() ) );
717  const int lda( numeric_cast<int>( A.spacing() ) );
718  const int ldb( numeric_cast<int>( B.spacing() ) );
719  const int ldc( numeric_cast<int>( C.spacing() ) );
720 
721  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
722  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
723  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
724  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
725  }
727 #endif
728  //**********************************************************************************************
729 
730  //**BLAS-based assignment to dense matrices (double precision)**********************************
731 #if BLAZE_BLAS_MODE
732 
745  template< typename MT3 // Type of the left-hand side target matrix
746  , typename MT4 // Type of the left-hand side matrix operand
747  , typename MT5 > // Type of the right-hand side matrix operand
748  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
749  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
750  {
751  using boost::numeric_cast;
752 
753  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
754  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
755  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
756 
757  const int M ( numeric_cast<int>( A.rows() ) );
758  const int N ( numeric_cast<int>( B.columns() ) );
759  const int K ( numeric_cast<int>( A.columns() ) );
760  const int lda( numeric_cast<int>( A.spacing() ) );
761  const int ldb( numeric_cast<int>( B.spacing() ) );
762  const int ldc( numeric_cast<int>( C.spacing() ) );
763 
764  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
765  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
766  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
767  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
768  }
770 #endif
771  //**********************************************************************************************
772 
773  //**BLAS-based assignment to dense matrices (single precision complex)**************************
774 #if BLAZE_BLAS_MODE
775 
788  template< typename MT3 // Type of the left-hand side target matrix
789  , typename MT4 // Type of the left-hand side matrix operand
790  , typename MT5 > // Type of the right-hand side matrix operand
791  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
792  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
793  {
794  using boost::numeric_cast;
795 
796  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
797  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
798  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
799  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
800  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
801  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
802 
803  const int M ( numeric_cast<int>( A.rows() ) );
804  const int N ( numeric_cast<int>( B.columns() ) );
805  const int K ( numeric_cast<int>( A.columns() ) );
806  const int lda( numeric_cast<int>( A.spacing() ) );
807  const int ldb( numeric_cast<int>( B.spacing() ) );
808  const int ldc( numeric_cast<int>( C.spacing() ) );
809  const complex<float> alpha( 1.0F, 0.0F );
810  const complex<float> beta ( 0.0F, 0.0F );
811 
812  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
813  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
814  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
815  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
816  }
818 #endif
819  //**********************************************************************************************
820 
821  //**BLAS-based assignment to dense matrices (double precision complex)**************************
822 #if BLAZE_BLAS_MODE
823 
836  template< typename MT3 // Type of the left-hand side target matrix
837  , typename MT4 // Type of the left-hand side matrix operand
838  , typename MT5 > // Type of the right-hand side matrix operand
839  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
840  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
841  {
842  using boost::numeric_cast;
843 
844  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
845  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
846  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
847  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
848  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
849  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
850 
851  const int M ( numeric_cast<int>( A.rows() ) );
852  const int N ( numeric_cast<int>( B.columns() ) );
853  const int K ( numeric_cast<int>( A.columns() ) );
854  const int lda( numeric_cast<int>( A.spacing() ) );
855  const int ldb( numeric_cast<int>( B.spacing() ) );
856  const int ldc( numeric_cast<int>( C.spacing() ) );
857  const complex<double> alpha( 1.0, 0.0 );
858  const complex<double> beta ( 0.0, 0.0 );
859 
860  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
861  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
862  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
863  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
864  }
866 #endif
867  //**********************************************************************************************
868 
869  //**Assignment to sparse matrices***************************************************************
881  template< typename MT // Type of the target sparse matrix
882  , bool SO > // Storage order of the target sparse matrix
883  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
884  {
885  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
886 
892  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( typename TmpType::CompositeType );
893 
894  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
895  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
896 
897  const TmpType tmp( rhs );
898  assign( ~lhs, tmp );
899  }
901  //**********************************************************************************************
902 
903  //**Addition assignment to dense matrices*******************************************************
916  template< typename MT // Type of the target dense matrix
917  , bool SO > // Storage order of the target dense matrix
918  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
919  {
920  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
921  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
922 
923  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
924  return;
925  }
926 
927  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
928  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
929 
930  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
931  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
932  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
933  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
934  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
935  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
936 
937  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
938  DMatTDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
939  else
940  DMatTDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
941  }
943  //**********************************************************************************************
944 
945  //**Default addition assignment to dense matrices***********************************************
959  template< typename MT3 // Type of the left-hand side target matrix
960  , typename MT4 // Type of the left-hand side matrix operand
961  , typename MT5 > // Type of the right-hand side matrix operand
962  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
963  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
964  {
965  const size_t M( A.rows() );
966  const size_t N( B.columns() );
967  const size_t K( A.columns() );
968 
969  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
970  const size_t end( N & size_t(-2) );
971 
972  for( size_t i=0UL; i<M; ++i ) {
973  for( size_t k=0UL; k<K; ++k ) {
974  for( size_t j=0UL; j<end; j+=2UL ) {
975  C(i,j ) += A(i,k) * B(k,j );
976  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
977  }
978  if( end < N ) {
979  C(i,end) += A(i,k) * B(k,end);
980  }
981  }
982  }
983  }
985  //**********************************************************************************************
986 
987  //**Vectorized default addition assignment to row-major dense matrices**************************
1001  template< typename MT3 // Type of the left-hand side target matrix
1002  , typename MT4 // Type of the left-hand side matrix operand
1003  , typename MT5 > // Type of the right-hand side matrix operand
1004  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1005  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1006  {
1007  typedef IntrinsicTrait<ElementType> IT;
1008 
1009  const size_t M( A.rows() );
1010  const size_t N( B.columns() );
1011  const size_t K( A.columns() );
1012 
1013  size_t i( 0UL );
1014 
1015  for( ; (i+2UL) <= M; i+=2UL ) {
1016  size_t j( 0UL );
1017  for( ; (j+4UL) <= N; j+=4UL ) {
1018  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1019  for( size_t k=0UL; k<K; k+=IT::size ) {
1020  const IntrinsicType a1( A.get(i ,k) );
1021  const IntrinsicType a2( A.get(i+1UL,k) );
1022  const IntrinsicType b1( B.get(k,j ) );
1023  const IntrinsicType b2( B.get(k,j+1UL) );
1024  const IntrinsicType b3( B.get(k,j+2UL) );
1025  const IntrinsicType b4( B.get(k,j+3UL) );
1026  xmm1 = xmm1 + a1 * b1;
1027  xmm2 = xmm2 + a1 * b2;
1028  xmm3 = xmm3 + a1 * b3;
1029  xmm4 = xmm4 + a1 * b4;
1030  xmm5 = xmm5 + a2 * b1;
1031  xmm6 = xmm6 + a2 * b2;
1032  xmm7 = xmm7 + a2 * b3;
1033  xmm8 = xmm8 + a2 * b4;
1034  }
1035  (~C)(i ,j ) += sum( xmm1 );
1036  (~C)(i ,j+1UL) += sum( xmm2 );
1037  (~C)(i ,j+2UL) += sum( xmm3 );
1038  (~C)(i ,j+3UL) += sum( xmm4 );
1039  (~C)(i+1UL,j ) += sum( xmm5 );
1040  (~C)(i+1UL,j+1UL) += sum( xmm6 );
1041  (~C)(i+1UL,j+2UL) += sum( xmm7 );
1042  (~C)(i+1UL,j+3UL) += sum( xmm8 );
1043  }
1044  for( ; (j+2UL) <= N; j+=2UL ) {
1045  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1046  for( size_t k=0UL; k<K; k+=IT::size ) {
1047  const IntrinsicType a1( A.get(i ,k) );
1048  const IntrinsicType a2( A.get(i+1UL,k) );
1049  const IntrinsicType b1( B.get(k,j ) );
1050  const IntrinsicType b2( B.get(k,j+1UL) );
1051  xmm1 = xmm1 + a1 * b1;
1052  xmm2 = xmm2 + a1 * b2;
1053  xmm3 = xmm3 + a2 * b1;
1054  xmm4 = xmm4 + a2 * b2;
1055  }
1056  (~C)(i ,j ) += sum( xmm1 );
1057  (~C)(i ,j+1UL) += sum( xmm2 );
1058  (~C)(i+1UL,j ) += sum( xmm3 );
1059  (~C)(i+1UL,j+1UL) += sum( xmm4 );
1060  }
1061  if( j < N ) {
1062  IntrinsicType xmm1, xmm2;
1063  for( size_t k=0UL; k<K; k+=IT::size ) {
1064  const IntrinsicType b1( B.get(k,j) );
1065  xmm1 = xmm1 + A.get(i ,k) * b1;
1066  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1067  }
1068  (~C)(i ,j) += sum( xmm1 );
1069  (~C)(i+1UL,j) += sum( xmm2 );
1070  }
1071  }
1072  if( i < M ) {
1073  size_t j( 0UL );
1074  for( ; (j+4UL) <= N; j+=4UL ) {
1075  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1076  for( size_t k=0UL; k<K; k+=IT::size ) {
1077  const IntrinsicType a1( A.get(i,k) );
1078  xmm1 = xmm1 + a1 * B.get(k,j );
1079  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1080  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
1081  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
1082  }
1083  (~C)(i,j ) += sum( xmm1 );
1084  (~C)(i,j+1UL) += sum( xmm2 );
1085  (~C)(i,j+2UL) += sum( xmm3 );
1086  (~C)(i,j+3UL) += sum( xmm4 );
1087  }
1088  for( ; (j+2UL) <= N; j+=2UL ) {
1089  IntrinsicType xmm1, xmm2;
1090  for( size_t k=0UL; k<K; k+=IT::size ) {
1091  const IntrinsicType a1( A.get(i,k) );
1092  xmm1 = xmm1 + a1 * B.get(k,j );
1093  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1094  }
1095  (~C)(i,j ) += sum( xmm1 );
1096  (~C)(i,j+1UL) += sum( xmm2 );
1097  }
1098  if( j < N ) {
1099  IntrinsicType xmm1, xmm2;
1100  for( size_t k=0UL; k<K; k+=IT::size ) {
1101  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1102  }
1103  (~C)(i,j) += sum( xmm1 );
1104  }
1105  }
1106  }
1108  //**********************************************************************************************
1109 
1110  //**Vectorized default addition assignment to column-major dense matrices***********************
1124  template< typename MT3 // Type of the left-hand side target matrix
1125  , typename MT4 // Type of the left-hand side matrix operand
1126  , typename MT5 > // Type of the right-hand side matrix operand
1127  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1128  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1129  {
1130  typedef IntrinsicTrait<ElementType> IT;
1131 
1132  const size_t M( A.rows() );
1133  const size_t N( B.columns() );
1134  const size_t K( A.columns() );
1135 
1136  size_t i( 0UL );
1137 
1138  for( ; (i+4UL) <= M; i+=4UL ) {
1139  size_t j( 0UL );
1140  for( ; (j+2UL) <= N; j+=2UL ) {
1141  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1142  for( size_t k=0UL; k<K; k+=IT::size ) {
1143  const IntrinsicType a1( A.get(i ,k) );
1144  const IntrinsicType a2( A.get(i+1UL,k) );
1145  const IntrinsicType a3( A.get(i+2UL,k) );
1146  const IntrinsicType a4( A.get(i+3UL,k) );
1147  const IntrinsicType b1( B.get(k,j ) );
1148  const IntrinsicType b2( B.get(k,j+1UL) );
1149  xmm1 = xmm1 + a1 * b1;
1150  xmm2 = xmm2 + a1 * b2;
1151  xmm3 = xmm3 + a2 * b1;
1152  xmm4 = xmm4 + a2 * b2;
1153  xmm5 = xmm5 + a3 * b1;
1154  xmm6 = xmm6 + a3 * b2;
1155  xmm7 = xmm7 + a4 * b1;
1156  xmm8 = xmm8 + a4 * b2;
1157  }
1158  (~C)(i ,j ) += sum( xmm1 );
1159  (~C)(i ,j+1UL) += sum( xmm2 );
1160  (~C)(i+1UL,j ) += sum( xmm3 );
1161  (~C)(i+1UL,j+1UL) += sum( xmm4 );
1162  (~C)(i+2UL,j ) += sum( xmm5 );
1163  (~C)(i+2UL,j+1UL) += sum( xmm6 );
1164  (~C)(i+3UL,j ) += sum( xmm7 );
1165  (~C)(i+3UL,j+1UL) += sum( xmm8 );
1166  }
1167  if( j < N ) {
1168  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1169  for( size_t k=0UL; k<K; k+=IT::size ) {
1170  const IntrinsicType b1( B.get(k,j) );
1171  xmm1 = xmm1 + A.get(i ,k) * b1;
1172  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1173  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
1174  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
1175  }
1176  (~C)(i ,j) += sum( xmm1 );
1177  (~C)(i+1UL,j) += sum( xmm2 );
1178  (~C)(i+2UL,j) += sum( xmm3 );
1179  (~C)(i+3UL,j) += sum( xmm4 );
1180  }
1181  }
1182  for( ; (i+2UL) <= M; i+=2UL ) {
1183  size_t j( 0UL );
1184  for( ; (j+2UL) <= N; j+=2UL ) {
1185  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1186  for( size_t k=0UL; k<K; k+=IT::size ) {
1187  const IntrinsicType a1( A.get(i ,k) );
1188  const IntrinsicType a2( A.get(i+1UL,k) );
1189  const IntrinsicType b1( B.get(k,j ) );
1190  const IntrinsicType b2( B.get(k,j+1UL) );
1191  xmm1 = xmm1 + a1 * b1;
1192  xmm2 = xmm2 + a1 * b2;
1193  xmm3 = xmm3 + a2 * b1;
1194  xmm4 = xmm4 + a2 * b2;
1195  }
1196  (~C)(i ,j ) += sum( xmm1 );
1197  (~C)(i ,j+1UL) += sum( xmm2 );
1198  (~C)(i+1UL,j ) += sum( xmm3 );
1199  (~C)(i+1UL,j+1UL) += sum( xmm4 );
1200  }
1201  if( j < N ) {
1202  IntrinsicType xmm1, xmm2;
1203  for( size_t k=0UL; k<K; k+=IT::size ) {
1204  const IntrinsicType b1( B.get(k,j) );
1205  xmm1 = xmm1 + A.get(i ,k) * b1;
1206  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1207  }
1208  (~C)(i ,j) += sum( xmm1 );
1209  (~C)(i+1UL,j) += sum( xmm2 );
1210  }
1211  }
1212  if( i < M ) {
1213  size_t j( 0UL );
1214  for( ; (j+2UL) <= N; j+=2UL ) {
1215  IntrinsicType xmm1, xmm2;
1216  for( size_t k=0UL; k<K; k+=IT::size ) {
1217  const IntrinsicType a1( A.get(i,k) );
1218  xmm1 = xmm1 + a1 * B.get(k,j );
1219  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1220  }
1221  (~C)(i,j ) += sum( xmm1 );
1222  (~C)(i,j+1UL) += sum( xmm2 );
1223  }
1224  if( j < N ) {
1225  IntrinsicType xmm1, xmm2;
1226  for( size_t k=0UL; k<K; k+=IT::size ) {
1227  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1228  }
1229  (~C)(i,j) += sum( xmm1 );
1230  }
1231  }
1232  }
1234  //**********************************************************************************************
1235 
1236  //**Default addition assignment to dense matrices***********************************************
1250  template< typename MT3 // Type of the left-hand side target matrix
1251  , typename MT4 // Type of the left-hand side matrix operand
1252  , typename MT5 > // Type of the right-hand side matrix operand
1253  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1254  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1255  {
1256  selectDefaultAddAssignKernel( C, A, B );
1257  }
1259  //**********************************************************************************************
1260 
1261  //**BLAS-based addition assignment to dense matrices (single precision)*************************
1262 #if BLAZE_BLAS_MODE
1263 
1276  template< typename MT3 // Type of the left-hand side target matrix
1277  , typename MT4 // Type of the left-hand side matrix operand
1278  , typename MT5 > // Type of the right-hand side matrix operand
1279  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1280  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1281  {
1282  using boost::numeric_cast;
1283 
1284  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
1285  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
1286  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
1287 
1288  const int M ( numeric_cast<int>( A.rows() ) );
1289  const int N ( numeric_cast<int>( B.columns() ) );
1290  const int K ( numeric_cast<int>( A.columns() ) );
1291  const int lda( numeric_cast<int>( A.spacing() ) );
1292  const int ldb( numeric_cast<int>( B.spacing() ) );
1293  const int ldc( numeric_cast<int>( C.spacing() ) );
1294 
1295  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1296  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1297  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1298  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1299  }
1301 #endif
1302  //**********************************************************************************************
1303 
1304  //**BLAS-based addition assignment to dense matrices (double precision)*************************
1305 #if BLAZE_BLAS_MODE
1306 
1319  template< typename MT3 // Type of the left-hand side target matrix
1320  , typename MT4 // Type of the left-hand side matrix operand
1321  , typename MT5 > // Type of the right-hand side matrix operand
1322  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1323  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1324  {
1325  using boost::numeric_cast;
1326 
1327  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
1328  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
1329  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
1330 
1331  const int M ( numeric_cast<int>( A.rows() ) );
1332  const int N ( numeric_cast<int>( B.columns() ) );
1333  const int K ( numeric_cast<int>( A.columns() ) );
1334  const int lda( numeric_cast<int>( A.spacing() ) );
1335  const int ldb( numeric_cast<int>( B.spacing() ) );
1336  const int ldc( numeric_cast<int>( C.spacing() ) );
1337 
1338  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1339  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1340  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1341  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1342  }
1344 #endif
1345  //**********************************************************************************************
1346 
1347  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
1348 #if BLAZE_BLAS_MODE
1349 
1362  template< typename MT3 // Type of the left-hand side target matrix
1363  , typename MT4 // Type of the left-hand side matrix operand
1364  , typename MT5 > // Type of the right-hand side matrix operand
1365  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1366  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1367  {
1368  using boost::numeric_cast;
1369 
1370  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1371  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1372  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1373  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
1374  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
1375  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
1376 
1377  const int M ( numeric_cast<int>( A.rows() ) );
1378  const int N ( numeric_cast<int>( B.columns() ) );
1379  const int K ( numeric_cast<int>( A.columns() ) );
1380  const int lda( numeric_cast<int>( A.spacing() ) );
1381  const int ldb( numeric_cast<int>( B.spacing() ) );
1382  const int ldc( numeric_cast<int>( C.spacing() ) );
1383  const complex<float> alpha( 1.0F, 0.0F );
1384  const complex<float> beta ( 1.0F, 0.0F );
1385 
1386  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1387  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1388  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1389  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1390  }
1392 #endif
1393  //**********************************************************************************************
1394 
1395  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
1396 #if BLAZE_BLAS_MODE
1397 
1410  template< typename MT3 // Type of the left-hand side target matrix
1411  , typename MT4 // Type of the left-hand side matrix operand
1412  , typename MT5 > // Type of the right-hand side matrix operand
1413  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1414  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1415  {
1416  using boost::numeric_cast;
1417 
1418  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1419  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1420  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1421  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1422  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1423  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1424 
1425  const int M ( numeric_cast<int>( A.rows() ) );
1426  const int N ( numeric_cast<int>( B.columns() ) );
1427  const int K ( numeric_cast<int>( A.columns() ) );
1428  const int lda( numeric_cast<int>( A.spacing() ) );
1429  const int ldb( numeric_cast<int>( B.spacing() ) );
1430  const int ldc( numeric_cast<int>( C.spacing() ) );
1431  const complex<double> alpha( 1.0, 0.0 );
1432  const complex<double> beta ( 1.0, 0.0 );
1433 
1434  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1435  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1436  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1437  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1438  }
1440 #endif
1441  //**********************************************************************************************
1442 
1443  //**Addition assignment to sparse matrices******************************************************
1444  // No special implementation for the addition assignment to sparse matrices.
1445  //**********************************************************************************************
1446 
1447  //**Subtraction assignment to dense matrices****************************************************
1460  template< typename MT // Type of the target dense matrix
1461  , bool SO > // Storage order of the target dense matrix
1462  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
1463  {
1464  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1465  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1466 
1467  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1468  return;
1469  }
1470 
1471  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
1472  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
1473 
1474  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1475  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1476  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1477  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1478  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1479  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1480 
1481  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
1482  DMatTDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1483  else
1484  DMatTDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1485  }
1487  //**********************************************************************************************
1488 
1489  //**Default subtraction assignment to dense matrices********************************************
1503  template< typename MT3 // Type of the left-hand side target matrix
1504  , typename MT4 // Type of the left-hand side matrix operand
1505  , typename MT5 > // Type of the right-hand side matrix operand
1506  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1507  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1508  {
1509  const size_t M( A.rows() );
1510  const size_t N( B.columns() );
1511  const size_t K( A.columns() );
1512 
1513  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1514  const size_t end( N & size_t(-2) );
1515 
1516  for( size_t i=0UL; i<M; ++i ) {
1517  for( size_t k=0UL; k<K; ++k ) {
1518  for( size_t j=0UL; j<end; j+=2UL ) {
1519  C(i,j ) -= A(i,k) * B(k,j );
1520  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1521  }
1522  if( end < N ) {
1523  C(i,end) -= A(i,k) * B(k,end);
1524  }
1525  }
1526  }
1527  }
1529  //**********************************************************************************************
1530 
1531  //**Default subtraction assignment to row-major dense matrices**********************************
1545  template< typename MT3 // Type of the left-hand side target matrix
1546  , typename MT4 // Type of the left-hand side matrix operand
1547  , typename MT5 > // Type of the right-hand side matrix operand
1548  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1549  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1550  {
1551  typedef IntrinsicTrait<ElementType> IT;
1552 
1553  const size_t M( A.rows() );
1554  const size_t N( B.columns() );
1555  const size_t K( A.columns() );
1556 
1557  size_t i( 0UL );
1558 
1559  for( ; (i+2UL) <= M; i+=2UL ) {
1560  size_t j( 0UL );
1561  for( ; (j+4UL) <= N; j+=4UL ) {
1562  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1563  for( size_t k=0UL; k<K; k+=IT::size ) {
1564  const IntrinsicType a1( A.get(i ,k) );
1565  const IntrinsicType a2( A.get(i+1UL,k) );
1566  const IntrinsicType b1( B.get(k,j ) );
1567  const IntrinsicType b2( B.get(k,j+1UL) );
1568  const IntrinsicType b3( B.get(k,j+2UL) );
1569  const IntrinsicType b4( B.get(k,j+3UL) );
1570  xmm1 = xmm1 + a1 * b1;
1571  xmm2 = xmm2 + a1 * b2;
1572  xmm3 = xmm3 + a1 * b3;
1573  xmm4 = xmm4 + a1 * b4;
1574  xmm5 = xmm5 + a2 * b1;
1575  xmm6 = xmm6 + a2 * b2;
1576  xmm7 = xmm7 + a2 * b3;
1577  xmm8 = xmm8 + a2 * b4;
1578  }
1579  (~C)(i ,j ) -= sum( xmm1 );
1580  (~C)(i ,j+1UL) -= sum( xmm2 );
1581  (~C)(i ,j+2UL) -= sum( xmm3 );
1582  (~C)(i ,j+3UL) -= sum( xmm4 );
1583  (~C)(i+1UL,j ) -= sum( xmm5 );
1584  (~C)(i+1UL,j+1UL) -= sum( xmm6 );
1585  (~C)(i+1UL,j+2UL) -= sum( xmm7 );
1586  (~C)(i+1UL,j+3UL) -= sum( xmm8 );
1587  }
1588  for( ; (j+2UL) <= N; j+=2UL ) {
1589  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1590  for( size_t k=0UL; k<K; k+=IT::size ) {
1591  const IntrinsicType a1( A.get(i ,k) );
1592  const IntrinsicType a2( A.get(i+1UL,k) );
1593  const IntrinsicType b1( B.get(k,j ) );
1594  const IntrinsicType b2( B.get(k,j+1UL) );
1595  xmm1 = xmm1 + a1 * b1;
1596  xmm2 = xmm2 + a1 * b2;
1597  xmm3 = xmm3 + a2 * b1;
1598  xmm4 = xmm4 + a2 * b2;
1599  }
1600  (~C)(i ,j ) -= sum( xmm1 );
1601  (~C)(i ,j+1UL) -= sum( xmm2 );
1602  (~C)(i+1UL,j ) -= sum( xmm3 );
1603  (~C)(i+1UL,j+1UL) -= sum( xmm4 );
1604  }
1605  if( j < N ) {
1606  IntrinsicType xmm1, xmm2;
1607  for( size_t k=0UL; k<K; k+=IT::size ) {
1608  const IntrinsicType b1( B.get(k,j) );
1609  xmm1 = xmm1 + A.get(i ,k) * b1;
1610  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1611  }
1612  (~C)(i ,j) -= sum( xmm1 );
1613  (~C)(i+1UL,j) -= sum( xmm2 );
1614  }
1615  }
1616  if( i < M ) {
1617  size_t j( 0UL );
1618  for( ; (j+4UL) <= N; j+=4UL ) {
1619  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1620  for( size_t k=0UL; k<K; k+=IT::size ) {
1621  const IntrinsicType a1( A.get(i,k) );
1622  xmm1 = xmm1 + a1 * B.get(k,j );
1623  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1624  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
1625  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
1626  }
1627  (~C)(i,j ) -= sum( xmm1 );
1628  (~C)(i,j+1UL) -= sum( xmm2 );
1629  (~C)(i,j+2UL) -= sum( xmm3 );
1630  (~C)(i,j+3UL) -= sum( xmm4 );
1631  }
1632  for( ; (j+2UL) <= N; j+=2UL ) {
1633  IntrinsicType xmm1, xmm2;
1634  for( size_t k=0UL; k<K; k+=IT::size ) {
1635  const IntrinsicType a1( A.get(i,k) );
1636  xmm1 = xmm1 + a1 * B.get(k,j );
1637  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1638  }
1639  (~C)(i,j ) -= sum( xmm1 );
1640  (~C)(i,j+1UL) -= sum( xmm2 );
1641  }
1642  if( j < N ) {
1643  IntrinsicType xmm1, xmm2;
1644  for( size_t k=0UL; k<K; k+=IT::size ) {
1645  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1646  }
1647  (~C)(i,j) -= sum( xmm1 );
1648  }
1649  }
1650  }
1652  //**********************************************************************************************
1653 
1654  //**Default subtraction assignment to column-major dense matrices*******************************
1668  template< typename MT3 // Type of the left-hand side target matrix
1669  , typename MT4 // Type of the left-hand side matrix operand
1670  , typename MT5 > // Type of the right-hand side matrix operand
1671  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1672  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1673  {
1674  typedef IntrinsicTrait<ElementType> IT;
1675 
1676  const size_t M( A.rows() );
1677  const size_t N( B.columns() );
1678  const size_t K( A.columns() );
1679 
1680  size_t i( 0UL );
1681 
1682  for( ; (i+4UL) <= M; i+=4UL ) {
1683  size_t j( 0UL );
1684  for( ; (j+2UL) <= N; j+=2UL ) {
1685  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1686  for( size_t k=0UL; k<K; k+=IT::size ) {
1687  const IntrinsicType a1( A.get(i ,k) );
1688  const IntrinsicType a2( A.get(i+1UL,k) );
1689  const IntrinsicType a3( A.get(i+2UL,k) );
1690  const IntrinsicType a4( A.get(i+3UL,k) );
1691  const IntrinsicType b1( B.get(k,j ) );
1692  const IntrinsicType b2( B.get(k,j+1UL) );
1693  xmm1 = xmm1 + a1 * b1;
1694  xmm2 = xmm2 + a1 * b2;
1695  xmm3 = xmm3 + a2 * b1;
1696  xmm4 = xmm4 + a2 * b2;
1697  xmm5 = xmm5 + a3 * b1;
1698  xmm6 = xmm6 + a3 * b2;
1699  xmm7 = xmm7 + a4 * b1;
1700  xmm8 = xmm8 + a4 * b2;
1701  }
1702  (~C)(i ,j ) -= sum( xmm1 );
1703  (~C)(i ,j+1UL) -= sum( xmm2 );
1704  (~C)(i+1UL,j ) -= sum( xmm3 );
1705  (~C)(i+1UL,j+1UL) -= sum( xmm4 );
1706  (~C)(i+2UL,j ) -= sum( xmm5 );
1707  (~C)(i+2UL,j+1UL) -= sum( xmm6 );
1708  (~C)(i+3UL,j ) -= sum( xmm7 );
1709  (~C)(i+3UL,j+1UL) -= sum( xmm8 );
1710  }
1711  if( j < N ) {
1712  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1713  for( size_t k=0UL; k<K; k+=IT::size ) {
1714  const IntrinsicType b1( B.get(k,j) );
1715  xmm1 = xmm1 + A.get(i ,k) * b1;
1716  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1717  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
1718  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
1719  }
1720  (~C)(i ,j) -= sum( xmm1 );
1721  (~C)(i+1UL,j) -= sum( xmm2 );
1722  (~C)(i+2UL,j) -= sum( xmm3 );
1723  (~C)(i+3UL,j) -= sum( xmm4 );
1724  }
1725  }
1726  for( ; (i+2UL) <= M; i+=2UL ) {
1727  size_t j( 0UL );
1728  for( ; (j+2UL) <= N; j+=2UL ) {
1729  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1730  for( size_t k=0UL; k<K; k+=IT::size ) {
1731  const IntrinsicType a1( A.get(i ,k) );
1732  const IntrinsicType a2( A.get(i+1UL,k) );
1733  const IntrinsicType b1( B.get(k,j ) );
1734  const IntrinsicType b2( B.get(k,j+1UL) );
1735  xmm1 = xmm1 + a1 * b1;
1736  xmm2 = xmm2 + a1 * b2;
1737  xmm3 = xmm3 + a2 * b1;
1738  xmm4 = xmm4 + a2 * b2;
1739  }
1740  (~C)(i ,j ) -= sum( xmm1 );
1741  (~C)(i ,j+1UL) -= sum( xmm2 );
1742  (~C)(i+1UL,j ) -= sum( xmm3 );
1743  (~C)(i+1UL,j+1UL) -= sum( xmm4 );
1744  }
1745  if( j < N ) {
1746  IntrinsicType xmm1, xmm2;
1747  for( size_t k=0UL; k<K; k+=IT::size ) {
1748  const IntrinsicType b1( B.get(k,j) );
1749  xmm1 = xmm1 + A.get(i ,k) * b1;
1750  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1751  }
1752  (~C)(i ,j) -= sum( xmm1 );
1753  (~C)(i+1UL,j) -= sum( xmm2 );
1754  }
1755  }
1756  if( i < M ) {
1757  size_t j( 0UL );
1758  for( ; (j+2UL) <= N; j+=2UL ) {
1759  IntrinsicType xmm1, xmm2;
1760  for( size_t k=0UL; k<K; k+=IT::size ) {
1761  const IntrinsicType a1( A.get(i,k) );
1762  xmm1 = xmm1 + a1 * B.get(k,j );
1763  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1764  }
1765  (~C)(i,j ) -= sum( xmm1 );
1766  (~C)(i,j+1UL) -= sum( xmm2 );
1767  }
1768  if( j < N ) {
1769  IntrinsicType xmm1, xmm2;
1770  for( size_t k=0UL; k<K; k+=IT::size ) {
1771  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1772  }
1773  (~C)(i,j) -= sum( xmm1 );
1774  }
1775  }
1776  }
1778  //**********************************************************************************************
1779 
1780  //**Default subtraction assignment to dense matrices********************************************
1794  template< typename MT3 // Type of the left-hand side target matrix
1795  , typename MT4 // Type of the left-hand side matrix operand
1796  , typename MT5 > // Type of the right-hand side matrix operand
1797  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1798  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1799  {
1800  selectDefaultSubAssignKernel( C, A, B );
1801  }
1803  //**********************************************************************************************
1804 
1805  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
1806 #if BLAZE_BLAS_MODE
1807 
1820  template< typename MT3 // Type of the left-hand side target matrix
1821  , typename MT4 // Type of the left-hand side matrix operand
1822  , typename MT5 > // Type of the right-hand side matrix operand
1823  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1824  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1825  {
1826  using boost::numeric_cast;
1827 
1828  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
1829  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
1830  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
1831 
1832  const int M ( numeric_cast<int>( A.rows() ) );
1833  const int N ( numeric_cast<int>( B.columns() ) );
1834  const int K ( numeric_cast<int>( A.columns() ) );
1835  const int lda( numeric_cast<int>( A.spacing() ) );
1836  const int ldb( numeric_cast<int>( B.spacing() ) );
1837  const int ldc( numeric_cast<int>( C.spacing() ) );
1838 
1839  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1840  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1841  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1842  M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1843  }
1845 #endif
1846  //**********************************************************************************************
1847 
1848  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
1849 #if BLAZE_BLAS_MODE
1850 
1863  template< typename MT3 // Type of the left-hand side target matrix
1864  , typename MT4 // Type of the left-hand side matrix operand
1865  , typename MT5 > // Type of the right-hand side matrix operand
1866  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1867  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1868  {
1869  using boost::numeric_cast;
1870 
1871  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
1872  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
1873  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
1874 
1875  const int M ( numeric_cast<int>( A.rows() ) );
1876  const int N ( numeric_cast<int>( B.columns() ) );
1877  const int K ( numeric_cast<int>( A.columns() ) );
1878  const int lda( numeric_cast<int>( A.spacing() ) );
1879  const int ldb( numeric_cast<int>( B.spacing() ) );
1880  const int ldc( numeric_cast<int>( C.spacing() ) );
1881 
1882  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1883  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1884  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1885  M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1886  }
1888 #endif
1889  //**********************************************************************************************
1890 
1891  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
1892 #if BLAZE_BLAS_MODE
1893 
1906  template< typename MT3 // Type of the left-hand side target matrix
1907  , typename MT4 // Type of the left-hand side matrix operand
1908  , typename MT5 > // Type of the right-hand side matrix operand
1909  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1910  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1911  {
1912  using boost::numeric_cast;
1913 
1914  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1915  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1916  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1917  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
1918  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
1919  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
1920 
1921  const int M ( numeric_cast<int>( A.rows() ) );
1922  const int N ( numeric_cast<int>( B.columns() ) );
1923  const int K ( numeric_cast<int>( A.columns() ) );
1924  const int lda( numeric_cast<int>( A.spacing() ) );
1925  const int ldb( numeric_cast<int>( B.spacing() ) );
1926  const int ldc( numeric_cast<int>( C.spacing() ) );
1927  const complex<float> alpha( -1.0F, 0.0F );
1928  const complex<float> beta ( 1.0F, 0.0F );
1929 
1930  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1931  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1932  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1933  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1934  }
1936 #endif
1937  //**********************************************************************************************
1938 
1939  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
1940 #if BLAZE_BLAS_MODE
1941 
1954  template< typename MT3 // Type of the left-hand side target matrix
1955  , typename MT4 // Type of the left-hand side matrix operand
1956  , typename MT5 > // Type of the right-hand side matrix operand
1957  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1958  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1959  {
1960  using boost::numeric_cast;
1961 
1962  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1963  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1964  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1965  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1966  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1967  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1968 
1969  const int M ( numeric_cast<int>( A.rows() ) );
1970  const int N ( numeric_cast<int>( B.columns() ) );
1971  const int K ( numeric_cast<int>( A.columns() ) );
1972  const int lda( numeric_cast<int>( A.spacing() ) );
1973  const int ldb( numeric_cast<int>( B.spacing() ) );
1974  const int ldc( numeric_cast<int>( C.spacing() ) );
1975  const complex<double> alpha( -1.0, 0.0 );
1976  const complex<double> beta ( 1.0, 0.0 );
1977 
1978  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1979  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1980  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1981  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1982  }
1984 #endif
1985  //**********************************************************************************************
1986 
1987  //**Subtraction assignment to sparse matrices***************************************************
1988  // No special implementation for the subtraction assignment to sparse matrices.
1989  //**********************************************************************************************
1990 
1991  //**Multiplication assignment to dense matrices*************************************************
1992  // No special implementation for the multiplication assignment to dense matrices.
1993  //**********************************************************************************************
1994 
1995  //**Multiplication assignment to sparse matrices************************************************
1996  // No special implementation for the multiplication assignment to sparse matrices.
1997  //**********************************************************************************************
1998 
1999  //**Compile time checks*************************************************************************
2006  //**********************************************************************************************
2007 };
2008 //*************************************************************************************************
2009 
2010 
2011 
2012 
2013 //=================================================================================================
2014 //
2015 // DMATSCALARMULTEXPR SPECIALIZATION
2016 //
2017 //=================================================================================================
2018 
2019 //*************************************************************************************************
2027 template< typename MT1 // Type of the left-hand side dense matrix
2028  , typename MT2 // Type of the right-hand side dense matrix
2029  , typename ST > // Type of the right-hand side scalar value
2030 class DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >
2031  : public DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >, false >
2032  , private Expression
2033  , private Computation
2034 {
2035  private:
2036  //**Type definitions****************************************************************************
2037  typedef DMatTDMatMultExpr<MT1,MT2> MMM;
2038  typedef typename MMM::ResultType RES;
2039  typedef typename MT1::ResultType RT1;
2040  typedef typename MT2::ResultType RT2;
2041  typedef typename MT1::CompositeType CT1;
2042  typedef typename MT2::CompositeType CT2;
2043  //**********************************************************************************************
2044 
2045  //**********************************************************************************************
2047 
2050  template< typename T1, typename T2, typename T3, typename T4 >
2051  struct UseSinglePrecisionKernel {
2052  enum { value = IsFloat<typename T1::ElementType>::value &&
2053  IsFloat<typename T2::ElementType>::value &&
2054  IsFloat<typename T3::ElementType>::value &&
2055  !IsComplex<T4>::value };
2056  };
2057  //**********************************************************************************************
2058 
2059  //**********************************************************************************************
2061 
2064  template< typename T1, typename T2, typename T3, typename T4 >
2065  struct UseDoublePrecisionKernel {
2066  enum { value = IsDouble<typename T1::ElementType>::value &&
2067  IsDouble<typename T2::ElementType>::value &&
2068  IsDouble<typename T3::ElementType>::value &&
2069  !IsComplex<T4>::value };
2070  };
2071  //**********************************************************************************************
2072 
2073  //**********************************************************************************************
2075 
2078  template< typename T1, typename T2, typename T3 >
2079  struct UseSinglePrecisionComplexKernel {
2080  typedef complex<float> Type;
2081  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2082  IsSame<typename T2::ElementType,Type>::value &&
2083  IsSame<typename T3::ElementType,Type>::value };
2084  };
2085  //**********************************************************************************************
2086 
2087  //**********************************************************************************************
2089 
2092  template< typename T1, typename T2, typename T3 >
2093  struct UseDoublePrecisionComplexKernel {
2094  typedef complex<double> Type;
2095  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2096  IsSame<typename T2::ElementType,Type>::value &&
2097  IsSame<typename T3::ElementType,Type>::value };
2098  };
2099  //**********************************************************************************************
2100 
2101  //**********************************************************************************************
2103 
2105  template< typename T1, typename T2, typename T3, typename T4 >
2106  struct UseDefaultKernel {
2107  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2108  !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2109  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2110  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2111  };
2112  //**********************************************************************************************
2113 
2114  //**********************************************************************************************
2116 
2118  template< typename T1, typename T2, typename T3, typename T4 >
2119  struct UseVectorizedDefaultKernel {
2120  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2121  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2122  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2123  IsSame<typename T1::ElementType,T4>::value &&
2124  IntrinsicTrait<typename T1::ElementType>::addition &&
2125  IntrinsicTrait<typename T1::ElementType>::multiplication };
2126  };
2127  //**********************************************************************************************
2128 
2129  public:
2130  //**Type definitions****************************************************************************
2131  typedef DMatScalarMultExpr<MMM,ST,false> This;
2132  typedef typename MultTrait<RES,ST>::Type ResultType;
2133  typedef typename ResultType::OppositeType OppositeType;
2134  typedef typename ResultType::TransposeType TransposeType;
2135  typedef typename ResultType::ElementType ElementType;
2136  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
2137  typedef const ElementType ReturnType;
2138  typedef const ResultType CompositeType;
2139 
2141  typedef const DMatTDMatMultExpr<MT1,MT2> LeftOperand;
2142 
2144  typedef typename SelectType< IsNumeric<ElementType>::value, ElementType, ST >::Type RightOperand;
2145 
2147  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
2148 
2150  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
2151  //**********************************************************************************************
2152 
2153  //**Compilation flags***************************************************************************
2155  enum { vectorizable = 0 };
2156 
2158  enum { canAlias = CanAlias<MMM>::value };
2159  //**********************************************************************************************
2160 
2161  //**Constructor*********************************************************************************
2167  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
2168  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
2169  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2170  {}
2171  //**********************************************************************************************
2172 
2173  //**Access operator*****************************************************************************
2180  inline ReturnType operator()( size_t i, size_t j ) const {
2181  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
2182  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
2183  return matrix_(i,j) * scalar_;
2184  }
2185  //**********************************************************************************************
2186 
2187  //**Rows function*******************************************************************************
2192  inline size_t rows() const {
2193  return matrix_.rows();
2194  }
2195  //**********************************************************************************************
2196 
2197  //**Columns function****************************************************************************
2202  inline size_t columns() const {
2203  return matrix_.columns();
2204  }
2205  //**********************************************************************************************
2206 
2207  //**Left operand access*************************************************************************
2212  inline LeftOperand leftOperand() const {
2213  return matrix_;
2214  }
2215  //**********************************************************************************************
2216 
2217  //**Right operand access************************************************************************
2222  inline RightOperand rightOperand() const {
2223  return scalar_;
2224  }
2225  //**********************************************************************************************
2226 
2227  //**********************************************************************************************
2233  template< typename T >
2234  inline bool isAliased( const T* alias ) const {
2235  return CanAlias<MMM>::value && matrix_.isAliased( alias );
2236  }
2237  //**********************************************************************************************
2238 
2239  private:
2240  //**Member variables****************************************************************************
2241  LeftOperand matrix_;
2242  RightOperand scalar_;
2243  //**********************************************************************************************
2244 
2245  //**Assignment to dense matrices****************************************************************
2254  template< typename MT3 // Type of the target dense matrix
2255  , bool SO > // Storage order of the target dense matrix
2256  friend inline void assign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
2257  {
2258  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2259  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2260 
2261  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2262  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2263 
2264  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
2265  return;
2266  }
2267  else if( left.columns() == 0UL ) {
2268  reset( ~lhs );
2269  return;
2270  }
2271 
2272  LT A( left ); // Evaluation of the left-hand side dense matrix operand
2273  RT B( right ); // Evaluation of the right-hand side dense matrix operand
2274 
2275  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2276  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
2277  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
2278  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
2279  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2280  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
2281 
2282  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
2283  DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2284  else
2285  DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2286  }
2287  //**********************************************************************************************
2288 
2289  //**Default assignment to dense matrices********************************************************
2303  template< typename MT3 // Type of the left-hand side target matrix
2304  , typename MT4 // Type of the left-hand side matrix operand
2305  , typename MT5 // Type of the right-hand side matrix operand
2306  , typename ST2 > // Type of the scalar value
2307  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2308  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2309  {
2310  for( size_t i=0UL; i<A.rows(); ++i ) {
2311  for( size_t k=0UL; k<B.columns(); ++k ) {
2312  C(i,k) = A(i,0UL) * B(0UL,k);
2313  }
2314  for( size_t j=1UL; j<A.columns(); ++j ) {
2315  for( size_t k=0UL; k<B.columns(); ++k ) {
2316  C(i,k) += A(i,j) * B(j,k);
2317  }
2318  }
2319  for( size_t k=0UL; k<B.columns(); ++k ) {
2320  C(i,k) *= scalar;
2321  }
2322  }
2323  }
2324  //**********************************************************************************************
2325 
2326  //**Vectorized default assignment to row-major dense matrices***********************************
2340  template< typename MT3 // Type of the left-hand side target matrix
2341  , typename MT4 // Type of the left-hand side matrix operand
2342  , typename MT5 // Type of the right-hand side matrix operand
2343  , typename ST2 > // Type of the scalar value
2344  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2345  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
2346  {
2347  typedef IntrinsicTrait<ElementType> IT;
2348 
2349  const size_t M( A.rows() );
2350  const size_t N( B.columns() );
2351  const size_t K( A.columns() );
2352 
2353  size_t i( 0UL );
2354 
2355  for( ; (i+2UL) <= M; i+=2UL ) {
2356  size_t j( 0UL );
2357  for( ; (j+4UL) <= N; j+=4UL ) {
2358  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2359  for( size_t k=0UL; k<K; k+=IT::size ) {
2360  const IntrinsicType a1( A.get(i ,k) );
2361  const IntrinsicType a2( A.get(i+1UL,k) );
2362  const IntrinsicType b1( B.get(k,j ) );
2363  const IntrinsicType b2( B.get(k,j+1UL) );
2364  const IntrinsicType b3( B.get(k,j+2UL) );
2365  const IntrinsicType b4( B.get(k,j+3UL) );
2366  xmm1 = xmm1 + a1 * b1;
2367  xmm2 = xmm2 + a1 * b2;
2368  xmm3 = xmm3 + a1 * b3;
2369  xmm4 = xmm4 + a1 * b4;
2370  xmm5 = xmm5 + a2 * b1;
2371  xmm6 = xmm6 + a2 * b2;
2372  xmm7 = xmm7 + a2 * b3;
2373  xmm8 = xmm8 + a2 * b4;
2374  }
2375  (~C)(i ,j ) = sum( xmm1 ) * scalar;
2376  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
2377  (~C)(i ,j+2UL) = sum( xmm3 ) * scalar;
2378  (~C)(i ,j+3UL) = sum( xmm4 ) * scalar;
2379  (~C)(i+1UL,j ) = sum( xmm5 ) * scalar;
2380  (~C)(i+1UL,j+1UL) = sum( xmm6 ) * scalar;
2381  (~C)(i+1UL,j+2UL) = sum( xmm7 ) * scalar;
2382  (~C)(i+1UL,j+3UL) = sum( xmm8 ) * scalar;
2383  }
2384  for( ; (j+2UL) <= N; j+=2UL ) {
2385  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2386  for( size_t k=0UL; k<K; k+=IT::size ) {
2387  const IntrinsicType a1( A.get(i ,k) );
2388  const IntrinsicType a2( A.get(i+1UL,k) );
2389  const IntrinsicType b1( B.get(k,j ) );
2390  const IntrinsicType b2( B.get(k,j+1UL) );
2391  xmm1 = xmm1 + a1 * b1;
2392  xmm2 = xmm2 + a1 * b2;
2393  xmm3 = xmm3 + a2 * b1;
2394  xmm4 = xmm4 + a2 * b2;
2395  }
2396  (~C)(i ,j ) = sum( xmm1 ) * scalar;
2397  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
2398  (~C)(i+1UL,j ) = sum( xmm3 ) * scalar;
2399  (~C)(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
2400  }
2401  if( j < N ) {
2402  IntrinsicType xmm1, xmm2;
2403  for( size_t k=0UL; k<K; k+=IT::size ) {
2404  const IntrinsicType b1( B.get(k,j) );
2405  xmm1 = xmm1 + A.get(i ,k) * b1;
2406  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2407  }
2408  (~C)(i ,j) = sum( xmm1 ) * scalar;
2409  (~C)(i+1UL,j) = sum( xmm2 ) * scalar;
2410  }
2411  }
2412  if( i < M ) {
2413  size_t j( 0UL );
2414  for( ; (j+4UL) <= N; j+=4UL ) {
2415  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2416  for( size_t k=0UL; k<K; k+=IT::size ) {
2417  const IntrinsicType a1( A.get(i,k) );
2418  xmm1 = xmm1 + a1 * B.get(k,j );
2419  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2420  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
2421  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
2422  }
2423  (~C)(i,j ) = sum( xmm1 ) * scalar;
2424  (~C)(i,j+1UL) = sum( xmm2 ) * scalar;
2425  (~C)(i,j+2UL) = sum( xmm3 ) * scalar;
2426  (~C)(i,j+3UL) = sum( xmm4 ) * scalar;
2427  }
2428  for( ; (j+2UL) <= N; j+=2UL ) {
2429  IntrinsicType xmm1, xmm2;
2430  for( size_t k=0UL; k<K; k+=IT::size ) {
2431  const IntrinsicType a1( A.get(i,k) );
2432  xmm1 = xmm1 + a1 * B.get(k,j );
2433  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2434  }
2435  (~C)(i,j ) = sum( xmm1 ) * scalar;
2436  (~C)(i,j+1UL) = sum( xmm2 ) * scalar;
2437  }
2438  if( j < N ) {
2439  IntrinsicType xmm1, xmm2;
2440  for( size_t k=0UL; k<K; k+=IT::size ) {
2441  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
2442  }
2443  (~C)(i,j) = sum( xmm1 ) * scalar;
2444  }
2445  }
2446  }
2447  //**********************************************************************************************
2448 
2449  //**Vectorized default assignment to column-major dense matrices********************************
2463  template< typename MT3 // Type of the left-hand side target matrix
2464  , typename MT4 // Type of the left-hand side matrix operand
2465  , typename MT5 // Type of the right-hand side matrix operand
2466  , typename ST2 > // Type of the scalar value
2467  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2468  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
2469  {
2470  typedef IntrinsicTrait<ElementType> IT;
2471 
2472  const size_t M( A.rows() );
2473  const size_t N( B.columns() );
2474  const size_t K( A.columns() );
2475 
2476  size_t i( 0UL );
2477 
2478  for( ; (i+4UL) <= M; i+=4UL ) {
2479  size_t j( 0UL );
2480  for( ; (j+2UL) <= N; j+=2UL ) {
2481  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2482  for( size_t k=0UL; k<K; k+=IT::size ) {
2483  const IntrinsicType a1( A.get(i ,k) );
2484  const IntrinsicType a2( A.get(i+1UL,k) );
2485  const IntrinsicType a3( A.get(i+2UL,k) );
2486  const IntrinsicType a4( A.get(i+3UL,k) );
2487  const IntrinsicType b1( B.get(k,j ) );
2488  const IntrinsicType b2( B.get(k,j+1UL) );
2489  xmm1 = xmm1 + a1 * b1;
2490  xmm2 = xmm2 + a1 * b2;
2491  xmm3 = xmm3 + a2 * b1;
2492  xmm4 = xmm4 + a2 * b2;
2493  xmm5 = xmm5 + a3 * b1;
2494  xmm6 = xmm6 + a3 * b2;
2495  xmm7 = xmm7 + a4 * b1;
2496  xmm8 = xmm8 + a4 * b2;
2497  }
2498  (~C)(i ,j ) = sum( xmm1 ) * scalar;
2499  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
2500  (~C)(i+1UL,j ) = sum( xmm3 ) * scalar;
2501  (~C)(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
2502  (~C)(i+2UL,j ) = sum( xmm5 ) * scalar;
2503  (~C)(i+2UL,j+1UL) = sum( xmm6 ) * scalar;
2504  (~C)(i+3UL,j ) = sum( xmm7 ) * scalar;
2505  (~C)(i+3UL,j+1UL) = sum( xmm8 ) * scalar;
2506  }
2507  if( j < N ) {
2508  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2509  for( size_t k=0UL; k<K; k+=IT::size ) {
2510  const IntrinsicType b1( B.get(k,j) );
2511  xmm1 = xmm1 + A.get(i ,k) * b1;
2512  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2513  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
2514  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
2515  }
2516  (~C)(i ,j) = sum( xmm1 ) * scalar;
2517  (~C)(i+1UL,j) = sum( xmm2 ) * scalar;
2518  (~C)(i+2UL,j) = sum( xmm3 ) * scalar;
2519  (~C)(i+3UL,j) = sum( xmm4 ) * scalar;
2520  }
2521  }
2522  for( ; (i+2UL) <= M; i+=2UL ) {
2523  size_t j( 0UL );
2524  for( ; (j+2UL) <= N; j+=2UL ) {
2525  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2526  for( size_t k=0UL; k<K; k+=IT::size ) {
2527  const IntrinsicType a1( A.get(i ,k) );
2528  const IntrinsicType a2( A.get(i+1UL,k) );
2529  const IntrinsicType b1( B.get(k,j ) );
2530  const IntrinsicType b2( B.get(k,j+1UL) );
2531  xmm1 = xmm1 + a1 * b1;
2532  xmm2 = xmm2 + a1 * b2;
2533  xmm3 = xmm3 + a2 * b1;
2534  xmm4 = xmm4 + a2 * b2;
2535  }
2536  (~C)(i ,j ) = sum( xmm1 ) * scalar;
2537  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
2538  (~C)(i+1UL,j ) = sum( xmm3 ) * scalar;
2539  (~C)(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
2540  }
2541  if( j < N ) {
2542  IntrinsicType xmm1, xmm2;
2543  for( size_t k=0UL; k<K; k+=IT::size ) {
2544  const IntrinsicType b1( B.get(k,j) );
2545  xmm1 = xmm1 + A.get(i ,k) * b1;
2546  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2547  }
2548  (~C)(i ,j) = sum( xmm1 ) * scalar;
2549  (~C)(i+1UL,j) = sum( xmm2 ) * scalar;
2550  }
2551  }
2552  if( i < M ) {
2553  size_t j( 0UL );
2554  for( ; (j+2UL) <= N; j+=2UL ) {
2555  IntrinsicType xmm1, xmm2;
2556  for( size_t k=0UL; k<K; k+=IT::size ) {
2557  const IntrinsicType a1( A.get(i,k) );
2558  xmm1 = xmm1 + a1 * B.get(k,j );
2559  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2560  }
2561  (~C)(i,j ) = sum( xmm1 ) * scalar;
2562  (~C)(i,j+1UL) = sum( xmm2 ) * scalar;
2563  }
2564  if( j < N ) {
2565  IntrinsicType xmm1, xmm2;
2566  for( size_t k=0UL; k<K; k+=IT::size ) {
2567  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
2568  }
2569  (~C)(i,j) = sum( xmm1 ) * scalar;
2570  }
2571  }
2572  }
2573  //**********************************************************************************************
2574 
2575  //**BLAS-based assignment to dense matrices (default)*******************************************
2589  template< typename MT3 // Type of the left-hand side target matrix
2590  , typename MT4 // Type of the left-hand side matrix operand
2591  , typename MT5 // Type of the right-hand side matrix operand
2592  , typename ST2 > // Type of the scalar value
2593  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2594  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2595  {
2596  selectDefaultAssignKernel( C, A, B, scalar );
2597  }
2598  //**********************************************************************************************
2599 
2600  //**BLAS-based assignment to dense matrices (single precision)**********************************
2601 #if BLAZE_BLAS_MODE
2602 
2615  template< typename MT3 // Type of the left-hand side target matrix
2616  , typename MT4 // Type of the left-hand side matrix operand
2617  , typename MT5 // Type of the right-hand side matrix operand
2618  , typename ST2 > // Type of the scalar value
2619  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2620  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2621  {
2622  using boost::numeric_cast;
2623 
2624  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
2625  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
2626  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
2627 
2628  const int M ( numeric_cast<int>( A.rows() ) );
2629  const int N ( numeric_cast<int>( B.columns() ) );
2630  const int K ( numeric_cast<int>( A.columns() ) );
2631  const int lda( numeric_cast<int>( A.spacing() ) );
2632  const int ldb( numeric_cast<int>( B.spacing() ) );
2633  const int ldc( numeric_cast<int>( C.spacing() ) );
2634 
2635  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2636  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2637  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2638  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2639  }
2640 #endif
2641  //**********************************************************************************************
2642 
2643  //**BLAS-based assignment to dense matrices (double precision)**********************************
2644 #if BLAZE_BLAS_MODE
2645 
2658  template< typename MT3 // Type of the left-hand side target matrix
2659  , typename MT4 // Type of the left-hand side matrix operand
2660  , typename MT5 // Type of the right-hand side matrix operand
2661  , typename ST2 > // Type of the scalar value
2662  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2663  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2664  {
2665  using boost::numeric_cast;
2666 
2667  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
2668  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
2669  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
2670 
2671  const int M ( numeric_cast<int>( A.rows() ) );
2672  const int N ( numeric_cast<int>( B.columns() ) );
2673  const int K ( numeric_cast<int>( A.columns() ) );
2674  const int lda( numeric_cast<int>( A.spacing() ) );
2675  const int ldb( numeric_cast<int>( B.spacing() ) );
2676  const int ldc( numeric_cast<int>( C.spacing() ) );
2677 
2678  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2679  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2680  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2681  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2682  }
2683 #endif
2684  //**********************************************************************************************
2685 
2686  //**BLAS-based assignment to dense matrices (single precision complex)**************************
2687 #if BLAZE_BLAS_MODE
2688 
2701  template< typename MT3 // Type of the left-hand side target matrix
2702  , typename MT4 // Type of the left-hand side matrix operand
2703  , typename MT5 // Type of the right-hand side matrix operand
2704  , typename ST2 > // Type of the scalar value
2705  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2706  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2707  {
2708  using boost::numeric_cast;
2709 
2710  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
2711  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
2712  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
2714  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
2715  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
2716  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
2717 
2718  const int M ( numeric_cast<int>( A.rows() ) );
2719  const int N ( numeric_cast<int>( B.columns() ) );
2720  const int K ( numeric_cast<int>( A.columns() ) );
2721  const int lda( numeric_cast<int>( A.spacing() ) );
2722  const int ldb( numeric_cast<int>( B.spacing() ) );
2723  const int ldc( numeric_cast<int>( C.spacing() ) );
2724  const complex<float> alpha( scalar );
2725  const complex<float> beta ( 0.0F, 0.0F );
2726 
2727  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2728  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2729  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2730  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2731  }
2732 #endif
2733  //**********************************************************************************************
2734 
2735  //**BLAS-based assignment to dense matrices (double precision complex)**************************
2736 #if BLAZE_BLAS_MODE
2737 
2750  template< typename MT3 // Type of the left-hand side target matrix
2751  , typename MT4 // Type of the left-hand side matrix operand
2752  , typename MT5 // Type of the right-hand side matrix operand
2753  , typename ST2 > // Type of the scalar value
2754  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2755  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2756  {
2757  using boost::numeric_cast;
2758 
2759  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
2760  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
2761  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
2763  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
2764  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
2765  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
2766 
2767  const int M ( numeric_cast<int>( A.rows() ) );
2768  const int N ( numeric_cast<int>( B.columns() ) );
2769  const int K ( numeric_cast<int>( A.columns() ) );
2770  const int lda( numeric_cast<int>( A.spacing() ) );
2771  const int ldb( numeric_cast<int>( B.spacing() ) );
2772  const int ldc( numeric_cast<int>( C.spacing() ) );
2773  const complex<double> alpha( scalar );
2774  const complex<double> beta ( 0.0, 0.0 );
2775 
2776  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2777  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2778  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2779  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2780  }
2781 #endif
2782  //**********************************************************************************************
2783 
2784  //**Assignment to sparse matrices***************************************************************
2796  template< typename MT // Type of the target sparse matrix
2797  , bool SO > // Storage order of the target sparse matrix
2798  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
2799  {
2800  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
2801 
2807  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( typename TmpType::CompositeType );
2808 
2809  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2810  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2811 
2812  const TmpType tmp( rhs );
2813  assign( ~lhs, tmp );
2814  }
2815  //**********************************************************************************************
2816 
2817  //**Addition assignment to dense matrices*******************************************************
2829  template< typename MT3 // Type of the target dense matrix
2830  , bool SO > // Storage order of the target dense matrix
2831  friend inline void addAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
2832  {
2833  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2834  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2835 
2836  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2837  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2838 
2839  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
2840  return;
2841  }
2842 
2843  LT A( left ); // Evaluation of the left-hand side dense matrix operand
2844  RT B( right ); // Evaluation of the right-hand side dense matrix operand
2845 
2846  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2847  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
2848  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
2849  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
2850  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2851  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
2852 
2853  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
2854  DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2855  else
2856  DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2857  }
2858  //**********************************************************************************************
2859 
2860  //**Default addition assignment to dense matrices***********************************************
2874  template< typename MT3 // Type of the left-hand side target matrix
2875  , typename MT4 // Type of the left-hand side matrix operand
2876  , typename MT5 // Type of the right-hand side matrix operand
2877  , typename ST2 > // Type of the scalar value
2878  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2879  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2880  {
2881  const ResultType tmp( A * B * scalar );
2882  addAssign( C, tmp );
2883  }
2884  //**********************************************************************************************
2885 
2886  //**Vectorized default addition assignment to row-major dense matrices**************************
2900  template< typename MT3 // Type of the left-hand side target matrix
2901  , typename MT4 // Type of the left-hand side matrix operand
2902  , typename MT5 // Type of the right-hand side matrix operand
2903  , typename ST2 > // Type of the scalar value
2904  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2905  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
2906  {
2907  typedef IntrinsicTrait<ElementType> IT;
2908 
2909  const size_t M( A.rows() );
2910  const size_t N( B.columns() );
2911  const size_t K( A.columns() );
2912 
2913  size_t i( 0UL );
2914 
2915  for( ; (i+2UL) <= M; i+=2UL ) {
2916  size_t j( 0UL );
2917  for( ; (j+4UL) <= N; j+=4UL ) {
2918  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2919  for( size_t k=0UL; k<K; k+=IT::size ) {
2920  const IntrinsicType a1( A.get(i ,k) );
2921  const IntrinsicType a2( A.get(i+1UL,k) );
2922  const IntrinsicType b1( B.get(k,j ) );
2923  const IntrinsicType b2( B.get(k,j+1UL) );
2924  const IntrinsicType b3( B.get(k,j+2UL) );
2925  const IntrinsicType b4( B.get(k,j+3UL) );
2926  xmm1 = xmm1 + a1 * b1;
2927  xmm2 = xmm2 + a1 * b2;
2928  xmm3 = xmm3 + a1 * b3;
2929  xmm4 = xmm4 + a1 * b4;
2930  xmm5 = xmm5 + a2 * b1;
2931  xmm6 = xmm6 + a2 * b2;
2932  xmm7 = xmm7 + a2 * b3;
2933  xmm8 = xmm8 + a2 * b4;
2934  }
2935  (~C)(i ,j ) += sum( xmm1 ) * scalar;
2936  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
2937  (~C)(i ,j+2UL) += sum( xmm3 ) * scalar;
2938  (~C)(i ,j+3UL) += sum( xmm4 ) * scalar;
2939  (~C)(i+1UL,j ) += sum( xmm5 ) * scalar;
2940  (~C)(i+1UL,j+1UL) += sum( xmm6 ) * scalar;
2941  (~C)(i+1UL,j+2UL) += sum( xmm7 ) * scalar;
2942  (~C)(i+1UL,j+3UL) += sum( xmm8 ) * scalar;
2943  }
2944  for( ; (j+2UL) <= N; j+=2UL ) {
2945  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2946  for( size_t k=0UL; k<K; k+=IT::size ) {
2947  const IntrinsicType a1( A.get(i ,k) );
2948  const IntrinsicType a2( A.get(i+1UL,k) );
2949  const IntrinsicType b1( B.get(k,j ) );
2950  const IntrinsicType b2( B.get(k,j+1UL) );
2951  xmm1 = xmm1 + a1 * b1;
2952  xmm2 = xmm2 + a1 * b2;
2953  xmm3 = xmm3 + a2 * b1;
2954  xmm4 = xmm4 + a2 * b2;
2955  }
2956  (~C)(i ,j ) += sum( xmm1 ) * scalar;
2957  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
2958  (~C)(i+1UL,j ) += sum( xmm3 ) * scalar;
2959  (~C)(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
2960  }
2961  if( j < N ) {
2962  IntrinsicType xmm1, xmm2;
2963  for( size_t k=0UL; k<K; k+=IT::size ) {
2964  const IntrinsicType b1( B.get(k,j) );
2965  xmm1 = xmm1 + A.get(i ,k) * b1;
2966  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2967  }
2968  (~C)(i ,j) += sum( xmm1 ) * scalar;
2969  (~C)(i+1UL,j) += sum( xmm2 ) * scalar;
2970  }
2971  }
2972  if( i < M ) {
2973  size_t j( 0UL );
2974  for( ; (j+4UL) <= N; j+=4UL ) {
2975  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2976  for( size_t k=0UL; k<K; k+=IT::size ) {
2977  const IntrinsicType a1( A.get(i,k) );
2978  xmm1 = xmm1 + a1 * B.get(k,j );
2979  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2980  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
2981  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
2982  }
2983  (~C)(i,j ) += sum( xmm1 ) * scalar;
2984  (~C)(i,j+1UL) += sum( xmm2 ) * scalar;
2985  (~C)(i,j+2UL) += sum( xmm3 ) * scalar;
2986  (~C)(i,j+3UL) += sum( xmm4 ) * scalar;
2987  }
2988  for( ; (j+2UL) <= N; j+=2UL ) {
2989  IntrinsicType xmm1, xmm2;
2990  for( size_t k=0UL; k<K; k+=IT::size ) {
2991  const IntrinsicType a1( A.get(i,k) );
2992  xmm1 = xmm1 + a1 * B.get(k,j );
2993  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2994  }
2995  (~C)(i,j ) += sum( xmm1 ) * scalar;
2996  (~C)(i,j+1UL) += sum( xmm2 ) * scalar;
2997  }
2998  if( j < N ) {
2999  IntrinsicType xmm1, xmm2;
3000  for( size_t k=0UL; k<K; k+=IT::size ) {
3001  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3002  }
3003  (~C)(i,j) += sum( xmm1 ) * scalar;
3004  }
3005  }
3006  }
3007  //**********************************************************************************************
3008 
3009  //**Vectorized default addition assignment to column-major dense matrices***********************
3023  template< typename MT3 // Type of the left-hand side target matrix
3024  , typename MT4 // Type of the left-hand side matrix operand
3025  , typename MT5 // Type of the right-hand side matrix operand
3026  , typename ST2 > // Type of the scalar value
3027  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3028  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3029  {
3030  typedef IntrinsicTrait<ElementType> IT;
3031 
3032  const size_t M( A.rows() );
3033  const size_t N( B.columns() );
3034  const size_t K( A.columns() );
3035 
3036  size_t i( 0UL );
3037 
3038  for( ; (i+4UL) <= M; i+=4UL ) {
3039  size_t j( 0UL );
3040  for( ; (j+2UL) <= N; j+=2UL ) {
3041  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3042  for( size_t k=0UL; k<K; k+=IT::size ) {
3043  const IntrinsicType a1( A.get(i ,k) );
3044  const IntrinsicType a2( A.get(i+1UL,k) );
3045  const IntrinsicType a3( A.get(i+2UL,k) );
3046  const IntrinsicType a4( A.get(i+3UL,k) );
3047  const IntrinsicType b1( B.get(k,j ) );
3048  const IntrinsicType b2( B.get(k,j+1UL) );
3049  xmm1 = xmm1 + a1 * b1;
3050  xmm2 = xmm2 + a1 * b2;
3051  xmm3 = xmm3 + a2 * b1;
3052  xmm4 = xmm4 + a2 * b2;
3053  xmm5 = xmm5 + a3 * b1;
3054  xmm6 = xmm6 + a3 * b2;
3055  xmm7 = xmm7 + a4 * b1;
3056  xmm8 = xmm8 + a4 * b2;
3057  }
3058  (~C)(i ,j ) += sum( xmm1 ) * scalar;
3059  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
3060  (~C)(i+1UL,j ) += sum( xmm3 ) * scalar;
3061  (~C)(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
3062  (~C)(i+2UL,j ) += sum( xmm5 ) * scalar;
3063  (~C)(i+2UL,j+1UL) += sum( xmm6 ) * scalar;
3064  (~C)(i+3UL,j ) += sum( xmm7 ) * scalar;
3065  (~C)(i+3UL,j+1UL) += sum( xmm8 ) * scalar;
3066  }
3067  if( j < N ) {
3068  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3069  for( size_t k=0UL; k<K; k+=IT::size ) {
3070  const IntrinsicType b1( B.get(k,j) );
3071  xmm1 = xmm1 + A.get(i ,k) * b1;
3072  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3073  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
3074  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
3075  }
3076  (~C)(i ,j) += sum( xmm1 ) * scalar;
3077  (~C)(i+1UL,j) += sum( xmm2 ) * scalar;
3078  (~C)(i+2UL,j) += sum( xmm3 ) * scalar;
3079  (~C)(i+3UL,j) += sum( xmm4 ) * scalar;
3080  }
3081  }
3082  for( ; (i+2UL) <= M; i+=2UL ) {
3083  size_t j( 0UL );
3084  for( ; (j+2UL) <= N; j+=2UL ) {
3085  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3086  for( size_t k=0UL; k<K; k+=IT::size ) {
3087  const IntrinsicType a1( A.get(i ,k) );
3088  const IntrinsicType a2( A.get(i+1UL,k) );
3089  const IntrinsicType b1( B.get(k,j ) );
3090  const IntrinsicType b2( B.get(k,j+1UL) );
3091  xmm1 = xmm1 + a1 * b1;
3092  xmm2 = xmm2 + a1 * b2;
3093  xmm3 = xmm3 + a2 * b1;
3094  xmm4 = xmm4 + a2 * b2;
3095  }
3096  (~C)(i ,j ) += sum( xmm1 ) * scalar;
3097  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
3098  (~C)(i+1UL,j ) += sum( xmm3 ) * scalar;
3099  (~C)(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
3100  }
3101  if( j < N ) {
3102  IntrinsicType xmm1, xmm2;
3103  for( size_t k=0UL; k<K; k+=IT::size ) {
3104  const IntrinsicType b1( B.get(k,j) );
3105  xmm1 = xmm1 + A.get(i ,k) * b1;
3106  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3107  }
3108  (~C)(i ,j) += sum( xmm1 ) * scalar;
3109  (~C)(i+1UL,j) += sum( xmm2 ) * scalar;
3110  }
3111  }
3112  if( i < M ) {
3113  size_t j( 0UL );
3114  for( ; (j+2UL) <= N; j+=2UL ) {
3115  IntrinsicType xmm1, xmm2;
3116  for( size_t k=0UL; k<K; k+=IT::size ) {
3117  const IntrinsicType a1( A.get(i,k) );
3118  xmm1 = xmm1 + a1 * B.get(k,j );
3119  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3120  }
3121  (~C)(i,j ) += sum( xmm1 ) * scalar;
3122  (~C)(i,j+1UL) += sum( xmm2 ) * scalar;
3123  }
3124  if( j < N ) {
3125  IntrinsicType xmm1, xmm2;
3126  for( size_t k=0UL; k<K; k+=IT::size ) {
3127  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3128  }
3129  (~C)(i,j) += sum( xmm1 ) * scalar;
3130  }
3131  }
3132  }
3133  //**********************************************************************************************
3134 
3135  //**BLAS-based addition assignment to dense matrices (default)**********************************
3149  template< typename MT3 // Type of the left-hand side target matrix
3150  , typename MT4 // Type of the left-hand side matrix operand
3151  , typename MT5 // Type of the right-hand side matrix operand
3152  , typename ST2 > // Type of the scalar value
3153  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3154  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3155  {
3156  selectDefaultAddAssignKernel( C, A, B, scalar );
3157  }
3158  //**********************************************************************************************
3159 
3160  //**BLAS-based addition assignment to dense matrices (single precision)*************************
3161 #if BLAZE_BLAS_MODE
3162 
3175  template< typename MT3 // Type of the left-hand side target matrix
3176  , typename MT4 // Type of the left-hand side matrix operand
3177  , typename MT5 // Type of the right-hand side matrix operand
3178  , typename ST2 > // Type of the scalar value
3179  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3180  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3181  {
3182  using boost::numeric_cast;
3183 
3184  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
3185  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
3186  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
3187 
3188  const int M ( numeric_cast<int>( A.rows() ) );
3189  const int N ( numeric_cast<int>( B.columns() ) );
3190  const int K ( numeric_cast<int>( A.columns() ) );
3191  const int lda( numeric_cast<int>( A.spacing() ) );
3192  const int ldb( numeric_cast<int>( B.spacing() ) );
3193  const int ldc( numeric_cast<int>( C.spacing() ) );
3194 
3195  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3196  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3197  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3198  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3199  }
3200 #endif
3201  //**********************************************************************************************
3202 
3203  //**BLAS-based addition assignment to dense matrices (double precision)*************************
3204 #if BLAZE_BLAS_MODE
3205 
3218  template< typename MT3 // Type of the left-hand side target matrix
3219  , typename MT4 // Type of the left-hand side matrix operand
3220  , typename MT5 // Type of the right-hand side matrix operand
3221  , typename ST2 > // Type of the scalar value
3222  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3223  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3224  {
3225  using boost::numeric_cast;
3226 
3227  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
3228  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
3229  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
3230 
3231  const int M ( numeric_cast<int>( A.rows() ) );
3232  const int N ( numeric_cast<int>( B.columns() ) );
3233  const int K ( numeric_cast<int>( A.columns() ) );
3234  const int lda( numeric_cast<int>( A.spacing() ) );
3235  const int ldb( numeric_cast<int>( B.spacing() ) );
3236  const int ldc( numeric_cast<int>( C.spacing() ) );
3237 
3238  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3239  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3240  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3241  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3242  }
3243 #endif
3244  //**********************************************************************************************
3245 
3246  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
3247 #if BLAZE_BLAS_MODE
3248 
3261  template< typename MT3 // Type of the left-hand side target matrix
3262  , typename MT4 // Type of the left-hand side matrix operand
3263  , typename MT5 // Type of the right-hand side matrix operand
3264  , typename ST2 > // Type of the scalar value
3265  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3266  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3267  {
3268  using boost::numeric_cast;
3269 
3270  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3271  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3272  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3274  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3275  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3276  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3277 
3278  const int M ( numeric_cast<int>( A.rows() ) );
3279  const int N ( numeric_cast<int>( B.columns() ) );
3280  const int K ( numeric_cast<int>( A.columns() ) );
3281  const int lda( numeric_cast<int>( A.spacing() ) );
3282  const int ldb( numeric_cast<int>( B.spacing() ) );
3283  const int ldc( numeric_cast<int>( C.spacing() ) );
3284  const complex<float> alpha( scalar );
3285  const complex<float> beta ( 1.0F, 0.0F );
3286 
3287  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3288  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3289  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3290  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3291  }
3292 #endif
3293  //**********************************************************************************************
3294 
3295  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
3296 #if BLAZE_BLAS_MODE
3297 
3310  template< typename MT3 // Type of the left-hand side target matrix
3311  , typename MT4 // Type of the left-hand side matrix operand
3312  , typename MT5 // Type of the right-hand side matrix operand
3313  , typename ST2 > // Type of the scalar value
3314  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3315  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3316  {
3317  using boost::numeric_cast;
3318 
3319  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3320  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3321  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3323  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3324  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3325  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3326 
3327  const int M ( numeric_cast<int>( A.rows() ) );
3328  const int N ( numeric_cast<int>( B.columns() ) );
3329  const int K ( numeric_cast<int>( A.columns() ) );
3330  const int lda( numeric_cast<int>( A.spacing() ) );
3331  const int ldb( numeric_cast<int>( B.spacing() ) );
3332  const int ldc( numeric_cast<int>( C.spacing() ) );
3333  const complex<double> alpha( scalar );
3334  const complex<double> beta ( 1.0, 0.0 );
3335 
3336  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3337  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3338  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3339  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3340  }
3341 #endif
3342  //**********************************************************************************************
3343 
3344  //**Addition assignment to sparse matrices******************************************************
3345  // No special implementation for the addition assignment to sparse matrices.
3346  //**********************************************************************************************
3347 
3348  //**Subtraction assignment to dense matrices****************************************************
3360  template< typename MT3 // Type of the target dense matrix
3361  , bool SO > // Storage order of the target dense matrix
3362  friend inline void subAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
3363  {
3364  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3365  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3366 
3367  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3368  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3369 
3370  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3371  return;
3372  }
3373 
3374  LT A( left ); // Evaluation of the left-hand side dense matrix operand
3375  RT B( right ); // Evaluation of the right-hand side dense matrix operand
3376 
3377  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3378  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3379  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3380  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3381  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3382  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3383 
3384  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
3385  DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3386  else
3387  DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3388  }
3389  //**********************************************************************************************
3390 
3391  //**Default subtraction assignment to dense matrices********************************************
3405  template< typename MT3 // Type of the left-hand side target matrix
3406  , typename MT4 // Type of the left-hand side matrix operand
3407  , typename MT5 // Type of the right-hand side matrix operand
3408  , typename ST2 > // Type of the scalar value
3409  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3410  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3411  {
3412  const ResultType tmp( A * B * scalar );
3413  subAssign( C, tmp );
3414  }
3415  //**********************************************************************************************
3416 
3417  //**Vectorized default subtraction assignment to row-major dense matrices***********************
3431  template< typename MT3 // Type of the left-hand side target matrix
3432  , typename MT4 // Type of the left-hand side matrix operand
3433  , typename MT5 // Type of the right-hand side matrix operand
3434  , typename ST2 > // Type of the scalar value
3435  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3436  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3437  {
3438  typedef IntrinsicTrait<ElementType> IT;
3439 
3440  const size_t M( A.rows() );
3441  const size_t N( B.columns() );
3442  const size_t K( A.columns() );
3443 
3444  size_t i( 0UL );
3445 
3446  for( ; (i+2UL) <= M; i+=2UL ) {
3447  size_t j( 0UL );
3448  for( ; (j+4UL) <= N; j+=4UL ) {
3449  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3450  for( size_t k=0UL; k<K; k+=IT::size ) {
3451  const IntrinsicType a1( A.get(i ,k) );
3452  const IntrinsicType a2( A.get(i+1UL,k) );
3453  const IntrinsicType b1( B.get(k,j ) );
3454  const IntrinsicType b2( B.get(k,j+1UL) );
3455  const IntrinsicType b3( B.get(k,j+2UL) );
3456  const IntrinsicType b4( B.get(k,j+3UL) );
3457  xmm1 = xmm1 + a1 * b1;
3458  xmm2 = xmm2 + a1 * b2;
3459  xmm3 = xmm3 + a1 * b3;
3460  xmm4 = xmm4 + a1 * b4;
3461  xmm5 = xmm5 + a2 * b1;
3462  xmm6 = xmm6 + a2 * b2;
3463  xmm7 = xmm7 + a2 * b3;
3464  xmm8 = xmm8 + a2 * b4;
3465  }
3466  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
3467  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
3468  (~C)(i ,j+2UL) -= sum( xmm3 ) * scalar;
3469  (~C)(i ,j+3UL) -= sum( xmm4 ) * scalar;
3470  (~C)(i+1UL,j ) -= sum( xmm5 ) * scalar;
3471  (~C)(i+1UL,j+1UL) -= sum( xmm6 ) * scalar;
3472  (~C)(i+1UL,j+2UL) -= sum( xmm7 ) * scalar;
3473  (~C)(i+1UL,j+3UL) -= sum( xmm8 ) * scalar;
3474  }
3475  for( ; (j+2UL) <= N; j+=2UL ) {
3476  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3477  for( size_t k=0UL; k<K; k+=IT::size ) {
3478  const IntrinsicType a1( A.get(i ,k) );
3479  const IntrinsicType a2( A.get(i+1UL,k) );
3480  const IntrinsicType b1( B.get(k,j ) );
3481  const IntrinsicType b2( B.get(k,j+1UL) );
3482  xmm1 = xmm1 + a1 * b1;
3483  xmm2 = xmm2 + a1 * b2;
3484  xmm3 = xmm3 + a2 * b1;
3485  xmm4 = xmm4 + a2 * b2;
3486  }
3487  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
3488  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
3489  (~C)(i+1UL,j ) -= sum( xmm3 ) * scalar;
3490  (~C)(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
3491  }
3492  if( j < N ) {
3493  IntrinsicType xmm1, xmm2;
3494  for( size_t k=0UL; k<K; k+=IT::size ) {
3495  const IntrinsicType b1( B.get(k,j) );
3496  xmm1 = xmm1 + A.get(i ,k) * b1;
3497  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3498  }
3499  (~C)(i ,j) -= sum( xmm1 ) * scalar;
3500  (~C)(i+1UL,j) -= sum( xmm2 ) * scalar;
3501  }
3502  }
3503  if( i < M ) {
3504  size_t j( 0UL );
3505  for( ; (j+4UL) <= N; j+=4UL ) {
3506  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3507  for( size_t k=0UL; k<K; k+=IT::size ) {
3508  const IntrinsicType a1( A.get(i,k) );
3509  xmm1 = xmm1 + a1 * B.get(k,j );
3510  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3511  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
3512  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
3513  }
3514  (~C)(i,j ) -= sum( xmm1 ) * scalar;
3515  (~C)(i,j+1UL) -= sum( xmm2 ) * scalar;
3516  (~C)(i,j+2UL) -= sum( xmm3 ) * scalar;
3517  (~C)(i,j+3UL) -= sum( xmm4 ) * scalar;
3518  }
3519  for( ; (j+2UL) <= N; j+=2UL ) {
3520  IntrinsicType xmm1, xmm2;
3521  for( size_t k=0UL; k<K; k+=IT::size ) {
3522  const IntrinsicType a1( A.get(i,k) );
3523  xmm1 = xmm1 + a1 * B.get(k,j );
3524  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3525  }
3526  (~C)(i,j ) -= sum( xmm1 ) * scalar;
3527  (~C)(i,j+1UL) -= sum( xmm2 ) * scalar;
3528  }
3529  if( j < N ) {
3530  IntrinsicType xmm1, xmm2;
3531  for( size_t k=0UL; k<K; k+=IT::size ) {
3532  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3533  }
3534  (~C)(i,j) -= sum( xmm1 ) * scalar;
3535  }
3536  }
3537  }
3538  //**********************************************************************************************
3539 
3540  //**Vectorized default subtraction assignment to column-major dense matrices********************
3554  template< typename MT3 // Type of the left-hand side target matrix
3555  , typename MT4 // Type of the left-hand side matrix operand
3556  , typename MT5 // Type of the right-hand side matrix operand
3557  , typename ST2 > // Type of the scalar value
3558  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3559  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3560  {
3561  typedef IntrinsicTrait<ElementType> IT;
3562 
3563  const size_t M( A.rows() );
3564  const size_t N( B.columns() );
3565  const size_t K( A.columns() );
3566 
3567  size_t i( 0UL );
3568 
3569  for( ; (i+4UL) <= M; i+=4UL ) {
3570  size_t j( 0UL );
3571  for( ; (j+2UL) <= N; j+=2UL ) {
3572  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3573  for( size_t k=0UL; k<K; k+=IT::size ) {
3574  const IntrinsicType a1( A.get(i ,k) );
3575  const IntrinsicType a2( A.get(i+1UL,k) );
3576  const IntrinsicType a3( A.get(i+2UL,k) );
3577  const IntrinsicType a4( A.get(i+3UL,k) );
3578  const IntrinsicType b1( B.get(k,j ) );
3579  const IntrinsicType b2( B.get(k,j+1UL) );
3580  xmm1 = xmm1 + a1 * b1;
3581  xmm2 = xmm2 + a1 * b2;
3582  xmm3 = xmm3 + a2 * b1;
3583  xmm4 = xmm4 + a2 * b2;
3584  xmm5 = xmm5 + a3 * b1;
3585  xmm6 = xmm6 + a3 * b2;
3586  xmm7 = xmm7 + a4 * b1;
3587  xmm8 = xmm8 + a4 * b2;
3588  }
3589  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
3590  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
3591  (~C)(i+1UL,j ) -= sum( xmm3 ) * scalar;
3592  (~C)(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
3593  (~C)(i+2UL,j ) -= sum( xmm5 ) * scalar;
3594  (~C)(i+2UL,j+1UL) -= sum( xmm6 ) * scalar;
3595  (~C)(i+3UL,j ) -= sum( xmm7 ) * scalar;
3596  (~C)(i+3UL,j+1UL) -= sum( xmm8 ) * scalar;
3597  }
3598  if( j < N ) {
3599  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3600  for( size_t k=0UL; k<K; k+=IT::size ) {
3601  const IntrinsicType b1( B.get(k,j) );
3602  xmm1 = xmm1 + A.get(i ,k) * b1;
3603  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3604  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
3605  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
3606  }
3607  (~C)(i ,j) -= sum( xmm1 ) * scalar;
3608  (~C)(i+1UL,j) -= sum( xmm2 ) * scalar;
3609  (~C)(i+2UL,j) -= sum( xmm3 ) * scalar;
3610  (~C)(i+3UL,j) -= sum( xmm4 ) * scalar;
3611  }
3612  }
3613  for( ; (i+2UL) <= M; i+=2UL ) {
3614  size_t j( 0UL );
3615  for( ; (j+2UL) <= N; j+=2UL ) {
3616  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3617  for( size_t k=0UL; k<K; k+=IT::size ) {
3618  const IntrinsicType a1( A.get(i ,k) );
3619  const IntrinsicType a2( A.get(i+1UL,k) );
3620  const IntrinsicType b1( B.get(k,j ) );
3621  const IntrinsicType b2( B.get(k,j+1UL) );
3622  xmm1 = xmm1 + a1 * b1;
3623  xmm2 = xmm2 + a1 * b2;
3624  xmm3 = xmm3 + a2 * b1;
3625  xmm4 = xmm4 + a2 * b2;
3626  }
3627  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
3628  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
3629  (~C)(i+1UL,j ) -= sum( xmm3 ) * scalar;
3630  (~C)(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
3631  }
3632  if( j < N ) {
3633  IntrinsicType xmm1, xmm2;
3634  for( size_t k=0UL; k<K; k+=IT::size ) {
3635  const IntrinsicType b1( B.get(k,j) );
3636  xmm1 = xmm1 + A.get(i ,k) * b1;
3637  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3638  }
3639  (~C)(i ,j) -= sum( xmm1 ) * scalar;
3640  (~C)(i+1UL,j) -= sum( xmm2 ) * scalar;
3641  }
3642  }
3643  if( i < M ) {
3644  size_t j( 0UL );
3645  for( ; (j+2UL) <= N; j+=2UL ) {
3646  IntrinsicType xmm1, xmm2;
3647  for( size_t k=0UL; k<K; k+=IT::size ) {
3648  const IntrinsicType a1( A.get(i,k) );
3649  xmm1 = xmm1 + a1 * B.get(k,j );
3650  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3651  }
3652  (~C)(i,j ) -= sum( xmm1 ) * scalar;
3653  (~C)(i,j+1UL) -= sum( xmm2 ) * scalar;
3654  }
3655  if( j < N ) {
3656  IntrinsicType xmm1, xmm2;
3657  for( size_t k=0UL; k<K; k+=IT::size ) {
3658  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3659  }
3660  (~C)(i,j) -= sum( xmm1 ) * scalar;
3661  }
3662  }
3663  }
3664  //**********************************************************************************************
3665 
3666  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
3680  template< typename MT3 // Type of the left-hand side target matrix
3681  , typename MT4 // Type of the left-hand side matrix operand
3682  , typename MT5 // Type of the right-hand side matrix operand
3683  , typename ST2 > // Type of the scalar value
3684  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3685  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3686  {
3687  selectDefaultSubAssignKernel( C, A, B, scalar );
3688  }
3689  //**********************************************************************************************
3690 
3691  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
3692 #if BLAZE_BLAS_MODE
3693 
3706  template< typename MT3 // Type of the left-hand side target matrix
3707  , typename MT4 // Type of the left-hand side matrix operand
3708  , typename MT5 // Type of the right-hand side matrix operand
3709  , typename ST2 > // Type of the scalar value
3710  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3711  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3712  {
3713  using boost::numeric_cast;
3714 
3715  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
3716  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
3717  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
3718 
3719  const int M ( numeric_cast<int>( A.rows() ) );
3720  const int N ( numeric_cast<int>( B.columns() ) );
3721  const int K ( numeric_cast<int>( A.columns() ) );
3722  const int lda( numeric_cast<int>( A.spacing() ) );
3723  const int ldb( numeric_cast<int>( B.spacing() ) );
3724  const int ldc( numeric_cast<int>( C.spacing() ) );
3725 
3726  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3727  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3728  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3729  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3730  }
3731 #endif
3732  //**********************************************************************************************
3733 
3734  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
3735 #if BLAZE_BLAS_MODE
3736 
3749  template< typename MT3 // Type of the left-hand side target matrix
3750  , typename MT4 // Type of the left-hand side matrix operand
3751  , typename MT5 // Type of the right-hand side matrix operand
3752  , typename ST2 > // Type of the scalar value
3753  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3754  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3755  {
3756  using boost::numeric_cast;
3757 
3758  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
3759  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
3760  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
3761 
3762  const int M ( numeric_cast<int>( A.rows() ) );
3763  const int N ( numeric_cast<int>( B.columns() ) );
3764  const int K ( numeric_cast<int>( A.columns() ) );
3765  const int lda( numeric_cast<int>( A.spacing() ) );
3766  const int ldb( numeric_cast<int>( B.spacing() ) );
3767  const int ldc( numeric_cast<int>( C.spacing() ) );
3768 
3769  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3770  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3771  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3772  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3773  }
3774 #endif
3775  //**********************************************************************************************
3776 
3777  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
3778 #if BLAZE_BLAS_MODE
3779 
3792  template< typename MT3 // Type of the left-hand side target matrix
3793  , typename MT4 // Type of the left-hand side matrix operand
3794  , typename MT5 // Type of the right-hand side matrix operand
3795  , typename ST2 > // Type of the scalar value
3796  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3797  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3798  {
3799  using boost::numeric_cast;
3800 
3801  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3802  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3803  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3805  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3806  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3807  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3808 
3809  const int M ( numeric_cast<int>( A.rows() ) );
3810  const int N ( numeric_cast<int>( B.columns() ) );
3811  const int K ( numeric_cast<int>( A.columns() ) );
3812  const int lda( numeric_cast<int>( A.spacing() ) );
3813  const int ldb( numeric_cast<int>( B.spacing() ) );
3814  const int ldc( numeric_cast<int>( C.spacing() ) );
3815  const complex<float> alpha( -scalar );
3816  const complex<float> beta ( 1.0F, 0.0F );
3817 
3818  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3819  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3820  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3821  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3822  }
3823 #endif
3824  //**********************************************************************************************
3825 
3826  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
3827 #if BLAZE_BLAS_MODE
3828 
3841  template< typename MT3 // Type of the left-hand side target matrix
3842  , typename MT4 // Type of the left-hand side matrix operand
3843  , typename MT5 // Type of the right-hand side matrix operand
3844  , typename ST2 > // Type of the scalar value
3845  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3846  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3847  {
3848  using boost::numeric_cast;
3849 
3850  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3851  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3852  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3854  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3855  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3856  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3857 
3858  const int M ( numeric_cast<int>( A.rows() ) );
3859  const int N ( numeric_cast<int>( B.columns() ) );
3860  const int K ( numeric_cast<int>( A.columns() ) );
3861  const int lda( numeric_cast<int>( A.spacing() ) );
3862  const int ldb( numeric_cast<int>( B.spacing() ) );
3863  const int ldc( numeric_cast<int>( C.spacing() ) );
3864  const complex<double> alpha( -scalar );
3865  const complex<double> beta ( 1.0, 0.0 );
3866 
3867  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3868  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3869  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3870  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3871  }
3872 #endif
3873  //**********************************************************************************************
3874 
3875  //**Subtraction assignment to sparse matrices***************************************************
3876  // No special implementation for the subtraction assignment to sparse matrices.
3877  //**********************************************************************************************
3878 
3879  //**Multiplication assignment to dense matrices*************************************************
3880  // No special implementation for the multiplication assignment to dense matrices.
3881  //**********************************************************************************************
3882 
3883  //**Multiplication assignment to sparse matrices************************************************
3884  // No special implementation for the multiplication assignment to sparse matrices.
3885  //**********************************************************************************************
3886 
3887  //**Compile time checks*************************************************************************
3895  //**********************************************************************************************
3896 };
3898 //*************************************************************************************************
3899 
3900 
3901 
3902 
3903 //=================================================================================================
3904 //
3905 // GLOBAL BINARY ARITHMETIC OPERATORS
3906 //
3907 //=================================================================================================
3908 
3909 //*************************************************************************************************
3938 template< typename T1 // Type of the left-hand side dense matrix
3939  , typename T2 > // Type of the right-hand side dense matrix
3940 inline const DMatTDMatMultExpr<T1,T2>
3942 {
3943  if( (~lhs).columns() != (~rhs).rows() )
3944  throw std::invalid_argument( "Matrix sizes do not match" );
3945 
3946  return DMatTDMatMultExpr<T1,T2>( ~lhs, ~rhs );
3947 }
3948 //*************************************************************************************************
3949 
3950 
3951 
3952 
3953 //=================================================================================================
3954 //
3955 // EXPRESSION TRAIT SPECIALIZATIONS
3956 //
3957 //=================================================================================================
3958 
3959 //*************************************************************************************************
3961 template< typename MT1, typename MT2, typename VT >
3962 struct DMatDVecMultExprTrait< DMatTDMatMultExpr<MT1,MT2>, VT >
3963 {
3964  public:
3965  //**********************************************************************************************
3966  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3967  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
3968  IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
3969  , typename DMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
3970  , INVALID_TYPE >::Type Type;
3971  //**********************************************************************************************
3972 };
3974 //*************************************************************************************************
3975 
3976 
3977 //*************************************************************************************************
3979 template< typename MT1, typename MT2, typename VT >
3980 struct DMatSVecMultExprTrait< DMatTDMatMultExpr<MT1,MT2>, VT >
3981 {
3982  public:
3983  //**********************************************************************************************
3984  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
3985  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
3986  IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
3987  , typename DMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
3988  , INVALID_TYPE >::Type Type;
3989  //**********************************************************************************************
3990 };
3992 //*************************************************************************************************
3993 
3994 
3995 //*************************************************************************************************
3997 template< typename VT, typename MT1, typename MT2 >
3998 struct TDVecDMatMultExprTrait< VT, DMatTDMatMultExpr<MT1,MT2> >
3999 {
4000  public:
4001  //**********************************************************************************************
4002  typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
4003  IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4004  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
4005  , typename TDVecTDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4006  , INVALID_TYPE >::Type Type;
4007  //**********************************************************************************************
4008 };
4010 //*************************************************************************************************
4011 
4012 
4013 //*************************************************************************************************
4015 template< typename VT, typename MT1, typename MT2 >
4016 struct TSVecDMatMultExprTrait< VT, DMatTDMatMultExpr<MT1,MT2> >
4017 {
4018  public:
4019  //**********************************************************************************************
4020  typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
4021  IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4022  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
4023  , typename TDVecTDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4024  , INVALID_TYPE >::Type Type;
4025  //**********************************************************************************************
4026 };
4028 //*************************************************************************************************
4029 
4030 } // namespace blaze
4031 
4032 #endif