All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DMatTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
20 //=================================================================================================
21 
22 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
24 
25 
26 //*************************************************************************************************
27 // Includes
28 //*************************************************************************************************
29 
30 #include <stdexcept>
31 #include <boost/cast.hpp>
38 #include <blaze/math/Intrinsics.h>
39 #include <blaze/math/shims/Reset.h>
58 #include <blaze/system/BLAS.h>
60 #include <blaze/util/Assert.h>
61 #include <blaze/util/Complex.h>
66 #include <blaze/util/DisableIf.h>
67 #include <blaze/util/EnableIf.h>
68 #include <blaze/util/InvalidType.h>
70 #include <blaze/util/SelectType.h>
71 #include <blaze/util/Types.h>
77 
78 
79 namespace blaze {
80 
81 //=================================================================================================
82 //
83 // CLASS DMATTDMATMULTEXPR
84 //
85 //=================================================================================================
86 
87 //*************************************************************************************************
94 template< typename MT1 // Type of the left-hand side dense matrix
95  , typename MT2 > // Type of the right-hand side dense matrix
96 class DMatTDMatMultExpr : public DenseMatrix< DMatTDMatMultExpr<MT1,MT2>, false >
97  , private Expression
98  , private Computation
99 {
100  private:
101  //**Type definitions****************************************************************************
102  typedef typename MT1::ResultType RT1;
103  typedef typename MT2::ResultType RT2;
104  typedef typename MT1::CompositeType CT1;
105  typedef typename MT2::CompositeType CT2;
106  //**********************************************************************************************
107 
108  //**********************************************************************************************
110 
111 
113  template< typename T1, typename T2, typename T3 >
114  struct UseSinglePrecisionKernel {
118  };
120  //**********************************************************************************************
121 
122  //**********************************************************************************************
124 
125 
127  template< typename T1, typename T2, typename T3 >
128  struct UseDoublePrecisionKernel {
132  };
134  //**********************************************************************************************
135 
136  //**********************************************************************************************
138 
139 
142  template< typename T1, typename T2, typename T3 >
143  struct UseSinglePrecisionComplexKernel {
144  typedef complex<float> Type;
145  enum { value = IsSame<typename T1::ElementType,Type>::value &&
146  IsSame<typename T2::ElementType,Type>::value &&
147  IsSame<typename T3::ElementType,Type>::value };
148  };
150  //**********************************************************************************************
151 
152  //**********************************************************************************************
154 
155 
158  template< typename T1, typename T2, typename T3 >
159  struct UseDoublePrecisionComplexKernel {
160  typedef complex<double> Type;
161  enum { value = IsSame<typename T1::ElementType,Type>::value &&
162  IsSame<typename T2::ElementType,Type>::value &&
163  IsSame<typename T3::ElementType,Type>::value };
164  };
166  //**********************************************************************************************
167 
168  //**********************************************************************************************
170 
171 
173  template< typename T1, typename T2, typename T3 >
174  struct UseDefaultKernel {
175  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
176  !UseDoublePrecisionKernel<T1,T2,T3>::value &&
177  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
178  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
179  };
181  //**********************************************************************************************
182 
183  //**********************************************************************************************
185 
186 
188  template< typename T1, typename T2, typename T3 >
189  struct UseVectorizedDefaultKernel {
190  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
191  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
192  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
193  IntrinsicTrait<typename T1::ElementType>::addition &&
194  IntrinsicTrait<typename T1::ElementType>::multiplication };
195  };
197  //**********************************************************************************************
198 
199  public:
200  //**Type definitions****************************************************************************
203  typedef typename ResultType::OppositeType OppositeType;
204  typedef typename ResultType::TransposeType TransposeType;
205  typedef typename ResultType::ElementType ElementType;
207  typedef const ElementType ReturnType;
208  typedef const ResultType CompositeType;
209 
211  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
212 
214  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
215 
217  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
218 
220  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
221  //**********************************************************************************************
222 
223  //**Compilation flags***************************************************************************
225  enum { vectorizable = 0 };
226  //**********************************************************************************************
227 
228  //**Constructor*********************************************************************************
234  explicit inline DMatTDMatMultExpr( const MT1& lhs, const MT2& rhs )
235  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
236  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
237  {
238  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
239  }
240  //**********************************************************************************************
241 
242  //**Access operator*****************************************************************************
249  inline ReturnType operator()( size_t i, size_t j ) const {
250  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
251  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
252 
253  ElementType tmp;
254 
255  if( lhs_.columns() != 0UL ) {
256  const size_t end( ( ( lhs_.columns()-1UL ) & size_t(-2) ) + 1UL );
257  tmp = lhs_(i,0UL) * rhs_(0UL,j);
258  for( size_t k=1UL; k<end; k+=2UL ) {
259  tmp += lhs_(i,k ) * rhs_(k ,j);
260  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
261  }
262  if( end < lhs_.columns() ) {
263  tmp += lhs_(i,end) * rhs_(end,j);
264  }
265  }
266  else {
267  reset( tmp );
268  }
269 
270  return tmp;
271  }
272  //**********************************************************************************************
273 
274  //**Rows function*******************************************************************************
279  inline size_t rows() const {
280  return lhs_.rows();
281  }
282  //**********************************************************************************************
283 
284  //**Columns function****************************************************************************
289  inline size_t columns() const {
290  return rhs_.columns();
291  }
292  //**********************************************************************************************
293 
294  //**Left operand access*************************************************************************
299  inline LeftOperand leftOperand() const {
300  return lhs_;
301  }
302  //**********************************************************************************************
303 
304  //**Right operand access************************************************************************
309  inline RightOperand rightOperand() const {
310  return rhs_;
311  }
312  //**********************************************************************************************
313 
314  //**********************************************************************************************
320  template< typename T >
321  inline bool canAlias( const T* alias ) const {
322  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
323  }
324  //**********************************************************************************************
325 
326  //**********************************************************************************************
332  template< typename T >
333  inline bool isAliased( const T* alias ) const {
334  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
335  }
336  //**********************************************************************************************
337 
338  private:
339  //**Member variables****************************************************************************
342  //**********************************************************************************************
343 
344  //**Assignment to dense matrices****************************************************************
353  template< typename MT // Type of the target dense matrix
354  , bool SO > // Storage order of the target dense matrix
355  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
356  {
358 
359  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
360  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
361 
362  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
363  return;
364  }
365  else if( rhs.lhs_.columns() == 0UL ) {
366  reset( ~lhs );
367  return;
368  }
369 
370  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
371  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
372 
373  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
374  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
375  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
376  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
377  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
378  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
379 
380  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
381  DMatTDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
382  else
383  DMatTDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
384  }
386  //**********************************************************************************************
387 
388  //**Default assignment to dense matrices********************************************************
402  template< typename MT3 // Type of the left-hand side target matrix
403  , typename MT4 // Type of the left-hand side matrix operand
404  , typename MT5 > // Type of the right-hand side matrix operand
405  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
406  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
407  {
408  const size_t M( A.rows() );
409  const size_t N( B.columns() );
410  const size_t K( A.columns() );
411 
412  for( size_t i=0UL; i<M; ++i ) {
413  for( size_t j=0UL; j<N; ++j ) {
414  C(i,j) = A(i,0UL) * B(0UL,j);
415  }
416  for( size_t k=1UL; k<K; ++k ) {
417  for( size_t j=0UL; j<N; ++j ) {
418  C(i,j) += A(i,k) * B(k,j);
419  }
420  }
421  }
422  }
424  //**********************************************************************************************
425 
426  //**Vectorized default assignment to row-major dense matrices***********************************
440  template< typename MT3 // Type of the left-hand side target matrix
441  , typename MT4 // Type of the left-hand side matrix operand
442  , typename MT5 > // Type of the right-hand side matrix operand
443  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
444  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
445  {
446  typedef IntrinsicTrait<ElementType> IT;
447 
448  const size_t M( A.rows() );
449  const size_t N( B.columns() );
450  const size_t K( A.columns() );
451 
452  size_t i( 0UL );
453 
454  for( ; (i+2UL) <= M; i+=2UL ) {
455  size_t j( 0UL );
456  for( ; (j+4UL) <= N; j+=4UL ) {
457  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
458  for( size_t k=0UL; k<K; k+=IT::size ) {
459  const IntrinsicType a1( A.get(i ,k) );
460  const IntrinsicType a2( A.get(i+1UL,k) );
461  const IntrinsicType b1( B.get(k,j ) );
462  const IntrinsicType b2( B.get(k,j+1UL) );
463  const IntrinsicType b3( B.get(k,j+2UL) );
464  const IntrinsicType b4( B.get(k,j+3UL) );
465  xmm1 = xmm1 + a1 * b1;
466  xmm2 = xmm2 + a1 * b2;
467  xmm3 = xmm3 + a1 * b3;
468  xmm4 = xmm4 + a1 * b4;
469  xmm5 = xmm5 + a2 * b1;
470  xmm6 = xmm6 + a2 * b2;
471  xmm7 = xmm7 + a2 * b3;
472  xmm8 = xmm8 + a2 * b4;
473  }
474  (~C)(i ,j ) = sum( xmm1 );
475  (~C)(i ,j+1UL) = sum( xmm2 );
476  (~C)(i ,j+2UL) = sum( xmm3 );
477  (~C)(i ,j+3UL) = sum( xmm4 );
478  (~C)(i+1UL,j ) = sum( xmm5 );
479  (~C)(i+1UL,j+1UL) = sum( xmm6 );
480  (~C)(i+1UL,j+2UL) = sum( xmm7 );
481  (~C)(i+1UL,j+3UL) = sum( xmm8 );
482  }
483  for( ; (j+2UL) <= N; j+=2UL ) {
484  IntrinsicType xmm1, xmm2, xmm3, xmm4;
485  for( size_t k=0UL; k<K; k+=IT::size ) {
486  const IntrinsicType a1( A.get(i ,k) );
487  const IntrinsicType a2( A.get(i+1UL,k) );
488  const IntrinsicType b1( B.get(k,j ) );
489  const IntrinsicType b2( B.get(k,j+1UL) );
490  xmm1 = xmm1 + a1 * b1;
491  xmm2 = xmm2 + a1 * b2;
492  xmm3 = xmm3 + a2 * b1;
493  xmm4 = xmm4 + a2 * b2;
494  }
495  (~C)(i ,j ) = sum( xmm1 );
496  (~C)(i ,j+1UL) = sum( xmm2 );
497  (~C)(i+1UL,j ) = sum( xmm3 );
498  (~C)(i+1UL,j+1UL) = sum( xmm4 );
499  }
500  if( j < N ) {
501  IntrinsicType xmm1, xmm2;
502  for( size_t k=0UL; k<K; k+=IT::size ) {
503  const IntrinsicType b1( B.get(k,j) );
504  xmm1 = xmm1 + A.get(i ,k) * b1;
505  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
506  }
507  (~C)(i ,j) = sum( xmm1 );
508  (~C)(i+1UL,j) = sum( xmm2 );
509  }
510  }
511  if( i < M ) {
512  size_t j( 0UL );
513  for( ; (j+4UL) <= N; j+=4UL ) {
514  IntrinsicType xmm1, xmm2, xmm3, xmm4;
515  for( size_t k=0UL; k<K; k+=IT::size ) {
516  const IntrinsicType a1( A.get(i,k) );
517  xmm1 = xmm1 + a1 * B.get(k,j );
518  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
519  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
520  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
521  }
522  (~C)(i,j ) = sum( xmm1 );
523  (~C)(i,j+1UL) = sum( xmm2 );
524  (~C)(i,j+2UL) = sum( xmm3 );
525  (~C)(i,j+3UL) = sum( xmm4 );
526  }
527  for( ; (j+2UL) <= N; j+=2UL ) {
528  IntrinsicType xmm1, xmm2;
529  for( size_t k=0UL; k<K; k+=IT::size ) {
530  const IntrinsicType a1( A.get(i,k) );
531  xmm1 = xmm1 + a1 * B.get(k,j );
532  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
533  }
534  (~C)(i,j ) = sum( xmm1 );
535  (~C)(i,j+1UL) = sum( xmm2 );
536  }
537  if( j < N ) {
538  IntrinsicType xmm1, xmm2;
539  for( size_t k=0UL; k<K; k+=IT::size ) {
540  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
541  }
542  (~C)(i,j) = sum( xmm1 );
543  }
544  }
545  }
547  //**********************************************************************************************
548 
549  //**Vectorized default assignment to column-major dense matrices********************************
563  template< typename MT3 // Type of the left-hand side target matrix
564  , typename MT4 // Type of the left-hand side matrix operand
565  , typename MT5 > // Type of the right-hand side matrix operand
566  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
567  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
568  {
569  typedef IntrinsicTrait<ElementType> IT;
570 
571  const size_t M( A.rows() );
572  const size_t N( B.columns() );
573  const size_t K( A.columns() );
574 
575  size_t i( 0UL );
576 
577  for( ; (i+4UL) <= M; i+=4UL ) {
578  size_t j( 0UL );
579  for( ; (j+2UL) <= N; j+=2UL ) {
580  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
581  for( size_t k=0UL; k<K; k+=IT::size ) {
582  const IntrinsicType a1( A.get(i ,k) );
583  const IntrinsicType a2( A.get(i+1UL,k) );
584  const IntrinsicType a3( A.get(i+2UL,k) );
585  const IntrinsicType a4( A.get(i+3UL,k) );
586  const IntrinsicType b1( B.get(k,j ) );
587  const IntrinsicType b2( B.get(k,j+1UL) );
588  xmm1 = xmm1 + a1 * b1;
589  xmm2 = xmm2 + a1 * b2;
590  xmm3 = xmm3 + a2 * b1;
591  xmm4 = xmm4 + a2 * b2;
592  xmm5 = xmm5 + a3 * b1;
593  xmm6 = xmm6 + a3 * b2;
594  xmm7 = xmm7 + a4 * b1;
595  xmm8 = xmm8 + a4 * b2;
596  }
597  (~C)(i ,j ) = sum( xmm1 );
598  (~C)(i ,j+1UL) = sum( xmm2 );
599  (~C)(i+1UL,j ) = sum( xmm3 );
600  (~C)(i+1UL,j+1UL) = sum( xmm4 );
601  (~C)(i+2UL,j ) = sum( xmm5 );
602  (~C)(i+2UL,j+1UL) = sum( xmm6 );
603  (~C)(i+3UL,j ) = sum( xmm7 );
604  (~C)(i+3UL,j+1UL) = sum( xmm8 );
605  }
606  if( j < N ) {
607  IntrinsicType xmm1, xmm2, xmm3, xmm4;
608  for( size_t k=0UL; k<K; k+=IT::size ) {
609  const IntrinsicType b1( B.get(k,j) );
610  xmm1 = xmm1 + A.get(i ,k) * b1;
611  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
612  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
613  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
614  }
615  (~C)(i ,j) = sum( xmm1 );
616  (~C)(i+1UL,j) = sum( xmm2 );
617  (~C)(i+2UL,j) = sum( xmm3 );
618  (~C)(i+3UL,j) = sum( xmm4 );
619  }
620  }
621  for( ; (i+2UL) <= M; i+=2UL ) {
622  size_t j( 0UL );
623  for( ; (j+2UL) <= N; j+=2UL ) {
624  IntrinsicType xmm1, xmm2, xmm3, xmm4;
625  for( size_t k=0UL; k<K; k+=IT::size ) {
626  const IntrinsicType a1( A.get(i ,k) );
627  const IntrinsicType a2( A.get(i+1UL,k) );
628  const IntrinsicType b1( B.get(k,j ) );
629  const IntrinsicType b2( B.get(k,j+1UL) );
630  xmm1 = xmm1 + a1 * b1;
631  xmm2 = xmm2 + a1 * b2;
632  xmm3 = xmm3 + a2 * b1;
633  xmm4 = xmm4 + a2 * b2;
634  }
635  (~C)(i ,j ) = sum( xmm1 );
636  (~C)(i ,j+1UL) = sum( xmm2 );
637  (~C)(i+1UL,j ) = sum( xmm3 );
638  (~C)(i+1UL,j+1UL) = sum( xmm4 );
639  }
640  if( j < N ) {
641  IntrinsicType xmm1, xmm2;
642  for( size_t k=0UL; k<K; k+=IT::size ) {
643  const IntrinsicType b1( B.get(k,j) );
644  xmm1 = xmm1 + A.get(i ,k) * b1;
645  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
646  }
647  (~C)(i ,j) = sum( xmm1 );
648  (~C)(i+1UL,j) = sum( xmm2 );
649  }
650  }
651  if( i < M ) {
652  size_t j( 0UL );
653  for( ; (j+2UL) <= N; j+=2UL ) {
654  IntrinsicType xmm1, xmm2;
655  for( size_t k=0UL; k<K; k+=IT::size ) {
656  const IntrinsicType a1( A.get(i,k) );
657  xmm1 = xmm1 + a1 * B.get(k,j );
658  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
659  }
660  (~C)(i,j ) = sum( xmm1 );
661  (~C)(i,j+1UL) = sum( xmm2 );
662  }
663  if( j < N ) {
664  IntrinsicType xmm1, xmm2;
665  for( size_t k=0UL; k<K; k+=IT::size ) {
666  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
667  }
668  (~C)(i,j) = sum( xmm1 );
669  }
670  }
671  }
673  //**********************************************************************************************
674 
675  //**Default assignment to dense matrices********************************************************
689  template< typename MT3 // Type of the left-hand side target matrix
690  , typename MT4 // Type of the left-hand side matrix operand
691  , typename MT5 > // Type of the right-hand side matrix operand
692  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
693  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
694  {
695  selectDefaultAssignKernel( C, A, B );
696  }
698  //**********************************************************************************************
699 
700  //**BLAS-based assignment to dense matrices (single precision)**********************************
701 #if BLAZE_BLAS_MODE
702 
715  template< typename MT3 // Type of the left-hand side target matrix
716  , typename MT4 // Type of the left-hand side matrix operand
717  , typename MT5 > // Type of the right-hand side matrix operand
718  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
719  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
720  {
721  using boost::numeric_cast;
722 
723  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
724  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
725  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
726 
727  const int M ( numeric_cast<int>( A.rows() ) );
728  const int N ( numeric_cast<int>( B.columns() ) );
729  const int K ( numeric_cast<int>( A.columns() ) );
730  const int lda( numeric_cast<int>( A.spacing() ) );
731  const int ldb( numeric_cast<int>( B.spacing() ) );
732  const int ldc( numeric_cast<int>( C.spacing() ) );
733 
734  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
735  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
736  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
737  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
738  }
740 #endif
741  //**********************************************************************************************
742 
743  //**BLAS-based assignment to dense matrices (double precision)**********************************
744 #if BLAZE_BLAS_MODE
745 
758  template< typename MT3 // Type of the left-hand side target matrix
759  , typename MT4 // Type of the left-hand side matrix operand
760  , typename MT5 > // Type of the right-hand side matrix operand
761  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
762  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
763  {
764  using boost::numeric_cast;
765 
766  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
767  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
768  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
769 
770  const int M ( numeric_cast<int>( A.rows() ) );
771  const int N ( numeric_cast<int>( B.columns() ) );
772  const int K ( numeric_cast<int>( A.columns() ) );
773  const int lda( numeric_cast<int>( A.spacing() ) );
774  const int ldb( numeric_cast<int>( B.spacing() ) );
775  const int ldc( numeric_cast<int>( C.spacing() ) );
776 
777  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
778  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
779  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
780  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
781  }
783 #endif
784  //**********************************************************************************************
785 
786  //**BLAS-based assignment to dense matrices (single precision complex)**************************
787 #if BLAZE_BLAS_MODE
788 
801  template< typename MT3 // Type of the left-hand side target matrix
802  , typename MT4 // Type of the left-hand side matrix operand
803  , typename MT5 > // Type of the right-hand side matrix operand
804  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
805  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
806  {
807  using boost::numeric_cast;
808 
809  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
810  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
811  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
812  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
813  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
814  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
815 
816  const int M ( numeric_cast<int>( A.rows() ) );
817  const int N ( numeric_cast<int>( B.columns() ) );
818  const int K ( numeric_cast<int>( A.columns() ) );
819  const int lda( numeric_cast<int>( A.spacing() ) );
820  const int ldb( numeric_cast<int>( B.spacing() ) );
821  const int ldc( numeric_cast<int>( C.spacing() ) );
822  const complex<float> alpha( 1.0F, 0.0F );
823  const complex<float> beta ( 0.0F, 0.0F );
824 
825  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
826  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
827  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
828  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
829  }
831 #endif
832  //**********************************************************************************************
833 
834  //**BLAS-based assignment to dense matrices (double precision complex)**************************
835 #if BLAZE_BLAS_MODE
836 
849  template< typename MT3 // Type of the left-hand side target matrix
850  , typename MT4 // Type of the left-hand side matrix operand
851  , typename MT5 > // Type of the right-hand side matrix operand
852  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
853  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
854  {
855  using boost::numeric_cast;
856 
857  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
858  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
859  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
860  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
861  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
862  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
863 
864  const int M ( numeric_cast<int>( A.rows() ) );
865  const int N ( numeric_cast<int>( B.columns() ) );
866  const int K ( numeric_cast<int>( A.columns() ) );
867  const int lda( numeric_cast<int>( A.spacing() ) );
868  const int ldb( numeric_cast<int>( B.spacing() ) );
869  const int ldc( numeric_cast<int>( C.spacing() ) );
870  const complex<double> alpha( 1.0, 0.0 );
871  const complex<double> beta ( 0.0, 0.0 );
872 
873  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
874  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
875  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
876  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
877  }
879 #endif
880  //**********************************************************************************************
881 
882  //**Assignment to sparse matrices***************************************************************
894  template< typename MT // Type of the target sparse matrix
895  , bool SO > // Storage order of the target sparse matrix
896  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
897  {
899 
900  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
901 
907  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( typename TmpType::CompositeType );
908 
909  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
910  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
911 
912  const TmpType tmp( rhs );
913  assign( ~lhs, tmp );
914  }
916  //**********************************************************************************************
917 
918  //**Addition assignment to dense matrices*******************************************************
931  template< typename MT // Type of the target dense matrix
932  , bool SO > // Storage order of the target dense matrix
933  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
934  {
936 
937  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
938  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
939 
940  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
941  return;
942  }
943 
944  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
945  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
946 
947  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
948  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
949  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
950  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
951  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
952  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
953 
954  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
955  DMatTDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
956  else
957  DMatTDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
958  }
960  //**********************************************************************************************
961 
962  //**Default addition assignment to dense matrices***********************************************
976  template< typename MT3 // Type of the left-hand side target matrix
977  , typename MT4 // Type of the left-hand side matrix operand
978  , typename MT5 > // Type of the right-hand side matrix operand
979  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
980  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
981  {
982  const size_t M( A.rows() );
983  const size_t N( B.columns() );
984  const size_t K( A.columns() );
985 
986  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
987  const size_t end( N & size_t(-2) );
988 
989  for( size_t i=0UL; i<M; ++i ) {
990  for( size_t k=0UL; k<K; ++k ) {
991  for( size_t j=0UL; j<end; j+=2UL ) {
992  C(i,j ) += A(i,k) * B(k,j );
993  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
994  }
995  if( end < N ) {
996  C(i,end) += A(i,k) * B(k,end);
997  }
998  }
999  }
1000  }
1002  //**********************************************************************************************
1003 
1004  //**Vectorized default addition assignment to row-major dense matrices**************************
1018  template< typename MT3 // Type of the left-hand side target matrix
1019  , typename MT4 // Type of the left-hand side matrix operand
1020  , typename MT5 > // Type of the right-hand side matrix operand
1021  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1022  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1023  {
1024  typedef IntrinsicTrait<ElementType> IT;
1025 
1026  const size_t M( A.rows() );
1027  const size_t N( B.columns() );
1028  const size_t K( A.columns() );
1029 
1030  size_t i( 0UL );
1031 
1032  for( ; (i+2UL) <= M; i+=2UL ) {
1033  size_t j( 0UL );
1034  for( ; (j+4UL) <= N; j+=4UL ) {
1035  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1036  for( size_t k=0UL; k<K; k+=IT::size ) {
1037  const IntrinsicType a1( A.get(i ,k) );
1038  const IntrinsicType a2( A.get(i+1UL,k) );
1039  const IntrinsicType b1( B.get(k,j ) );
1040  const IntrinsicType b2( B.get(k,j+1UL) );
1041  const IntrinsicType b3( B.get(k,j+2UL) );
1042  const IntrinsicType b4( B.get(k,j+3UL) );
1043  xmm1 = xmm1 + a1 * b1;
1044  xmm2 = xmm2 + a1 * b2;
1045  xmm3 = xmm3 + a1 * b3;
1046  xmm4 = xmm4 + a1 * b4;
1047  xmm5 = xmm5 + a2 * b1;
1048  xmm6 = xmm6 + a2 * b2;
1049  xmm7 = xmm7 + a2 * b3;
1050  xmm8 = xmm8 + a2 * b4;
1051  }
1052  (~C)(i ,j ) += sum( xmm1 );
1053  (~C)(i ,j+1UL) += sum( xmm2 );
1054  (~C)(i ,j+2UL) += sum( xmm3 );
1055  (~C)(i ,j+3UL) += sum( xmm4 );
1056  (~C)(i+1UL,j ) += sum( xmm5 );
1057  (~C)(i+1UL,j+1UL) += sum( xmm6 );
1058  (~C)(i+1UL,j+2UL) += sum( xmm7 );
1059  (~C)(i+1UL,j+3UL) += sum( xmm8 );
1060  }
1061  for( ; (j+2UL) <= N; j+=2UL ) {
1062  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1063  for( size_t k=0UL; k<K; k+=IT::size ) {
1064  const IntrinsicType a1( A.get(i ,k) );
1065  const IntrinsicType a2( A.get(i+1UL,k) );
1066  const IntrinsicType b1( B.get(k,j ) );
1067  const IntrinsicType b2( B.get(k,j+1UL) );
1068  xmm1 = xmm1 + a1 * b1;
1069  xmm2 = xmm2 + a1 * b2;
1070  xmm3 = xmm3 + a2 * b1;
1071  xmm4 = xmm4 + a2 * b2;
1072  }
1073  (~C)(i ,j ) += sum( xmm1 );
1074  (~C)(i ,j+1UL) += sum( xmm2 );
1075  (~C)(i+1UL,j ) += sum( xmm3 );
1076  (~C)(i+1UL,j+1UL) += sum( xmm4 );
1077  }
1078  if( j < N ) {
1079  IntrinsicType xmm1, xmm2;
1080  for( size_t k=0UL; k<K; k+=IT::size ) {
1081  const IntrinsicType b1( B.get(k,j) );
1082  xmm1 = xmm1 + A.get(i ,k) * b1;
1083  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1084  }
1085  (~C)(i ,j) += sum( xmm1 );
1086  (~C)(i+1UL,j) += sum( xmm2 );
1087  }
1088  }
1089  if( i < M ) {
1090  size_t j( 0UL );
1091  for( ; (j+4UL) <= N; j+=4UL ) {
1092  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1093  for( size_t k=0UL; k<K; k+=IT::size ) {
1094  const IntrinsicType a1( A.get(i,k) );
1095  xmm1 = xmm1 + a1 * B.get(k,j );
1096  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1097  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
1098  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
1099  }
1100  (~C)(i,j ) += sum( xmm1 );
1101  (~C)(i,j+1UL) += sum( xmm2 );
1102  (~C)(i,j+2UL) += sum( xmm3 );
1103  (~C)(i,j+3UL) += sum( xmm4 );
1104  }
1105  for( ; (j+2UL) <= N; j+=2UL ) {
1106  IntrinsicType xmm1, xmm2;
1107  for( size_t k=0UL; k<K; k+=IT::size ) {
1108  const IntrinsicType a1( A.get(i,k) );
1109  xmm1 = xmm1 + a1 * B.get(k,j );
1110  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1111  }
1112  (~C)(i,j ) += sum( xmm1 );
1113  (~C)(i,j+1UL) += sum( xmm2 );
1114  }
1115  if( j < N ) {
1116  IntrinsicType xmm1, xmm2;
1117  for( size_t k=0UL; k<K; k+=IT::size ) {
1118  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1119  }
1120  (~C)(i,j) += sum( xmm1 );
1121  }
1122  }
1123  }
1125  //**********************************************************************************************
1126 
1127  //**Vectorized default addition assignment to column-major dense matrices***********************
1141  template< typename MT3 // Type of the left-hand side target matrix
1142  , typename MT4 // Type of the left-hand side matrix operand
1143  , typename MT5 > // Type of the right-hand side matrix operand
1144  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1145  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1146  {
1147  typedef IntrinsicTrait<ElementType> IT;
1148 
1149  const size_t M( A.rows() );
1150  const size_t N( B.columns() );
1151  const size_t K( A.columns() );
1152 
1153  size_t i( 0UL );
1154 
1155  for( ; (i+4UL) <= M; i+=4UL ) {
1156  size_t j( 0UL );
1157  for( ; (j+2UL) <= N; j+=2UL ) {
1158  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1159  for( size_t k=0UL; k<K; k+=IT::size ) {
1160  const IntrinsicType a1( A.get(i ,k) );
1161  const IntrinsicType a2( A.get(i+1UL,k) );
1162  const IntrinsicType a3( A.get(i+2UL,k) );
1163  const IntrinsicType a4( A.get(i+3UL,k) );
1164  const IntrinsicType b1( B.get(k,j ) );
1165  const IntrinsicType b2( B.get(k,j+1UL) );
1166  xmm1 = xmm1 + a1 * b1;
1167  xmm2 = xmm2 + a1 * b2;
1168  xmm3 = xmm3 + a2 * b1;
1169  xmm4 = xmm4 + a2 * b2;
1170  xmm5 = xmm5 + a3 * b1;
1171  xmm6 = xmm6 + a3 * b2;
1172  xmm7 = xmm7 + a4 * b1;
1173  xmm8 = xmm8 + a4 * b2;
1174  }
1175  (~C)(i ,j ) += sum( xmm1 );
1176  (~C)(i ,j+1UL) += sum( xmm2 );
1177  (~C)(i+1UL,j ) += sum( xmm3 );
1178  (~C)(i+1UL,j+1UL) += sum( xmm4 );
1179  (~C)(i+2UL,j ) += sum( xmm5 );
1180  (~C)(i+2UL,j+1UL) += sum( xmm6 );
1181  (~C)(i+3UL,j ) += sum( xmm7 );
1182  (~C)(i+3UL,j+1UL) += sum( xmm8 );
1183  }
1184  if( j < N ) {
1185  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1186  for( size_t k=0UL; k<K; k+=IT::size ) {
1187  const IntrinsicType b1( B.get(k,j) );
1188  xmm1 = xmm1 + A.get(i ,k) * b1;
1189  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1190  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
1191  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
1192  }
1193  (~C)(i ,j) += sum( xmm1 );
1194  (~C)(i+1UL,j) += sum( xmm2 );
1195  (~C)(i+2UL,j) += sum( xmm3 );
1196  (~C)(i+3UL,j) += sum( xmm4 );
1197  }
1198  }
1199  for( ; (i+2UL) <= M; i+=2UL ) {
1200  size_t j( 0UL );
1201  for( ; (j+2UL) <= N; j+=2UL ) {
1202  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1203  for( size_t k=0UL; k<K; k+=IT::size ) {
1204  const IntrinsicType a1( A.get(i ,k) );
1205  const IntrinsicType a2( A.get(i+1UL,k) );
1206  const IntrinsicType b1( B.get(k,j ) );
1207  const IntrinsicType b2( B.get(k,j+1UL) );
1208  xmm1 = xmm1 + a1 * b1;
1209  xmm2 = xmm2 + a1 * b2;
1210  xmm3 = xmm3 + a2 * b1;
1211  xmm4 = xmm4 + a2 * b2;
1212  }
1213  (~C)(i ,j ) += sum( xmm1 );
1214  (~C)(i ,j+1UL) += sum( xmm2 );
1215  (~C)(i+1UL,j ) += sum( xmm3 );
1216  (~C)(i+1UL,j+1UL) += sum( xmm4 );
1217  }
1218  if( j < N ) {
1219  IntrinsicType xmm1, xmm2;
1220  for( size_t k=0UL; k<K; k+=IT::size ) {
1221  const IntrinsicType b1( B.get(k,j) );
1222  xmm1 = xmm1 + A.get(i ,k) * b1;
1223  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1224  }
1225  (~C)(i ,j) += sum( xmm1 );
1226  (~C)(i+1UL,j) += sum( xmm2 );
1227  }
1228  }
1229  if( i < M ) {
1230  size_t j( 0UL );
1231  for( ; (j+2UL) <= N; j+=2UL ) {
1232  IntrinsicType xmm1, xmm2;
1233  for( size_t k=0UL; k<K; k+=IT::size ) {
1234  const IntrinsicType a1( A.get(i,k) );
1235  xmm1 = xmm1 + a1 * B.get(k,j );
1236  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1237  }
1238  (~C)(i,j ) += sum( xmm1 );
1239  (~C)(i,j+1UL) += sum( xmm2 );
1240  }
1241  if( j < N ) {
1242  IntrinsicType xmm1, xmm2;
1243  for( size_t k=0UL; k<K; k+=IT::size ) {
1244  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1245  }
1246  (~C)(i,j) += sum( xmm1 );
1247  }
1248  }
1249  }
1251  //**********************************************************************************************
1252 
1253  //**Default addition assignment to dense matrices***********************************************
1267  template< typename MT3 // Type of the left-hand side target matrix
1268  , typename MT4 // Type of the left-hand side matrix operand
1269  , typename MT5 > // Type of the right-hand side matrix operand
1270  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1271  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1272  {
1273  selectDefaultAddAssignKernel( C, A, B );
1274  }
1276  //**********************************************************************************************
1277 
1278  //**BLAS-based addition assignment to dense matrices (single precision)*************************
1279 #if BLAZE_BLAS_MODE
1280 
1293  template< typename MT3 // Type of the left-hand side target matrix
1294  , typename MT4 // Type of the left-hand side matrix operand
1295  , typename MT5 > // Type of the right-hand side matrix operand
1296  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1297  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1298  {
1299  using boost::numeric_cast;
1300 
1301  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
1302  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
1303  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
1304 
1305  const int M ( numeric_cast<int>( A.rows() ) );
1306  const int N ( numeric_cast<int>( B.columns() ) );
1307  const int K ( numeric_cast<int>( A.columns() ) );
1308  const int lda( numeric_cast<int>( A.spacing() ) );
1309  const int ldb( numeric_cast<int>( B.spacing() ) );
1310  const int ldc( numeric_cast<int>( C.spacing() ) );
1311 
1312  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1313  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1314  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1315  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1316  }
1318 #endif
1319  //**********************************************************************************************
1320 
1321  //**BLAS-based addition assignment to dense matrices (double precision)*************************
1322 #if BLAZE_BLAS_MODE
1323 
1336  template< typename MT3 // Type of the left-hand side target matrix
1337  , typename MT4 // Type of the left-hand side matrix operand
1338  , typename MT5 > // Type of the right-hand side matrix operand
1339  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1340  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1341  {
1342  using boost::numeric_cast;
1343 
1344  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
1345  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
1346  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
1347 
1348  const int M ( numeric_cast<int>( A.rows() ) );
1349  const int N ( numeric_cast<int>( B.columns() ) );
1350  const int K ( numeric_cast<int>( A.columns() ) );
1351  const int lda( numeric_cast<int>( A.spacing() ) );
1352  const int ldb( numeric_cast<int>( B.spacing() ) );
1353  const int ldc( numeric_cast<int>( C.spacing() ) );
1354 
1355  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1356  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1357  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1358  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1359  }
1361 #endif
1362  //**********************************************************************************************
1363 
1364  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
1365 #if BLAZE_BLAS_MODE
1366 
1379  template< typename MT3 // Type of the left-hand side target matrix
1380  , typename MT4 // Type of the left-hand side matrix operand
1381  , typename MT5 > // Type of the right-hand side matrix operand
1382  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1383  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1384  {
1385  using boost::numeric_cast;
1386 
1387  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1388  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1389  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1390  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
1391  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
1392  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
1393 
1394  const int M ( numeric_cast<int>( A.rows() ) );
1395  const int N ( numeric_cast<int>( B.columns() ) );
1396  const int K ( numeric_cast<int>( A.columns() ) );
1397  const int lda( numeric_cast<int>( A.spacing() ) );
1398  const int ldb( numeric_cast<int>( B.spacing() ) );
1399  const int ldc( numeric_cast<int>( C.spacing() ) );
1400  const complex<float> alpha( 1.0F, 0.0F );
1401  const complex<float> beta ( 1.0F, 0.0F );
1402 
1403  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1404  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1405  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1406  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1407  }
1409 #endif
1410  //**********************************************************************************************
1411 
1412  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
1413 #if BLAZE_BLAS_MODE
1414 
1427  template< typename MT3 // Type of the left-hand side target matrix
1428  , typename MT4 // Type of the left-hand side matrix operand
1429  , typename MT5 > // Type of the right-hand side matrix operand
1430  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1431  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1432  {
1433  using boost::numeric_cast;
1434 
1435  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1436  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1437  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1438  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1439  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1440  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1441 
1442  const int M ( numeric_cast<int>( A.rows() ) );
1443  const int N ( numeric_cast<int>( B.columns() ) );
1444  const int K ( numeric_cast<int>( A.columns() ) );
1445  const int lda( numeric_cast<int>( A.spacing() ) );
1446  const int ldb( numeric_cast<int>( B.spacing() ) );
1447  const int ldc( numeric_cast<int>( C.spacing() ) );
1448  const complex<double> alpha( 1.0, 0.0 );
1449  const complex<double> beta ( 1.0, 0.0 );
1450 
1451  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1452  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1453  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1454  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1455  }
1457 #endif
1458  //**********************************************************************************************
1459 
1460  //**Addition assignment to sparse matrices******************************************************
1461  // No special implementation for the addition assignment to sparse matrices.
1462  //**********************************************************************************************
1463 
1464  //**Subtraction assignment to dense matrices****************************************************
1477  template< typename MT // Type of the target dense matrix
1478  , bool SO > // Storage order of the target dense matrix
1479  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
1480  {
1482 
1483  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1484  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1485 
1486  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1487  return;
1488  }
1489 
1490  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
1491  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
1492 
1493  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1494  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1495  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1496  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1497  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1498  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1499 
1500  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
1501  DMatTDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1502  else
1503  DMatTDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1504  }
1506  //**********************************************************************************************
1507 
1508  //**Default subtraction assignment to dense matrices********************************************
1522  template< typename MT3 // Type of the left-hand side target matrix
1523  , typename MT4 // Type of the left-hand side matrix operand
1524  , typename MT5 > // Type of the right-hand side matrix operand
1525  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1526  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1527  {
1528  const size_t M( A.rows() );
1529  const size_t N( B.columns() );
1530  const size_t K( A.columns() );
1531 
1532  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1533  const size_t end( N & size_t(-2) );
1534 
1535  for( size_t i=0UL; i<M; ++i ) {
1536  for( size_t k=0UL; k<K; ++k ) {
1537  for( size_t j=0UL; j<end; j+=2UL ) {
1538  C(i,j ) -= A(i,k) * B(k,j );
1539  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1540  }
1541  if( end < N ) {
1542  C(i,end) -= A(i,k) * B(k,end);
1543  }
1544  }
1545  }
1546  }
1548  //**********************************************************************************************
1549 
1550  //**Default subtraction assignment to row-major dense matrices**********************************
1564  template< typename MT3 // Type of the left-hand side target matrix
1565  , typename MT4 // Type of the left-hand side matrix operand
1566  , typename MT5 > // Type of the right-hand side matrix operand
1567  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1568  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1569  {
1570  typedef IntrinsicTrait<ElementType> IT;
1571 
1572  const size_t M( A.rows() );
1573  const size_t N( B.columns() );
1574  const size_t K( A.columns() );
1575 
1576  size_t i( 0UL );
1577 
1578  for( ; (i+2UL) <= M; i+=2UL ) {
1579  size_t j( 0UL );
1580  for( ; (j+4UL) <= N; j+=4UL ) {
1581  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1582  for( size_t k=0UL; k<K; k+=IT::size ) {
1583  const IntrinsicType a1( A.get(i ,k) );
1584  const IntrinsicType a2( A.get(i+1UL,k) );
1585  const IntrinsicType b1( B.get(k,j ) );
1586  const IntrinsicType b2( B.get(k,j+1UL) );
1587  const IntrinsicType b3( B.get(k,j+2UL) );
1588  const IntrinsicType b4( B.get(k,j+3UL) );
1589  xmm1 = xmm1 + a1 * b1;
1590  xmm2 = xmm2 + a1 * b2;
1591  xmm3 = xmm3 + a1 * b3;
1592  xmm4 = xmm4 + a1 * b4;
1593  xmm5 = xmm5 + a2 * b1;
1594  xmm6 = xmm6 + a2 * b2;
1595  xmm7 = xmm7 + a2 * b3;
1596  xmm8 = xmm8 + a2 * b4;
1597  }
1598  (~C)(i ,j ) -= sum( xmm1 );
1599  (~C)(i ,j+1UL) -= sum( xmm2 );
1600  (~C)(i ,j+2UL) -= sum( xmm3 );
1601  (~C)(i ,j+3UL) -= sum( xmm4 );
1602  (~C)(i+1UL,j ) -= sum( xmm5 );
1603  (~C)(i+1UL,j+1UL) -= sum( xmm6 );
1604  (~C)(i+1UL,j+2UL) -= sum( xmm7 );
1605  (~C)(i+1UL,j+3UL) -= sum( xmm8 );
1606  }
1607  for( ; (j+2UL) <= N; j+=2UL ) {
1608  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1609  for( size_t k=0UL; k<K; k+=IT::size ) {
1610  const IntrinsicType a1( A.get(i ,k) );
1611  const IntrinsicType a2( A.get(i+1UL,k) );
1612  const IntrinsicType b1( B.get(k,j ) );
1613  const IntrinsicType b2( B.get(k,j+1UL) );
1614  xmm1 = xmm1 + a1 * b1;
1615  xmm2 = xmm2 + a1 * b2;
1616  xmm3 = xmm3 + a2 * b1;
1617  xmm4 = xmm4 + a2 * b2;
1618  }
1619  (~C)(i ,j ) -= sum( xmm1 );
1620  (~C)(i ,j+1UL) -= sum( xmm2 );
1621  (~C)(i+1UL,j ) -= sum( xmm3 );
1622  (~C)(i+1UL,j+1UL) -= sum( xmm4 );
1623  }
1624  if( j < N ) {
1625  IntrinsicType xmm1, xmm2;
1626  for( size_t k=0UL; k<K; k+=IT::size ) {
1627  const IntrinsicType b1( B.get(k,j) );
1628  xmm1 = xmm1 + A.get(i ,k) * b1;
1629  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1630  }
1631  (~C)(i ,j) -= sum( xmm1 );
1632  (~C)(i+1UL,j) -= sum( xmm2 );
1633  }
1634  }
1635  if( i < M ) {
1636  size_t j( 0UL );
1637  for( ; (j+4UL) <= N; j+=4UL ) {
1638  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1639  for( size_t k=0UL; k<K; k+=IT::size ) {
1640  const IntrinsicType a1( A.get(i,k) );
1641  xmm1 = xmm1 + a1 * B.get(k,j );
1642  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1643  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
1644  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
1645  }
1646  (~C)(i,j ) -= sum( xmm1 );
1647  (~C)(i,j+1UL) -= sum( xmm2 );
1648  (~C)(i,j+2UL) -= sum( xmm3 );
1649  (~C)(i,j+3UL) -= sum( xmm4 );
1650  }
1651  for( ; (j+2UL) <= N; j+=2UL ) {
1652  IntrinsicType xmm1, xmm2;
1653  for( size_t k=0UL; k<K; k+=IT::size ) {
1654  const IntrinsicType a1( A.get(i,k) );
1655  xmm1 = xmm1 + a1 * B.get(k,j );
1656  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1657  }
1658  (~C)(i,j ) -= sum( xmm1 );
1659  (~C)(i,j+1UL) -= sum( xmm2 );
1660  }
1661  if( j < N ) {
1662  IntrinsicType xmm1, xmm2;
1663  for( size_t k=0UL; k<K; k+=IT::size ) {
1664  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1665  }
1666  (~C)(i,j) -= sum( xmm1 );
1667  }
1668  }
1669  }
1671  //**********************************************************************************************
1672 
1673  //**Default subtraction assignment to column-major dense matrices*******************************
1687  template< typename MT3 // Type of the left-hand side target matrix
1688  , typename MT4 // Type of the left-hand side matrix operand
1689  , typename MT5 > // Type of the right-hand side matrix operand
1690  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1691  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1692  {
1693  typedef IntrinsicTrait<ElementType> IT;
1694 
1695  const size_t M( A.rows() );
1696  const size_t N( B.columns() );
1697  const size_t K( A.columns() );
1698 
1699  size_t i( 0UL );
1700 
1701  for( ; (i+4UL) <= M; i+=4UL ) {
1702  size_t j( 0UL );
1703  for( ; (j+2UL) <= N; j+=2UL ) {
1704  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1705  for( size_t k=0UL; k<K; k+=IT::size ) {
1706  const IntrinsicType a1( A.get(i ,k) );
1707  const IntrinsicType a2( A.get(i+1UL,k) );
1708  const IntrinsicType a3( A.get(i+2UL,k) );
1709  const IntrinsicType a4( A.get(i+3UL,k) );
1710  const IntrinsicType b1( B.get(k,j ) );
1711  const IntrinsicType b2( B.get(k,j+1UL) );
1712  xmm1 = xmm1 + a1 * b1;
1713  xmm2 = xmm2 + a1 * b2;
1714  xmm3 = xmm3 + a2 * b1;
1715  xmm4 = xmm4 + a2 * b2;
1716  xmm5 = xmm5 + a3 * b1;
1717  xmm6 = xmm6 + a3 * b2;
1718  xmm7 = xmm7 + a4 * b1;
1719  xmm8 = xmm8 + a4 * b2;
1720  }
1721  (~C)(i ,j ) -= sum( xmm1 );
1722  (~C)(i ,j+1UL) -= sum( xmm2 );
1723  (~C)(i+1UL,j ) -= sum( xmm3 );
1724  (~C)(i+1UL,j+1UL) -= sum( xmm4 );
1725  (~C)(i+2UL,j ) -= sum( xmm5 );
1726  (~C)(i+2UL,j+1UL) -= sum( xmm6 );
1727  (~C)(i+3UL,j ) -= sum( xmm7 );
1728  (~C)(i+3UL,j+1UL) -= sum( xmm8 );
1729  }
1730  if( j < N ) {
1731  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1732  for( size_t k=0UL; k<K; k+=IT::size ) {
1733  const IntrinsicType b1( B.get(k,j) );
1734  xmm1 = xmm1 + A.get(i ,k) * b1;
1735  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1736  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
1737  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
1738  }
1739  (~C)(i ,j) -= sum( xmm1 );
1740  (~C)(i+1UL,j) -= sum( xmm2 );
1741  (~C)(i+2UL,j) -= sum( xmm3 );
1742  (~C)(i+3UL,j) -= sum( xmm4 );
1743  }
1744  }
1745  for( ; (i+2UL) <= M; i+=2UL ) {
1746  size_t j( 0UL );
1747  for( ; (j+2UL) <= N; j+=2UL ) {
1748  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1749  for( size_t k=0UL; k<K; k+=IT::size ) {
1750  const IntrinsicType a1( A.get(i ,k) );
1751  const IntrinsicType a2( A.get(i+1UL,k) );
1752  const IntrinsicType b1( B.get(k,j ) );
1753  const IntrinsicType b2( B.get(k,j+1UL) );
1754  xmm1 = xmm1 + a1 * b1;
1755  xmm2 = xmm2 + a1 * b2;
1756  xmm3 = xmm3 + a2 * b1;
1757  xmm4 = xmm4 + a2 * b2;
1758  }
1759  (~C)(i ,j ) -= sum( xmm1 );
1760  (~C)(i ,j+1UL) -= sum( xmm2 );
1761  (~C)(i+1UL,j ) -= sum( xmm3 );
1762  (~C)(i+1UL,j+1UL) -= sum( xmm4 );
1763  }
1764  if( j < N ) {
1765  IntrinsicType xmm1, xmm2;
1766  for( size_t k=0UL; k<K; k+=IT::size ) {
1767  const IntrinsicType b1( B.get(k,j) );
1768  xmm1 = xmm1 + A.get(i ,k) * b1;
1769  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
1770  }
1771  (~C)(i ,j) -= sum( xmm1 );
1772  (~C)(i+1UL,j) -= sum( xmm2 );
1773  }
1774  }
1775  if( i < M ) {
1776  size_t j( 0UL );
1777  for( ; (j+2UL) <= N; j+=2UL ) {
1778  IntrinsicType xmm1, xmm2;
1779  for( size_t k=0UL; k<K; k+=IT::size ) {
1780  const IntrinsicType a1( A.get(i,k) );
1781  xmm1 = xmm1 + a1 * B.get(k,j );
1782  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
1783  }
1784  (~C)(i,j ) -= sum( xmm1 );
1785  (~C)(i,j+1UL) -= sum( xmm2 );
1786  }
1787  if( j < N ) {
1788  IntrinsicType xmm1, xmm2;
1789  for( size_t k=0UL; k<K; k+=IT::size ) {
1790  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
1791  }
1792  (~C)(i,j) -= sum( xmm1 );
1793  }
1794  }
1795  }
1797  //**********************************************************************************************
1798 
1799  //**Default subtraction assignment to dense matrices********************************************
1813  template< typename MT3 // Type of the left-hand side target matrix
1814  , typename MT4 // Type of the left-hand side matrix operand
1815  , typename MT5 > // Type of the right-hand side matrix operand
1816  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1817  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1818  {
1819  selectDefaultSubAssignKernel( C, A, B );
1820  }
1822  //**********************************************************************************************
1823 
1824  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
1825 #if BLAZE_BLAS_MODE
1826 
1839  template< typename MT3 // Type of the left-hand side target matrix
1840  , typename MT4 // Type of the left-hand side matrix operand
1841  , typename MT5 > // Type of the right-hand side matrix operand
1842  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1843  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1844  {
1845  using boost::numeric_cast;
1846 
1847  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
1848  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
1849  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
1850 
1851  const int M ( numeric_cast<int>( A.rows() ) );
1852  const int N ( numeric_cast<int>( B.columns() ) );
1853  const int K ( numeric_cast<int>( A.columns() ) );
1854  const int lda( numeric_cast<int>( A.spacing() ) );
1855  const int ldb( numeric_cast<int>( B.spacing() ) );
1856  const int ldc( numeric_cast<int>( C.spacing() ) );
1857 
1858  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1859  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1860  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1861  M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1862  }
1864 #endif
1865  //**********************************************************************************************
1866 
1867  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
1868 #if BLAZE_BLAS_MODE
1869 
1882  template< typename MT3 // Type of the left-hand side target matrix
1883  , typename MT4 // Type of the left-hand side matrix operand
1884  , typename MT5 > // Type of the right-hand side matrix operand
1885  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1886  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1887  {
1888  using boost::numeric_cast;
1889 
1890  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
1891  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
1892  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
1893 
1894  const int M ( numeric_cast<int>( A.rows() ) );
1895  const int N ( numeric_cast<int>( B.columns() ) );
1896  const int K ( numeric_cast<int>( A.columns() ) );
1897  const int lda( numeric_cast<int>( A.spacing() ) );
1898  const int ldb( numeric_cast<int>( B.spacing() ) );
1899  const int ldc( numeric_cast<int>( C.spacing() ) );
1900 
1901  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1902  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1903  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1904  M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1905  }
1907 #endif
1908  //**********************************************************************************************
1909 
1910  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
1911 #if BLAZE_BLAS_MODE
1912 
1925  template< typename MT3 // Type of the left-hand side target matrix
1926  , typename MT4 // Type of the left-hand side matrix operand
1927  , typename MT5 > // Type of the right-hand side matrix operand
1928  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1929  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1930  {
1931  using boost::numeric_cast;
1932 
1933  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1934  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1935  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1936  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
1937  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
1938  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
1939 
1940  const int M ( numeric_cast<int>( A.rows() ) );
1941  const int N ( numeric_cast<int>( B.columns() ) );
1942  const int K ( numeric_cast<int>( A.columns() ) );
1943  const int lda( numeric_cast<int>( A.spacing() ) );
1944  const int ldb( numeric_cast<int>( B.spacing() ) );
1945  const int ldc( numeric_cast<int>( C.spacing() ) );
1946  const complex<float> alpha( -1.0F, 0.0F );
1947  const complex<float> beta ( 1.0F, 0.0F );
1948 
1949  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1950  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1951  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1952  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1953  }
1955 #endif
1956  //**********************************************************************************************
1957 
1958  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
1959 #if BLAZE_BLAS_MODE
1960 
1973  template< typename MT3 // Type of the left-hand side target matrix
1974  , typename MT4 // Type of the left-hand side matrix operand
1975  , typename MT5 > // Type of the right-hand side matrix operand
1976  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1977  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1978  {
1979  using boost::numeric_cast;
1980 
1981  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1982  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1983  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1984  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1985  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1986  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1987 
1988  const int M ( numeric_cast<int>( A.rows() ) );
1989  const int N ( numeric_cast<int>( B.columns() ) );
1990  const int K ( numeric_cast<int>( A.columns() ) );
1991  const int lda( numeric_cast<int>( A.spacing() ) );
1992  const int ldb( numeric_cast<int>( B.spacing() ) );
1993  const int ldc( numeric_cast<int>( C.spacing() ) );
1994  const complex<double> alpha( -1.0, 0.0 );
1995  const complex<double> beta ( 1.0, 0.0 );
1996 
1997  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1998  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1999  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2000  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2001  }
2003 #endif
2004  //**********************************************************************************************
2005 
2006  //**Subtraction assignment to sparse matrices***************************************************
2007  // No special implementation for the subtraction assignment to sparse matrices.
2008  //**********************************************************************************************
2009 
2010  //**Multiplication assignment to dense matrices*************************************************
2011  // No special implementation for the multiplication assignment to dense matrices.
2012  //**********************************************************************************************
2013 
2014  //**Multiplication assignment to sparse matrices************************************************
2015  // No special implementation for the multiplication assignment to sparse matrices.
2016  //**********************************************************************************************
2017 
2018  //**Compile time checks*************************************************************************
2025  //**********************************************************************************************
2026 };
2027 //*************************************************************************************************
2028 
2029 
2030 
2031 
2032 //=================================================================================================
2033 //
2034 // DMATSCALARMULTEXPR SPECIALIZATION
2035 //
2036 //=================================================================================================
2037 
2038 //*************************************************************************************************
2046 template< typename MT1 // Type of the left-hand side dense matrix
2047  , typename MT2 // Type of the right-hand side dense matrix
2048  , typename ST > // Type of the right-hand side scalar value
2049 class DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >
2050  : public DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >, false >
2051  , private Expression
2052  , private Computation
2053 {
2054  private:
2055  //**Type definitions****************************************************************************
2056  typedef DMatTDMatMultExpr<MT1,MT2> MMM;
2057  typedef typename MMM::ResultType RES;
2058  typedef typename MT1::ResultType RT1;
2059  typedef typename MT2::ResultType RT2;
2060  typedef typename MT1::CompositeType CT1;
2061  typedef typename MT2::CompositeType CT2;
2062  //**********************************************************************************************
2063 
2064  //**********************************************************************************************
2066 
2069  template< typename T1, typename T2, typename T3, typename T4 >
2070  struct UseSinglePrecisionKernel {
2071  enum { value = IsFloat<typename T1::ElementType>::value &&
2072  IsFloat<typename T2::ElementType>::value &&
2073  IsFloat<typename T3::ElementType>::value &&
2074  !IsComplex<T4>::value };
2075  };
2076  //**********************************************************************************************
2077 
2078  //**********************************************************************************************
2080 
2083  template< typename T1, typename T2, typename T3, typename T4 >
2084  struct UseDoublePrecisionKernel {
2085  enum { value = IsDouble<typename T1::ElementType>::value &&
2086  IsDouble<typename T2::ElementType>::value &&
2087  IsDouble<typename T3::ElementType>::value &&
2088  !IsComplex<T4>::value };
2089  };
2090  //**********************************************************************************************
2091 
2092  //**********************************************************************************************
2094 
2097  template< typename T1, typename T2, typename T3 >
2098  struct UseSinglePrecisionComplexKernel {
2099  typedef complex<float> Type;
2100  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2101  IsSame<typename T2::ElementType,Type>::value &&
2102  IsSame<typename T3::ElementType,Type>::value };
2103  };
2104  //**********************************************************************************************
2105 
2106  //**********************************************************************************************
2108 
2111  template< typename T1, typename T2, typename T3 >
2112  struct UseDoublePrecisionComplexKernel {
2113  typedef complex<double> Type;
2114  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2115  IsSame<typename T2::ElementType,Type>::value &&
2116  IsSame<typename T3::ElementType,Type>::value };
2117  };
2118  //**********************************************************************************************
2119 
2120  //**********************************************************************************************
2122 
2124  template< typename T1, typename T2, typename T3, typename T4 >
2125  struct UseDefaultKernel {
2126  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2127  !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2128  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2129  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2130  };
2131  //**********************************************************************************************
2132 
2133  //**********************************************************************************************
2135 
2137  template< typename T1, typename T2, typename T3, typename T4 >
2138  struct UseVectorizedDefaultKernel {
2139  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2140  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2141  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2142  IsSame<typename T1::ElementType,T4>::value &&
2143  IntrinsicTrait<typename T1::ElementType>::addition &&
2144  IntrinsicTrait<typename T1::ElementType>::multiplication };
2145  };
2146  //**********************************************************************************************
2147 
2148  public:
2149  //**Type definitions****************************************************************************
2150  typedef DMatScalarMultExpr<MMM,ST,false> This;
2151  typedef typename MultTrait<RES,ST>::Type ResultType;
2152  typedef typename ResultType::OppositeType OppositeType;
2153  typedef typename ResultType::TransposeType TransposeType;
2154  typedef typename ResultType::ElementType ElementType;
2155  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
2156  typedef const ElementType ReturnType;
2157  typedef const ResultType CompositeType;
2158 
2160  typedef const DMatTDMatMultExpr<MT1,MT2> LeftOperand;
2161 
2163  typedef typename SelectType< IsNumeric<ElementType>::value, ElementType, ST >::Type RightOperand;
2164 
2166  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
2167 
2169  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
2170  //**********************************************************************************************
2171 
2172  //**Compilation flags***************************************************************************
2174  enum { vectorizable = 0 };
2175  //**********************************************************************************************
2176 
2177  //**Constructor*********************************************************************************
2183  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
2184  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
2185  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2186  {}
2187  //**********************************************************************************************
2188 
2189  //**Access operator*****************************************************************************
2196  inline ReturnType operator()( size_t i, size_t j ) const {
2197  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
2198  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
2199  return matrix_(i,j) * scalar_;
2200  }
2201  //**********************************************************************************************
2202 
2203  //**Rows function*******************************************************************************
2208  inline size_t rows() const {
2209  return matrix_.rows();
2210  }
2211  //**********************************************************************************************
2212 
2213  //**Columns function****************************************************************************
2218  inline size_t columns() const {
2219  return matrix_.columns();
2220  }
2221  //**********************************************************************************************
2222 
2223  //**Left operand access*************************************************************************
2228  inline LeftOperand leftOperand() const {
2229  return matrix_;
2230  }
2231  //**********************************************************************************************
2232 
2233  //**Right operand access************************************************************************
2238  inline RightOperand rightOperand() const {
2239  return scalar_;
2240  }
2241  //**********************************************************************************************
2242 
2243  //**********************************************************************************************
2249  template< typename T >
2250  inline bool canAlias( const T* alias ) const {
2251  return matrix_.canAlias( alias );
2252  }
2253  //**********************************************************************************************
2254 
2255  //**********************************************************************************************
2261  template< typename T >
2262  inline bool isAliased( const T* alias ) const {
2263  return matrix_.isAliased( alias );
2264  }
2265  //**********************************************************************************************
2266 
2267  private:
2268  //**Member variables****************************************************************************
2269  LeftOperand matrix_;
2270  RightOperand scalar_;
2271  //**********************************************************************************************
2272 
2273  //**Assignment to dense matrices****************************************************************
2282  template< typename MT3 // Type of the target dense matrix
2283  , bool SO > // Storage order of the target dense matrix
2284  friend inline void assign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
2285  {
2287 
2288  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2289  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2290 
2291  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2292  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2293 
2294  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
2295  return;
2296  }
2297  else if( left.columns() == 0UL ) {
2298  reset( ~lhs );
2299  return;
2300  }
2301 
2302  LT A( left ); // Evaluation of the left-hand side dense matrix operand
2303  RT B( right ); // Evaluation of the right-hand side dense matrix operand
2304 
2305  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2306  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
2307  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
2308  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
2309  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2310  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
2311 
2312  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
2313  DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2314  else
2315  DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2316  }
2317  //**********************************************************************************************
2318 
2319  //**Default assignment to dense matrices********************************************************
2333  template< typename MT3 // Type of the left-hand side target matrix
2334  , typename MT4 // Type of the left-hand side matrix operand
2335  , typename MT5 // Type of the right-hand side matrix operand
2336  , typename ST2 > // Type of the scalar value
2337  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2338  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2339  {
2340  for( size_t i=0UL; i<A.rows(); ++i ) {
2341  for( size_t k=0UL; k<B.columns(); ++k ) {
2342  C(i,k) = A(i,0UL) * B(0UL,k);
2343  }
2344  for( size_t j=1UL; j<A.columns(); ++j ) {
2345  for( size_t k=0UL; k<B.columns(); ++k ) {
2346  C(i,k) += A(i,j) * B(j,k);
2347  }
2348  }
2349  for( size_t k=0UL; k<B.columns(); ++k ) {
2350  C(i,k) *= scalar;
2351  }
2352  }
2353  }
2354  //**********************************************************************************************
2355 
2356  //**Vectorized default assignment to row-major dense matrices***********************************
2370  template< typename MT3 // Type of the left-hand side target matrix
2371  , typename MT4 // Type of the left-hand side matrix operand
2372  , typename MT5 // Type of the right-hand side matrix operand
2373  , typename ST2 > // Type of the scalar value
2374  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2375  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
2376  {
2377  typedef IntrinsicTrait<ElementType> IT;
2378 
2379  const size_t M( A.rows() );
2380  const size_t N( B.columns() );
2381  const size_t K( A.columns() );
2382 
2383  size_t i( 0UL );
2384 
2385  for( ; (i+2UL) <= M; i+=2UL ) {
2386  size_t j( 0UL );
2387  for( ; (j+4UL) <= N; j+=4UL ) {
2388  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2389  for( size_t k=0UL; k<K; k+=IT::size ) {
2390  const IntrinsicType a1( A.get(i ,k) );
2391  const IntrinsicType a2( A.get(i+1UL,k) );
2392  const IntrinsicType b1( B.get(k,j ) );
2393  const IntrinsicType b2( B.get(k,j+1UL) );
2394  const IntrinsicType b3( B.get(k,j+2UL) );
2395  const IntrinsicType b4( B.get(k,j+3UL) );
2396  xmm1 = xmm1 + a1 * b1;
2397  xmm2 = xmm2 + a1 * b2;
2398  xmm3 = xmm3 + a1 * b3;
2399  xmm4 = xmm4 + a1 * b4;
2400  xmm5 = xmm5 + a2 * b1;
2401  xmm6 = xmm6 + a2 * b2;
2402  xmm7 = xmm7 + a2 * b3;
2403  xmm8 = xmm8 + a2 * b4;
2404  }
2405  (~C)(i ,j ) = sum( xmm1 ) * scalar;
2406  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
2407  (~C)(i ,j+2UL) = sum( xmm3 ) * scalar;
2408  (~C)(i ,j+3UL) = sum( xmm4 ) * scalar;
2409  (~C)(i+1UL,j ) = sum( xmm5 ) * scalar;
2410  (~C)(i+1UL,j+1UL) = sum( xmm6 ) * scalar;
2411  (~C)(i+1UL,j+2UL) = sum( xmm7 ) * scalar;
2412  (~C)(i+1UL,j+3UL) = sum( xmm8 ) * scalar;
2413  }
2414  for( ; (j+2UL) <= N; j+=2UL ) {
2415  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2416  for( size_t k=0UL; k<K; k+=IT::size ) {
2417  const IntrinsicType a1( A.get(i ,k) );
2418  const IntrinsicType a2( A.get(i+1UL,k) );
2419  const IntrinsicType b1( B.get(k,j ) );
2420  const IntrinsicType b2( B.get(k,j+1UL) );
2421  xmm1 = xmm1 + a1 * b1;
2422  xmm2 = xmm2 + a1 * b2;
2423  xmm3 = xmm3 + a2 * b1;
2424  xmm4 = xmm4 + a2 * b2;
2425  }
2426  (~C)(i ,j ) = sum( xmm1 ) * scalar;
2427  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
2428  (~C)(i+1UL,j ) = sum( xmm3 ) * scalar;
2429  (~C)(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
2430  }
2431  if( j < N ) {
2432  IntrinsicType xmm1, xmm2;
2433  for( size_t k=0UL; k<K; k+=IT::size ) {
2434  const IntrinsicType b1( B.get(k,j) );
2435  xmm1 = xmm1 + A.get(i ,k) * b1;
2436  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2437  }
2438  (~C)(i ,j) = sum( xmm1 ) * scalar;
2439  (~C)(i+1UL,j) = sum( xmm2 ) * scalar;
2440  }
2441  }
2442  if( i < M ) {
2443  size_t j( 0UL );
2444  for( ; (j+4UL) <= N; j+=4UL ) {
2445  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2446  for( size_t k=0UL; k<K; k+=IT::size ) {
2447  const IntrinsicType a1( A.get(i,k) );
2448  xmm1 = xmm1 + a1 * B.get(k,j );
2449  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2450  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
2451  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
2452  }
2453  (~C)(i,j ) = sum( xmm1 ) * scalar;
2454  (~C)(i,j+1UL) = sum( xmm2 ) * scalar;
2455  (~C)(i,j+2UL) = sum( xmm3 ) * scalar;
2456  (~C)(i,j+3UL) = sum( xmm4 ) * scalar;
2457  }
2458  for( ; (j+2UL) <= N; j+=2UL ) {
2459  IntrinsicType xmm1, xmm2;
2460  for( size_t k=0UL; k<K; k+=IT::size ) {
2461  const IntrinsicType a1( A.get(i,k) );
2462  xmm1 = xmm1 + a1 * B.get(k,j );
2463  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2464  }
2465  (~C)(i,j ) = sum( xmm1 ) * scalar;
2466  (~C)(i,j+1UL) = sum( xmm2 ) * scalar;
2467  }
2468  if( j < N ) {
2469  IntrinsicType xmm1, xmm2;
2470  for( size_t k=0UL; k<K; k+=IT::size ) {
2471  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
2472  }
2473  (~C)(i,j) = sum( xmm1 ) * scalar;
2474  }
2475  }
2476  }
2477  //**********************************************************************************************
2478 
2479  //**Vectorized default assignment to column-major dense matrices********************************
2493  template< typename MT3 // Type of the left-hand side target matrix
2494  , typename MT4 // Type of the left-hand side matrix operand
2495  , typename MT5 // Type of the right-hand side matrix operand
2496  , typename ST2 > // Type of the scalar value
2497  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2498  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
2499  {
2500  typedef IntrinsicTrait<ElementType> IT;
2501 
2502  const size_t M( A.rows() );
2503  const size_t N( B.columns() );
2504  const size_t K( A.columns() );
2505 
2506  size_t i( 0UL );
2507 
2508  for( ; (i+4UL) <= M; i+=4UL ) {
2509  size_t j( 0UL );
2510  for( ; (j+2UL) <= N; j+=2UL ) {
2511  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2512  for( size_t k=0UL; k<K; k+=IT::size ) {
2513  const IntrinsicType a1( A.get(i ,k) );
2514  const IntrinsicType a2( A.get(i+1UL,k) );
2515  const IntrinsicType a3( A.get(i+2UL,k) );
2516  const IntrinsicType a4( A.get(i+3UL,k) );
2517  const IntrinsicType b1( B.get(k,j ) );
2518  const IntrinsicType b2( B.get(k,j+1UL) );
2519  xmm1 = xmm1 + a1 * b1;
2520  xmm2 = xmm2 + a1 * b2;
2521  xmm3 = xmm3 + a2 * b1;
2522  xmm4 = xmm4 + a2 * b2;
2523  xmm5 = xmm5 + a3 * b1;
2524  xmm6 = xmm6 + a3 * b2;
2525  xmm7 = xmm7 + a4 * b1;
2526  xmm8 = xmm8 + a4 * b2;
2527  }
2528  (~C)(i ,j ) = sum( xmm1 ) * scalar;
2529  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
2530  (~C)(i+1UL,j ) = sum( xmm3 ) * scalar;
2531  (~C)(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
2532  (~C)(i+2UL,j ) = sum( xmm5 ) * scalar;
2533  (~C)(i+2UL,j+1UL) = sum( xmm6 ) * scalar;
2534  (~C)(i+3UL,j ) = sum( xmm7 ) * scalar;
2535  (~C)(i+3UL,j+1UL) = sum( xmm8 ) * scalar;
2536  }
2537  if( j < N ) {
2538  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2539  for( size_t k=0UL; k<K; k+=IT::size ) {
2540  const IntrinsicType b1( B.get(k,j) );
2541  xmm1 = xmm1 + A.get(i ,k) * b1;
2542  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2543  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
2544  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
2545  }
2546  (~C)(i ,j) = sum( xmm1 ) * scalar;
2547  (~C)(i+1UL,j) = sum( xmm2 ) * scalar;
2548  (~C)(i+2UL,j) = sum( xmm3 ) * scalar;
2549  (~C)(i+3UL,j) = sum( xmm4 ) * scalar;
2550  }
2551  }
2552  for( ; (i+2UL) <= M; i+=2UL ) {
2553  size_t j( 0UL );
2554  for( ; (j+2UL) <= N; j+=2UL ) {
2555  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2556  for( size_t k=0UL; k<K; k+=IT::size ) {
2557  const IntrinsicType a1( A.get(i ,k) );
2558  const IntrinsicType a2( A.get(i+1UL,k) );
2559  const IntrinsicType b1( B.get(k,j ) );
2560  const IntrinsicType b2( B.get(k,j+1UL) );
2561  xmm1 = xmm1 + a1 * b1;
2562  xmm2 = xmm2 + a1 * b2;
2563  xmm3 = xmm3 + a2 * b1;
2564  xmm4 = xmm4 + a2 * b2;
2565  }
2566  (~C)(i ,j ) = sum( xmm1 ) * scalar;
2567  (~C)(i ,j+1UL) = sum( xmm2 ) * scalar;
2568  (~C)(i+1UL,j ) = sum( xmm3 ) * scalar;
2569  (~C)(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
2570  }
2571  if( j < N ) {
2572  IntrinsicType xmm1, xmm2;
2573  for( size_t k=0UL; k<K; k+=IT::size ) {
2574  const IntrinsicType b1( B.get(k,j) );
2575  xmm1 = xmm1 + A.get(i ,k) * b1;
2576  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
2577  }
2578  (~C)(i ,j) = sum( xmm1 ) * scalar;
2579  (~C)(i+1UL,j) = sum( xmm2 ) * scalar;
2580  }
2581  }
2582  if( i < M ) {
2583  size_t j( 0UL );
2584  for( ; (j+2UL) <= N; j+=2UL ) {
2585  IntrinsicType xmm1, xmm2;
2586  for( size_t k=0UL; k<K; k+=IT::size ) {
2587  const IntrinsicType a1( A.get(i,k) );
2588  xmm1 = xmm1 + a1 * B.get(k,j );
2589  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
2590  }
2591  (~C)(i,j ) = sum( xmm1 ) * scalar;
2592  (~C)(i,j+1UL) = sum( xmm2 ) * scalar;
2593  }
2594  if( j < N ) {
2595  IntrinsicType xmm1, xmm2;
2596  for( size_t k=0UL; k<K; k+=IT::size ) {
2597  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
2598  }
2599  (~C)(i,j) = sum( xmm1 ) * scalar;
2600  }
2601  }
2602  }
2603  //**********************************************************************************************
2604 
2605  //**BLAS-based assignment to dense matrices (default)*******************************************
2619  template< typename MT3 // Type of the left-hand side target matrix
2620  , typename MT4 // Type of the left-hand side matrix operand
2621  , typename MT5 // Type of the right-hand side matrix operand
2622  , typename ST2 > // Type of the scalar value
2623  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2624  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2625  {
2626  selectDefaultAssignKernel( C, A, B, scalar );
2627  }
2628  //**********************************************************************************************
2629 
2630  //**BLAS-based assignment to dense matrices (single precision)**********************************
2631 #if BLAZE_BLAS_MODE
2632 
2645  template< typename MT3 // Type of the left-hand side target matrix
2646  , typename MT4 // Type of the left-hand side matrix operand
2647  , typename MT5 // Type of the right-hand side matrix operand
2648  , typename ST2 > // Type of the scalar value
2649  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2650  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2651  {
2652  using boost::numeric_cast;
2653 
2654  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
2655  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
2656  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
2657 
2658  const int M ( numeric_cast<int>( A.rows() ) );
2659  const int N ( numeric_cast<int>( B.columns() ) );
2660  const int K ( numeric_cast<int>( A.columns() ) );
2661  const int lda( numeric_cast<int>( A.spacing() ) );
2662  const int ldb( numeric_cast<int>( B.spacing() ) );
2663  const int ldc( numeric_cast<int>( C.spacing() ) );
2664 
2665  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2666  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2667  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2668  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2669  }
2670 #endif
2671  //**********************************************************************************************
2672 
2673  //**BLAS-based assignment to dense matrices (double precision)**********************************
2674 #if BLAZE_BLAS_MODE
2675 
2688  template< typename MT3 // Type of the left-hand side target matrix
2689  , typename MT4 // Type of the left-hand side matrix operand
2690  , typename MT5 // Type of the right-hand side matrix operand
2691  , typename ST2 > // Type of the scalar value
2692  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2693  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2694  {
2695  using boost::numeric_cast;
2696 
2697  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
2698  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
2699  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
2700 
2701  const int M ( numeric_cast<int>( A.rows() ) );
2702  const int N ( numeric_cast<int>( B.columns() ) );
2703  const int K ( numeric_cast<int>( A.columns() ) );
2704  const int lda( numeric_cast<int>( A.spacing() ) );
2705  const int ldb( numeric_cast<int>( B.spacing() ) );
2706  const int ldc( numeric_cast<int>( C.spacing() ) );
2707 
2708  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2709  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2710  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2711  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2712  }
2713 #endif
2714  //**********************************************************************************************
2715 
2716  //**BLAS-based assignment to dense matrices (single precision complex)**************************
2717 #if BLAZE_BLAS_MODE
2718 
2731  template< typename MT3 // Type of the left-hand side target matrix
2732  , typename MT4 // Type of the left-hand side matrix operand
2733  , typename MT5 // Type of the right-hand side matrix operand
2734  , typename ST2 > // Type of the scalar value
2735  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2736  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2737  {
2738  using boost::numeric_cast;
2739 
2740  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
2741  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
2742  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
2744  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
2745  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
2746  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
2747 
2748  const int M ( numeric_cast<int>( A.rows() ) );
2749  const int N ( numeric_cast<int>( B.columns() ) );
2750  const int K ( numeric_cast<int>( A.columns() ) );
2751  const int lda( numeric_cast<int>( A.spacing() ) );
2752  const int ldb( numeric_cast<int>( B.spacing() ) );
2753  const int ldc( numeric_cast<int>( C.spacing() ) );
2754  const complex<float> alpha( scalar );
2755  const complex<float> beta ( 0.0F, 0.0F );
2756 
2757  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2758  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2759  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2760  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2761  }
2762 #endif
2763  //**********************************************************************************************
2764 
2765  //**BLAS-based assignment to dense matrices (double precision complex)**************************
2766 #if BLAZE_BLAS_MODE
2767 
2780  template< typename MT3 // Type of the left-hand side target matrix
2781  , typename MT4 // Type of the left-hand side matrix operand
2782  , typename MT5 // Type of the right-hand side matrix operand
2783  , typename ST2 > // Type of the scalar value
2784  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2785  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2786  {
2787  using boost::numeric_cast;
2788 
2789  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
2790  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
2791  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
2793  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
2794  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
2795  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
2796 
2797  const int M ( numeric_cast<int>( A.rows() ) );
2798  const int N ( numeric_cast<int>( B.columns() ) );
2799  const int K ( numeric_cast<int>( A.columns() ) );
2800  const int lda( numeric_cast<int>( A.spacing() ) );
2801  const int ldb( numeric_cast<int>( B.spacing() ) );
2802  const int ldc( numeric_cast<int>( C.spacing() ) );
2803  const complex<double> alpha( scalar );
2804  const complex<double> beta ( 0.0, 0.0 );
2805 
2806  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2807  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2808  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2809  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2810  }
2811 #endif
2812  //**********************************************************************************************
2813 
2814  //**Assignment to sparse matrices***************************************************************
2826  template< typename MT // Type of the target sparse matrix
2827  , bool SO > // Storage order of the target sparse matrix
2828  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
2829  {
2831 
2832  typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
2833 
2839  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( typename TmpType::CompositeType );
2840 
2841  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2842  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2843 
2844  const TmpType tmp( rhs );
2845  assign( ~lhs, tmp );
2846  }
2847  //**********************************************************************************************
2848 
2849  //**Addition assignment to dense matrices*******************************************************
2861  template< typename MT3 // Type of the target dense matrix
2862  , bool SO > // Storage order of the target dense matrix
2863  friend inline void addAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
2864  {
2866 
2867  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2868  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2869 
2870  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2871  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2872 
2873  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
2874  return;
2875  }
2876 
2877  LT A( left ); // Evaluation of the left-hand side dense matrix operand
2878  RT B( right ); // Evaluation of the right-hand side dense matrix operand
2879 
2880  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2881  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
2882  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
2883  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
2884  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2885  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
2886 
2887  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
2888  DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2889  else
2890  DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
2891  }
2892  //**********************************************************************************************
2893 
2894  //**Default addition assignment to dense matrices***********************************************
2908  template< typename MT3 // Type of the left-hand side target matrix
2909  , typename MT4 // Type of the left-hand side matrix operand
2910  , typename MT5 // Type of the right-hand side matrix operand
2911  , typename ST2 > // Type of the scalar value
2912  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2913  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2914  {
2915  const ResultType tmp( A * B * scalar );
2916  addAssign( C, tmp );
2917  }
2918  //**********************************************************************************************
2919 
2920  //**Vectorized default addition assignment to row-major dense matrices**************************
2934  template< typename MT3 // Type of the left-hand side target matrix
2935  , typename MT4 // Type of the left-hand side matrix operand
2936  , typename MT5 // Type of the right-hand side matrix operand
2937  , typename ST2 > // Type of the scalar value
2938  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2939  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
2940  {
2941  typedef IntrinsicTrait<ElementType> IT;
2942 
2943  const size_t M( A.rows() );
2944  const size_t N( B.columns() );
2945  const size_t K( A.columns() );
2946 
2947  size_t i( 0UL );
2948 
2949  for( ; (i+2UL) <= M; i+=2UL ) {
2950  size_t j( 0UL );
2951  for( ; (j+4UL) <= N; j+=4UL ) {
2952  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2953  for( size_t k=0UL; k<K; k+=IT::size ) {
2954  const IntrinsicType a1( A.get(i ,k) );
2955  const IntrinsicType a2( A.get(i+1UL,k) );
2956  const IntrinsicType b1( B.get(k,j ) );
2957  const IntrinsicType b2( B.get(k,j+1UL) );
2958  const IntrinsicType b3( B.get(k,j+2UL) );
2959  const IntrinsicType b4( B.get(k,j+3UL) );
2960  xmm1 = xmm1 + a1 * b1;
2961  xmm2 = xmm2 + a1 * b2;
2962  xmm3 = xmm3 + a1 * b3;
2963  xmm4 = xmm4 + a1 * b4;
2964  xmm5 = xmm5 + a2 * b1;
2965  xmm6 = xmm6 + a2 * b2;
2966  xmm7 = xmm7 + a2 * b3;
2967  xmm8 = xmm8 + a2 * b4;
2968  }
2969  (~C)(i ,j ) += sum( xmm1 ) * scalar;
2970  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
2971  (~C)(i ,j+2UL) += sum( xmm3 ) * scalar;
2972  (~C)(i ,j+3UL) += sum( xmm4 ) * scalar;
2973  (~C)(i+1UL,j ) += sum( xmm5 ) * scalar;
2974  (~C)(i+1UL,j+1UL) += sum( xmm6 ) * scalar;
2975  (~C)(i+1UL,j+2UL) += sum( xmm7 ) * scalar;
2976  (~C)(i+1UL,j+3UL) += sum( xmm8 ) * scalar;
2977  }
2978  for( ; (j+2UL) <= N; j+=2UL ) {
2979  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2980  for( size_t k=0UL; k<K; k+=IT::size ) {
2981  const IntrinsicType a1( A.get(i ,k) );
2982  const IntrinsicType a2( A.get(i+1UL,k) );
2983  const IntrinsicType b1( B.get(k,j ) );
2984  const IntrinsicType b2( B.get(k,j+1UL) );
2985  xmm1 = xmm1 + a1 * b1;
2986  xmm2 = xmm2 + a1 * b2;
2987  xmm3 = xmm3 + a2 * b1;
2988  xmm4 = xmm4 + a2 * b2;
2989  }
2990  (~C)(i ,j ) += sum( xmm1 ) * scalar;
2991  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
2992  (~C)(i+1UL,j ) += sum( xmm3 ) * scalar;
2993  (~C)(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
2994  }
2995  if( j < N ) {
2996  IntrinsicType xmm1, xmm2;
2997  for( size_t k=0UL; k<K; k+=IT::size ) {
2998  const IntrinsicType b1( B.get(k,j) );
2999  xmm1 = xmm1 + A.get(i ,k) * b1;
3000  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3001  }
3002  (~C)(i ,j) += sum( xmm1 ) * scalar;
3003  (~C)(i+1UL,j) += sum( xmm2 ) * scalar;
3004  }
3005  }
3006  if( i < M ) {
3007  size_t j( 0UL );
3008  for( ; (j+4UL) <= N; j+=4UL ) {
3009  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3010  for( size_t k=0UL; k<K; k+=IT::size ) {
3011  const IntrinsicType a1( A.get(i,k) );
3012  xmm1 = xmm1 + a1 * B.get(k,j );
3013  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3014  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
3015  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
3016  }
3017  (~C)(i,j ) += sum( xmm1 ) * scalar;
3018  (~C)(i,j+1UL) += sum( xmm2 ) * scalar;
3019  (~C)(i,j+2UL) += sum( xmm3 ) * scalar;
3020  (~C)(i,j+3UL) += sum( xmm4 ) * scalar;
3021  }
3022  for( ; (j+2UL) <= N; j+=2UL ) {
3023  IntrinsicType xmm1, xmm2;
3024  for( size_t k=0UL; k<K; k+=IT::size ) {
3025  const IntrinsicType a1( A.get(i,k) );
3026  xmm1 = xmm1 + a1 * B.get(k,j );
3027  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3028  }
3029  (~C)(i,j ) += sum( xmm1 ) * scalar;
3030  (~C)(i,j+1UL) += sum( xmm2 ) * scalar;
3031  }
3032  if( j < N ) {
3033  IntrinsicType xmm1, xmm2;
3034  for( size_t k=0UL; k<K; k+=IT::size ) {
3035  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3036  }
3037  (~C)(i,j) += sum( xmm1 ) * scalar;
3038  }
3039  }
3040  }
3041  //**********************************************************************************************
3042 
3043  //**Vectorized default addition assignment to column-major dense matrices***********************
3057  template< typename MT3 // Type of the left-hand side target matrix
3058  , typename MT4 // Type of the left-hand side matrix operand
3059  , typename MT5 // Type of the right-hand side matrix operand
3060  , typename ST2 > // Type of the scalar value
3061  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3062  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3063  {
3064  typedef IntrinsicTrait<ElementType> IT;
3065 
3066  const size_t M( A.rows() );
3067  const size_t N( B.columns() );
3068  const size_t K( A.columns() );
3069 
3070  size_t i( 0UL );
3071 
3072  for( ; (i+4UL) <= M; i+=4UL ) {
3073  size_t j( 0UL );
3074  for( ; (j+2UL) <= N; j+=2UL ) {
3075  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3076  for( size_t k=0UL; k<K; k+=IT::size ) {
3077  const IntrinsicType a1( A.get(i ,k) );
3078  const IntrinsicType a2( A.get(i+1UL,k) );
3079  const IntrinsicType a3( A.get(i+2UL,k) );
3080  const IntrinsicType a4( A.get(i+3UL,k) );
3081  const IntrinsicType b1( B.get(k,j ) );
3082  const IntrinsicType b2( B.get(k,j+1UL) );
3083  xmm1 = xmm1 + a1 * b1;
3084  xmm2 = xmm2 + a1 * b2;
3085  xmm3 = xmm3 + a2 * b1;
3086  xmm4 = xmm4 + a2 * b2;
3087  xmm5 = xmm5 + a3 * b1;
3088  xmm6 = xmm6 + a3 * b2;
3089  xmm7 = xmm7 + a4 * b1;
3090  xmm8 = xmm8 + a4 * b2;
3091  }
3092  (~C)(i ,j ) += sum( xmm1 ) * scalar;
3093  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
3094  (~C)(i+1UL,j ) += sum( xmm3 ) * scalar;
3095  (~C)(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
3096  (~C)(i+2UL,j ) += sum( xmm5 ) * scalar;
3097  (~C)(i+2UL,j+1UL) += sum( xmm6 ) * scalar;
3098  (~C)(i+3UL,j ) += sum( xmm7 ) * scalar;
3099  (~C)(i+3UL,j+1UL) += sum( xmm8 ) * scalar;
3100  }
3101  if( j < N ) {
3102  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3103  for( size_t k=0UL; k<K; k+=IT::size ) {
3104  const IntrinsicType b1( B.get(k,j) );
3105  xmm1 = xmm1 + A.get(i ,k) * b1;
3106  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3107  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
3108  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
3109  }
3110  (~C)(i ,j) += sum( xmm1 ) * scalar;
3111  (~C)(i+1UL,j) += sum( xmm2 ) * scalar;
3112  (~C)(i+2UL,j) += sum( xmm3 ) * scalar;
3113  (~C)(i+3UL,j) += sum( xmm4 ) * scalar;
3114  }
3115  }
3116  for( ; (i+2UL) <= M; i+=2UL ) {
3117  size_t j( 0UL );
3118  for( ; (j+2UL) <= N; j+=2UL ) {
3119  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3120  for( size_t k=0UL; k<K; k+=IT::size ) {
3121  const IntrinsicType a1( A.get(i ,k) );
3122  const IntrinsicType a2( A.get(i+1UL,k) );
3123  const IntrinsicType b1( B.get(k,j ) );
3124  const IntrinsicType b2( B.get(k,j+1UL) );
3125  xmm1 = xmm1 + a1 * b1;
3126  xmm2 = xmm2 + a1 * b2;
3127  xmm3 = xmm3 + a2 * b1;
3128  xmm4 = xmm4 + a2 * b2;
3129  }
3130  (~C)(i ,j ) += sum( xmm1 ) * scalar;
3131  (~C)(i ,j+1UL) += sum( xmm2 ) * scalar;
3132  (~C)(i+1UL,j ) += sum( xmm3 ) * scalar;
3133  (~C)(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
3134  }
3135  if( j < N ) {
3136  IntrinsicType xmm1, xmm2;
3137  for( size_t k=0UL; k<K; k+=IT::size ) {
3138  const IntrinsicType b1( B.get(k,j) );
3139  xmm1 = xmm1 + A.get(i ,k) * b1;
3140  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3141  }
3142  (~C)(i ,j) += sum( xmm1 ) * scalar;
3143  (~C)(i+1UL,j) += sum( xmm2 ) * scalar;
3144  }
3145  }
3146  if( i < M ) {
3147  size_t j( 0UL );
3148  for( ; (j+2UL) <= N; j+=2UL ) {
3149  IntrinsicType xmm1, xmm2;
3150  for( size_t k=0UL; k<K; k+=IT::size ) {
3151  const IntrinsicType a1( A.get(i,k) );
3152  xmm1 = xmm1 + a1 * B.get(k,j );
3153  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3154  }
3155  (~C)(i,j ) += sum( xmm1 ) * scalar;
3156  (~C)(i,j+1UL) += sum( xmm2 ) * scalar;
3157  }
3158  if( j < N ) {
3159  IntrinsicType xmm1, xmm2;
3160  for( size_t k=0UL; k<K; k+=IT::size ) {
3161  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3162  }
3163  (~C)(i,j) += sum( xmm1 ) * scalar;
3164  }
3165  }
3166  }
3167  //**********************************************************************************************
3168 
3169  //**BLAS-based addition assignment to dense matrices (default)**********************************
3183  template< typename MT3 // Type of the left-hand side target matrix
3184  , typename MT4 // Type of the left-hand side matrix operand
3185  , typename MT5 // Type of the right-hand side matrix operand
3186  , typename ST2 > // Type of the scalar value
3187  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3188  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3189  {
3190  selectDefaultAddAssignKernel( C, A, B, scalar );
3191  }
3192  //**********************************************************************************************
3193 
3194  //**BLAS-based addition assignment to dense matrices (single precision)*************************
3195 #if BLAZE_BLAS_MODE
3196 
3209  template< typename MT3 // Type of the left-hand side target matrix
3210  , typename MT4 // Type of the left-hand side matrix operand
3211  , typename MT5 // Type of the right-hand side matrix operand
3212  , typename ST2 > // Type of the scalar value
3213  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3214  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3215  {
3216  using boost::numeric_cast;
3217 
3218  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
3219  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
3220  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
3221 
3222  const int M ( numeric_cast<int>( A.rows() ) );
3223  const int N ( numeric_cast<int>( B.columns() ) );
3224  const int K ( numeric_cast<int>( A.columns() ) );
3225  const int lda( numeric_cast<int>( A.spacing() ) );
3226  const int ldb( numeric_cast<int>( B.spacing() ) );
3227  const int ldc( numeric_cast<int>( C.spacing() ) );
3228 
3229  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3230  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3231  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3232  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3233  }
3234 #endif
3235  //**********************************************************************************************
3236 
3237  //**BLAS-based addition assignment to dense matrices (double precision)*************************
3238 #if BLAZE_BLAS_MODE
3239 
3252  template< typename MT3 // Type of the left-hand side target matrix
3253  , typename MT4 // Type of the left-hand side matrix operand
3254  , typename MT5 // Type of the right-hand side matrix operand
3255  , typename ST2 > // Type of the scalar value
3256  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3257  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3258  {
3259  using boost::numeric_cast;
3260 
3261  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
3262  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
3263  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
3264 
3265  const int M ( numeric_cast<int>( A.rows() ) );
3266  const int N ( numeric_cast<int>( B.columns() ) );
3267  const int K ( numeric_cast<int>( A.columns() ) );
3268  const int lda( numeric_cast<int>( A.spacing() ) );
3269  const int ldb( numeric_cast<int>( B.spacing() ) );
3270  const int ldc( numeric_cast<int>( C.spacing() ) );
3271 
3272  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3273  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3274  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3275  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3276  }
3277 #endif
3278  //**********************************************************************************************
3279 
3280  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
3281 #if BLAZE_BLAS_MODE
3282 
3295  template< typename MT3 // Type of the left-hand side target matrix
3296  , typename MT4 // Type of the left-hand side matrix operand
3297  , typename MT5 // Type of the right-hand side matrix operand
3298  , typename ST2 > // Type of the scalar value
3299  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3300  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3301  {
3302  using boost::numeric_cast;
3303 
3304  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3305  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3306  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3308  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3309  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3310  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3311 
3312  const int M ( numeric_cast<int>( A.rows() ) );
3313  const int N ( numeric_cast<int>( B.columns() ) );
3314  const int K ( numeric_cast<int>( A.columns() ) );
3315  const int lda( numeric_cast<int>( A.spacing() ) );
3316  const int ldb( numeric_cast<int>( B.spacing() ) );
3317  const int ldc( numeric_cast<int>( C.spacing() ) );
3318  const complex<float> alpha( scalar );
3319  const complex<float> beta ( 1.0F, 0.0F );
3320 
3321  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3322  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3323  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3324  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3325  }
3326 #endif
3327  //**********************************************************************************************
3328 
3329  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
3330 #if BLAZE_BLAS_MODE
3331 
3344  template< typename MT3 // Type of the left-hand side target matrix
3345  , typename MT4 // Type of the left-hand side matrix operand
3346  , typename MT5 // Type of the right-hand side matrix operand
3347  , typename ST2 > // Type of the scalar value
3348  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3349  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3350  {
3351  using boost::numeric_cast;
3352 
3353  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3354  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3355  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3357  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3358  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3359  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3360 
3361  const int M ( numeric_cast<int>( A.rows() ) );
3362  const int N ( numeric_cast<int>( B.columns() ) );
3363  const int K ( numeric_cast<int>( A.columns() ) );
3364  const int lda( numeric_cast<int>( A.spacing() ) );
3365  const int ldb( numeric_cast<int>( B.spacing() ) );
3366  const int ldc( numeric_cast<int>( C.spacing() ) );
3367  const complex<double> alpha( scalar );
3368  const complex<double> beta ( 1.0, 0.0 );
3369 
3370  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3371  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3372  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3373  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3374  }
3375 #endif
3376  //**********************************************************************************************
3377 
3378  //**Addition assignment to sparse matrices******************************************************
3379  // No special implementation for the addition assignment to sparse matrices.
3380  //**********************************************************************************************
3381 
3382  //**Subtraction assignment to dense matrices****************************************************
3394  template< typename MT3 // Type of the target dense matrix
3395  , bool SO > // Storage order of the target dense matrix
3396  friend inline void subAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
3397  {
3399 
3400  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3401  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3402 
3403  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3404  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3405 
3406  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3407  return;
3408  }
3409 
3410  LT A( left ); // Evaluation of the left-hand side dense matrix operand
3411  RT B( right ); // Evaluation of the right-hand side dense matrix operand
3412 
3413  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3414  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3415  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3416  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3417  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3418  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3419 
3420  if( (~lhs).rows() * (~lhs).columns() < DMATTDMATMULT_THRESHOLD )
3421  DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3422  else
3423  DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3424  }
3425  //**********************************************************************************************
3426 
3427  //**Default subtraction assignment to dense matrices********************************************
3441  template< typename MT3 // Type of the left-hand side target matrix
3442  , typename MT4 // Type of the left-hand side matrix operand
3443  , typename MT5 // Type of the right-hand side matrix operand
3444  , typename ST2 > // Type of the scalar value
3445  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3446  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3447  {
3448  const ResultType tmp( A * B * scalar );
3449  subAssign( C, tmp );
3450  }
3451  //**********************************************************************************************
3452 
3453  //**Vectorized default subtraction assignment to row-major dense matrices***********************
3467  template< typename MT3 // Type of the left-hand side target matrix
3468  , typename MT4 // Type of the left-hand side matrix operand
3469  , typename MT5 // Type of the right-hand side matrix operand
3470  , typename ST2 > // Type of the scalar value
3471  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3472  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3473  {
3474  typedef IntrinsicTrait<ElementType> IT;
3475 
3476  const size_t M( A.rows() );
3477  const size_t N( B.columns() );
3478  const size_t K( A.columns() );
3479 
3480  size_t i( 0UL );
3481 
3482  for( ; (i+2UL) <= M; i+=2UL ) {
3483  size_t j( 0UL );
3484  for( ; (j+4UL) <= N; j+=4UL ) {
3485  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3486  for( size_t k=0UL; k<K; k+=IT::size ) {
3487  const IntrinsicType a1( A.get(i ,k) );
3488  const IntrinsicType a2( A.get(i+1UL,k) );
3489  const IntrinsicType b1( B.get(k,j ) );
3490  const IntrinsicType b2( B.get(k,j+1UL) );
3491  const IntrinsicType b3( B.get(k,j+2UL) );
3492  const IntrinsicType b4( B.get(k,j+3UL) );
3493  xmm1 = xmm1 + a1 * b1;
3494  xmm2 = xmm2 + a1 * b2;
3495  xmm3 = xmm3 + a1 * b3;
3496  xmm4 = xmm4 + a1 * b4;
3497  xmm5 = xmm5 + a2 * b1;
3498  xmm6 = xmm6 + a2 * b2;
3499  xmm7 = xmm7 + a2 * b3;
3500  xmm8 = xmm8 + a2 * b4;
3501  }
3502  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
3503  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
3504  (~C)(i ,j+2UL) -= sum( xmm3 ) * scalar;
3505  (~C)(i ,j+3UL) -= sum( xmm4 ) * scalar;
3506  (~C)(i+1UL,j ) -= sum( xmm5 ) * scalar;
3507  (~C)(i+1UL,j+1UL) -= sum( xmm6 ) * scalar;
3508  (~C)(i+1UL,j+2UL) -= sum( xmm7 ) * scalar;
3509  (~C)(i+1UL,j+3UL) -= sum( xmm8 ) * scalar;
3510  }
3511  for( ; (j+2UL) <= N; j+=2UL ) {
3512  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3513  for( size_t k=0UL; k<K; k+=IT::size ) {
3514  const IntrinsicType a1( A.get(i ,k) );
3515  const IntrinsicType a2( A.get(i+1UL,k) );
3516  const IntrinsicType b1( B.get(k,j ) );
3517  const IntrinsicType b2( B.get(k,j+1UL) );
3518  xmm1 = xmm1 + a1 * b1;
3519  xmm2 = xmm2 + a1 * b2;
3520  xmm3 = xmm3 + a2 * b1;
3521  xmm4 = xmm4 + a2 * b2;
3522  }
3523  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
3524  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
3525  (~C)(i+1UL,j ) -= sum( xmm3 ) * scalar;
3526  (~C)(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
3527  }
3528  if( j < N ) {
3529  IntrinsicType xmm1, xmm2;
3530  for( size_t k=0UL; k<K; k+=IT::size ) {
3531  const IntrinsicType b1( B.get(k,j) );
3532  xmm1 = xmm1 + A.get(i ,k) * b1;
3533  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3534  }
3535  (~C)(i ,j) -= sum( xmm1 ) * scalar;
3536  (~C)(i+1UL,j) -= sum( xmm2 ) * scalar;
3537  }
3538  }
3539  if( i < M ) {
3540  size_t j( 0UL );
3541  for( ; (j+4UL) <= N; j+=4UL ) {
3542  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3543  for( size_t k=0UL; k<K; k+=IT::size ) {
3544  const IntrinsicType a1( A.get(i,k) );
3545  xmm1 = xmm1 + a1 * B.get(k,j );
3546  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3547  xmm3 = xmm3 + a1 * B.get(k,j+2UL);
3548  xmm4 = xmm4 + a1 * B.get(k,j+3UL);
3549  }
3550  (~C)(i,j ) -= sum( xmm1 ) * scalar;
3551  (~C)(i,j+1UL) -= sum( xmm2 ) * scalar;
3552  (~C)(i,j+2UL) -= sum( xmm3 ) * scalar;
3553  (~C)(i,j+3UL) -= sum( xmm4 ) * scalar;
3554  }
3555  for( ; (j+2UL) <= N; j+=2UL ) {
3556  IntrinsicType xmm1, xmm2;
3557  for( size_t k=0UL; k<K; k+=IT::size ) {
3558  const IntrinsicType a1( A.get(i,k) );
3559  xmm1 = xmm1 + a1 * B.get(k,j );
3560  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3561  }
3562  (~C)(i,j ) -= sum( xmm1 ) * scalar;
3563  (~C)(i,j+1UL) -= sum( xmm2 ) * scalar;
3564  }
3565  if( j < N ) {
3566  IntrinsicType xmm1, xmm2;
3567  for( size_t k=0UL; k<K; k+=IT::size ) {
3568  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3569  }
3570  (~C)(i,j) -= sum( xmm1 ) * scalar;
3571  }
3572  }
3573  }
3574  //**********************************************************************************************
3575 
3576  //**Vectorized default subtraction assignment to column-major dense matrices********************
3590  template< typename MT3 // Type of the left-hand side target matrix
3591  , typename MT4 // Type of the left-hand side matrix operand
3592  , typename MT5 // Type of the right-hand side matrix operand
3593  , typename ST2 > // Type of the scalar value
3594  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3595  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3596  {
3597  typedef IntrinsicTrait<ElementType> IT;
3598 
3599  const size_t M( A.rows() );
3600  const size_t N( B.columns() );
3601  const size_t K( A.columns() );
3602 
3603  size_t i( 0UL );
3604 
3605  for( ; (i+4UL) <= M; i+=4UL ) {
3606  size_t j( 0UL );
3607  for( ; (j+2UL) <= N; j+=2UL ) {
3608  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3609  for( size_t k=0UL; k<K; k+=IT::size ) {
3610  const IntrinsicType a1( A.get(i ,k) );
3611  const IntrinsicType a2( A.get(i+1UL,k) );
3612  const IntrinsicType a3( A.get(i+2UL,k) );
3613  const IntrinsicType a4( A.get(i+3UL,k) );
3614  const IntrinsicType b1( B.get(k,j ) );
3615  const IntrinsicType b2( B.get(k,j+1UL) );
3616  xmm1 = xmm1 + a1 * b1;
3617  xmm2 = xmm2 + a1 * b2;
3618  xmm3 = xmm3 + a2 * b1;
3619  xmm4 = xmm4 + a2 * b2;
3620  xmm5 = xmm5 + a3 * b1;
3621  xmm6 = xmm6 + a3 * b2;
3622  xmm7 = xmm7 + a4 * b1;
3623  xmm8 = xmm8 + a4 * b2;
3624  }
3625  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
3626  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
3627  (~C)(i+1UL,j ) -= sum( xmm3 ) * scalar;
3628  (~C)(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
3629  (~C)(i+2UL,j ) -= sum( xmm5 ) * scalar;
3630  (~C)(i+2UL,j+1UL) -= sum( xmm6 ) * scalar;
3631  (~C)(i+3UL,j ) -= sum( xmm7 ) * scalar;
3632  (~C)(i+3UL,j+1UL) -= sum( xmm8 ) * scalar;
3633  }
3634  if( j < N ) {
3635  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3636  for( size_t k=0UL; k<K; k+=IT::size ) {
3637  const IntrinsicType b1( B.get(k,j) );
3638  xmm1 = xmm1 + A.get(i ,k) * b1;
3639  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3640  xmm3 = xmm3 + A.get(i+2UL,k) * b1;
3641  xmm4 = xmm4 + A.get(i+3UL,k) * b1;
3642  }
3643  (~C)(i ,j) -= sum( xmm1 ) * scalar;
3644  (~C)(i+1UL,j) -= sum( xmm2 ) * scalar;
3645  (~C)(i+2UL,j) -= sum( xmm3 ) * scalar;
3646  (~C)(i+3UL,j) -= sum( xmm4 ) * scalar;
3647  }
3648  }
3649  for( ; (i+2UL) <= M; i+=2UL ) {
3650  size_t j( 0UL );
3651  for( ; (j+2UL) <= N; j+=2UL ) {
3652  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3653  for( size_t k=0UL; k<K; k+=IT::size ) {
3654  const IntrinsicType a1( A.get(i ,k) );
3655  const IntrinsicType a2( A.get(i+1UL,k) );
3656  const IntrinsicType b1( B.get(k,j ) );
3657  const IntrinsicType b2( B.get(k,j+1UL) );
3658  xmm1 = xmm1 + a1 * b1;
3659  xmm2 = xmm2 + a1 * b2;
3660  xmm3 = xmm3 + a2 * b1;
3661  xmm4 = xmm4 + a2 * b2;
3662  }
3663  (~C)(i ,j ) -= sum( xmm1 ) * scalar;
3664  (~C)(i ,j+1UL) -= sum( xmm2 ) * scalar;
3665  (~C)(i+1UL,j ) -= sum( xmm3 ) * scalar;
3666  (~C)(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
3667  }
3668  if( j < N ) {
3669  IntrinsicType xmm1, xmm2;
3670  for( size_t k=0UL; k<K; k+=IT::size ) {
3671  const IntrinsicType b1( B.get(k,j) );
3672  xmm1 = xmm1 + A.get(i ,k) * b1;
3673  xmm2 = xmm2 + A.get(i+1UL,k) * b1;
3674  }
3675  (~C)(i ,j) -= sum( xmm1 ) * scalar;
3676  (~C)(i+1UL,j) -= sum( xmm2 ) * scalar;
3677  }
3678  }
3679  if( i < M ) {
3680  size_t j( 0UL );
3681  for( ; (j+2UL) <= N; j+=2UL ) {
3682  IntrinsicType xmm1, xmm2;
3683  for( size_t k=0UL; k<K; k+=IT::size ) {
3684  const IntrinsicType a1( A.get(i,k) );
3685  xmm1 = xmm1 + a1 * B.get(k,j );
3686  xmm2 = xmm2 + a1 * B.get(k,j+1UL);
3687  }
3688  (~C)(i,j ) -= sum( xmm1 ) * scalar;
3689  (~C)(i,j+1UL) -= sum( xmm2 ) * scalar;
3690  }
3691  if( j < N ) {
3692  IntrinsicType xmm1, xmm2;
3693  for( size_t k=0UL; k<K; k+=IT::size ) {
3694  xmm1 = xmm1 + A.get(i,k) * B.get(k,j);
3695  }
3696  (~C)(i,j) -= sum( xmm1 ) * scalar;
3697  }
3698  }
3699  }
3700  //**********************************************************************************************
3701 
3702  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
3716  template< typename MT3 // Type of the left-hand side target matrix
3717  , typename MT4 // Type of the left-hand side matrix operand
3718  , typename MT5 // Type of the right-hand side matrix operand
3719  , typename ST2 > // Type of the scalar value
3720  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3721  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3722  {
3723  selectDefaultSubAssignKernel( C, A, B, scalar );
3724  }
3725  //**********************************************************************************************
3726 
3727  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
3728 #if BLAZE_BLAS_MODE
3729 
3742  template< typename MT3 // Type of the left-hand side target matrix
3743  , typename MT4 // Type of the left-hand side matrix operand
3744  , typename MT5 // Type of the right-hand side matrix operand
3745  , typename ST2 > // Type of the scalar value
3746  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3747  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3748  {
3749  using boost::numeric_cast;
3750 
3751  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
3752  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
3753  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
3754 
3755  const int M ( numeric_cast<int>( A.rows() ) );
3756  const int N ( numeric_cast<int>( B.columns() ) );
3757  const int K ( numeric_cast<int>( A.columns() ) );
3758  const int lda( numeric_cast<int>( A.spacing() ) );
3759  const int ldb( numeric_cast<int>( B.spacing() ) );
3760  const int ldc( numeric_cast<int>( C.spacing() ) );
3761 
3762  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3763  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3764  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3765  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3766  }
3767 #endif
3768  //**********************************************************************************************
3769 
3770  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
3771 #if BLAZE_BLAS_MODE
3772 
3785  template< typename MT3 // Type of the left-hand side target matrix
3786  , typename MT4 // Type of the left-hand side matrix operand
3787  , typename MT5 // Type of the right-hand side matrix operand
3788  , typename ST2 > // Type of the scalar value
3789  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3790  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3791  {
3792  using boost::numeric_cast;
3793 
3794  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
3795  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
3796  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
3797 
3798  const int M ( numeric_cast<int>( A.rows() ) );
3799  const int N ( numeric_cast<int>( B.columns() ) );
3800  const int K ( numeric_cast<int>( A.columns() ) );
3801  const int lda( numeric_cast<int>( A.spacing() ) );
3802  const int ldb( numeric_cast<int>( B.spacing() ) );
3803  const int ldc( numeric_cast<int>( C.spacing() ) );
3804 
3805  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3806  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3807  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3808  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3809  }
3810 #endif
3811  //**********************************************************************************************
3812 
3813  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
3814 #if BLAZE_BLAS_MODE
3815 
3828  template< typename MT3 // Type of the left-hand side target matrix
3829  , typename MT4 // Type of the left-hand side matrix operand
3830  , typename MT5 // Type of the right-hand side matrix operand
3831  , typename ST2 > // Type of the scalar value
3832  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3833  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3834  {
3835  using boost::numeric_cast;
3836 
3837  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3838  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3839  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3841  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3842  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3843  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3844 
3845  const int M ( numeric_cast<int>( A.rows() ) );
3846  const int N ( numeric_cast<int>( B.columns() ) );
3847  const int K ( numeric_cast<int>( A.columns() ) );
3848  const int lda( numeric_cast<int>( A.spacing() ) );
3849  const int ldb( numeric_cast<int>( B.spacing() ) );
3850  const int ldc( numeric_cast<int>( C.spacing() ) );
3851  const complex<float> alpha( -scalar );
3852  const complex<float> beta ( 1.0F, 0.0F );
3853 
3854  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3855  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3856  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3857  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3858  }
3859 #endif
3860  //**********************************************************************************************
3861 
3862  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
3863 #if BLAZE_BLAS_MODE
3864 
3877  template< typename MT3 // Type of the left-hand side target matrix
3878  , typename MT4 // Type of the left-hand side matrix operand
3879  , typename MT5 // Type of the right-hand side matrix operand
3880  , typename ST2 > // Type of the scalar value
3881  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3882  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3883  {
3884  using boost::numeric_cast;
3885 
3886  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3887  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3888  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3890  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3891  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3892  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3893 
3894  const int M ( numeric_cast<int>( A.rows() ) );
3895  const int N ( numeric_cast<int>( B.columns() ) );
3896  const int K ( numeric_cast<int>( A.columns() ) );
3897  const int lda( numeric_cast<int>( A.spacing() ) );
3898  const int ldb( numeric_cast<int>( B.spacing() ) );
3899  const int ldc( numeric_cast<int>( C.spacing() ) );
3900  const complex<double> alpha( -scalar );
3901  const complex<double> beta ( 1.0, 0.0 );
3902 
3903  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3904  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3905  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3906  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3907  }
3908 #endif
3909  //**********************************************************************************************
3910 
3911  //**Subtraction assignment to sparse matrices***************************************************
3912  // No special implementation for the subtraction assignment to sparse matrices.
3913  //**********************************************************************************************
3914 
3915  //**Multiplication assignment to dense matrices*************************************************
3916  // No special implementation for the multiplication assignment to dense matrices.
3917  //**********************************************************************************************
3918 
3919  //**Multiplication assignment to sparse matrices************************************************
3920  // No special implementation for the multiplication assignment to sparse matrices.
3921  //**********************************************************************************************
3922 
3923  //**Compile time checks*************************************************************************
3931  //**********************************************************************************************
3932 };
3934 //*************************************************************************************************
3935 
3936 
3937 
3938 
3939 //=================================================================================================
3940 //
3941 // GLOBAL BINARY ARITHMETIC OPERATORS
3942 //
3943 //=================================================================================================
3944 
3945 //*************************************************************************************************
3974 template< typename T1 // Type of the left-hand side dense matrix
3975  , typename T2 > // Type of the right-hand side dense matrix
3976 inline const DMatTDMatMultExpr<T1,T2>
3978 {
3980 
3981  if( (~lhs).columns() != (~rhs).rows() )
3982  throw std::invalid_argument( "Matrix sizes do not match" );
3983 
3984  return DMatTDMatMultExpr<T1,T2>( ~lhs, ~rhs );
3985 }
3986 //*************************************************************************************************
3987 
3988 
3989 
3990 
3991 //=================================================================================================
3992 //
3993 // GLOBAL OPERATORS
3994 //
3995 //=================================================================================================
3996 
3997 //*************************************************************************************************
4010 template< typename MT1 // Type of the left-hand side dense matrix
4011  , typename MT2 > // Type of the right-hand side dense matrix
4012 inline typename RowExprTrait< DMatTDMatMultExpr<MT1,MT2> >::Type
4013  row( const DMatTDMatMultExpr<MT1,MT2>& dm, size_t index )
4014 {
4016 
4017  return row( dm.leftOperand(), index ) * dm.rightOperand();
4018 }
4020 //*************************************************************************************************
4021 
4022 
4023 //*************************************************************************************************
4036 template< typename MT1 // Type of the left-hand side dense matrix
4037  , typename MT2 > // Type of the right-hand side dense matrix
4038 inline typename ColumnExprTrait< DMatTDMatMultExpr<MT1,MT2> >::Type
4039  column( const DMatTDMatMultExpr<MT1,MT2>& dm, size_t index )
4040 {
4042 
4043  return dm.leftOperand() * column( dm.rightOperand(), index );
4044 }
4046 //*************************************************************************************************
4047 
4048 
4049 
4050 
4051 //=================================================================================================
4052 //
4053 // EXPRESSION TRAIT SPECIALIZATIONS
4054 //
4055 //=================================================================================================
4056 
4057 //*************************************************************************************************
4059 template< typename MT1, typename MT2, typename VT >
4060 struct DMatDVecMultExprTrait< DMatTDMatMultExpr<MT1,MT2>, VT >
4061 {
4062  public:
4063  //**********************************************************************************************
4064  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4065  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
4066  IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
4067  , typename DMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
4068  , INVALID_TYPE >::Type Type;
4069  //**********************************************************************************************
4070 };
4072 //*************************************************************************************************
4073 
4074 
4075 //*************************************************************************************************
4077 template< typename MT1, typename MT2, typename VT >
4078 struct DMatSVecMultExprTrait< DMatTDMatMultExpr<MT1,MT2>, VT >
4079 {
4080  public:
4081  //**********************************************************************************************
4082  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4083  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
4084  IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
4085  , typename DMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
4086  , INVALID_TYPE >::Type Type;
4087  //**********************************************************************************************
4088 };
4090 //*************************************************************************************************
4091 
4092 
4093 //*************************************************************************************************
4095 template< typename VT, typename MT1, typename MT2 >
4096 struct TDVecDMatMultExprTrait< VT, DMatTDMatMultExpr<MT1,MT2> >
4097 {
4098  public:
4099  //**********************************************************************************************
4100  typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
4101  IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4102  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
4103  , typename TDVecTDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4104  , INVALID_TYPE >::Type Type;
4105  //**********************************************************************************************
4106 };
4108 //*************************************************************************************************
4109 
4110 
4111 //*************************************************************************************************
4113 template< typename VT, typename MT1, typename MT2 >
4114 struct TSVecDMatMultExprTrait< VT, DMatTDMatMultExpr<MT1,MT2> >
4115 {
4116  public:
4117  //**********************************************************************************************
4118  typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
4119  IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
4120  IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
4121  , typename TDVecTDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4122  , INVALID_TYPE >::Type Type;
4123  //**********************************************************************************************
4124 };
4126 //*************************************************************************************************
4127 
4128 
4129 //*************************************************************************************************
4131 template< typename MT1, typename MT2 >
4132 struct RowExprTrait< DMatTDMatMultExpr<MT1,MT2> >
4133 {
4134  public:
4135  //**********************************************************************************************
4136  typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
4137  //**********************************************************************************************
4138 };
4140 //*************************************************************************************************
4141 
4142 
4143 //*************************************************************************************************
4145 template< typename MT1, typename MT2 >
4146 struct ColumnExprTrait< DMatTDMatMultExpr<MT1,MT2> >
4147 {
4148  public:
4149  //**********************************************************************************************
4150  typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
4151  //**********************************************************************************************
4152 };
4154 //*************************************************************************************************
4155 
4156 } // namespace blaze
4157 
4158 #endif