All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
20 //=================================================================================================
21 
22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
24 
25 
26 //*************************************************************************************************
27 // Includes
28 //*************************************************************************************************
29 
30 #include <stdexcept>
31 #include <boost/cast.hpp>
38 #include <blaze/math/Intrinsics.h>
39 #include <blaze/math/shims/Reset.h>
59 #include <blaze/system/BLAS.h>
61 #include <blaze/util/Assert.h>
62 #include <blaze/util/Complex.h>
67 #include <blaze/util/DisableIf.h>
68 #include <blaze/util/EnableIf.h>
69 #include <blaze/util/InvalidType.h>
71 #include <blaze/util/SelectType.h>
72 #include <blaze/util/Types.h>
78 
79 
80 namespace blaze {
81 
82 //=================================================================================================
83 //
84 // CLASS TDMATDMATMULTEXPR
85 //
86 //=================================================================================================
87 
88 //*************************************************************************************************
95 template< typename MT1 // Type of the left-hand side dense matrix
96  , typename MT2 > // Type of the right-hand side dense matrix
97 class TDMatDMatMultExpr : public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
98  , private Expression
99  , private Computation
100 {
101  private:
102  //**Type definitions****************************************************************************
103  typedef typename MT1::ResultType RT1;
104  typedef typename MT2::ResultType RT2;
105  typedef typename MT1::CompositeType CT1;
106  typedef typename MT2::CompositeType CT2;
107  //**********************************************************************************************
108 
109  //**********************************************************************************************
111 
112 
114  template< typename T1, typename T2, typename T3 >
115  struct UseSinglePrecisionKernel {
119  };
121  //**********************************************************************************************
122 
123  //**********************************************************************************************
125 
126 
128  template< typename T1, typename T2, typename T3 >
129  struct UseDoublePrecisionKernel {
133  };
135  //**********************************************************************************************
136 
137  //**********************************************************************************************
139 
140 
143  template< typename T1, typename T2, typename T3 >
144  struct UseSinglePrecisionComplexKernel {
145  typedef complex<float> Type;
146  enum { value = IsSame<typename T1::ElementType,Type>::value &&
147  IsSame<typename T2::ElementType,Type>::value &&
148  IsSame<typename T3::ElementType,Type>::value };
149  };
151  //**********************************************************************************************
152 
153  //**********************************************************************************************
155 
156 
159  template< typename T1, typename T2, typename T3 >
160  struct UseDoublePrecisionComplexKernel {
161  typedef complex<double> Type;
162  enum { value = IsSame<typename T1::ElementType,Type>::value &&
163  IsSame<typename T2::ElementType,Type>::value &&
164  IsSame<typename T3::ElementType,Type>::value };
165  };
167  //**********************************************************************************************
168 
169  //**********************************************************************************************
171 
172 
174  template< typename T1, typename T2, typename T3 >
175  struct UseDefaultKernel {
176  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
177  !UseDoublePrecisionKernel<T1,T2,T3>::value &&
178  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
179  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
180  };
182  //**********************************************************************************************
183 
184  //**********************************************************************************************
186 
187 
189  template< typename T1, typename T2, typename T3 >
190  struct UseVectorizedDefaultKernel {
191  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
192  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
193  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
194  IntrinsicTrait<typename T1::ElementType>::addition &&
195  IntrinsicTrait<typename T1::ElementType>::multiplication };
196  };
198  //**********************************************************************************************
199 
200  public:
201  //**Type definitions****************************************************************************
204  typedef typename ResultType::OppositeType OppositeType;
205  typedef typename ResultType::TransposeType TransposeType;
206  typedef typename ResultType::ElementType ElementType;
208  typedef const ElementType ReturnType;
209  typedef const ResultType CompositeType;
210 
212  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
213 
215  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
216 
218  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
219 
221  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
222  //**********************************************************************************************
223 
224  //**Compilation flags***************************************************************************
226  enum { vectorizable = 0 };
227  //**********************************************************************************************
228 
229  //**Constructor*********************************************************************************
235  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs )
236  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
237  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
238  {
239  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
240  }
241  //**********************************************************************************************
242 
243  //**Access operator*****************************************************************************
250  inline ReturnType operator()( size_t i, size_t j ) const {
251  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
252  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
253 
254  ElementType tmp;
255 
256  if( lhs_.columns() != 0UL ) {
257  const size_t end( ( ( lhs_.columns()-1UL ) & size_t(-2) ) + 1UL );
258  tmp = lhs_(i,0UL) * rhs_(0UL,j);
259  for( size_t k=1UL; k<end; k+=2UL ) {
260  tmp += lhs_(i,k ) * rhs_(k ,j);
261  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
262  }
263  if( end < lhs_.columns() ) {
264  tmp += lhs_(i,end) * rhs_(end,j);
265  }
266  }
267  else {
268  reset( tmp );
269  }
270 
271  return tmp;
272  }
273  //**********************************************************************************************
274 
275  //**Rows function*******************************************************************************
280  inline size_t rows() const {
281  return lhs_.rows();
282  }
283  //**********************************************************************************************
284 
285  //**Columns function****************************************************************************
290  inline size_t columns() const {
291  return rhs_.columns();
292  }
293  //**********************************************************************************************
294 
295  //**Left operand access*************************************************************************
300  inline LeftOperand leftOperand() const {
301  return lhs_;
302  }
303  //**********************************************************************************************
304 
305  //**Right operand access************************************************************************
310  inline RightOperand rightOperand() const {
311  return rhs_;
312  }
313  //**********************************************************************************************
314 
315  //**********************************************************************************************
321  template< typename T >
322  inline bool canAlias( const T* alias ) const {
323  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
324  }
325  //**********************************************************************************************
326 
327  //**********************************************************************************************
333  template< typename T >
334  inline bool isAliased( const T* alias ) const {
335  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
336  }
337  //**********************************************************************************************
338 
339  private:
340  //**Member variables****************************************************************************
343  //**********************************************************************************************
344 
345  //**Assignment to dense matrices****************************************************************
354  template< typename MT // Type of the target dense matrix
355  , bool SO > // Storage order of the target dense matrix
356  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
357  {
359 
360  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
361  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
362 
363  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
364  return;
365  }
366  else if( rhs.lhs_.columns() == 0UL ) {
367  reset( ~lhs );
368  return;
369  }
370 
371  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
372  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
373 
374  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
375  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
376  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
377  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
378  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
379  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
380 
381  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
382  TDMatDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
383  else
384  TDMatDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
385  }
387  //**********************************************************************************************
388 
389  //**Default assignment to dense matrices********************************************************
403  template< typename MT3 // Type of the left-hand side target matrix
404  , typename MT4 // Type of the left-hand side matrix operand
405  , typename MT5 > // Type of the right-hand side matrix operand
406  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
407  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
408  {
409  const size_t M( A.rows() );
410  const size_t N( B.columns() );
411  const size_t K( A.columns() );
412 
413  for( size_t i=0UL; i<M; ++i ) {
414  for( size_t j=0UL; j<N; ++j ) {
415  C(i,j) = A(i,0UL) * B(0UL,j);
416  }
417  for( size_t k=1UL; k<K; ++k ) {
418  for( size_t j=0UL; j<N; ++j ) {
419  C(i,j) += A(i,k) * B(k,j);
420  }
421  }
422  }
423  }
425  //**********************************************************************************************
426 
427  //**Vectorized default assignment to row-major dense matrices***********************************
441  template< typename MT3 // Type of the left-hand side target matrix
442  , typename MT4 // Type of the left-hand side matrix operand
443  , typename MT5 > // Type of the right-hand side matrix operand
444  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
445  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
446  {
447  typedef IntrinsicTrait<ElementType> IT;
448 
449  const size_t M( A.rows() );
450  const size_t N( B.spacing() );
451  const size_t K( A.columns() );
452 
453  size_t j( 0UL );
454 
455  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
456  for( size_t i=0UL; i<M; ++i ) {
457  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
458  for( size_t k=0UL; k<K; ++k ) {
459  const IntrinsicType a1( set( A(i,k) ) );
460  xmm1 = xmm1 + a1 * B.get(k,j );
461  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
462  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
463  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
464  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
465  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
466  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
467  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
468  }
469  store( &(~C)(i,j ), xmm1 );
470  store( &(~C)(i,j+IT::size ), xmm2 );
471  store( &(~C)(i,j+IT::size*2UL), xmm3 );
472  store( &(~C)(i,j+IT::size*3UL), xmm4 );
473  store( &(~C)(i,j+IT::size*4UL), xmm5 );
474  store( &(~C)(i,j+IT::size*5UL), xmm6 );
475  store( &(~C)(i,j+IT::size*6UL), xmm7 );
476  store( &(~C)(i,j+IT::size*7UL), xmm8 );
477  }
478  }
479  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
480  size_t i( 0UL );
481  for( ; (i+2UL) <= M; i+=2UL ) {
482  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
483  for( size_t k=0UL; k<K; ++k ) {
484  const IntrinsicType a1( set( A(i ,k) ) );
485  const IntrinsicType a2( set( A(i+1UL,k) ) );
486  const IntrinsicType b1( B.get(k,j ) );
487  const IntrinsicType b2( B.get(k,j+IT::size ) );
488  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
489  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
490  xmm1 = xmm1 + a1 * b1;
491  xmm2 = xmm2 + a1 * b2;
492  xmm3 = xmm3 + a1 * b3;
493  xmm4 = xmm4 + a1 * b4;
494  xmm5 = xmm5 + a2 * b1;
495  xmm6 = xmm6 + a2 * b2;
496  xmm7 = xmm7 + a2 * b3;
497  xmm8 = xmm8 + a2 * b4;
498  }
499  store( &(~C)(i ,j ), xmm1 );
500  store( &(~C)(i ,j+IT::size ), xmm2 );
501  store( &(~C)(i ,j+IT::size*2UL), xmm3 );
502  store( &(~C)(i ,j+IT::size*3UL), xmm4 );
503  store( &(~C)(i+1UL,j ), xmm5 );
504  store( &(~C)(i+1UL,j+IT::size ), xmm6 );
505  store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
506  store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
507  }
508  if( i < M ) {
509  IntrinsicType xmm1, xmm2, xmm3, xmm4;
510  for( size_t k=0UL; k<K; ++k ) {
511  const IntrinsicType a1( set( A(i,k) ) );
512  xmm1 = xmm1 + a1 * B.get(k,j );
513  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
514  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
515  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
516  }
517  store( &(~C)(i,j ), xmm1 );
518  store( &(~C)(i,j+IT::size ), xmm2 );
519  store( &(~C)(i,j+IT::size*2UL), xmm3 );
520  store( &(~C)(i,j+IT::size*3UL), xmm4 );
521  }
522  }
523  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
524  size_t i( 0UL );
525  for( ; (i+2UL) <= M; i+=2UL ) {
526  IntrinsicType xmm1, xmm2, xmm3, xmm4;
527  for( size_t k=0UL; k<K; ++k ) {
528  const IntrinsicType a1( set( A(i ,k) ) );
529  const IntrinsicType a2( set( A(i+1UL,k) ) );
530  const IntrinsicType b1( B.get(k,j ) );
531  const IntrinsicType b2( B.get(k,j+IT::size) );
532  xmm1 = xmm1 + a1 * b1;
533  xmm2 = xmm2 + a1 * b2;
534  xmm3 = xmm3 + a2 * b1;
535  xmm4 = xmm4 + a2 * b2;
536  }
537  store( &(~C)(i ,j ), xmm1 );
538  store( &(~C)(i ,j+IT::size), xmm2 );
539  store( &(~C)(i+1UL,j ), xmm3 );
540  store( &(~C)(i+1UL,j+IT::size), xmm4 );
541  }
542  if( i < M ) {
543  IntrinsicType xmm1, xmm2;
544  for( size_t k=0UL; k<K; ++k ) {
545  const IntrinsicType a1( set( A(i,k) ) );
546  xmm1 = xmm1 + a1 * B.get(k,j );
547  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
548  }
549  store( &(~C)(i,j ), xmm1 );
550  store( &(~C)(i,j+IT::size), xmm2 );
551  }
552  }
553  if( j < N ) {
554  size_t i( 0UL );
555  for( ; (i+2UL) <= M; i+=2UL ) {
556  IntrinsicType xmm1, xmm2;
557  for( size_t k=0UL; k<K; ++k ) {
558  const IntrinsicType b1( B.get(k,j) );
559  xmm1 = xmm1 + set( A(i ,k) ) * b1;
560  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
561  }
562  store( &(~C)(i ,j), xmm1 );
563  store( &(~C)(i+1UL,j), xmm2 );
564  }
565  if( i < M ) {
566  IntrinsicType xmm1;
567  for( size_t k=0UL; k<K; ++k ) {
568  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
569  }
570  store( &(~C)(i,j), xmm1 );
571  }
572  }
573  }
575  //**********************************************************************************************
576 
577  //**Vectorized default assignment to column-major dense matrices********************************
591  template< typename MT3 // Type of the left-hand side target matrix
592  , typename MT4 // Type of the left-hand side matrix operand
593  , typename MT5 > // Type of the right-hand side matrix operand
594  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
595  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
596  {
597  typedef IntrinsicTrait<ElementType> IT;
598 
599  const size_t M( A.spacing() );
600  const size_t N( B.columns() );
601  const size_t K( A.columns() );
602 
603  size_t i( 0UL );
604 
605  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
606  for( size_t j=0UL; j<N; ++j ) {
607  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
608  for( size_t k=0UL; k<K; ++k ) {
609  const IntrinsicType b1( set( B(k,j) ) );
610  xmm1 = xmm1 + A.get(i ,k) * b1;
611  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
612  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
613  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
614  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
615  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
616  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
617  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
618  }
619  store( &(~C)(i ,j), xmm1 );
620  store( &(~C)(i+IT::size ,j), xmm2 );
621  store( &(~C)(i+IT::size*2UL,j), xmm3 );
622  store( &(~C)(i+IT::size*3UL,j), xmm4 );
623  store( &(~C)(i+IT::size*4UL,j), xmm5 );
624  store( &(~C)(i+IT::size*5UL,j), xmm6 );
625  store( &(~C)(i+IT::size*6UL,j), xmm7 );
626  store( &(~C)(i+IT::size*7UL,j), xmm8 );
627  }
628  }
629  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
630  size_t j( 0UL );
631  for( ; (j+2UL) <= N; j+=2UL ) {
632  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
633  for( size_t k=0UL; k<K; ++k ) {
634  const IntrinsicType a1( A.get(i ,k) );
635  const IntrinsicType a2( A.get(i+IT::size ,k) );
636  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
637  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
638  const IntrinsicType b1( set( B(k,j ) ) );
639  const IntrinsicType b2( set( B(k,j+1UL) ) );
640  xmm1 = xmm1 + a1 * b1;
641  xmm2 = xmm2 + a2 * b1;
642  xmm3 = xmm3 + a3 * b1;
643  xmm4 = xmm4 + a4 * b1;
644  xmm5 = xmm5 + a1 * b2;
645  xmm6 = xmm6 + a2 * b2;
646  xmm7 = xmm7 + a3 * b2;
647  xmm8 = xmm8 + a4 * b2;
648  }
649  store( &(~C)(i ,j ), xmm1 );
650  store( &(~C)(i+IT::size ,j ), xmm2 );
651  store( &(~C)(i+IT::size*2UL,j ), xmm3 );
652  store( &(~C)(i+IT::size*3UL,j ), xmm4 );
653  store( &(~C)(i ,j+1UL), xmm5 );
654  store( &(~C)(i+IT::size ,j+1UL), xmm6 );
655  store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
656  store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
657  }
658  if( j < N ) {
659  IntrinsicType xmm1, xmm2, xmm3, xmm4;
660  for( size_t k=0UL; k<K; ++k ) {
661  const IntrinsicType b1( set( B(k,j) ) );
662  xmm1 = xmm1 + A.get(i ,k) * b1;
663  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
664  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
665  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
666  }
667  store( &(~C)(i ,j), xmm1 );
668  store( &(~C)(i+IT::size ,j), xmm2 );
669  store( &(~C)(i+IT::size*2UL,j), xmm3 );
670  store( &(~C)(i+IT::size*3UL,j), xmm4 );
671  }
672  }
673  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
674  size_t j( 0UL );
675  for( ; (j+2UL) <= N; j+=2UL ) {
676  IntrinsicType xmm1, xmm2, xmm3, xmm4;
677  for( size_t k=0UL; k<K; ++k ) {
678  const IntrinsicType a1( A.get(i ,k) );
679  const IntrinsicType a2( A.get(i+IT::size,k) );
680  const IntrinsicType b1( set( B(k,j ) ) );
681  const IntrinsicType b2( set( B(k,j+1UL) ) );
682  xmm1 = xmm1 + a1 * b1;
683  xmm2 = xmm2 + a2 * b1;
684  xmm3 = xmm3 + a1 * b2;
685  xmm4 = xmm4 + a2 * b2;
686  }
687  store( &(~C)(i ,j ), xmm1 );
688  store( &(~C)(i+IT::size,j ), xmm2 );
689  store( &(~C)(i ,j+1UL), xmm3 );
690  store( &(~C)(i+IT::size,j+1UL), xmm4 );
691  }
692  if( j < N ) {
693  IntrinsicType xmm1, xmm2;
694  for( size_t k=0UL; k<K; ++k ) {
695  const IntrinsicType b1( set( B(k,j) ) );
696  xmm1 = xmm1 + A.get(i ,k) * b1;
697  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
698  }
699  store( &(~C)(i ,j), xmm1 );
700  store( &(~C)(i+IT::size,j), xmm2 );
701  }
702  }
703  if( i < M ) {
704  size_t j( 0UL );
705  for( ; (j+2UL) <= N; j+=2UL ) {
706  IntrinsicType xmm1, xmm2;
707  for( size_t k=0UL; k<K; ++k ) {
708  const IntrinsicType a1( A.get(i,k) );
709  xmm1 = xmm1 + a1 * set( B(k,j ) );
710  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
711  }
712  store( &(~C)(i,j ), xmm1 );
713  store( &(~C)(i,j+1UL), xmm2 );
714  }
715  if( j < N ) {
716  IntrinsicType xmm1;
717  for( size_t k=0UL; k<K; ++k ) {
718  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
719  }
720  store( &(~C)(i,j), xmm1 );
721  }
722  }
723  }
725  //**********************************************************************************************
726 
727  //**BLAS-based assignment to dense matrices (default)*******************************************
741  template< typename MT3 // Type of the left-hand side target matrix
742  , typename MT4 // Type of the left-hand side matrix operand
743  , typename MT5 > // Type of the right-hand side matrix operand
744  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
745  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
746  {
747  selectDefaultAssignKernel( C, A, B );
748  }
750  //**********************************************************************************************
751 
752  //**BLAS-based assignment to dense matrices (single precision)**********************************
753 #if BLAZE_BLAS_MODE
754 
767  template< typename MT3 // Type of the left-hand side target matrix
768  , typename MT4 // Type of the left-hand side matrix operand
769  , typename MT5 > // Type of the right-hand side matrix operand
770  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
771  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
772  {
773  using boost::numeric_cast;
774 
775  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
776  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
777  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
778 
779  const int M ( numeric_cast<int>( A.rows() ) );
780  const int N ( numeric_cast<int>( B.columns() ) );
781  const int K ( numeric_cast<int>( A.columns() ) );
782  const int lda( numeric_cast<int>( A.spacing() ) );
783  const int ldb( numeric_cast<int>( B.spacing() ) );
784  const int ldc( numeric_cast<int>( C.spacing() ) );
785 
786  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
787  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
788  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
789  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
790  }
792 #endif
793  //**********************************************************************************************
794 
795  //**BLAS-based assignment to dense matrices (double precision)**********************************
796 #if BLAZE_BLAS_MODE
797 
810  template< typename MT3 // Type of the left-hand side target matrix
811  , typename MT4 // Type of the left-hand side matrix operand
812  , typename MT5 > // Type of the right-hand side matrix operand
813  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
814  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
815  {
816  using boost::numeric_cast;
817 
818  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
819  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
820  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
821 
822  const int M ( numeric_cast<int>( A.rows() ) );
823  const int N ( numeric_cast<int>( B.columns() ) );
824  const int K ( numeric_cast<int>( A.columns() ) );
825  const int lda( numeric_cast<int>( A.spacing() ) );
826  const int ldb( numeric_cast<int>( B.spacing() ) );
827  const int ldc( numeric_cast<int>( C.spacing() ) );
828 
829  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
830  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
831  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
832  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
833  }
835 #endif
836  //**********************************************************************************************
837 
838  //**BLAS-based assignment to dense matrices (single precision complex)**************************
839 #if BLAZE_BLAS_MODE
840 
853  template< typename MT3 // Type of the left-hand side target matrix
854  , typename MT4 // Type of the left-hand side matrix operand
855  , typename MT5 > // Type of the right-hand side matrix operand
856  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
857  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
858  {
859  using boost::numeric_cast;
860 
861  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
862  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
863  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
864  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
865  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
866  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
867 
868  const int M ( numeric_cast<int>( A.rows() ) );
869  const int N ( numeric_cast<int>( B.columns() ) );
870  const int K ( numeric_cast<int>( A.columns() ) );
871  const int lda( numeric_cast<int>( A.spacing() ) );
872  const int ldb( numeric_cast<int>( B.spacing() ) );
873  const int ldc( numeric_cast<int>( C.spacing() ) );
874  const complex<float> alpha( 1.0F, 0.0F );
875  const complex<float> beta ( 0.0F, 0.0F );
876 
877  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
878  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
879  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
880  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
881  }
883 #endif
884  //**********************************************************************************************
885 
886  //**BLAS-based assignment to dense matrices (double precision complex)**************************
887 #if BLAZE_BLAS_MODE
888 
901  template< typename MT3 // Type of the left-hand side target matrix
902  , typename MT4 // Type of the left-hand side matrix operand
903  , typename MT5 > // Type of the right-hand side matrix operand
904  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
905  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
906  {
907  using boost::numeric_cast;
908 
909  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
910  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
911  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
912  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
913  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
914  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
915 
916  const int M ( numeric_cast<int>( A.rows() ) );
917  const int N ( numeric_cast<int>( B.columns() ) );
918  const int K ( numeric_cast<int>( A.columns() ) );
919  const int lda( numeric_cast<int>( A.spacing() ) );
920  const int ldb( numeric_cast<int>( B.spacing() ) );
921  const int ldc( numeric_cast<int>( C.spacing() ) );
922  const complex<double> alpha( 1.0, 0.0 );
923  const complex<double> beta ( 0.0, 0.0 );
924 
925  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
926  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
927  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
928  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
929  }
931 #endif
932  //**********************************************************************************************
933 
934  //**Assignment to sparse matrices***************************************************************
946  template< typename MT // Type of the target sparse matrix
947  , bool SO > // Storage order of the target sparse matrix
948  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
949  {
951 
952  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
953 
959  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( typename TmpType::CompositeType );
960 
961  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
962  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
963 
964  const TmpType tmp( rhs );
965  assign( ~lhs, tmp );
966  }
968  //**********************************************************************************************
969 
970  //**Addition assignment to dense matrices*******************************************************
983  template< typename MT // Type of the target dense matrix
984  , bool SO > // Storage order of the target dense matrix
985  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
986  {
988 
989  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
990  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
991 
992  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
993  return;
994  }
995 
996  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
997  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
998 
999  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1000  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1001  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1002  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1003  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1004  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1005 
1006  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
1007  TDMatDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
1008  else
1009  TDMatDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
1010  }
1012  //**********************************************************************************************
1013 
1014  //**Default addition assignment to dense matrices***********************************************
1028  template< typename MT3 // Type of the left-hand side target matrix
1029  , typename MT4 // Type of the left-hand side matrix operand
1030  , typename MT5 > // Type of the right-hand side matrix operand
1031  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1032  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1033  {
1034  const size_t M( A.rows() );
1035  const size_t N( B.columns() );
1036  const size_t K( A.columns() );
1037 
1038  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1039  const size_t end( N & size_t(-2) );
1040 
1041  for( size_t i=0UL; i<M; ++i ) {
1042  for( size_t k=0UL; k<K; ++k ) {
1043  for( size_t j=0UL; j<end; j+=2UL ) {
1044  C(i,j ) += A(i,k) * B(k,j );
1045  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1046  }
1047  if( end < N ) {
1048  C(i,end) += A(i,k) * B(k,end);
1049  }
1050  }
1051  }
1052  }
1054  //**********************************************************************************************
1055 
1056  //**Vectorized default addition assignment to row-major dense matrices**************************
1070  template< typename MT3 // Type of the left-hand side target matrix
1071  , typename MT4 // Type of the left-hand side matrix operand
1072  , typename MT5 > // Type of the right-hand side matrix operand
1073  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1074  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1075  {
1076  typedef IntrinsicTrait<ElementType> IT;
1077 
1078  const size_t M( A.rows() );
1079  const size_t N( B.spacing() );
1080  const size_t K( A.columns() );
1081 
1082  size_t j( 0UL );
1083 
1084  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1085  for( size_t i=0UL; i<M; ++i ) {
1086  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1087  IntrinsicType xmm2( load( &(~C)(i,j+IT::size ) ) );
1088  IntrinsicType xmm3( load( &(~C)(i,j+IT::size*2UL) ) );
1089  IntrinsicType xmm4( load( &(~C)(i,j+IT::size*3UL) ) );
1090  IntrinsicType xmm5( load( &(~C)(i,j+IT::size*4UL) ) );
1091  IntrinsicType xmm6( load( &(~C)(i,j+IT::size*5UL) ) );
1092  IntrinsicType xmm7( load( &(~C)(i,j+IT::size*6UL) ) );
1093  IntrinsicType xmm8( load( &(~C)(i,j+IT::size*7UL) ) );
1094  for( size_t k=0UL; k<K; ++k ) {
1095  const IntrinsicType a1( set( A(i,k) ) );
1096  xmm1 = xmm1 + a1 * B.get(k,j );
1097  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1098  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1099  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1100  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
1101  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
1102  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
1103  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
1104  }
1105  store( &(~C)(i,j ), xmm1 );
1106  store( &(~C)(i,j+IT::size ), xmm2 );
1107  store( &(~C)(i,j+IT::size*2UL), xmm3 );
1108  store( &(~C)(i,j+IT::size*3UL), xmm4 );
1109  store( &(~C)(i,j+IT::size*4UL), xmm5 );
1110  store( &(~C)(i,j+IT::size*5UL), xmm6 );
1111  store( &(~C)(i,j+IT::size*6UL), xmm7 );
1112  store( &(~C)(i,j+IT::size*7UL), xmm8 );
1113  }
1114  }
1115  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1116  size_t i( 0UL );
1117  for( ; (i+2UL) <= M; i+=2UL ) {
1118  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1119  IntrinsicType xmm2( load( &(~C)(i ,j+IT::size ) ) );
1120  IntrinsicType xmm3( load( &(~C)(i ,j+IT::size*2UL) ) );
1121  IntrinsicType xmm4( load( &(~C)(i ,j+IT::size*3UL) ) );
1122  IntrinsicType xmm5( load( &(~C)(i+1UL,j ) ) );
1123  IntrinsicType xmm6( load( &(~C)(i+1UL,j+IT::size ) ) );
1124  IntrinsicType xmm7( load( &(~C)(i+1UL,j+IT::size*2UL) ) );
1125  IntrinsicType xmm8( load( &(~C)(i+1UL,j+IT::size*3UL) ) );
1126  for( size_t k=0UL; k<K; ++k ) {
1127  const IntrinsicType a1( set( A(i ,k) ) );
1128  const IntrinsicType a2( set( A(i+1UL,k) ) );
1129  const IntrinsicType b1( B.get(k,j ) );
1130  const IntrinsicType b2( B.get(k,j+IT::size ) );
1131  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
1132  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
1133  xmm1 = xmm1 + a1 * b1;
1134  xmm2 = xmm2 + a1 * b2;
1135  xmm3 = xmm3 + a1 * b3;
1136  xmm4 = xmm4 + a1 * b4;
1137  xmm5 = xmm5 + a2 * b1;
1138  xmm6 = xmm6 + a2 * b2;
1139  xmm7 = xmm7 + a2 * b3;
1140  xmm8 = xmm8 + a2 * b4;
1141  }
1142  store( &(~C)(i ,j ), xmm1 );
1143  store( &(~C)(i ,j+IT::size ), xmm2 );
1144  store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1145  store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1146  store( &(~C)(i+1UL,j ), xmm5 );
1147  store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1148  store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1149  store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1150  }
1151  if( i < M ) {
1152  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1153  IntrinsicType xmm2( load( &(~C)(i,j+IT::size ) ) );
1154  IntrinsicType xmm3( load( &(~C)(i,j+IT::size*2UL) ) );
1155  IntrinsicType xmm4( load( &(~C)(i,j+IT::size*3UL) ) );
1156  for( size_t k=0UL; k<K; ++k ) {
1157  const IntrinsicType a1( set( A(i,k) ) );
1158  xmm1 = xmm1 + a1 * B.get(k,j );
1159  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1160  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1161  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1162  }
1163  store( &(~C)(i,j ), xmm1 );
1164  store( &(~C)(i,j+IT::size ), xmm2 );
1165  store( &(~C)(i,j+IT::size*2UL), xmm3 );
1166  store( &(~C)(i,j+IT::size*3UL), xmm4 );
1167  }
1168  }
1169  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1170  size_t i( 0UL );
1171  for( ; (i+2UL) <= M; i+=2UL ) {
1172  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1173  IntrinsicType xmm2( load( &(~C)(i ,j+IT::size) ) );
1174  IntrinsicType xmm3( load( &(~C)(i+1UL,j ) ) );
1175  IntrinsicType xmm4( load( &(~C)(i+1UL,j+IT::size) ) );
1176  for( size_t k=0UL; k<K; ++k ) {
1177  const IntrinsicType a1( set( A(i ,k) ) );
1178  const IntrinsicType a2( set( A(i+1UL,k) ) );
1179  const IntrinsicType b1( B.get(k,j ) );
1180  const IntrinsicType b2( B.get(k,j+IT::size) );
1181  xmm1 = xmm1 + a1 * b1;
1182  xmm2 = xmm2 + a1 * b2;
1183  xmm3 = xmm3 + a2 * b1;
1184  xmm4 = xmm4 + a2 * b2;
1185  }
1186  store( &(~C)(i ,j ), xmm1 );
1187  store( &(~C)(i ,j+IT::size), xmm2 );
1188  store( &(~C)(i+1UL,j ), xmm3 );
1189  store( &(~C)(i+1UL,j+IT::size), xmm4 );
1190  }
1191  if( i < M ) {
1192  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1193  IntrinsicType xmm2( load( &(~C)(i,j+IT::size) ) );
1194  for( size_t k=0UL; k<K; ++k ) {
1195  const IntrinsicType a1( set( A(i,k) ) );
1196  xmm1 = xmm1 + a1 * B.get(k,j );
1197  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
1198  }
1199  store( &(~C)(i,j ), xmm1 );
1200  store( &(~C)(i,j+IT::size), xmm2 );
1201  }
1202  }
1203  if( j < N ) {
1204  size_t i( 0UL );
1205  for( ; (i+2UL) <= M; i+=2UL ) {
1206  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1207  IntrinsicType xmm2( load( &(~C)(i+1UL,j) ) );
1208  for( size_t k=0UL; k<K; ++k ) {
1209  const IntrinsicType b1( B.get(k,j) );
1210  xmm1 = xmm1 + set( A(i ,k) ) * b1;
1211  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
1212  }
1213  store( &(~C)(i ,j), xmm1 );
1214  store( &(~C)(i+1UL,j), xmm2 );
1215  }
1216  if( i < M ) {
1217  IntrinsicType xmm1( load( &(~C)(i,j) ) );
1218  for( size_t k=0UL; k<K; ++k ) {
1219  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
1220  }
1221  store( &(~C)(i,j), xmm1 );
1222  }
1223  }
1224  }
1226  //**********************************************************************************************
1227 
1228  //**Vectorized default addition assignment to column-major dense matrices***********************
1242  template< typename MT3 // Type of the left-hand side target matrix
1243  , typename MT4 // Type of the left-hand side matrix operand
1244  , typename MT5 > // Type of the right-hand side matrix operand
1245  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1246  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1247  {
1248  typedef IntrinsicTrait<ElementType> IT;
1249 
1250  const size_t M( A.spacing() );
1251  const size_t N( B.columns() );
1252  const size_t K( A.columns() );
1253 
1254  size_t i( 0UL );
1255 
1256  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1257  for( size_t j=0UL; j<N; ++j ) {
1258  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1259  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j) ) );
1260  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j) ) );
1261  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j) ) );
1262  IntrinsicType xmm5( load( &(~C)(i+IT::size*4UL,j) ) );
1263  IntrinsicType xmm6( load( &(~C)(i+IT::size*5UL,j) ) );
1264  IntrinsicType xmm7( load( &(~C)(i+IT::size*6UL,j) ) );
1265  IntrinsicType xmm8( load( &(~C)(i+IT::size*7UL,j) ) );
1266  for( size_t k=0UL; k<K; ++k ) {
1267  const IntrinsicType b1( set( B(k,j) ) );
1268  xmm1 = xmm1 + A.get(i ,k) * b1;
1269  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1270  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1271  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1272  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
1273  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
1274  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
1275  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
1276  }
1277  store( &(~C)(i ,j), xmm1 );
1278  store( &(~C)(i+IT::size ,j), xmm2 );
1279  store( &(~C)(i+IT::size*2UL,j), xmm3 );
1280  store( &(~C)(i+IT::size*3UL,j), xmm4 );
1281  store( &(~C)(i+IT::size*4UL,j), xmm5 );
1282  store( &(~C)(i+IT::size*5UL,j), xmm6 );
1283  store( &(~C)(i+IT::size*6UL,j), xmm7 );
1284  store( &(~C)(i+IT::size*7UL,j), xmm8 );
1285  }
1286  }
1287  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1288  size_t j( 0UL );
1289  for( ; (j+2UL) <= N; j+=2UL ) {
1290  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1291  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j ) ) );
1292  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j ) ) );
1293  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j ) ) );
1294  IntrinsicType xmm5( load( &(~C)(i ,j+1UL) ) );
1295  IntrinsicType xmm6( load( &(~C)(i+IT::size ,j+1UL) ) );
1296  IntrinsicType xmm7( load( &(~C)(i+IT::size*2UL,j+1UL) ) );
1297  IntrinsicType xmm8( load( &(~C)(i+IT::size*3UL,j+1UL) ) );
1298  for( size_t k=0UL; k<K; ++k ) {
1299  const IntrinsicType a1( A.get(i ,k) );
1300  const IntrinsicType a2( A.get(i+IT::size ,k) );
1301  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
1302  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
1303  const IntrinsicType b1( set( B(k,j ) ) );
1304  const IntrinsicType b2( set( B(k,j+1UL) ) );
1305  xmm1 = xmm1 + a1 * b1;
1306  xmm2 = xmm2 + a2 * b1;
1307  xmm3 = xmm3 + a3 * b1;
1308  xmm4 = xmm4 + a4 * b1;
1309  xmm5 = xmm5 + a1 * b2;
1310  xmm6 = xmm6 + a2 * b2;
1311  xmm7 = xmm7 + a3 * b2;
1312  xmm8 = xmm8 + a4 * b2;
1313  }
1314  store( &(~C)(i ,j ), xmm1 );
1315  store( &(~C)(i+IT::size ,j ), xmm2 );
1316  store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1317  store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1318  store( &(~C)(i ,j+1UL), xmm5 );
1319  store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1320  store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1321  store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1322  }
1323  if( j < N ) {
1324  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1325  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j) ) );
1326  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j) ) );
1327  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j) ) );
1328  for( size_t k=0UL; k<K; ++k ) {
1329  const IntrinsicType b1( set( B(k,j) ) );
1330  xmm1 = xmm1 + A.get(i ,k) * b1;
1331  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1332  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1333  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1334  }
1335  store( &(~C)(i ,j), xmm1 );
1336  store( &(~C)(i+IT::size ,j), xmm2 );
1337  store( &(~C)(i+IT::size*2UL,j), xmm3 );
1338  store( &(~C)(i+IT::size*3UL,j), xmm4 );
1339  }
1340  }
1341  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1342  size_t j( 0UL );
1343  for( ; (j+2UL) <= N; j+=2UL ) {
1344  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1345  IntrinsicType xmm2( load( &(~C)(i+IT::size,j ) ) );
1346  IntrinsicType xmm3( load( &(~C)(i ,j+1UL) ) );
1347  IntrinsicType xmm4( load( &(~C)(i+IT::size,j+1UL) ) );
1348  for( size_t k=0UL; k<K; ++k ) {
1349  const IntrinsicType a1( A.get(i ,k) );
1350  const IntrinsicType a2( A.get(i+IT::size,k) );
1351  const IntrinsicType b1( set( B(k,j ) ) );
1352  const IntrinsicType b2( set( B(k,j+1UL) ) );
1353  xmm1 = xmm1 + a1 * b1;
1354  xmm2 = xmm2 + a2 * b1;
1355  xmm3 = xmm3 + a1 * b2;
1356  xmm4 = xmm4 + a2 * b2;
1357  }
1358  store( &(~C)(i ,j ), xmm1 );
1359  store( &(~C)(i+IT::size,j ), xmm2 );
1360  store( &(~C)(i ,j+1UL), xmm3 );
1361  store( &(~C)(i+IT::size,j+1UL), xmm4 );
1362  }
1363  if( j < N ) {
1364  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1365  IntrinsicType xmm2( load( &(~C)(i+IT::size,j) ) );
1366  for( size_t k=0UL; k<K; ++k ) {
1367  const IntrinsicType b1( set( B(k,j) ) );
1368  xmm1 = xmm1 + A.get(i ,k) * b1;
1369  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
1370  }
1371  store( &(~C)(i ,j), xmm1 );
1372  store( &(~C)(i+IT::size,j), xmm2 );
1373  }
1374  }
1375  if( i < M ) {
1376  size_t j( 0UL );
1377  for( ; (j+2UL) <= N; j+=2UL ) {
1378  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1379  IntrinsicType xmm2( load( &(~C)(i,j+1UL) ) );
1380  for( size_t k=0UL; k<K; ++k ) {
1381  const IntrinsicType a1( A.get(i,k) );
1382  xmm1 = xmm1 + a1 * set( B(k,j ) );
1383  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
1384  }
1385  store( &(~C)(i,j ), xmm1 );
1386  store( &(~C)(i,j+1UL), xmm2 );
1387  }
1388  if( j < N ) {
1389  IntrinsicType xmm1( load( &(~C)(i,j) ) );
1390  for( size_t k=0UL; k<K; ++k ) {
1391  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
1392  }
1393  store( &(~C)(i,j), xmm1 );
1394  }
1395  }
1396  }
1398  //**********************************************************************************************
1399 
1400  //**BLAS-based addition assignment to dense matrices (default)**********************************
1414  template< typename MT3 // Type of the left-hand side target matrix
1415  , typename MT4 // Type of the left-hand side matrix operand
1416  , typename MT5 > // Type of the right-hand side matrix operand
1417  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1418  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1419  {
1420  selectDefaultAddAssignKernel( C, A, B );
1421  }
1423  //**********************************************************************************************
1424 
1425  //**BLAS-based addition assignment to dense matrices (single precision)*************************
1426 #if BLAZE_BLAS_MODE
1427 
1440  template< typename MT3 // Type of the left-hand side target matrix
1441  , typename MT4 // Type of the left-hand side matrix operand
1442  , typename MT5 > // Type of the right-hand side matrix operand
1443  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1444  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1445  {
1446  using boost::numeric_cast;
1447 
1448  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
1449  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
1450  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
1451 
1452  const int M ( numeric_cast<int>( A.rows() ) );
1453  const int N ( numeric_cast<int>( B.columns() ) );
1454  const int K ( numeric_cast<int>( A.columns() ) );
1455  const int lda( numeric_cast<int>( A.spacing() ) );
1456  const int ldb( numeric_cast<int>( B.spacing() ) );
1457  const int ldc( numeric_cast<int>( C.spacing() ) );
1458 
1459  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1460  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1461  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1462  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1463  }
1465 #endif
1466  //**********************************************************************************************
1467 
1468  //**BLAS-based addition assignment to dense matrices (double precision)*************************
1469 #if BLAZE_BLAS_MODE
1470 
1483  template< typename MT3 // Type of the left-hand side target matrix
1484  , typename MT4 // Type of the left-hand side matrix operand
1485  , typename MT5 > // Type of the right-hand side matrix operand
1486  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1487  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1488  {
1489  using boost::numeric_cast;
1490 
1491  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
1492  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
1493  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
1494 
1495  const int M ( numeric_cast<int>( A.rows() ) );
1496  const int N ( numeric_cast<int>( B.columns() ) );
1497  const int K ( numeric_cast<int>( A.columns() ) );
1498  const int lda( numeric_cast<int>( A.spacing() ) );
1499  const int ldb( numeric_cast<int>( B.spacing() ) );
1500  const int ldc( numeric_cast<int>( C.spacing() ) );
1501 
1502  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1503  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1504  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1505  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1506  }
1508 #endif
1509  //**********************************************************************************************
1510 
1511  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
1512 #if BLAZE_BLAS_MODE
1513 
1526  template< typename MT3 // Type of the left-hand side target matrix
1527  , typename MT4 // Type of the left-hand side matrix operand
1528  , typename MT5 > // Type of the right-hand side matrix operand
1529  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1530  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1531  {
1532  using boost::numeric_cast;
1533 
1534  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1535  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1536  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1537  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
1538  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
1539  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
1540 
1541  const int M ( numeric_cast<int>( A.rows() ) );
1542  const int N ( numeric_cast<int>( B.columns() ) );
1543  const int K ( numeric_cast<int>( A.columns() ) );
1544  const int lda( numeric_cast<int>( A.spacing() ) );
1545  const int ldb( numeric_cast<int>( B.spacing() ) );
1546  const int ldc( numeric_cast<int>( C.spacing() ) );
1547  const complex<float> alpha( 1.0F, 0.0F );
1548  const complex<float> beta ( 1.0F, 0.0F );
1549 
1550  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1551  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1552  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1553  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1554  }
1556 #endif
1557  //**********************************************************************************************
1558 
1559  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
1560 #if BLAZE_BLAS_MODE
1561 
1574  template< typename MT3 // Type of the left-hand side target matrix
1575  , typename MT4 // Type of the left-hand side matrix operand
1576  , typename MT5 > // Type of the right-hand side matrix operand
1577  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1578  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1579  {
1580  using boost::numeric_cast;
1581 
1582  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1583  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1584  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1585  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1586  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1587  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1588 
1589  const int M ( numeric_cast<int>( A.rows() ) );
1590  const int N ( numeric_cast<int>( B.columns() ) );
1591  const int K ( numeric_cast<int>( A.columns() ) );
1592  const int lda( numeric_cast<int>( A.spacing() ) );
1593  const int ldb( numeric_cast<int>( B.spacing() ) );
1594  const int ldc( numeric_cast<int>( C.spacing() ) );
1595  const complex<double> alpha( 1.0, 0.0 );
1596  const complex<double> beta ( 1.0, 0.0 );
1597 
1598  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1599  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1600  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1601  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1602  }
1604 #endif
1605  //**********************************************************************************************
1606 
1607  //**Addition assignment to sparse matrices******************************************************
1608  // No special implementation for the addition assignment to sparse matrices.
1609  //**********************************************************************************************
1610 
1611  //**Subtraction assignment to dense matrices****************************************************
1624  template< typename MT // Type of the target dense matrix
1625  , bool SO > // Storage order of the target dense matrix
1626  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
1627  {
1629 
1630  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1631  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1632 
1633  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1634  return;
1635  }
1636 
1637  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
1638  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
1639 
1640  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1641  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1642  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1643  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1644  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1645  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1646 
1647  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
1648  TDMatDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1649  else
1650  TDMatDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1651  }
1653  //**********************************************************************************************
1654 
1655  //**Default subtraction assignment to dense matrices********************************************
1669  template< typename MT3 // Type of the left-hand side target matrix
1670  , typename MT4 // Type of the left-hand side matrix operand
1671  , typename MT5 > // Type of the right-hand side matrix operand
1672  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1673  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1674  {
1675  const size_t M( A.rows() );
1676  const size_t N( B.columns() );
1677  const size_t K( A.columns() );
1678 
1679  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1680  const size_t end( N & size_t(-2) );
1681 
1682  for( size_t i=0UL; i<M; ++i ) {
1683  for( size_t k=0UL; k<K; ++k ) {
1684  for( size_t j=0UL; j<end; j+=2UL ) {
1685  C(i,j ) -= A(i,k) * B(k,j );
1686  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1687  }
1688  if( end < N ) {
1689  C(i,end) -= A(i,k) * B(k,end);
1690  }
1691  }
1692  }
1693  }
1695  //**********************************************************************************************
1696 
1697  //**Vectorized default subtraction assignment to row-major dense matrices***********************
1711  template< typename MT3 // Type of the left-hand side target matrix
1712  , typename MT4 // Type of the left-hand side matrix operand
1713  , typename MT5 > // Type of the right-hand side matrix operand
1714  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1715  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1716  {
1717  typedef IntrinsicTrait<ElementType> IT;
1718 
1719  const size_t M( A.rows() );
1720  const size_t N( B.spacing() );
1721  const size_t K( A.columns() );
1722 
1723  size_t j( 0UL );
1724 
1725  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1726  for( size_t i=0UL; i<M; ++i ) {
1727  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1728  IntrinsicType xmm2( load( &(~C)(i,j+IT::size ) ) );
1729  IntrinsicType xmm3( load( &(~C)(i,j+IT::size*2UL) ) );
1730  IntrinsicType xmm4( load( &(~C)(i,j+IT::size*3UL) ) );
1731  IntrinsicType xmm5( load( &(~C)(i,j+IT::size*4UL) ) );
1732  IntrinsicType xmm6( load( &(~C)(i,j+IT::size*5UL) ) );
1733  IntrinsicType xmm7( load( &(~C)(i,j+IT::size*6UL) ) );
1734  IntrinsicType xmm8( load( &(~C)(i,j+IT::size*7UL) ) );
1735  for( size_t k=0UL; k<K; ++k ) {
1736  const IntrinsicType a1( set( A(i,k) ) );
1737  xmm1 = xmm1 - a1 * B.get(k,j );
1738  xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1739  xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1740  xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1741  xmm5 = xmm5 - a1 * B.get(k,j+IT::size*4UL);
1742  xmm6 = xmm6 - a1 * B.get(k,j+IT::size*5UL);
1743  xmm7 = xmm7 - a1 * B.get(k,j+IT::size*6UL);
1744  xmm8 = xmm8 - a1 * B.get(k,j+IT::size*7UL);
1745  }
1746  store( &(~C)(i,j ), xmm1 );
1747  store( &(~C)(i,j+IT::size ), xmm2 );
1748  store( &(~C)(i,j+IT::size*2UL), xmm3 );
1749  store( &(~C)(i,j+IT::size*3UL), xmm4 );
1750  store( &(~C)(i,j+IT::size*4UL), xmm5 );
1751  store( &(~C)(i,j+IT::size*5UL), xmm6 );
1752  store( &(~C)(i,j+IT::size*6UL), xmm7 );
1753  store( &(~C)(i,j+IT::size*7UL), xmm8 );
1754  }
1755  }
1756  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1757  size_t i( 0UL );
1758  for( ; (i+2UL) <= M; i+=2UL ) {
1759  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1760  IntrinsicType xmm2( load( &(~C)(i ,j+IT::size ) ) );
1761  IntrinsicType xmm3( load( &(~C)(i ,j+IT::size*2UL) ) );
1762  IntrinsicType xmm4( load( &(~C)(i ,j+IT::size*3UL) ) );
1763  IntrinsicType xmm5( load( &(~C)(i+1UL,j ) ) );
1764  IntrinsicType xmm6( load( &(~C)(i+1UL,j+IT::size ) ) );
1765  IntrinsicType xmm7( load( &(~C)(i+1UL,j+IT::size*2UL) ) );
1766  IntrinsicType xmm8( load( &(~C)(i+1UL,j+IT::size*3UL) ) );
1767  for( size_t k=0UL; k<K; ++k ) {
1768  const IntrinsicType a1( set( A(i ,k) ) );
1769  const IntrinsicType a2( set( A(i+1UL,k) ) );
1770  const IntrinsicType b1( B.get(k,j ) );
1771  const IntrinsicType b2( B.get(k,j+IT::size ) );
1772  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
1773  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
1774  xmm1 = xmm1 - a1 * b1;
1775  xmm2 = xmm2 - a1 * b2;
1776  xmm3 = xmm3 - a1 * b3;
1777  xmm4 = xmm4 - a1 * b4;
1778  xmm5 = xmm5 - a2 * b1;
1779  xmm6 = xmm6 - a2 * b2;
1780  xmm7 = xmm7 - a2 * b3;
1781  xmm8 = xmm8 - a2 * b4;
1782  }
1783  store( &(~C)(i ,j ), xmm1 );
1784  store( &(~C)(i ,j+IT::size ), xmm2 );
1785  store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1786  store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1787  store( &(~C)(i+1UL,j ), xmm5 );
1788  store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1789  store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1790  store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1791  }
1792  if( i < M ) {
1793  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1794  IntrinsicType xmm2( load( &(~C)(i,j+IT::size ) ) );
1795  IntrinsicType xmm3( load( &(~C)(i,j+IT::size*2UL) ) );
1796  IntrinsicType xmm4( load( &(~C)(i,j+IT::size*3UL) ) );
1797  for( size_t k=0UL; k<K; ++k ) {
1798  const IntrinsicType a1( set( A(i,k) ) );
1799  xmm1 = xmm1 - a1 * B.get(k,j );
1800  xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1801  xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1802  xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1803  }
1804  store( &(~C)(i,j ), xmm1 );
1805  store( &(~C)(i,j+IT::size ), xmm2 );
1806  store( &(~C)(i,j+IT::size*2UL), xmm3 );
1807  store( &(~C)(i,j+IT::size*3UL), xmm4 );
1808  }
1809  }
1810  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1811  size_t i( 0UL );
1812  for( ; (i+2UL) <= M; i+=2UL ) {
1813  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1814  IntrinsicType xmm2( load( &(~C)(i ,j+IT::size) ) );
1815  IntrinsicType xmm3( load( &(~C)(i+1UL,j ) ) );
1816  IntrinsicType xmm4( load( &(~C)(i+1UL,j+IT::size) ) );
1817  for( size_t k=0UL; k<K; ++k ) {
1818  const IntrinsicType a1( set( A(i ,k) ) );
1819  const IntrinsicType a2( set( A(i+1UL,k) ) );
1820  const IntrinsicType b1( B.get(k,j ) );
1821  const IntrinsicType b2( B.get(k,j+IT::size) );
1822  xmm1 = xmm1 - a1 * b1;
1823  xmm2 = xmm2 - a1 * b2;
1824  xmm3 = xmm3 - a2 * b1;
1825  xmm4 = xmm4 - a2 * b2;
1826  }
1827  store( &(~C)(i ,j ), xmm1 );
1828  store( &(~C)(i ,j+IT::size), xmm2 );
1829  store( &(~C)(i+1UL,j ), xmm3 );
1830  store( &(~C)(i+1UL,j+IT::size), xmm4 );
1831  }
1832  if( i < M ) {
1833  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1834  IntrinsicType xmm2( load( &(~C)(i,j+IT::size) ) );
1835  for( size_t k=0UL; k<K; ++k ) {
1836  const IntrinsicType a1( set( A(i,k) ) );
1837  xmm1 = xmm1 - a1 * B.get(k,j );
1838  xmm2 = xmm2 - a1 * B.get(k,j+IT::size);
1839  }
1840  store( &(~C)(i,j ), xmm1 );
1841  store( &(~C)(i,j+IT::size), xmm2 );
1842  }
1843  }
1844  if( j < N ) {
1845  size_t i( 0UL );
1846  for( ; (i+2UL) <= M; i+=2UL ) {
1847  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1848  IntrinsicType xmm2( load( &(~C)(i+1UL,j) ) );
1849  for( size_t k=0UL; k<K; ++k ) {
1850  const IntrinsicType b1( B.get(k,j) );
1851  xmm1 = xmm1 - set( A(i ,k) ) * b1;
1852  xmm2 = xmm2 - set( A(i+1UL,k) ) * b1;
1853  }
1854  store( &(~C)(i ,j), xmm1 );
1855  store( &(~C)(i+1UL,j), xmm2 );
1856  }
1857  if( i < M ) {
1858  IntrinsicType xmm1( load( &(~C)(i,j) ) );
1859  for( size_t k=0UL; k<K; ++k ) {
1860  xmm1 = xmm1 - set( A(i,k) ) * B.get(k,j);
1861  }
1862  store( &(~C)(i,j), xmm1 );
1863  }
1864  }
1865  }
1867  //**********************************************************************************************
1868 
1869  //**Vectorized default subtraction assignment to column-major dense matrices********************
1883  template< typename MT3 // Type of the left-hand side target matrix
1884  , typename MT4 // Type of the left-hand side matrix operand
1885  , typename MT5 > // Type of the right-hand side matrix operand
1886  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1887  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1888  {
1889  typedef IntrinsicTrait<ElementType> IT;
1890 
1891  const size_t M( A.spacing() );
1892  const size_t N( B.columns() );
1893  const size_t K( A.columns() );
1894 
1895  size_t i( 0UL );
1896 
1897  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1898  for( size_t j=0UL; j<N; ++j ) {
1899  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1900  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j) ) );
1901  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j) ) );
1902  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j) ) );
1903  IntrinsicType xmm5( load( &(~C)(i+IT::size*4UL,j) ) );
1904  IntrinsicType xmm6( load( &(~C)(i+IT::size*5UL,j) ) );
1905  IntrinsicType xmm7( load( &(~C)(i+IT::size*6UL,j) ) );
1906  IntrinsicType xmm8( load( &(~C)(i+IT::size*7UL,j) ) );
1907  for( size_t k=0UL; k<K; ++k ) {
1908  const IntrinsicType b1( set( B(k,j) ) );
1909  xmm1 = xmm1 - A.get(i ,k) * b1;
1910  xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1911  xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1912  xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1913  xmm5 = xmm5 - A.get(i+IT::size*4UL,k) * b1;
1914  xmm6 = xmm6 - A.get(i+IT::size*5UL,k) * b1;
1915  xmm7 = xmm7 - A.get(i+IT::size*6UL,k) * b1;
1916  xmm8 = xmm8 - A.get(i+IT::size*7UL,k) * b1;
1917  }
1918  store( &(~C)(i ,j), xmm1 );
1919  store( &(~C)(i+IT::size ,j), xmm2 );
1920  store( &(~C)(i+IT::size*2UL,j), xmm3 );
1921  store( &(~C)(i+IT::size*3UL,j), xmm4 );
1922  store( &(~C)(i+IT::size*4UL,j), xmm5 );
1923  store( &(~C)(i+IT::size*5UL,j), xmm6 );
1924  store( &(~C)(i+IT::size*6UL,j), xmm7 );
1925  store( &(~C)(i+IT::size*7UL,j), xmm8 );
1926  }
1927  }
1928  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1929  size_t j( 0UL );
1930  for( ; (j+2UL) <= N; j+=2UL ) {
1931  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1932  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j ) ) );
1933  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j ) ) );
1934  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j ) ) );
1935  IntrinsicType xmm5( load( &(~C)(i ,j+1UL) ) );
1936  IntrinsicType xmm6( load( &(~C)(i+IT::size ,j+1UL) ) );
1937  IntrinsicType xmm7( load( &(~C)(i+IT::size*2UL,j+1UL) ) );
1938  IntrinsicType xmm8( load( &(~C)(i+IT::size*3UL,j+1UL) ) );
1939  for( size_t k=0UL; k<K; ++k ) {
1940  const IntrinsicType a1( A.get(i ,k) );
1941  const IntrinsicType a2( A.get(i+IT::size ,k) );
1942  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
1943  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
1944  const IntrinsicType b1( set( B(k,j ) ) );
1945  const IntrinsicType b2( set( B(k,j+1UL) ) );
1946  xmm1 = xmm1 - a1 * b1;
1947  xmm2 = xmm2 - a2 * b1;
1948  xmm3 = xmm3 - a3 * b1;
1949  xmm4 = xmm4 - a4 * b1;
1950  xmm5 = xmm5 - a1 * b2;
1951  xmm6 = xmm6 - a2 * b2;
1952  xmm7 = xmm7 - a3 * b2;
1953  xmm8 = xmm8 - a4 * b2;
1954  }
1955  store( &(~C)(i ,j ), xmm1 );
1956  store( &(~C)(i+IT::size ,j ), xmm2 );
1957  store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1958  store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1959  store( &(~C)(i ,j+1UL), xmm5 );
1960  store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1961  store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1962  store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1963  }
1964  if( j < N ) {
1965  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1966  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j) ) );
1967  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j) ) );
1968  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j) ) );
1969  for( size_t k=0UL; k<K; ++k ) {
1970  const IntrinsicType b1( set( B(k,j) ) );
1971  xmm1 = xmm1 - A.get(i ,k) * b1;
1972  xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1973  xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1974  xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1975  }
1976  store( &(~C)(i ,j), xmm1 );
1977  store( &(~C)(i+IT::size ,j), xmm2 );
1978  store( &(~C)(i+IT::size*2UL,j), xmm3 );
1979  store( &(~C)(i+IT::size*3UL,j), xmm4 );
1980  }
1981  }
1982  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1983  size_t j( 0UL );
1984  for( ; (j+2UL) <= N; j+=2UL ) {
1985  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1986  IntrinsicType xmm2( load( &(~C)(i+IT::size,j ) ) );
1987  IntrinsicType xmm3( load( &(~C)(i ,j+1UL) ) );
1988  IntrinsicType xmm4( load( &(~C)(i+IT::size,j+1UL) ) );
1989  for( size_t k=0UL; k<K; ++k ) {
1990  const IntrinsicType a1( A.get(i ,k) );
1991  const IntrinsicType a2( A.get(i+IT::size,k) );
1992  const IntrinsicType b1( set( B(k,j ) ) );
1993  const IntrinsicType b2( set( B(k,j+1UL) ) );
1994  xmm1 = xmm1 - a1 * b1;
1995  xmm2 = xmm2 - a2 * b1;
1996  xmm3 = xmm3 - a1 * b2;
1997  xmm4 = xmm4 - a2 * b2;
1998  }
1999  store( &(~C)(i ,j ), xmm1 );
2000  store( &(~C)(i+IT::size,j ), xmm2 );
2001  store( &(~C)(i ,j+1UL), xmm3 );
2002  store( &(~C)(i+IT::size,j+1UL), xmm4 );
2003  }
2004  if( j < N ) {
2005  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
2006  IntrinsicType xmm2( load( &(~C)(i+IT::size,j) ) );
2007  for( size_t k=0UL; k<K; ++k ) {
2008  const IntrinsicType b1( set( B(k,j) ) );
2009  xmm1 = xmm1 - A.get(i ,k) * b1;
2010  xmm2 = xmm2 - A.get(i+IT::size,k) * b1;
2011  }
2012  store( &(~C)(i ,j), xmm1 );
2013  store( &(~C)(i+IT::size,j), xmm2 );
2014  }
2015  }
2016  if( i < M ) {
2017  size_t j( 0UL );
2018  for( ; (j+2UL) <= N; j+=2UL ) {
2019  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
2020  IntrinsicType xmm2( load( &(~C)(i,j+1UL) ) );
2021  for( size_t k=0UL; k<K; ++k ) {
2022  const IntrinsicType a1( A.get(i,k) );
2023  xmm1 = xmm1 - a1 * set( B(k,j ) );
2024  xmm2 = xmm2 - a1 * set( B(k,j+1UL) );
2025  }
2026  store( &(~C)(i,j ), xmm1 );
2027  store( &(~C)(i,j+1UL), xmm2 );
2028  }
2029  if( j < N ) {
2030  IntrinsicType xmm1( load( &(~C)(i,j) ) );
2031  for( size_t k=0UL; k<K; ++k ) {
2032  xmm1 = xmm1 - A.get(i,k) * set( B(k,j) );
2033  }
2034  store( &(~C)(i,j), xmm1 );
2035  }
2036  }
2037  }
2039  //**********************************************************************************************
2040 
2041  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
2055  template< typename MT3 // Type of the left-hand side target matrix
2056  , typename MT4 // Type of the left-hand side matrix operand
2057  , typename MT5 > // Type of the right-hand side matrix operand
2058  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2059  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2060  {
2061  selectDefaultSubAssignKernel( C, A, B );
2062  }
2064  //**********************************************************************************************
2065 
2066  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
2067 #if BLAZE_BLAS_MODE
2068 
2081  template< typename MT3 // Type of the left-hand side target matrix
2082  , typename MT4 // Type of the left-hand side matrix operand
2083  , typename MT5 > // Type of the right-hand side matrix operand
2084  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2085  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2086  {
2087  using boost::numeric_cast;
2088 
2089  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
2090  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
2091  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
2092 
2093  const int M ( numeric_cast<int>( A.rows() ) );
2094  const int N ( numeric_cast<int>( B.columns() ) );
2095  const int K ( numeric_cast<int>( A.columns() ) );
2096  const int lda( numeric_cast<int>( A.spacing() ) );
2097  const int ldb( numeric_cast<int>( B.spacing() ) );
2098  const int ldc( numeric_cast<int>( C.spacing() ) );
2099 
2100  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2101  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2102  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2103  M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2104  }
2106 #endif
2107  //**********************************************************************************************
2108 
2109  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
2110 #if BLAZE_BLAS_MODE
2111 
2124  template< typename MT3 // Type of the left-hand side target matrix
2125  , typename MT4 // Type of the left-hand side matrix operand
2126  , typename MT5 > // Type of the right-hand side matrix operand
2127  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2128  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2129  {
2130  using boost::numeric_cast;
2131 
2132  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
2133  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
2134  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
2135 
2136  const int M ( numeric_cast<int>( A.rows() ) );
2137  const int N ( numeric_cast<int>( B.columns() ) );
2138  const int K ( numeric_cast<int>( A.columns() ) );
2139  const int lda( numeric_cast<int>( A.spacing() ) );
2140  const int ldb( numeric_cast<int>( B.spacing() ) );
2141  const int ldc( numeric_cast<int>( C.spacing() ) );
2142 
2143  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2144  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2145  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2146  M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2147  }
2149 #endif
2150  //**********************************************************************************************
2151 
2152  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
2153 #if BLAZE_BLAS_MODE
2154 
2167  template< typename MT3 // Type of the left-hand side target matrix
2168  , typename MT4 // Type of the left-hand side matrix operand
2169  , typename MT5 > // Type of the right-hand side matrix operand
2170  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2171  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2172  {
2173  using boost::numeric_cast;
2174 
2175  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
2176  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
2177  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
2178  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
2179  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
2180  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
2181 
2182  const int M ( numeric_cast<int>( A.rows() ) );
2183  const int N ( numeric_cast<int>( B.columns() ) );
2184  const int K ( numeric_cast<int>( A.columns() ) );
2185  const int lda( numeric_cast<int>( A.spacing() ) );
2186  const int ldb( numeric_cast<int>( B.spacing() ) );
2187  const int ldc( numeric_cast<int>( C.spacing() ) );
2188  const complex<float> alpha( -1.0F, 0.0F );
2189  const complex<float> beta ( 1.0F, 0.0F );
2190 
2191  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2192  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2193  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2194  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2195  }
2197 #endif
2198  //**********************************************************************************************
2199 
2200  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
2201 #if BLAZE_BLAS_MODE
2202 
2215  template< typename MT3 // Type of the left-hand side target matrix
2216  , typename MT4 // Type of the left-hand side matrix operand
2217  , typename MT5 > // Type of the right-hand side matrix operand
2218  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2219  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2220  {
2221  using boost::numeric_cast;
2222 
2223  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
2224  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
2225  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
2226  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
2227  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
2228  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
2229 
2230  const int M ( numeric_cast<int>( A.rows() ) );
2231  const int N ( numeric_cast<int>( B.columns() ) );
2232  const int K ( numeric_cast<int>( A.columns() ) );
2233  const int lda( numeric_cast<int>( A.spacing() ) );
2234  const int ldb( numeric_cast<int>( B.spacing() ) );
2235  const int ldc( numeric_cast<int>( C.spacing() ) );
2236  const complex<double> alpha( -1.0, 0.0 );
2237  const complex<double> beta ( 1.0, 0.0 );
2238 
2239  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2240  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2241  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2242  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2243  }
2245 #endif
2246  //**********************************************************************************************
2247 
2248  //**Subtraction assignment to sparse matrices***************************************************
2249  // No special implementation for the subtraction assignment to sparse matrices.
2250  //**********************************************************************************************
2251 
2252  //**Multiplication assignment to dense matrices*************************************************
2253  // No special implementation for the multiplication assignment to dense matrices.
2254  //**********************************************************************************************
2255 
2256  //**Multiplication assignment to sparse matrices************************************************
2257  // No special implementation for the multiplication assignment to sparse matrices.
2258  //**********************************************************************************************
2259 
2260  //**Compile time checks*************************************************************************
2267  //**********************************************************************************************
2268 };
2269 //*************************************************************************************************
2270 
2271 
2272 
2273 
2274 //=================================================================================================
2275 //
2276 // DMATSCALARMULTEXPR SPECIALIZATION
2277 //
2278 //=================================================================================================
2279 
2280 //*************************************************************************************************
2288 template< typename MT1 // Type of the left-hand side dense matrix
2289  , typename MT2 // Type of the right-hand side dense matrix
2290  , typename ST > // Type of the right-hand side scalar value
2291 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >
2292  : public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2293  , private Expression
2294  , private Computation
2295 {
2296  private:
2297  //**Type definitions****************************************************************************
2298  typedef TDMatDMatMultExpr<MT1,MT2> MMM;
2299  typedef typename MMM::ResultType RES;
2300  typedef typename MT1::ResultType RT1;
2301  typedef typename MT2::ResultType RT2;
2302  typedef typename MT1::CompositeType CT1;
2303  typedef typename MT2::CompositeType CT2;
2304  //**********************************************************************************************
2305 
2306  //**********************************************************************************************
2308 
2311  template< typename T1, typename T2, typename T3, typename T4 >
2312  struct UseSinglePrecisionKernel {
2313  enum { value = IsFloat<typename T1::ElementType>::value &&
2314  IsFloat<typename T2::ElementType>::value &&
2315  IsFloat<typename T3::ElementType>::value &&
2316  !IsComplex<T4>::value };
2317  };
2318  //**********************************************************************************************
2319 
2320  //**********************************************************************************************
2322 
2325  template< typename T1, typename T2, typename T3, typename T4 >
2326  struct UseDoublePrecisionKernel {
2327  enum { value = IsDouble<typename T1::ElementType>::value &&
2328  IsDouble<typename T2::ElementType>::value &&
2329  IsDouble<typename T3::ElementType>::value &&
2330  !IsComplex<T4>::value };
2331  };
2332  //**********************************************************************************************
2333 
2334  //**********************************************************************************************
2336 
2339  template< typename T1, typename T2, typename T3 >
2340  struct UseSinglePrecisionComplexKernel {
2341  typedef complex<float> Type;
2342  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2343  IsSame<typename T2::ElementType,Type>::value &&
2344  IsSame<typename T3::ElementType,Type>::value };
2345  };
2346  //**********************************************************************************************
2347 
2348  //**********************************************************************************************
2350 
2353  template< typename T1, typename T2, typename T3 >
2354  struct UseDoublePrecisionComplexKernel {
2355  typedef complex<double> Type;
2356  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2357  IsSame<typename T2::ElementType,Type>::value &&
2358  IsSame<typename T3::ElementType,Type>::value };
2359  };
2360  //**********************************************************************************************
2361 
2362  //**********************************************************************************************
2364 
2366  template< typename T1, typename T2, typename T3, typename T4 >
2367  struct UseDefaultKernel {
2368  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2369  !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2370  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2371  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2372  };
2373  //**********************************************************************************************
2374 
2375  //**********************************************************************************************
2377 
2379  template< typename T1, typename T2, typename T3, typename T4 >
2380  struct UseVectorizedDefaultKernel {
2381  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2382  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2383  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2384  IsSame<typename T1::ElementType,T4>::value &&
2385  IntrinsicTrait<typename T1::ElementType>::addition &&
2386  IntrinsicTrait<typename T1::ElementType>::multiplication };
2387  };
2388  //**********************************************************************************************
2389 
2390  public:
2391  //**Type definitions****************************************************************************
2392  typedef DMatScalarMultExpr<MMM,ST,true> This;
2393  typedef typename MultTrait<RES,ST>::Type ResultType;
2394  typedef typename ResultType::OppositeType OppositeType;
2395  typedef typename ResultType::TransposeType TransposeType;
2396  typedef typename ResultType::ElementType ElementType;
2397  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
2398  typedef const ElementType ReturnType;
2399  typedef const ResultType CompositeType;
2400 
2402  typedef const TDMatDMatMultExpr<MT1,MT2> LeftOperand;
2403 
2405  typedef typename SelectType< IsNumeric<ElementType>::value, ElementType, ST >::Type RightOperand;
2406 
2408  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
2409 
2411  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
2412  //**********************************************************************************************
2413 
2414  //**Compilation flags***************************************************************************
2416  enum { vectorizable = 0 };
2417  //**********************************************************************************************
2418 
2419  //**Constructor*********************************************************************************
2425  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
2426  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
2427  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2428  {}
2429  //**********************************************************************************************
2430 
2431  //**Access operator*****************************************************************************
2438  inline ResultType operator()( size_t i, size_t j ) const {
2439  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
2440  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
2441  return matrix_(i,j) * scalar_;
2442  }
2443  //**********************************************************************************************
2444 
2445  //**Rows function*******************************************************************************
2450  inline size_t rows() const {
2451  return matrix_.rows();
2452  }
2453  //**********************************************************************************************
2454 
2455  //**Columns function****************************************************************************
2460  inline size_t columns() const {
2461  return matrix_.columns();
2462  }
2463  //**********************************************************************************************
2464 
2465  //**Left operand access*************************************************************************
2470  inline LeftOperand leftOperand() const {
2471  return matrix_;
2472  }
2473  //**********************************************************************************************
2474 
2475  //**Right operand access************************************************************************
2480  inline RightOperand rightOperand() const {
2481  return scalar_;
2482  }
2483  //**********************************************************************************************
2484 
2485  //**********************************************************************************************
2491  template< typename T >
2492  inline bool canAlias( const T* alias ) const {
2493  return matrix_.canAlias( alias );
2494  }
2495  //**********************************************************************************************
2496 
2497  //**********************************************************************************************
2503  template< typename T >
2504  inline bool isAliased( const T* alias ) const {
2505  return matrix_.isAliased( alias );
2506  }
2507  //**********************************************************************************************
2508 
2509  private:
2510  //**Member variables****************************************************************************
2511  LeftOperand matrix_;
2512  RightOperand scalar_;
2513  //**********************************************************************************************
2514 
2515  //**Assignment to dense matrices****************************************************************
2524  template< typename MT3 // Type of the target dense matrix
2525  , bool SO > // Storage order of the target dense matrix
2526  friend inline void assign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
2527  {
2529 
2530  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2531  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2532 
2533  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2534  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2535 
2536  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
2537  return;
2538  }
2539  else if( left.columns() == 0UL ) {
2540  reset( ~lhs );
2541  return;
2542  }
2543 
2544  LT A( left ); // Evaluation of the left-hand side dense matrix operand
2545  RT B( right ); // Evaluation of the right-hand side dense matrix operand
2546 
2547  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2548  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
2549  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
2550  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
2551  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2552  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
2553 
2554  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
2555  DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2556  else
2557  DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2558  }
2559  //**********************************************************************************************
2560 
2561  //**Default assignment to dense matrices********************************************************
2575  template< typename MT3 // Type of the left-hand side target matrix
2576  , typename MT4 // Type of the left-hand side matrix operand
2577  , typename MT5 // Type of the right-hand side matrix operand
2578  , typename ST2 > // Type of the scalar value
2579  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2580  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2581  {
2582  for( size_t i=0UL; i<A.rows(); ++i ) {
2583  for( size_t k=0UL; k<B.columns(); ++k ) {
2584  C(i,k) = A(i,0UL) * B(0UL,k);
2585  }
2586  for( size_t j=1UL; j<A.columns(); ++j ) {
2587  for( size_t k=0UL; k<B.columns(); ++k ) {
2588  C(i,k) += A(i,j) * B(j,k);
2589  }
2590  }
2591  for( size_t k=0UL; k<B.columns(); ++k ) {
2592  C(i,k) *= scalar;
2593  }
2594  }
2595  }
2596  //**********************************************************************************************
2597 
2598  //**Vectorized default assignment to row-major dense matrices***********************************
2612  template< typename MT3 // Type of the left-hand side target matrix
2613  , typename MT4 // Type of the left-hand side matrix operand
2614  , typename MT5 // Type of the right-hand side matrix operand
2615  , typename ST2 > // Type of the scalar value
2616  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2617  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
2618  {
2619  typedef IntrinsicTrait<ElementType> IT;
2620 
2621  const size_t M( A.rows() );
2622  const size_t N( B.spacing() );
2623  const size_t K( A.columns() );
2624 
2625  const IntrinsicType factor( set( scalar ) );
2626 
2627  size_t j( 0UL );
2628 
2629  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2630  for( size_t i=0UL; i<M; ++i ) {
2631  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2632  for( size_t k=0UL; k<K; ++k ) {
2633  const IntrinsicType a1( set( A(i,k) ) );
2634  xmm1 = xmm1 + a1 * B.get(k,j );
2635  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2636  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2637  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2638  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
2639  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
2640  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
2641  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
2642  }
2643  store( &(~C)(i,j ), xmm1 * factor );
2644  store( &(~C)(i,j+IT::size ), xmm2 * factor );
2645  store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2646  store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2647  store( &(~C)(i,j+IT::size*4UL), xmm5 * factor );
2648  store( &(~C)(i,j+IT::size*5UL), xmm6 * factor );
2649  store( &(~C)(i,j+IT::size*6UL), xmm7 * factor );
2650  store( &(~C)(i,j+IT::size*7UL), xmm8 * factor );
2651  }
2652  }
2653  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2654  size_t i( 0UL );
2655  for( ; (i+2UL) <= M; i+=2UL ) {
2656  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2657  for( size_t k=0UL; k<K; ++k ) {
2658  const IntrinsicType a1( set( A(i ,k) ) );
2659  const IntrinsicType a2( set( A(i+1UL,k) ) );
2660  const IntrinsicType b1( B.get(k,j ) );
2661  const IntrinsicType b2( B.get(k,j+IT::size ) );
2662  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
2663  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
2664  xmm1 = xmm1 + a1 * b1;
2665  xmm2 = xmm2 + a1 * b2;
2666  xmm3 = xmm3 + a1 * b3;
2667  xmm4 = xmm4 + a1 * b4;
2668  xmm5 = xmm5 + a2 * b1;
2669  xmm6 = xmm6 + a2 * b2;
2670  xmm7 = xmm7 + a2 * b3;
2671  xmm8 = xmm8 + a2 * b4;
2672  }
2673  store( &(~C)(i ,j ), xmm1 * factor );
2674  store( &(~C)(i ,j+IT::size ), xmm2 * factor );
2675  store( &(~C)(i ,j+IT::size*2UL), xmm3 * factor );
2676  store( &(~C)(i ,j+IT::size*3UL), xmm4 * factor );
2677  store( &(~C)(i+1UL,j ), xmm5 * factor );
2678  store( &(~C)(i+1UL,j+IT::size ), xmm6 * factor );
2679  store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 * factor );
2680  store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 * factor );
2681  }
2682  if( i < M ) {
2683  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2684  for( size_t k=0UL; k<K; ++k ) {
2685  const IntrinsicType a1( set( A(i,k) ) );
2686  xmm1 = xmm1 + a1 * B.get(k,j );
2687  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2688  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2689  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2690  }
2691  store( &(~C)(i,j ), xmm1 * factor );
2692  store( &(~C)(i,j+IT::size ), xmm2 * factor );
2693  store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2694  store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2695  }
2696  }
2697  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2698  size_t i( 0UL );
2699  for( ; (i+2UL) <= M; i+=2UL ) {
2700  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2701  for( size_t k=0UL; k<K; ++k ) {
2702  const IntrinsicType a1( set( A(i ,k) ) );
2703  const IntrinsicType a2( set( A(i+1UL,k) ) );
2704  const IntrinsicType b1( B.get(k,j ) );
2705  const IntrinsicType b2( B.get(k,j+IT::size) );
2706  xmm1 = xmm1 + a1 * b1;
2707  xmm2 = xmm2 + a1 * b2;
2708  xmm3 = xmm3 + a2 * b1;
2709  xmm4 = xmm4 + a2 * b2;
2710  }
2711  store( &(~C)(i ,j ), xmm1 * factor );
2712  store( &(~C)(i ,j+IT::size), xmm2 * factor );
2713  store( &(~C)(i+1UL,j ), xmm3 * factor );
2714  store( &(~C)(i+1UL,j+IT::size), xmm4 * factor );
2715  }
2716  if( i < M ) {
2717  IntrinsicType xmm1, xmm2;
2718  for( size_t k=0UL; k<K; ++k ) {
2719  const IntrinsicType a1( set( A(i,k) ) );
2720  xmm1 = xmm1 + a1 * B.get(k,j );
2721  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
2722  }
2723  store( &(~C)(i,j ), xmm1 * factor );
2724  store( &(~C)(i,j+IT::size), xmm2 * factor );
2725  }
2726  }
2727  if( j < N ) {
2728  size_t i( 0UL );
2729  for( ; (i+2UL) <= M; i+=2UL ) {
2730  IntrinsicType xmm1, xmm2;
2731  for( size_t k=0UL; k<K; ++k ) {
2732  const IntrinsicType b1( B.get(k,j) );
2733  xmm1 = xmm1 + set( A(i ,k) ) * b1;
2734  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
2735  }
2736  store( &(~C)(i ,j), xmm1 * factor );
2737  store( &(~C)(i+1UL,j), xmm2 * factor );
2738  }
2739  if( i < M ) {
2740  IntrinsicType xmm1;
2741  for( size_t k=0UL; k<K; ++k ) {
2742  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
2743  }
2744  store( &(~C)(i,j), xmm1 * factor );
2745  }
2746  }
2747  }
2748  //**********************************************************************************************
2749 
2750  //**Vectorized default assignment to column-major dense matrices********************************
2764  template< typename MT3 // Type of the left-hand side target matrix
2765  , typename MT4 // Type of the left-hand side matrix operand
2766  , typename MT5 // Type of the right-hand side matrix operand
2767  , typename ST2 > // Type of the scalar value
2768  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2769  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
2770  {
2771  typedef IntrinsicTrait<ElementType> IT;
2772 
2773  const size_t M( A.spacing() );
2774  const size_t N( B.columns() );
2775  const size_t K( A.columns() );
2776 
2777  const IntrinsicType factor( set( scalar ) );
2778 
2779  size_t i( 0UL );
2780 
2781  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2782  for( size_t j=0UL; j<N; ++j ) {
2783  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2784  for( size_t k=0UL; k<K; ++k ) {
2785  const IntrinsicType b1( set( B(k,j) ) );
2786  xmm1 = xmm1 + A.get(i ,k) * b1;
2787  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2788  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2789  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2790  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
2791  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
2792  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
2793  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
2794  }
2795  store( &(~C)(i ,j), xmm1 * factor );
2796  store( &(~C)(i+IT::size ,j), xmm2 * factor );
2797  store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2798  store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2799  store( &(~C)(i+IT::size*4UL,j), xmm5 * factor );
2800  store( &(~C)(i+IT::size*5UL,j), xmm6 * factor );
2801  store( &(~C)(i+IT::size*6UL,j), xmm7 * factor );
2802  store( &(~C)(i+IT::size*7UL,j), xmm8 * factor );
2803  }
2804  }
2805  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2806  size_t j( 0UL );
2807  for( ; (j+2UL) <= N; j+=2UL ) {
2808  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2809  for( size_t k=0UL; k<K; ++k ) {
2810  const IntrinsicType a1( A.get(i ,k) );
2811  const IntrinsicType a2( A.get(i+IT::size ,k) );
2812  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
2813  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
2814  const IntrinsicType b1( set( B(k,j ) ) );
2815  const IntrinsicType b2( set( B(k,j+1UL) ) );
2816  xmm1 = xmm1 + a1 * b1;
2817  xmm2 = xmm2 + a2 * b1;
2818  xmm3 = xmm3 + a3 * b1;
2819  xmm4 = xmm4 + a4 * b1;
2820  xmm5 = xmm5 + a1 * b2;
2821  xmm6 = xmm6 + a2 * b2;
2822  xmm7 = xmm7 + a3 * b2;
2823  xmm8 = xmm8 + a4 * b2;
2824  }
2825  store( &(~C)(i ,j ), xmm1 * factor );
2826  store( &(~C)(i+IT::size ,j ), xmm2 * factor );
2827  store( &(~C)(i+IT::size*2UL,j ), xmm3 * factor );
2828  store( &(~C)(i+IT::size*3UL,j ), xmm4 * factor );
2829  store( &(~C)(i ,j+1UL), xmm5 * factor );
2830  store( &(~C)(i+IT::size ,j+1UL), xmm6 * factor );
2831  store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 * factor );
2832  store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 * factor );
2833  }
2834  if( j < N ) {
2835  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2836  for( size_t k=0UL; k<K; ++k ) {
2837  const IntrinsicType b1( set( B(k,j) ) );
2838  xmm1 = xmm1 + A.get(i ,k) * b1;
2839  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2840  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2841  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2842  }
2843  store( &(~C)(i ,j), xmm1 * factor );
2844  store( &(~C)(i+IT::size ,j), xmm2 * factor );
2845  store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2846  store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2847  }
2848  }
2849  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2850  size_t j( 0UL );
2851  for( ; (j+2UL) <= N; j+=2UL ) {
2852  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2853  for( size_t k=0UL; k<K; ++k ) {
2854  const IntrinsicType a1( A.get(i ,k) );
2855  const IntrinsicType a2( A.get(i+IT::size,k) );
2856  const IntrinsicType b1( set( B(k,j ) ) );
2857  const IntrinsicType b2( set( B(k,j+1UL) ) );
2858  xmm1 = xmm1 + a1 * b1;
2859  xmm2 = xmm2 + a2 * b1;
2860  xmm3 = xmm3 + a1 * b2;
2861  xmm4 = xmm4 + a2 * b2;
2862  }
2863  store( &(~C)(i ,j ), xmm1 * factor );
2864  store( &(~C)(i+IT::size,j ), xmm2 * factor );
2865  store( &(~C)(i ,j+1UL), xmm3 * factor );
2866  store( &(~C)(i+IT::size,j+1UL), xmm4 * factor );
2867  }
2868  if( j < N ) {
2869  IntrinsicType xmm1, xmm2;
2870  for( size_t k=0UL; k<K; ++k ) {
2871  const IntrinsicType b1( set( B(k,j) ) );
2872  xmm1 = xmm1 + A.get(i ,k) * b1;
2873  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
2874  }
2875  store( &(~C)(i ,j), xmm1 * factor );
2876  store( &(~C)(i+IT::size,j), xmm2 * factor );
2877  }
2878  }
2879  if( i < M ) {
2880  size_t j( 0UL );
2881  for( ; (j+2UL) <= N; j+=2UL ) {
2882  IntrinsicType xmm1, xmm2;
2883  for( size_t k=0UL; k<K; ++k ) {
2884  const IntrinsicType a1( A.get(i,k) );
2885  xmm1 = xmm1 + a1 * set( B(k,j ) );
2886  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
2887  }
2888  store( &(~C)(i,j ), xmm1 * factor );
2889  store( &(~C)(i,j+1UL), xmm2 * factor );
2890  }
2891  if( j < N ) {
2892  IntrinsicType xmm1;
2893  for( size_t k=0UL; k<K; ++k ) {
2894  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
2895  }
2896  store( &(~C)(i,j), xmm1 * factor );
2897  }
2898  }
2899  }
2900  //**********************************************************************************************
2901 
2902  //**BLAS-based assignment to dense matrices (default)*******************************************
2916  template< typename MT3 // Type of the left-hand side target matrix
2917  , typename MT4 // Type of the left-hand side matrix operand
2918  , typename MT5 // Type of the right-hand side matrix operand
2919  , typename ST2 > // Type of the scalar value
2920  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2921  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2922  {
2923  selectDefaultAssignKernel( C, A, B, scalar );
2924  }
2925  //**********************************************************************************************
2926 
2927  //**BLAS-based assignment to dense matrices (single precision)**********************************
2928 #if BLAZE_BLAS_MODE
2929 
2942  template< typename MT3 // Type of the left-hand side target matrix
2943  , typename MT4 // Type of the left-hand side matrix operand
2944  , typename MT5 // Type of the right-hand side matrix operand
2945  , typename ST2 > // Type of the scalar value
2946  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2947  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2948  {
2949  using boost::numeric_cast;
2950 
2951  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
2952  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
2953  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
2954 
2955  const int M ( numeric_cast<int>( A.rows() ) );
2956  const int N ( numeric_cast<int>( B.columns() ) );
2957  const int K ( numeric_cast<int>( A.columns() ) );
2958  const int lda( numeric_cast<int>( A.spacing() ) );
2959  const int ldb( numeric_cast<int>( B.spacing() ) );
2960  const int ldc( numeric_cast<int>( C.spacing() ) );
2961 
2962  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2963  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2964  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2965  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2966  }
2967 #endif
2968  //**********************************************************************************************
2969 
2970  //**BLAS-based assignment to dense matrices (double precision)**********************************
2971 #if BLAZE_BLAS_MODE
2972 
2985  template< typename MT3 // Type of the left-hand side target matrix
2986  , typename MT4 // Type of the left-hand side matrix operand
2987  , typename MT5 // Type of the right-hand side matrix operand
2988  , typename ST2 > // Type of the scalar value
2989  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2990  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2991  {
2992  using boost::numeric_cast;
2993 
2994  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
2995  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
2996  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
2997 
2998  const int M ( numeric_cast<int>( A.rows() ) );
2999  const int N ( numeric_cast<int>( B.columns() ) );
3000  const int K ( numeric_cast<int>( A.columns() ) );
3001  const int lda( numeric_cast<int>( A.spacing() ) );
3002  const int ldb( numeric_cast<int>( B.spacing() ) );
3003  const int ldc( numeric_cast<int>( C.spacing() ) );
3004 
3005  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3006  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3007  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3008  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
3009  }
3010 #endif
3011  //**********************************************************************************************
3012 
3013  //**BLAS-based assignment to dense matrices (single precision complex)**************************
3014 #if BLAZE_BLAS_MODE
3015 
3028  template< typename MT3 // Type of the left-hand side target matrix
3029  , typename MT4 // Type of the left-hand side matrix operand
3030  , typename MT5 // Type of the right-hand side matrix operand
3031  , typename ST2 > // Type of the scalar value
3032  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3033  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3034  {
3035  using boost::numeric_cast;
3036 
3037  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3038  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3039  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3041  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3042  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3043  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3044 
3045  const int M ( numeric_cast<int>( A.rows() ) );
3046  const int N ( numeric_cast<int>( B.columns() ) );
3047  const int K ( numeric_cast<int>( A.columns() ) );
3048  const int lda( numeric_cast<int>( A.spacing() ) );
3049  const int ldb( numeric_cast<int>( B.spacing() ) );
3050  const int ldc( numeric_cast<int>( C.spacing() ) );
3051  const complex<float> alpha( scalar );
3052  const complex<float> beta ( 0.0F, 0.0F );
3053 
3054  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3055  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3056  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3057  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3058  }
3059 #endif
3060  //**********************************************************************************************
3061 
3062  //**BLAS-based assignment to dense matrices (double precision complex)**************************
3063 #if BLAZE_BLAS_MODE
3064 
3077  template< typename MT3 // Type of the left-hand side target matrix
3078  , typename MT4 // Type of the left-hand side matrix operand
3079  , typename MT5 // Type of the right-hand side matrix operand
3080  , typename ST2 > // Type of the scalar value
3081  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3082  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3083  {
3084  using boost::numeric_cast;
3085 
3086  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3087  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3088  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3090  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3091  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3092  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3093 
3094  const int M ( numeric_cast<int>( A.rows() ) );
3095  const int N ( numeric_cast<int>( B.columns() ) );
3096  const int K ( numeric_cast<int>( A.columns() ) );
3097  const int lda( numeric_cast<int>( A.spacing() ) );
3098  const int ldb( numeric_cast<int>( B.spacing() ) );
3099  const int ldc( numeric_cast<int>( C.spacing() ) );
3100  const complex<double> alpha( scalar );
3101  const complex<double> beta ( 0.0, 0.0 );
3102 
3103  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3104  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3105  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3106  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3107  }
3108 #endif
3109  //**********************************************************************************************
3110 
3111  //**Assignment to sparse matrices***************************************************************
3123  template< typename MT // Type of the target sparse matrix
3124  , bool SO > // Storage order of the target sparse matrix
3125  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
3126  {
3128 
3129  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3130 
3136  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( typename TmpType::CompositeType );
3137 
3138  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3139  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3140 
3141  const TmpType tmp( rhs );
3142  assign( ~lhs, tmp );
3143  }
3144  //**********************************************************************************************
3145 
3146  //**Addition assignment to dense matrices*******************************************************
3158  template< typename MT3 // Type of the target dense matrix
3159  , bool SO > // Storage order of the target dense matrix
3160  friend inline void addAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
3161  {
3163 
3164  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3165  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3166 
3167  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3168  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3169 
3170  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3171  return;
3172  }
3173 
3174  LT A( left ); // Evaluation of the left-hand side dense matrix operand
3175  RT B( right ); // Evaluation of the right-hand side dense matrix operand
3176 
3177  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3178  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3179  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3180  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3181  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3182  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3183 
3184  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
3185  DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3186  else
3187  DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3188  }
3189  //**********************************************************************************************
3190 
3191  //**Default addition assignment to dense matrices***********************************************
3205  template< typename MT3 // Type of the left-hand side target matrix
3206  , typename MT4 // Type of the left-hand side matrix operand
3207  , typename MT5 // Type of the right-hand side matrix operand
3208  , typename ST2 > // Type of the scalar value
3209  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3210  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3211  {
3212  const ResultType tmp( A * B * scalar );
3213  addAssign( C, tmp );
3214  }
3215  //**********************************************************************************************
3216 
3217  //**Vectorized default addition assignment to row-major dense matrices**************************
3231  template< typename MT3 // Type of the left-hand side target matrix
3232  , typename MT4 // Type of the left-hand side matrix operand
3233  , typename MT5 // Type of the right-hand side matrix operand
3234  , typename ST2 > // Type of the scalar value
3235  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3236  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3237  {
3238  typedef IntrinsicTrait<ElementType> IT;
3239 
3240  const size_t M( A.rows() );
3241  const size_t N( B.spacing() );
3242  const size_t K( A.columns() );
3243 
3244  const IntrinsicType factor( set( scalar ) );
3245 
3246  size_t j( 0UL );
3247 
3248  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3249  for( size_t i=0UL; i<M; ++i ) {
3250  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3251  for( size_t k=0UL; k<K; ++k ) {
3252  const IntrinsicType a1( set( A(i,k) ) );
3253  xmm1 = xmm1 + a1 * B.get(k,j );
3254  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3255  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3256  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3257  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3258  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3259  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3260  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3261  }
3262  store( &(~C)(i,j ), load( &(~C)(i,j ) ) + xmm1 * factor );
3263  store( &(~C)(i,j+IT::size ), load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
3264  store( &(~C)(i,j+IT::size*2UL), load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
3265  store( &(~C)(i,j+IT::size*3UL), load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
3266  store( &(~C)(i,j+IT::size*4UL), load( &(~C)(i,j+IT::size*4UL) ) + xmm5 * factor );
3267  store( &(~C)(i,j+IT::size*5UL), load( &(~C)(i,j+IT::size*5UL) ) + xmm6 * factor );
3268  store( &(~C)(i,j+IT::size*6UL), load( &(~C)(i,j+IT::size*6UL) ) + xmm7 * factor );
3269  store( &(~C)(i,j+IT::size*7UL), load( &(~C)(i,j+IT::size*7UL) ) + xmm8 * factor );
3270  }
3271  }
3272  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3273  size_t i( 0UL );
3274  for( ; (i+2UL) <= M; i+=2UL ) {
3275  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3276  for( size_t k=0UL; k<K; ++k ) {
3277  const IntrinsicType a1( set( A(i ,k) ) );
3278  const IntrinsicType a2( set( A(i+1UL,k) ) );
3279  const IntrinsicType b1( B.get(k,j ) );
3280  const IntrinsicType b2( B.get(k,j+IT::size ) );
3281  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
3282  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
3283  xmm1 = xmm1 + a1 * b1;
3284  xmm2 = xmm2 + a1 * b2;
3285  xmm3 = xmm3 + a1 * b3;
3286  xmm4 = xmm4 + a1 * b4;
3287  xmm5 = xmm5 + a2 * b1;
3288  xmm6 = xmm6 + a2 * b2;
3289  xmm7 = xmm7 + a2 * b3;
3290  xmm8 = xmm8 + a2 * b4;
3291  }
3292  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) + xmm1 * factor );
3293  store( &(~C)(i ,j+IT::size ), load( &(~C)(i ,j+IT::size ) ) + xmm2 * factor );
3294  store( &(~C)(i ,j+IT::size*2UL), load( &(~C)(i ,j+IT::size*2UL) ) + xmm3 * factor );
3295  store( &(~C)(i ,j+IT::size*3UL), load( &(~C)(i ,j+IT::size*3UL) ) + xmm4 * factor );
3296  store( &(~C)(i+1UL,j ), load( &(~C)(i+1UL,j ) ) + xmm5 * factor );
3297  store( &(~C)(i+1UL,j+IT::size ), load( &(~C)(i+1UL,j+IT::size ) ) + xmm6 * factor );
3298  store( &(~C)(i+1UL,j+IT::size*2UL), load( &(~C)(i+1UL,j+IT::size*2UL) ) + xmm7 * factor );
3299  store( &(~C)(i+1UL,j+IT::size*3UL), load( &(~C)(i+1UL,j+IT::size*3UL) ) + xmm8 * factor );
3300  }
3301  if( i < M ) {
3302  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3303  for( size_t k=0UL; k<K; ++k ) {
3304  const IntrinsicType a1( set( A(i,k) ) );
3305  xmm1 = xmm1 + a1 * B.get(k,j );
3306  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3307  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3308  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3309  }
3310  store( &(~C)(i,j ), load( &(~C)(i,j ) ) + xmm1 * factor );
3311  store( &(~C)(i,j+IT::size ), load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
3312  store( &(~C)(i,j+IT::size*2UL), load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
3313  store( &(~C)(i,j+IT::size*3UL), load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
3314  }
3315  }
3316  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3317  size_t i( 0UL );
3318  for( ; (i+2UL) <= M; i+=2UL ) {
3319  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3320  for( size_t k=0UL; k<K; ++k ) {
3321  const IntrinsicType a1( set( A(i ,k) ) );
3322  const IntrinsicType a2( set( A(i+1UL,k) ) );
3323  const IntrinsicType b1( B.get(k,j ) );
3324  const IntrinsicType b2( B.get(k,j+IT::size) );
3325  xmm1 = xmm1 + a1 * b1;
3326  xmm2 = xmm2 + a1 * b2;
3327  xmm3 = xmm3 + a2 * b1;
3328  xmm4 = xmm4 + a2 * b2;
3329  }
3330  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) + xmm1 * factor );
3331  store( &(~C)(i ,j+IT::size), load( &(~C)(i ,j+IT::size) ) + xmm2 * factor );
3332  store( &(~C)(i+1UL,j ), load( &(~C)(i+1UL,j ) ) + xmm3 * factor );
3333  store( &(~C)(i+1UL,j+IT::size), load( &(~C)(i+1UL,j+IT::size) ) + xmm4 * factor );
3334  }
3335  if( i < M ) {
3336  IntrinsicType xmm1, xmm2;
3337  for( size_t k=0UL; k<K; ++k ) {
3338  const IntrinsicType a1( set( A(i,k) ) );
3339  xmm1 = xmm1 + a1 * B.get(k,j );
3340  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3341  }
3342  store( &(~C)(i,j ), load( &(~C)(i,j ) ) + xmm1 * factor );
3343  store( &(~C)(i,j+IT::size), load( &(~C)(i,j+IT::size) ) + xmm2 * factor );
3344  }
3345  }
3346  if( j < N ) {
3347  size_t i( 0UL );
3348  for( ; (i+2UL) <= M; i+=2UL ) {
3349  IntrinsicType xmm1, xmm2;
3350  for( size_t k=0UL; k<K; ++k ) {
3351  const IntrinsicType b1( B.get(k,j) );
3352  xmm1 = xmm1 + set( A(i ,k) ) * b1;
3353  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
3354  }
3355  store( &(~C)(i ,j), load( &(~C)(i ,j) ) + xmm1 * factor );
3356  store( &(~C)(i+1UL,j), load( &(~C)(i+1UL,j) ) + xmm2 * factor );
3357  }
3358  if( i < M ) {
3359  IntrinsicType xmm1;
3360  for( size_t k=0UL; k<K; ++k ) {
3361  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
3362  }
3363  store( &(~C)(i,j), load( &(~C)(i,j) ) + xmm1 * factor );
3364  }
3365  }
3366  }
3367  //**********************************************************************************************
3368 
3369  //**Vectorized default addition assignment to column-major dense matrices***********************
3383  template< typename MT3 // Type of the left-hand side target matrix
3384  , typename MT4 // Type of the left-hand side matrix operand
3385  , typename MT5 // Type of the right-hand side matrix operand
3386  , typename ST2 > // Type of the scalar value
3387  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3388  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3389  {
3390  typedef IntrinsicTrait<ElementType> IT;
3391 
3392  const size_t M( A.spacing() );
3393  const size_t N( B.columns() );
3394  const size_t K( A.columns() );
3395 
3396  const IntrinsicType factor( set( scalar ) );
3397 
3398  size_t i( 0UL );
3399 
3400  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3401  for( size_t j=0UL; j<N; ++j ) {
3402  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3403  for( size_t k=0UL; k<K; ++k ) {
3404  const IntrinsicType b1( set( B(k,j) ) );
3405  xmm1 = xmm1 + A.get(i ,k) * b1;
3406  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3407  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3408  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3409  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3410  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3411  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3412  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
3413  }
3414  store( &(~C)(i ,j), load( &(~C)(i ,j) ) + xmm1 * factor );
3415  store( &(~C)(i+IT::size ,j), load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
3416  store( &(~C)(i+IT::size*2UL,j), load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
3417  store( &(~C)(i+IT::size*3UL,j), load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
3418  store( &(~C)(i+IT::size*4UL,j), load( &(~C)(i+IT::size*4UL,j) ) + xmm5 * factor );
3419  store( &(~C)(i+IT::size*5UL,j), load( &(~C)(i+IT::size*5UL,j) ) + xmm6 * factor );
3420  store( &(~C)(i+IT::size*6UL,j), load( &(~C)(i+IT::size*6UL,j) ) + xmm7 * factor );
3421  store( &(~C)(i+IT::size*7UL,j), load( &(~C)(i+IT::size*7UL,j) ) + xmm8 * factor );
3422  }
3423  }
3424  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
3425  size_t j( 0UL );
3426  for( ; (j+2UL) <= N; j+=2UL ) {
3427  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3428  for( size_t k=0UL; k<K; ++k ) {
3429  const IntrinsicType a1( A.get(i ,k) );
3430  const IntrinsicType a2( A.get(i+IT::size ,k) );
3431  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
3432  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
3433  const IntrinsicType b1( set( B(k,j ) ) );
3434  const IntrinsicType b2( set( B(k,j+1UL) ) );
3435  xmm1 = xmm1 + a1 * b1;
3436  xmm2 = xmm2 + a2 * b1;
3437  xmm3 = xmm3 + a3 * b1;
3438  xmm4 = xmm4 + a4 * b1;
3439  xmm5 = xmm5 + a1 * b2;
3440  xmm6 = xmm6 + a2 * b2;
3441  xmm7 = xmm7 + a3 * b2;
3442  xmm8 = xmm8 + a4 * b2;
3443  }
3444  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) + xmm1 * factor );
3445  store( &(~C)(i+IT::size ,j ), load( &(~C)(i+IT::size ,j ) ) + xmm2 * factor );
3446  store( &(~C)(i+IT::size*2UL,j ), load( &(~C)(i+IT::size*2UL,j ) ) + xmm3 * factor );
3447  store( &(~C)(i+IT::size*3UL,j ), load( &(~C)(i+IT::size*3UL,j ) ) + xmm4 * factor );
3448  store( &(~C)(i ,j+1UL), load( &(~C)(i ,j+1UL) ) + xmm5 * factor );
3449  store( &(~C)(i+IT::size ,j+1UL), load( &(~C)(i+IT::size ,j+1UL) ) + xmm6 * factor );
3450  store( &(~C)(i+IT::size*2UL,j+1UL), load( &(~C)(i+IT::size*2UL,j+1UL) ) + xmm7 * factor );
3451  store( &(~C)(i+IT::size*3UL,j+1UL), load( &(~C)(i+IT::size*3UL,j+1UL) ) + xmm8 * factor );
3452  }
3453  if( j < N ) {
3454  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3455  for( size_t k=0UL; k<K; ++k ) {
3456  const IntrinsicType b1( set( B(k,j) ) );
3457  xmm1 = xmm1 + A.get(i ,k) * b1;
3458  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3459  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3460  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3461  }
3462  store( &(~C)(i ,j), load( &(~C)(i ,j) ) + xmm1 * factor );
3463  store( &(~C)(i+IT::size ,j), load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
3464  store( &(~C)(i+IT::size*2UL,j), load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
3465  store( &(~C)(i+IT::size*3UL,j), load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
3466  }
3467  }
3468  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
3469  size_t j( 0UL );
3470  for( ; (j+2UL) <= N; j+=2UL ) {
3471  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3472  for( size_t k=0UL; k<K; ++k ) {
3473  const IntrinsicType a1( A.get(i ,k) );
3474  const IntrinsicType a2( A.get(i+IT::size,k) );
3475  const IntrinsicType b1( set( B(k,j ) ) );
3476  const IntrinsicType b2( set( B(k,j+1UL) ) );
3477  xmm1 = xmm1 + a1 * b1;
3478  xmm2 = xmm2 + a2 * b1;
3479  xmm3 = xmm3 + a1 * b2;
3480  xmm4 = xmm4 + a2 * b2;
3481  }
3482  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) + xmm1 * factor );
3483  store( &(~C)(i+IT::size,j ), load( &(~C)(i+IT::size,j ) ) + xmm2 * factor );
3484  store( &(~C)(i ,j+1UL), load( &(~C)(i ,j+1UL) ) + xmm3 * factor );
3485  store( &(~C)(i+IT::size,j+1UL), load( &(~C)(i+IT::size,j+1UL) ) + xmm4 * factor );
3486  }
3487  if( j < N ) {
3488  IntrinsicType xmm1, xmm2;
3489  for( size_t k=0UL; k<K; ++k ) {
3490  const IntrinsicType b1( set( B(k,j) ) );
3491  xmm1 = xmm1 + A.get(i ,k) * b1;
3492  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
3493  }
3494  store( &(~C)(i ,j), load( &(~C)(i ,j) ) + xmm1 * factor );
3495  store( &(~C)(i+IT::size,j), load( &(~C)(i+IT::size,j) ) + xmm2 * factor );
3496  }
3497  }
3498  if( i < M ) {
3499  size_t j( 0UL );
3500  for( ; (j+2UL) <= N; j+=2UL ) {
3501  IntrinsicType xmm1, xmm2;
3502  for( size_t k=0UL; k<K; ++k ) {
3503  const IntrinsicType a1( A.get(i,k) );
3504  xmm1 = xmm1 + a1 * set( B(k,j ) );
3505  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
3506  }
3507  store( &(~C)(i,j ), load( &(~C)(i,j ) ) + xmm1 * factor );
3508  store( &(~C)(i,j+1UL), load( &(~C)(i,j+1UL) ) + xmm2 * factor );
3509  }
3510  if( j < N ) {
3511  IntrinsicType xmm1;
3512  for( size_t k=0UL; k<K; ++k ) {
3513  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
3514  }
3515  store( &(~C)(i,j), load( &(~C)(i,j) ) + xmm1 * factor );
3516  }
3517  }
3518  }
3519  //**********************************************************************************************
3520 
3521  //**BLAS-based addition assignment to dense matrices (default)**********************************
3535  template< typename MT3 // Type of the left-hand side target matrix
3536  , typename MT4 // Type of the left-hand side matrix operand
3537  , typename MT5 // Type of the right-hand side matrix operand
3538  , typename ST2 > // Type of the scalar value
3539  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3540  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3541  {
3542  selectDefaultAddAssignKernel( C, A, B, scalar );
3543  }
3544  //**********************************************************************************************
3545 
3546  //**BLAS-based addition assignment to dense matrices (single precision)*************************
3547 #if BLAZE_BLAS_MODE
3548 
3561  template< typename MT3 // Type of the left-hand side target matrix
3562  , typename MT4 // Type of the left-hand side matrix operand
3563  , typename MT5 // Type of the right-hand side matrix operand
3564  , typename ST2 > // Type of the scalar value
3565  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3566  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3567  {
3568  using boost::numeric_cast;
3569 
3570  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
3571  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
3572  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
3573 
3574  const int M ( numeric_cast<int>( A.rows() ) );
3575  const int N ( numeric_cast<int>( B.columns() ) );
3576  const int K ( numeric_cast<int>( A.columns() ) );
3577  const int lda( numeric_cast<int>( A.spacing() ) );
3578  const int ldb( numeric_cast<int>( B.spacing() ) );
3579  const int ldc( numeric_cast<int>( C.spacing() ) );
3580 
3581  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3582  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3583  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3584  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3585  }
3586 #endif
3587  //**********************************************************************************************
3588 
3589  //**BLAS-based addition assignment to dense matrices (double precision)*************************
3590 #if BLAZE_BLAS_MODE
3591 
3604  template< typename MT3 // Type of the left-hand side target matrix
3605  , typename MT4 // Type of the left-hand side matrix operand
3606  , typename MT5 // Type of the right-hand side matrix operand
3607  , typename ST2 > // Type of the scalar value
3608  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3609  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3610  {
3611  using boost::numeric_cast;
3612 
3613  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
3614  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
3615  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
3616 
3617  const int M ( numeric_cast<int>( A.rows() ) );
3618  const int N ( numeric_cast<int>( B.columns() ) );
3619  const int K ( numeric_cast<int>( A.columns() ) );
3620  const int lda( numeric_cast<int>( A.spacing() ) );
3621  const int ldb( numeric_cast<int>( B.spacing() ) );
3622  const int ldc( numeric_cast<int>( C.spacing() ) );
3623 
3624  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3625  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3626  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3627  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3628  }
3629 #endif
3630  //**********************************************************************************************
3631 
3632  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
3633 #if BLAZE_BLAS_MODE
3634 
3647  template< typename MT3 // Type of the left-hand side target matrix
3648  , typename MT4 // Type of the left-hand side matrix operand
3649  , typename MT5 // Type of the right-hand side matrix operand
3650  , typename ST2 > // Type of the scalar value
3651  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3652  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3653  {
3654  using boost::numeric_cast;
3655 
3656  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3657  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3658  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3660  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3661  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3662  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3663 
3664  const int M ( numeric_cast<int>( A.rows() ) );
3665  const int N ( numeric_cast<int>( B.columns() ) );
3666  const int K ( numeric_cast<int>( A.columns() ) );
3667  const int lda( numeric_cast<int>( A.spacing() ) );
3668  const int ldb( numeric_cast<int>( B.spacing() ) );
3669  const int ldc( numeric_cast<int>( C.spacing() ) );
3670  const complex<float> alpha( scalar );
3671  const complex<float> beta ( 1.0F, 0.0F );
3672 
3673  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3674  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3675  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3676  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3677  }
3678 #endif
3679  //**********************************************************************************************
3680 
3681  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
3682 #if BLAZE_BLAS_MODE
3683 
3696  template< typename MT3 // Type of the left-hand side target matrix
3697  , typename MT4 // Type of the left-hand side matrix operand
3698  , typename MT5 // Type of the right-hand side matrix operand
3699  , typename ST2 > // Type of the scalar value
3700  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3701  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3702  {
3703  using boost::numeric_cast;
3704 
3705  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3706  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3707  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3709  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3710  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3711  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3712 
3713  const int M ( numeric_cast<int>( A.rows() ) );
3714  const int N ( numeric_cast<int>( B.columns() ) );
3715  const int K ( numeric_cast<int>( A.columns() ) );
3716  const int lda( numeric_cast<int>( A.spacing() ) );
3717  const int ldb( numeric_cast<int>( B.spacing() ) );
3718  const int ldc( numeric_cast<int>( C.spacing() ) );
3719  const complex<double> alpha( scalar );
3720  const complex<double> beta ( 1.0, 0.0 );
3721 
3722  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3723  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3724  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3725  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3726  }
3727 #endif
3728  //**********************************************************************************************
3729 
3730  //**Addition assignment to sparse matrices******************************************************
3731  // No special implementation for the addition assignment to sparse matrices.
3732  //**********************************************************************************************
3733 
3734  //**Subtraction assignment to dense matrices****************************************************
3746  template< typename MT3 // Type of the target dense matrix
3747  , bool SO > // Storage order of the target dense matrix
3748  friend inline void subAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
3749  {
3751 
3752  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3753  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3754 
3755  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3756  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3757 
3758  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3759  return;
3760  }
3761 
3762  LT A( left ); // Evaluation of the left-hand side dense matrix operand
3763  RT B( right ); // Evaluation of the right-hand side dense matrix operand
3764 
3765  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3766  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3767  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3768  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3769  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3770  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3771 
3772  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
3773  DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3774  else
3775  DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3776  }
3777  //**********************************************************************************************
3778 
3779  //**Default subtraction assignment to dense matrices********************************************
3793  template< typename MT3 // Type of the left-hand side target matrix
3794  , typename MT4 // Type of the left-hand side matrix operand
3795  , typename MT5 // Type of the right-hand side matrix operand
3796  , typename ST2 > // Type of the scalar value
3797  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3798  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3799  {
3800  const ResultType tmp( A * B * scalar );
3801  subAssign( C, tmp );
3802  }
3803  //**********************************************************************************************
3804 
3805  //**Vectorized default subtraction assignment to row-major dense matrices***********************
3819  template< typename MT3 // Type of the left-hand side target matrix
3820  , typename MT4 // Type of the left-hand side matrix operand
3821  , typename MT5 // Type of the right-hand side matrix operand
3822  , typename ST2 > // Type of the scalar value
3823  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3824  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3825  {
3826  typedef IntrinsicTrait<ElementType> IT;
3827 
3828  const size_t M( A.rows() );
3829  const size_t N( B.spacing() );
3830  const size_t K( A.columns() );
3831 
3832  const IntrinsicType factor( set( scalar ) );
3833 
3834  size_t j( 0UL );
3835 
3836  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3837  for( size_t i=0UL; i<M; ++i ) {
3838  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3839  for( size_t k=0UL; k<K; ++k ) {
3840  const IntrinsicType a1( set( A(i,k) ) );
3841  xmm1 = xmm1 + a1 * B.get(k,j );
3842  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3843  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3844  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3845  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3846  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3847  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3848  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3849  }
3850  store( &(~C)(i,j ), load( &(~C)(i,j ) ) - xmm1 * factor );
3851  store( &(~C)(i,j+IT::size ), load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3852  store( &(~C)(i,j+IT::size*2UL), load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3853  store( &(~C)(i,j+IT::size*3UL), load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3854  store( &(~C)(i,j+IT::size*4UL), load( &(~C)(i,j+IT::size*4UL) ) - xmm5 * factor );
3855  store( &(~C)(i,j+IT::size*5UL), load( &(~C)(i,j+IT::size*5UL) ) - xmm6 * factor );
3856  store( &(~C)(i,j+IT::size*6UL), load( &(~C)(i,j+IT::size*6UL) ) - xmm7 * factor );
3857  store( &(~C)(i,j+IT::size*7UL), load( &(~C)(i,j+IT::size*7UL) ) - xmm8 * factor );
3858  }
3859  }
3860  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3861  size_t i( 0UL );
3862  for( ; (i+2UL) <= M; i+=2UL ) {
3863  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3864  for( size_t k=0UL; k<K; ++k ) {
3865  const IntrinsicType a1( set( A(i ,k) ) );
3866  const IntrinsicType a2( set( A(i+1UL,k) ) );
3867  const IntrinsicType b1( B.get(k,j ) );
3868  const IntrinsicType b2( B.get(k,j+IT::size ) );
3869  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
3870  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
3871  xmm1 = xmm1 + a1 * b1;
3872  xmm2 = xmm2 + a1 * b2;
3873  xmm3 = xmm3 + a1 * b3;
3874  xmm4 = xmm4 + a1 * b4;
3875  xmm5 = xmm5 + a2 * b1;
3876  xmm6 = xmm6 + a2 * b2;
3877  xmm7 = xmm7 + a2 * b3;
3878  xmm8 = xmm8 + a2 * b4;
3879  }
3880  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) - xmm1 * factor );
3881  store( &(~C)(i ,j+IT::size ), load( &(~C)(i ,j+IT::size ) ) - xmm2 * factor );
3882  store( &(~C)(i ,j+IT::size*2UL), load( &(~C)(i ,j+IT::size*2UL) ) - xmm3 * factor );
3883  store( &(~C)(i ,j+IT::size*3UL), load( &(~C)(i ,j+IT::size*3UL) ) - xmm4 * factor );
3884  store( &(~C)(i+1UL,j ), load( &(~C)(i+1UL,j ) ) - xmm5 * factor );
3885  store( &(~C)(i+1UL,j+IT::size ), load( &(~C)(i+1UL,j+IT::size ) ) - xmm6 * factor );
3886  store( &(~C)(i+1UL,j+IT::size*2UL), load( &(~C)(i+1UL,j+IT::size*2UL) ) - xmm7 * factor );
3887  store( &(~C)(i+1UL,j+IT::size*3UL), load( &(~C)(i+1UL,j+IT::size*3UL) ) - xmm8 * factor );
3888  }
3889  if( i < M ) {
3890  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3891  for( size_t k=0UL; k<K; ++k ) {
3892  const IntrinsicType a1( set( A(i,k) ) );
3893  xmm1 = xmm1 + a1 * B.get(k,j );
3894  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3895  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3896  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3897  }
3898  store( &(~C)(i,j ), load( &(~C)(i,j ) ) - xmm1 * factor );
3899  store( &(~C)(i,j+IT::size ), load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3900  store( &(~C)(i,j+IT::size*2UL), load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3901  store( &(~C)(i,j+IT::size*3UL), load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3902  }
3903  }
3904  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3905  size_t i( 0UL );
3906  for( ; (i+2UL) <= M; i+=2UL ) {
3907  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3908  for( size_t k=0UL; k<K; ++k ) {
3909  const IntrinsicType a1( set( A(i ,k) ) );
3910  const IntrinsicType a2( set( A(i+1UL,k) ) );
3911  const IntrinsicType b1( B.get(k,j ) );
3912  const IntrinsicType b2( B.get(k,j+IT::size) );
3913  xmm1 = xmm1 + a1 * b1;
3914  xmm2 = xmm2 + a1 * b2;
3915  xmm3 = xmm3 + a2 * b1;
3916  xmm4 = xmm4 + a2 * b2;
3917  }
3918  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) - xmm1 * factor );
3919  store( &(~C)(i ,j+IT::size), load( &(~C)(i ,j+IT::size) ) - xmm2 * factor );
3920  store( &(~C)(i+1UL,j ), load( &(~C)(i+1UL,j ) ) - xmm3 * factor );
3921  store( &(~C)(i+1UL,j+IT::size), load( &(~C)(i+1UL,j+IT::size) ) - xmm4 * factor );
3922  }
3923  if( i < M ) {
3924  IntrinsicType xmm1, xmm2;
3925  for( size_t k=0UL; k<K; ++k ) {
3926  const IntrinsicType a1( set( A(i,k) ) );
3927  xmm1 = xmm1 + a1 * B.get(k,j );
3928  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3929  }
3930  store( &(~C)(i,j ), load( &(~C)(i,j ) ) - xmm1 * factor );
3931  store( &(~C)(i,j+IT::size), load( &(~C)(i,j+IT::size) ) - xmm2 * factor );
3932  }
3933  }
3934  if( j < N ) {
3935  size_t i( 0UL );
3936  for( ; (i+2UL) <= M; i+=2UL ) {
3937  IntrinsicType xmm1, xmm2;
3938  for( size_t k=0UL; k<K; ++k ) {
3939  const IntrinsicType b1( B.get(k,j) );
3940  xmm1 = xmm1 + set( A(i ,k) ) * b1;
3941  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
3942  }
3943  store( &(~C)(i ,j), load( &(~C)(i ,j) ) - xmm1 * factor );
3944  store( &(~C)(i+1UL,j), load( &(~C)(i+1UL,j) ) - xmm2 * factor );
3945  }
3946  if( i < M ) {
3947  IntrinsicType xmm1;
3948  for( size_t k=0UL; k<K; ++k ) {
3949  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
3950  }
3951  store( &(~C)(i,j), load( &(~C)(i,j) ) - xmm1 * factor );
3952  }
3953  }
3954  }
3955  //**********************************************************************************************
3956 
3957  //**Vectorized default subtraction assignment to column-major dense matrices********************
3971  template< typename MT3 // Type of the left-hand side target matrix
3972  , typename MT4 // Type of the left-hand side matrix operand
3973  , typename MT5 // Type of the right-hand side matrix operand
3974  , typename ST2 > // Type of the scalar value
3975  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3976  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3977  {
3978  typedef IntrinsicTrait<ElementType> IT;
3979 
3980  const size_t M( A.spacing() );
3981  const size_t N( B.columns() );
3982  const size_t K( A.columns() );
3983 
3984  const IntrinsicType factor( set( scalar ) );
3985 
3986  size_t i( 0UL );
3987 
3988  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3989  for( size_t j=0UL; j<N; ++j ) {
3990  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3991  for( size_t k=0UL; k<K; ++k ) {
3992  const IntrinsicType b1( set( B(k,j) ) );
3993  xmm1 = xmm1 + A.get(i ,k) * b1;
3994  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3995  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3996  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3997  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3998  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3999  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
4000  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
4001  }
4002  store( &(~C)(i ,j), load( &(~C)(i ,j) ) - xmm1 * factor );
4003  store( &(~C)(i+IT::size ,j), load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
4004  store( &(~C)(i+IT::size*2UL,j), load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
4005  store( &(~C)(i+IT::size*3UL,j), load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
4006  store( &(~C)(i+IT::size*4UL,j), load( &(~C)(i+IT::size*4UL,j) ) - xmm5 * factor );
4007  store( &(~C)(i+IT::size*5UL,j), load( &(~C)(i+IT::size*5UL,j) ) - xmm6 * factor );
4008  store( &(~C)(i+IT::size*6UL,j), load( &(~C)(i+IT::size*6UL,j) ) - xmm7 * factor );
4009  store( &(~C)(i+IT::size*7UL,j), load( &(~C)(i+IT::size*7UL,j) ) - xmm8 * factor );
4010  }
4011  }
4012  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
4013  size_t j( 0UL );
4014  for( ; (j+2UL) <= N; j+=2UL ) {
4015  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4016  for( size_t k=0UL; k<K; ++k ) {
4017  const IntrinsicType a1( A.get(i ,k) );
4018  const IntrinsicType a2( A.get(i+IT::size ,k) );
4019  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
4020  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
4021  const IntrinsicType b1( set( B(k,j ) ) );
4022  const IntrinsicType b2( set( B(k,j+1UL) ) );
4023  xmm1 = xmm1 + a1 * b1;
4024  xmm2 = xmm2 + a2 * b1;
4025  xmm3 = xmm3 + a3 * b1;
4026  xmm4 = xmm4 + a4 * b1;
4027  xmm5 = xmm5 + a1 * b2;
4028  xmm6 = xmm6 + a2 * b2;
4029  xmm7 = xmm7 + a3 * b2;
4030  xmm8 = xmm8 + a4 * b2;
4031  }
4032  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) - xmm1 * factor );
4033  store( &(~C)(i+IT::size ,j ), load( &(~C)(i+IT::size ,j ) ) - xmm2 * factor );
4034  store( &(~C)(i+IT::size*2UL,j ), load( &(~C)(i+IT::size*2UL,j ) ) - xmm3 * factor );
4035  store( &(~C)(i+IT::size*3UL,j ), load( &(~C)(i+IT::size*3UL,j ) ) - xmm4 * factor );
4036  store( &(~C)(i ,j+1UL), load( &(~C)(i ,j+1UL) ) - xmm5 * factor );
4037  store( &(~C)(i+IT::size ,j+1UL), load( &(~C)(i+IT::size ,j+1UL) ) - xmm6 * factor );
4038  store( &(~C)(i+IT::size*2UL,j+1UL), load( &(~C)(i+IT::size*2UL,j+1UL) ) - xmm7 * factor );
4039  store( &(~C)(i+IT::size*3UL,j+1UL), load( &(~C)(i+IT::size*3UL,j+1UL) ) - xmm8 * factor );
4040  }
4041  if( j < N ) {
4042  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4043  for( size_t k=0UL; k<K; ++k ) {
4044  const IntrinsicType b1( set( B(k,j) ) );
4045  xmm1 = xmm1 + A.get(i ,k) * b1;
4046  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
4047  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
4048  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
4049  }
4050  store( &(~C)(i ,j), load( &(~C)(i ,j) ) - xmm1 * factor );
4051  store( &(~C)(i+IT::size ,j), load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
4052  store( &(~C)(i+IT::size*2UL,j), load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
4053  store( &(~C)(i+IT::size*3UL,j), load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
4054  }
4055  }
4056  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
4057  size_t j( 0UL );
4058  for( ; (j+2UL) <= N; j+=2UL ) {
4059  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4060  for( size_t k=0UL; k<K; ++k ) {
4061  const IntrinsicType a1( A.get(i ,k) );
4062  const IntrinsicType a2( A.get(i+IT::size,k) );
4063  const IntrinsicType b1( set( B(k,j ) ) );
4064  const IntrinsicType b2( set( B(k,j+1UL) ) );
4065  xmm1 = xmm1 + a1 * b1;
4066  xmm2 = xmm2 + a2 * b1;
4067  xmm3 = xmm3 + a1 * b2;
4068  xmm4 = xmm4 + a2 * b2;
4069  }
4070  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) - xmm1 * factor );
4071  store( &(~C)(i+IT::size,j ), load( &(~C)(i+IT::size,j ) ) - xmm2 * factor );
4072  store( &(~C)(i ,j+1UL), load( &(~C)(i ,j+1UL) ) - xmm3 * factor );
4073  store( &(~C)(i+IT::size,j+1UL), load( &(~C)(i+IT::size,j+1UL) ) - xmm4 * factor );
4074  }
4075  if( j < N ) {
4076  IntrinsicType xmm1, xmm2;
4077  for( size_t k=0UL; k<K; ++k ) {
4078  const IntrinsicType b1( set( B(k,j) ) );
4079  xmm1 = xmm1 + A.get(i ,k) * b1;
4080  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
4081  }
4082  store( &(~C)(i ,j), load( &(~C)(i ,j) ) - xmm1 * factor );
4083  store( &(~C)(i+IT::size,j), load( &(~C)(i+IT::size,j) ) - xmm2 * factor );
4084  }
4085  }
4086  if( i < M ) {
4087  size_t j( 0UL );
4088  for( ; (j+2UL) <= N; j+=2UL ) {
4089  IntrinsicType xmm1, xmm2;
4090  for( size_t k=0UL; k<K; ++k ) {
4091  const IntrinsicType a1( A.get(i,k) );
4092  xmm1 = xmm1 + a1 * set( B(k,j ) );
4093  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
4094  }
4095  store( &(~C)(i,j ), load( &(~C)(i,j ) ) - xmm1 * factor );
4096  store( &(~C)(i,j+1UL), load( &(~C)(i,j+1UL) ) - xmm2 * factor );
4097  }
4098  if( j < N ) {
4099  IntrinsicType xmm1;
4100  for( size_t k=0UL; k<K; ++k ) {
4101  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
4102  }
4103  store( &(~C)(i,j), load( &(~C)(i,j) ) - xmm1 * factor );
4104  }
4105  }
4106  }
4107  //**********************************************************************************************
4108 
4109  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4123  template< typename MT3 // Type of the left-hand side target matrix
4124  , typename MT4 // Type of the left-hand side matrix operand
4125  , typename MT5 // Type of the right-hand side matrix operand
4126  , typename ST2 > // Type of the scalar value
4127  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4128  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4129  {
4130  selectDefaultSubAssignKernel( C, A, B, scalar );
4131  }
4132  //**********************************************************************************************
4133 
4134  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
4135 #if BLAZE_BLAS_MODE
4136 
4149  template< typename MT3 // Type of the left-hand side target matrix
4150  , typename MT4 // Type of the left-hand side matrix operand
4151  , typename MT5 // Type of the right-hand side matrix operand
4152  , typename ST2 > // Type of the scalar value
4153  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4154  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4155  {
4156  using boost::numeric_cast;
4157 
4158  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
4159  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
4160  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
4161 
4162  const int M ( numeric_cast<int>( A.rows() ) );
4163  const int N ( numeric_cast<int>( B.columns() ) );
4164  const int K ( numeric_cast<int>( A.columns() ) );
4165  const int lda( numeric_cast<int>( A.spacing() ) );
4166  const int ldb( numeric_cast<int>( B.spacing() ) );
4167  const int ldc( numeric_cast<int>( C.spacing() ) );
4168 
4169  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4170  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4171  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4172  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4173  }
4174 #endif
4175  //**********************************************************************************************
4176 
4177  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
4178 #if BLAZE_BLAS_MODE
4179 
4192  template< typename MT3 // Type of the left-hand side target matrix
4193  , typename MT4 // Type of the left-hand side matrix operand
4194  , typename MT5 // Type of the right-hand side matrix operand
4195  , typename ST2 > // Type of the scalar value
4196  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4197  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4198  {
4199  using boost::numeric_cast;
4200 
4201  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
4202  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
4203  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
4204 
4205  const int M ( numeric_cast<int>( A.rows() ) );
4206  const int N ( numeric_cast<int>( B.columns() ) );
4207  const int K ( numeric_cast<int>( A.columns() ) );
4208  const int lda( numeric_cast<int>( A.spacing() ) );
4209  const int ldb( numeric_cast<int>( B.spacing() ) );
4210  const int ldc( numeric_cast<int>( C.spacing() ) );
4211 
4212  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4213  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4214  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4215  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4216  }
4217 #endif
4218  //**********************************************************************************************
4219 
4220  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
4221 #if BLAZE_BLAS_MODE
4222 
4235  template< typename MT3 // Type of the left-hand side target matrix
4236  , typename MT4 // Type of the left-hand side matrix operand
4237  , typename MT5 // Type of the right-hand side matrix operand
4238  , typename ST2 > // Type of the scalar value
4239  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4240  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4241  {
4242  using boost::numeric_cast;
4243 
4244  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
4245  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
4246  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
4248  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
4249  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
4250  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
4251 
4252  const int M ( numeric_cast<int>( A.rows() ) );
4253  const int N ( numeric_cast<int>( B.columns() ) );
4254  const int K ( numeric_cast<int>( A.columns() ) );
4255  const int lda( numeric_cast<int>( A.spacing() ) );
4256  const int ldb( numeric_cast<int>( B.spacing() ) );
4257  const int ldc( numeric_cast<int>( C.spacing() ) );
4258  const complex<float> alpha( -scalar );
4259  const complex<float> beta ( 1.0F, 0.0F );
4260 
4261  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4262  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4263  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4264  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4265  }
4266 #endif
4267  //**********************************************************************************************
4268 
4269  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
4270 #if BLAZE_BLAS_MODE
4271 
4284  template< typename MT3 // Type of the left-hand side target matrix
4285  , typename MT4 // Type of the left-hand side matrix operand
4286  , typename MT5 // Type of the right-hand side matrix operand
4287  , typename ST2 > // Type of the scalar value
4288  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4289  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4290  {
4291  using boost::numeric_cast;
4292 
4293  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
4294  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
4295  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
4297  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
4298  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
4299  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
4300 
4301  const int M ( numeric_cast<int>( A.rows() ) );
4302  const int N ( numeric_cast<int>( B.columns() ) );
4303  const int K ( numeric_cast<int>( A.columns() ) );
4304  const int lda( numeric_cast<int>( A.spacing() ) );
4305  const int ldb( numeric_cast<int>( B.spacing() ) );
4306  const int ldc( numeric_cast<int>( C.spacing() ) );
4307  const complex<double> alpha( -scalar );
4308  const complex<double> beta ( 1.0, 0.0 );
4309 
4310  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4311  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4312  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4313  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4314  }
4315 #endif
4316  //**********************************************************************************************
4317 
4318  //**Subtraction assignment to sparse matrices***************************************************
4319  // No special implementation for the subtraction assignment to sparse matrices.
4320  //**********************************************************************************************
4321 
4322  //**Multiplication assignment to dense matrices*************************************************
4323  // No special implementation for the multiplication assignment to dense matrices.
4324  //**********************************************************************************************
4325 
4326  //**Multiplication assignment to sparse matrices************************************************
4327  // No special implementation for the multiplication assignment to sparse matrices.
4328  //**********************************************************************************************
4329 
4330  //**Compile time checks*************************************************************************
4338  //**********************************************************************************************
4339 };
4341 //*************************************************************************************************
4342 
4343 
4344 
4345 
4346 //=================================================================================================
4347 //
4348 // GLOBAL BINARY ARITHMETIC OPERATORS
4349 //
4350 //=================================================================================================
4351 
4352 //*************************************************************************************************
4381 template< typename T1 // Type of the left-hand side dense matrix
4382  , typename T2 > // Type of the right-hand side dense matrix
4383 inline const TDMatDMatMultExpr<T1,T2>
4385 {
4387 
4388  if( (~lhs).columns() != (~rhs).rows() )
4389  throw std::invalid_argument( "Matrix sizes do not match" );
4390 
4391  return TDMatDMatMultExpr<T1,T2>( ~lhs, ~rhs );
4392 }
4393 //*************************************************************************************************
4394 
4395 
4396 
4397 
4398 //=================================================================================================
4399 //
4400 // GLOBAL OPERATORS
4401 //
4402 //=================================================================================================
4403 
4404 //*************************************************************************************************
4417 template< typename MT1 // Type of the left-hand side dense matrix
4418  , typename MT2 > // Type of the right-hand side dense matrix
4419 inline typename RowExprTrait< TDMatDMatMultExpr<MT1,MT2> >::Type
4420  row( const TDMatDMatMultExpr<MT1,MT2>& dm, size_t index )
4421 {
4423 
4424  return row( dm.leftOperand(), index ) * dm.rightOperand();
4425 }
4427 //*************************************************************************************************
4428 
4429 
4430 //*************************************************************************************************
4443 template< typename MT1 // Type of the left-hand side dense matrix
4444  , typename MT2 > // Type of the right-hand side dense matrix
4445 inline typename ColumnExprTrait< TDMatDMatMultExpr<MT1,MT2> >::Type
4446  column( const TDMatDMatMultExpr<MT1,MT2>& dm, size_t index )
4447 {
4449 
4450  return dm.leftOperand() * column( dm.rightOperand(), index );
4451 }
4453 //*************************************************************************************************
4454 
4455 
4456 
4457 
4458 //=================================================================================================
4459 //
4460 // EXPRESSION TRAIT SPECIALIZATIONS
4461 //
4462 //=================================================================================================
4463 
4464 //*************************************************************************************************
4466 template< typename MT1, typename MT2, typename VT >
4467 struct TDMatDVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
4468 {
4469  public:
4470  //**********************************************************************************************
4471  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4472  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4473  IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
4474  , typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
4475  , INVALID_TYPE >::Type Type;
4476  //**********************************************************************************************
4477 };
4479 //*************************************************************************************************
4480 
4481 
4482 //*************************************************************************************************
4484 template< typename MT1, typename MT2, typename VT >
4485 struct TDMatSVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
4486 {
4487  public:
4488  //**********************************************************************************************
4489  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4490  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4491  IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
4492  , typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
4493  , INVALID_TYPE >::Type Type;
4494  //**********************************************************************************************
4495 };
4497 //*************************************************************************************************
4498 
4499 
4500 //*************************************************************************************************
4502 template< typename VT, typename MT1, typename MT2 >
4503 struct TDVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
4504 {
4505  public:
4506  //**********************************************************************************************
4507  typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
4508  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4509  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4510  , typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4511  , INVALID_TYPE >::Type Type;
4512  //**********************************************************************************************
4513 };
4515 //*************************************************************************************************
4516 
4517 
4518 //*************************************************************************************************
4520 template< typename VT, typename MT1, typename MT2 >
4521 struct TSVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
4522 {
4523  public:
4524  //**********************************************************************************************
4525  typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
4526  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4527  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4528  , typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4529  , INVALID_TYPE >::Type Type;
4530  //**********************************************************************************************
4531 };
4533 //*************************************************************************************************
4534 
4535 
4536 //*************************************************************************************************
4538 template< typename MT1, typename MT2 >
4539 struct RowExprTrait< TDMatDMatMultExpr<MT1,MT2> >
4540 {
4541  public:
4542  //**********************************************************************************************
4543  typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
4544  //**********************************************************************************************
4545 };
4547 //*************************************************************************************************
4548 
4549 
4550 //*************************************************************************************************
4552 template< typename MT1, typename MT2 >
4553 struct ColumnExprTrait< TDMatDMatMultExpr<MT1,MT2> >
4554 {
4555  public:
4556  //**********************************************************************************************
4557  typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
4558  //**********************************************************************************************
4559 };
4561 //*************************************************************************************************
4562 
4563 } // namespace blaze
4564 
4565 #endif