All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
20 //=================================================================================================
21 
22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
24 
25 
26 //*************************************************************************************************
27 // Includes
28 //*************************************************************************************************
29 
30 #include <stdexcept>
31 #include <boost/cast.hpp>
38 #include <blaze/math/Intrinsics.h>
39 #include <blaze/math/shims/Reset.h>
57 #include <blaze/system/BLAS.h>
59 #include <blaze/util/Assert.h>
60 #include <blaze/util/Complex.h>
65 #include <blaze/util/DisableIf.h>
66 #include <blaze/util/EnableIf.h>
67 #include <blaze/util/InvalidType.h>
68 #include <blaze/util/SelectType.h>
69 #include <blaze/util/Types.h>
75 
76 
77 namespace blaze {
78 
79 //=================================================================================================
80 //
81 // CLASS TDMATDMATMULTEXPR
82 //
83 //=================================================================================================
84 
85 //*************************************************************************************************
92 template< typename MT1 // Type of the left-hand side dense matrix
93  , typename MT2 > // Type of the right-hand side dense matrix
94 class TDMatDMatMultExpr : public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
95  , private Expression
96  , private Computation
97 {
98  private:
99  //**Type definitions****************************************************************************
100  typedef typename MT1::ResultType RT1;
101  typedef typename MT2::ResultType RT2;
102  typedef typename MT1::CompositeType CT1;
103  typedef typename MT2::CompositeType CT2;
104  //**********************************************************************************************
105 
106  //**********************************************************************************************
108 
109 
111  template< typename T1, typename T2, typename T3 >
112  struct UseSinglePrecisionKernel {
116  };
118  //**********************************************************************************************
119 
120  //**********************************************************************************************
122 
123 
125  template< typename T1, typename T2, typename T3 >
126  struct UseDoublePrecisionKernel {
130  };
132  //**********************************************************************************************
133 
134  //**********************************************************************************************
136 
137 
140  template< typename T1, typename T2, typename T3 >
141  struct UseSinglePrecisionComplexKernel {
142  typedef complex<float> Type;
143  enum { value = IsSame<typename T1::ElementType,Type>::value &&
144  IsSame<typename T2::ElementType,Type>::value &&
145  IsSame<typename T3::ElementType,Type>::value };
146  };
148  //**********************************************************************************************
149 
150  //**********************************************************************************************
152 
153 
156  template< typename T1, typename T2, typename T3 >
157  struct UseDoublePrecisionComplexKernel {
158  typedef complex<double> Type;
159  enum { value = IsSame<typename T1::ElementType,Type>::value &&
160  IsSame<typename T2::ElementType,Type>::value &&
161  IsSame<typename T3::ElementType,Type>::value };
162  };
164  //**********************************************************************************************
165 
166  //**********************************************************************************************
168 
169 
171  template< typename T1, typename T2, typename T3 >
172  struct UseDefaultKernel {
173  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
174  !UseDoublePrecisionKernel<T1,T2,T3>::value &&
175  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
176  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
177  };
179  //**********************************************************************************************
180 
181  //**********************************************************************************************
183 
184 
186  template< typename T1, typename T2, typename T3 >
187  struct UseVectorizedDefaultKernel {
188  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
189  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
190  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
191  IntrinsicTrait<typename T1::ElementType>::addition &&
192  IntrinsicTrait<typename T1::ElementType>::multiplication };
193  };
195  //**********************************************************************************************
196 
197  public:
198  //**Type definitions****************************************************************************
201  typedef typename ResultType::OppositeType OppositeType;
202  typedef typename ResultType::TransposeType TransposeType;
203  typedef typename ResultType::ElementType ElementType;
205  typedef const ElementType ReturnType;
206  typedef const ResultType CompositeType;
207 
209  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
210 
212  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
213 
215  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
216 
218  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
219  //**********************************************************************************************
220 
221  //**Compilation flags***************************************************************************
223  enum { vectorizable = 0 };
224 
226  enum { canAlias = !IsComputation<MT1>::value || !IsComputation<MT2>::value };
227  //**********************************************************************************************
228 
229  //**Constructor*********************************************************************************
235  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs )
236  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
237  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
238  {
239  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
240  }
241  //**********************************************************************************************
242 
243  //**Access operator*****************************************************************************
250  inline ReturnType operator()( size_t i, size_t j ) const {
251  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
252  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
253 
254  ElementType tmp;
255 
256  if( lhs_.columns() != 0UL ) {
257  const size_t end( ( ( lhs_.columns()-1UL ) & size_t(-2) ) + 1UL );
258  tmp = lhs_(i,0UL) * rhs_(0UL,j);
259  for( size_t k=1UL; k<end; k+=2UL ) {
260  tmp += lhs_(i,k ) * rhs_(k ,j);
261  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
262  }
263  if( end < lhs_.columns() ) {
264  tmp += lhs_(i,end) * rhs_(end,j);
265  }
266  }
267  else {
268  reset( tmp );
269  }
270 
271  return tmp;
272  }
273  //**********************************************************************************************
274 
275  //**Rows function*******************************************************************************
280  inline size_t rows() const {
281  return lhs_.rows();
282  }
283  //**********************************************************************************************
284 
285  //**Columns function****************************************************************************
290  inline size_t columns() const {
291  return rhs_.columns();
292  }
293  //**********************************************************************************************
294 
295  //**Left operand access*************************************************************************
300  inline LeftOperand leftOperand() const {
301  return lhs_;
302  }
303  //**********************************************************************************************
304 
305  //**Right operand access************************************************************************
310  inline RightOperand rightOperand() const {
311  return rhs_;
312  }
313  //**********************************************************************************************
314 
315  //**********************************************************************************************
321  template< typename T >
322  inline bool isAliased( const T* alias ) const {
323  return ( !IsComputation<MT1>::value && lhs_.isAliased( alias ) ) ||
324  ( !IsComputation<MT2>::value && rhs_.isAliased( alias ) );
325  }
326  //**********************************************************************************************
327 
328  private:
329  //**Member variables****************************************************************************
332  //**********************************************************************************************
333 
334  //**Assignment to dense matrices****************************************************************
343  template< typename MT // Type of the target dense matrix
344  , bool SO > // Storage order of the target dense matrix
345  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
346  {
347  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
348  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
349 
350  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
351  return;
352  }
353  else if( rhs.lhs_.columns() == 0UL ) {
354  reset( ~lhs );
355  return;
356  }
357 
358  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
359  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
360 
361  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
362  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
363  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
364  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
365  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
366  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
367 
368  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
369  TDMatDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
370  else
371  TDMatDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
372  }
374  //**********************************************************************************************
375 
376  //**Default assignment to dense matrices********************************************************
390  template< typename MT3 // Type of the left-hand side target matrix
391  , typename MT4 // Type of the left-hand side matrix operand
392  , typename MT5 > // Type of the right-hand side matrix operand
393  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
394  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
395  {
396  const size_t M( A.rows() );
397  const size_t N( B.columns() );
398  const size_t K( A.columns() );
399 
400  for( size_t i=0UL; i<M; ++i ) {
401  for( size_t j=0UL; j<N; ++j ) {
402  C(i,j) = A(i,0UL) * B(0UL,j);
403  }
404  for( size_t k=1UL; k<K; ++k ) {
405  for( size_t j=0UL; j<N; ++j ) {
406  C(i,j) += A(i,k) * B(k,j);
407  }
408  }
409  }
410  }
412  //**********************************************************************************************
413 
414  //**Vectorized default assignment to row-major dense matrices***********************************
428  template< typename MT3 // Type of the left-hand side target matrix
429  , typename MT4 // Type of the left-hand side matrix operand
430  , typename MT5 > // Type of the right-hand side matrix operand
431  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
432  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
433  {
434  typedef IntrinsicTrait<ElementType> IT;
435 
436  const size_t M( A.rows() );
437  const size_t N( B.spacing() );
438  const size_t K( A.columns() );
439 
440  size_t j( 0UL );
441 
442  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
443  for( size_t i=0UL; i<M; ++i ) {
444  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
445  for( size_t k=0UL; k<K; ++k ) {
446  const IntrinsicType a1( set( A(i,k) ) );
447  xmm1 = xmm1 + a1 * B.get(k,j );
448  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
449  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
450  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
451  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
452  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
453  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
454  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
455  }
456  store( &(~C)(i,j ), xmm1 );
457  store( &(~C)(i,j+IT::size ), xmm2 );
458  store( &(~C)(i,j+IT::size*2UL), xmm3 );
459  store( &(~C)(i,j+IT::size*3UL), xmm4 );
460  store( &(~C)(i,j+IT::size*4UL), xmm5 );
461  store( &(~C)(i,j+IT::size*5UL), xmm6 );
462  store( &(~C)(i,j+IT::size*6UL), xmm7 );
463  store( &(~C)(i,j+IT::size*7UL), xmm8 );
464  }
465  }
466  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
467  size_t i( 0UL );
468  for( ; (i+2UL) <= M; i+=2UL ) {
469  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
470  for( size_t k=0UL; k<K; ++k ) {
471  const IntrinsicType a1( set( A(i ,k) ) );
472  const IntrinsicType a2( set( A(i+1UL,k) ) );
473  const IntrinsicType b1( B.get(k,j ) );
474  const IntrinsicType b2( B.get(k,j+IT::size ) );
475  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
476  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
477  xmm1 = xmm1 + a1 * b1;
478  xmm2 = xmm2 + a1 * b2;
479  xmm3 = xmm3 + a1 * b3;
480  xmm4 = xmm4 + a1 * b4;
481  xmm5 = xmm5 + a2 * b1;
482  xmm6 = xmm6 + a2 * b2;
483  xmm7 = xmm7 + a2 * b3;
484  xmm8 = xmm8 + a2 * b4;
485  }
486  store( &(~C)(i ,j ), xmm1 );
487  store( &(~C)(i ,j+IT::size ), xmm2 );
488  store( &(~C)(i ,j+IT::size*2UL), xmm3 );
489  store( &(~C)(i ,j+IT::size*3UL), xmm4 );
490  store( &(~C)(i+1UL,j ), xmm5 );
491  store( &(~C)(i+1UL,j+IT::size ), xmm6 );
492  store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
493  store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
494  }
495  if( i < M ) {
496  IntrinsicType xmm1, xmm2, xmm3, xmm4;
497  for( size_t k=0UL; k<K; ++k ) {
498  const IntrinsicType a1( set( A(i,k) ) );
499  xmm1 = xmm1 + a1 * B.get(k,j );
500  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
501  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
502  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
503  }
504  store( &(~C)(i,j ), xmm1 );
505  store( &(~C)(i,j+IT::size ), xmm2 );
506  store( &(~C)(i,j+IT::size*2UL), xmm3 );
507  store( &(~C)(i,j+IT::size*3UL), xmm4 );
508  }
509  }
510  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
511  size_t i( 0UL );
512  for( ; (i+2UL) <= M; i+=2UL ) {
513  IntrinsicType xmm1, xmm2, xmm3, xmm4;
514  for( size_t k=0UL; k<K; ++k ) {
515  const IntrinsicType a1( set( A(i ,k) ) );
516  const IntrinsicType a2( set( A(i+1UL,k) ) );
517  const IntrinsicType b1( B.get(k,j ) );
518  const IntrinsicType b2( B.get(k,j+IT::size) );
519  xmm1 = xmm1 + a1 * b1;
520  xmm2 = xmm2 + a1 * b2;
521  xmm3 = xmm3 + a2 * b1;
522  xmm4 = xmm4 + a2 * b2;
523  }
524  store( &(~C)(i ,j ), xmm1 );
525  store( &(~C)(i ,j+IT::size), xmm2 );
526  store( &(~C)(i+1UL,j ), xmm3 );
527  store( &(~C)(i+1UL,j+IT::size), xmm4 );
528  }
529  if( i < M ) {
530  IntrinsicType xmm1, xmm2;
531  for( size_t k=0UL; k<K; ++k ) {
532  const IntrinsicType a1( set( A(i,k) ) );
533  xmm1 = xmm1 + a1 * B.get(k,j );
534  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
535  }
536  store( &(~C)(i,j ), xmm1 );
537  store( &(~C)(i,j+IT::size), xmm2 );
538  }
539  }
540  if( j < N ) {
541  size_t i( 0UL );
542  for( ; (i+2UL) <= M; i+=2UL ) {
543  IntrinsicType xmm1, xmm2;
544  for( size_t k=0UL; k<K; ++k ) {
545  const IntrinsicType b1( B.get(k,j) );
546  xmm1 = xmm1 + set( A(i ,k) ) * b1;
547  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
548  }
549  store( &(~C)(i ,j), xmm1 );
550  store( &(~C)(i+1UL,j), xmm2 );
551  }
552  if( i < M ) {
553  IntrinsicType xmm1;
554  for( size_t k=0UL; k<K; ++k ) {
555  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
556  }
557  store( &(~C)(i,j), xmm1 );
558  }
559  }
560  }
562  //**********************************************************************************************
563 
564  //**Vectorized default assignment to column-major dense matrices********************************
578  template< typename MT3 // Type of the left-hand side target matrix
579  , typename MT4 // Type of the left-hand side matrix operand
580  , typename MT5 > // Type of the right-hand side matrix operand
581  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
582  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
583  {
584  typedef IntrinsicTrait<ElementType> IT;
585 
586  const size_t M( A.spacing() );
587  const size_t N( B.columns() );
588  const size_t K( A.columns() );
589 
590  size_t i( 0UL );
591 
592  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
593  for( size_t j=0UL; j<N; ++j ) {
594  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
595  for( size_t k=0UL; k<K; ++k ) {
596  const IntrinsicType b1( set( B(k,j) ) );
597  xmm1 = xmm1 + A.get(i ,k) * b1;
598  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
599  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
600  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
601  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
602  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
603  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
604  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
605  }
606  store( &(~C)(i ,j), xmm1 );
607  store( &(~C)(i+IT::size ,j), xmm2 );
608  store( &(~C)(i+IT::size*2UL,j), xmm3 );
609  store( &(~C)(i+IT::size*3UL,j), xmm4 );
610  store( &(~C)(i+IT::size*4UL,j), xmm5 );
611  store( &(~C)(i+IT::size*5UL,j), xmm6 );
612  store( &(~C)(i+IT::size*6UL,j), xmm7 );
613  store( &(~C)(i+IT::size*7UL,j), xmm8 );
614  }
615  }
616  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
617  size_t j( 0UL );
618  for( ; (j+2UL) <= N; j+=2UL ) {
619  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
620  for( size_t k=0UL; k<K; ++k ) {
621  const IntrinsicType a1( A.get(i ,k) );
622  const IntrinsicType a2( A.get(i+IT::size ,k) );
623  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
624  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
625  const IntrinsicType b1( set( B(k,j ) ) );
626  const IntrinsicType b2( set( B(k,j+1UL) ) );
627  xmm1 = xmm1 + a1 * b1;
628  xmm2 = xmm2 + a2 * b1;
629  xmm3 = xmm3 + a3 * b1;
630  xmm4 = xmm4 + a4 * b1;
631  xmm5 = xmm5 + a1 * b2;
632  xmm6 = xmm6 + a2 * b2;
633  xmm7 = xmm7 + a3 * b2;
634  xmm8 = xmm8 + a4 * b2;
635  }
636  store( &(~C)(i ,j ), xmm1 );
637  store( &(~C)(i+IT::size ,j ), xmm2 );
638  store( &(~C)(i+IT::size*2UL,j ), xmm3 );
639  store( &(~C)(i+IT::size*3UL,j ), xmm4 );
640  store( &(~C)(i ,j+1UL), xmm5 );
641  store( &(~C)(i+IT::size ,j+1UL), xmm6 );
642  store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
643  store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
644  }
645  if( j < N ) {
646  IntrinsicType xmm1, xmm2, xmm3, xmm4;
647  for( size_t k=0UL; k<K; ++k ) {
648  const IntrinsicType b1( set( B(k,j) ) );
649  xmm1 = xmm1 + A.get(i ,k) * b1;
650  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
651  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
652  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
653  }
654  store( &(~C)(i ,j), xmm1 );
655  store( &(~C)(i+IT::size ,j), xmm2 );
656  store( &(~C)(i+IT::size*2UL,j), xmm3 );
657  store( &(~C)(i+IT::size*3UL,j), xmm4 );
658  }
659  }
660  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
661  size_t j( 0UL );
662  for( ; (j+2UL) <= N; j+=2UL ) {
663  IntrinsicType xmm1, xmm2, xmm3, xmm4;
664  for( size_t k=0UL; k<K; ++k ) {
665  const IntrinsicType a1( A.get(i ,k) );
666  const IntrinsicType a2( A.get(i+IT::size,k) );
667  const IntrinsicType b1( set( B(k,j ) ) );
668  const IntrinsicType b2( set( B(k,j+1UL) ) );
669  xmm1 = xmm1 + a1 * b1;
670  xmm2 = xmm2 + a2 * b1;
671  xmm3 = xmm3 + a1 * b2;
672  xmm4 = xmm4 + a2 * b2;
673  }
674  store( &(~C)(i ,j ), xmm1 );
675  store( &(~C)(i+IT::size,j ), xmm2 );
676  store( &(~C)(i ,j+1UL), xmm3 );
677  store( &(~C)(i+IT::size,j+1UL), xmm4 );
678  }
679  if( j < N ) {
680  IntrinsicType xmm1, xmm2;
681  for( size_t k=0UL; k<K; ++k ) {
682  const IntrinsicType b1( set( B(k,j) ) );
683  xmm1 = xmm1 + A.get(i ,k) * b1;
684  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
685  }
686  store( &(~C)(i ,j), xmm1 );
687  store( &(~C)(i+IT::size,j), xmm2 );
688  }
689  }
690  if( i < M ) {
691  size_t j( 0UL );
692  for( ; (j+2UL) <= N; j+=2UL ) {
693  IntrinsicType xmm1, xmm2;
694  for( size_t k=0UL; k<K; ++k ) {
695  const IntrinsicType a1( A.get(i,k) );
696  xmm1 = xmm1 + a1 * set( B(k,j ) );
697  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
698  }
699  store( &(~C)(i,j ), xmm1 );
700  store( &(~C)(i,j+1UL), xmm2 );
701  }
702  if( j < N ) {
703  IntrinsicType xmm1;
704  for( size_t k=0UL; k<K; ++k ) {
705  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
706  }
707  store( &(~C)(i,j), xmm1 );
708  }
709  }
710  }
712  //**********************************************************************************************
713 
714  //**BLAS-based assignment to dense matrices (default)*******************************************
728  template< typename MT3 // Type of the left-hand side target matrix
729  , typename MT4 // Type of the left-hand side matrix operand
730  , typename MT5 > // Type of the right-hand side matrix operand
731  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
732  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
733  {
734  selectDefaultAssignKernel( C, A, B );
735  }
737  //**********************************************************************************************
738 
739  //**BLAS-based assignment to dense matrices (single precision)**********************************
740 #if BLAZE_BLAS_MODE
741 
754  template< typename MT3 // Type of the left-hand side target matrix
755  , typename MT4 // Type of the left-hand side matrix operand
756  , typename MT5 > // Type of the right-hand side matrix operand
757  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
758  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
759  {
760  using boost::numeric_cast;
761 
762  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
763  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
764  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
765 
766  const int M ( numeric_cast<int>( A.rows() ) );
767  const int N ( numeric_cast<int>( B.columns() ) );
768  const int K ( numeric_cast<int>( A.columns() ) );
769  const int lda( numeric_cast<int>( A.spacing() ) );
770  const int ldb( numeric_cast<int>( B.spacing() ) );
771  const int ldc( numeric_cast<int>( C.spacing() ) );
772 
773  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
774  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
775  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
776  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
777  }
779 #endif
780  //**********************************************************************************************
781 
782  //**BLAS-based assignment to dense matrices (double precision)**********************************
783 #if BLAZE_BLAS_MODE
784 
797  template< typename MT3 // Type of the left-hand side target matrix
798  , typename MT4 // Type of the left-hand side matrix operand
799  , typename MT5 > // Type of the right-hand side matrix operand
800  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
801  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
802  {
803  using boost::numeric_cast;
804 
805  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
806  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
807  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
808 
809  const int M ( numeric_cast<int>( A.rows() ) );
810  const int N ( numeric_cast<int>( B.columns() ) );
811  const int K ( numeric_cast<int>( A.columns() ) );
812  const int lda( numeric_cast<int>( A.spacing() ) );
813  const int ldb( numeric_cast<int>( B.spacing() ) );
814  const int ldc( numeric_cast<int>( C.spacing() ) );
815 
816  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
817  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
818  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
819  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
820  }
822 #endif
823  //**********************************************************************************************
824 
825  //**BLAS-based assignment to dense matrices (single precision complex)**************************
826 #if BLAZE_BLAS_MODE
827 
840  template< typename MT3 // Type of the left-hand side target matrix
841  , typename MT4 // Type of the left-hand side matrix operand
842  , typename MT5 > // Type of the right-hand side matrix operand
843  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
844  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
845  {
846  using boost::numeric_cast;
847 
848  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
849  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
850  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
851  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
852  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
853  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
854 
855  const int M ( numeric_cast<int>( A.rows() ) );
856  const int N ( numeric_cast<int>( B.columns() ) );
857  const int K ( numeric_cast<int>( A.columns() ) );
858  const int lda( numeric_cast<int>( A.spacing() ) );
859  const int ldb( numeric_cast<int>( B.spacing() ) );
860  const int ldc( numeric_cast<int>( C.spacing() ) );
861  const complex<float> alpha( 1.0F, 0.0F );
862  const complex<float> beta ( 0.0F, 0.0F );
863 
864  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
865  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
866  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
867  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
868  }
870 #endif
871  //**********************************************************************************************
872 
873  //**BLAS-based assignment to dense matrices (double precision complex)**************************
874 #if BLAZE_BLAS_MODE
875 
888  template< typename MT3 // Type of the left-hand side target matrix
889  , typename MT4 // Type of the left-hand side matrix operand
890  , typename MT5 > // Type of the right-hand side matrix operand
891  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
892  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
893  {
894  using boost::numeric_cast;
895 
896  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
897  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
898  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
899  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
900  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
901  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
902 
903  const int M ( numeric_cast<int>( A.rows() ) );
904  const int N ( numeric_cast<int>( B.columns() ) );
905  const int K ( numeric_cast<int>( A.columns() ) );
906  const int lda( numeric_cast<int>( A.spacing() ) );
907  const int ldb( numeric_cast<int>( B.spacing() ) );
908  const int ldc( numeric_cast<int>( C.spacing() ) );
909  const complex<double> alpha( 1.0, 0.0 );
910  const complex<double> beta ( 0.0, 0.0 );
911 
912  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
913  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
914  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
915  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
916  }
918 #endif
919  //**********************************************************************************************
920 
921  //**Assignment to sparse matrices***************************************************************
933  template< typename MT // Type of the target sparse matrix
934  , bool SO > // Storage order of the target sparse matrix
935  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
936  {
937  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
938 
944  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( typename TmpType::CompositeType );
945 
946  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
947  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
948 
949  const TmpType tmp( rhs );
950  assign( ~lhs, tmp );
951  }
953  //**********************************************************************************************
954 
955  //**Addition assignment to dense matrices*******************************************************
968  template< typename MT // Type of the target dense matrix
969  , bool SO > // Storage order of the target dense matrix
970  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
971  {
972  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
973  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
974 
975  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
976  return;
977  }
978 
979  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
980  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
981 
982  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
983  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
984  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
985  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
986  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
987  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
988 
989  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
990  TDMatDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
991  else
992  TDMatDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
993  }
995  //**********************************************************************************************
996 
997  //**Default addition assignment to dense matrices***********************************************
1011  template< typename MT3 // Type of the left-hand side target matrix
1012  , typename MT4 // Type of the left-hand side matrix operand
1013  , typename MT5 > // Type of the right-hand side matrix operand
1014  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1015  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1016  {
1017  const size_t M( A.rows() );
1018  const size_t N( B.columns() );
1019  const size_t K( A.columns() );
1020 
1021  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1022  const size_t end( N & size_t(-2) );
1023 
1024  for( size_t i=0UL; i<M; ++i ) {
1025  for( size_t k=0UL; k<K; ++k ) {
1026  for( size_t j=0UL; j<end; j+=2UL ) {
1027  C(i,j ) += A(i,k) * B(k,j );
1028  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1029  }
1030  if( end < N ) {
1031  C(i,end) += A(i,k) * B(k,end);
1032  }
1033  }
1034  }
1035  }
1037  //**********************************************************************************************
1038 
1039  //**Vectorized default addition assignment to row-major dense matrices**************************
1053  template< typename MT3 // Type of the left-hand side target matrix
1054  , typename MT4 // Type of the left-hand side matrix operand
1055  , typename MT5 > // Type of the right-hand side matrix operand
1056  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1057  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1058  {
1059  typedef IntrinsicTrait<ElementType> IT;
1060 
1061  const size_t M( A.rows() );
1062  const size_t N( B.spacing() );
1063  const size_t K( A.columns() );
1064 
1065  size_t j( 0UL );
1066 
1067  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1068  for( size_t i=0UL; i<M; ++i ) {
1069  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1070  IntrinsicType xmm2( load( &(~C)(i,j+IT::size ) ) );
1071  IntrinsicType xmm3( load( &(~C)(i,j+IT::size*2UL) ) );
1072  IntrinsicType xmm4( load( &(~C)(i,j+IT::size*3UL) ) );
1073  IntrinsicType xmm5( load( &(~C)(i,j+IT::size*4UL) ) );
1074  IntrinsicType xmm6( load( &(~C)(i,j+IT::size*5UL) ) );
1075  IntrinsicType xmm7( load( &(~C)(i,j+IT::size*6UL) ) );
1076  IntrinsicType xmm8( load( &(~C)(i,j+IT::size*7UL) ) );
1077  for( size_t k=0UL; k<K; ++k ) {
1078  const IntrinsicType a1( set( A(i,k) ) );
1079  xmm1 = xmm1 + a1 * B.get(k,j );
1080  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1081  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1082  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1083  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
1084  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
1085  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
1086  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
1087  }
1088  store( &(~C)(i,j ), xmm1 );
1089  store( &(~C)(i,j+IT::size ), xmm2 );
1090  store( &(~C)(i,j+IT::size*2UL), xmm3 );
1091  store( &(~C)(i,j+IT::size*3UL), xmm4 );
1092  store( &(~C)(i,j+IT::size*4UL), xmm5 );
1093  store( &(~C)(i,j+IT::size*5UL), xmm6 );
1094  store( &(~C)(i,j+IT::size*6UL), xmm7 );
1095  store( &(~C)(i,j+IT::size*7UL), xmm8 );
1096  }
1097  }
1098  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1099  size_t i( 0UL );
1100  for( ; (i+2UL) <= M; i+=2UL ) {
1101  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1102  IntrinsicType xmm2( load( &(~C)(i ,j+IT::size ) ) );
1103  IntrinsicType xmm3( load( &(~C)(i ,j+IT::size*2UL) ) );
1104  IntrinsicType xmm4( load( &(~C)(i ,j+IT::size*3UL) ) );
1105  IntrinsicType xmm5( load( &(~C)(i+1UL,j ) ) );
1106  IntrinsicType xmm6( load( &(~C)(i+1UL,j+IT::size ) ) );
1107  IntrinsicType xmm7( load( &(~C)(i+1UL,j+IT::size*2UL) ) );
1108  IntrinsicType xmm8( load( &(~C)(i+1UL,j+IT::size*3UL) ) );
1109  for( size_t k=0UL; k<K; ++k ) {
1110  const IntrinsicType a1( set( A(i ,k) ) );
1111  const IntrinsicType a2( set( A(i+1UL,k) ) );
1112  const IntrinsicType b1( B.get(k,j ) );
1113  const IntrinsicType b2( B.get(k,j+IT::size ) );
1114  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
1115  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
1116  xmm1 = xmm1 + a1 * b1;
1117  xmm2 = xmm2 + a1 * b2;
1118  xmm3 = xmm3 + a1 * b3;
1119  xmm4 = xmm4 + a1 * b4;
1120  xmm5 = xmm5 + a2 * b1;
1121  xmm6 = xmm6 + a2 * b2;
1122  xmm7 = xmm7 + a2 * b3;
1123  xmm8 = xmm8 + a2 * b4;
1124  }
1125  store( &(~C)(i ,j ), xmm1 );
1126  store( &(~C)(i ,j+IT::size ), xmm2 );
1127  store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1128  store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1129  store( &(~C)(i+1UL,j ), xmm5 );
1130  store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1131  store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1132  store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1133  }
1134  if( i < M ) {
1135  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1136  IntrinsicType xmm2( load( &(~C)(i,j+IT::size ) ) );
1137  IntrinsicType xmm3( load( &(~C)(i,j+IT::size*2UL) ) );
1138  IntrinsicType xmm4( load( &(~C)(i,j+IT::size*3UL) ) );
1139  for( size_t k=0UL; k<K; ++k ) {
1140  const IntrinsicType a1( set( A(i,k) ) );
1141  xmm1 = xmm1 + a1 * B.get(k,j );
1142  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1143  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1144  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1145  }
1146  store( &(~C)(i,j ), xmm1 );
1147  store( &(~C)(i,j+IT::size ), xmm2 );
1148  store( &(~C)(i,j+IT::size*2UL), xmm3 );
1149  store( &(~C)(i,j+IT::size*3UL), xmm4 );
1150  }
1151  }
1152  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1153  size_t i( 0UL );
1154  for( ; (i+2UL) <= M; i+=2UL ) {
1155  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1156  IntrinsicType xmm2( load( &(~C)(i ,j+IT::size) ) );
1157  IntrinsicType xmm3( load( &(~C)(i+1UL,j ) ) );
1158  IntrinsicType xmm4( load( &(~C)(i+1UL,j+IT::size) ) );
1159  for( size_t k=0UL; k<K; ++k ) {
1160  const IntrinsicType a1( set( A(i ,k) ) );
1161  const IntrinsicType a2( set( A(i+1UL,k) ) );
1162  const IntrinsicType b1( B.get(k,j ) );
1163  const IntrinsicType b2( B.get(k,j+IT::size) );
1164  xmm1 = xmm1 + a1 * b1;
1165  xmm2 = xmm2 + a1 * b2;
1166  xmm3 = xmm3 + a2 * b1;
1167  xmm4 = xmm4 + a2 * b2;
1168  }
1169  store( &(~C)(i ,j ), xmm1 );
1170  store( &(~C)(i ,j+IT::size), xmm2 );
1171  store( &(~C)(i+1UL,j ), xmm3 );
1172  store( &(~C)(i+1UL,j+IT::size), xmm4 );
1173  }
1174  if( i < M ) {
1175  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1176  IntrinsicType xmm2( load( &(~C)(i,j+IT::size) ) );
1177  for( size_t k=0UL; k<K; ++k ) {
1178  const IntrinsicType a1( set( A(i,k) ) );
1179  xmm1 = xmm1 + a1 * B.get(k,j );
1180  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
1181  }
1182  store( &(~C)(i,j ), xmm1 );
1183  store( &(~C)(i,j+IT::size), xmm2 );
1184  }
1185  }
1186  if( j < N ) {
1187  size_t i( 0UL );
1188  for( ; (i+2UL) <= M; i+=2UL ) {
1189  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1190  IntrinsicType xmm2( load( &(~C)(i+1UL,j) ) );
1191  for( size_t k=0UL; k<K; ++k ) {
1192  const IntrinsicType b1( B.get(k,j) );
1193  xmm1 = xmm1 + set( A(i ,k) ) * b1;
1194  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
1195  }
1196  store( &(~C)(i ,j), xmm1 );
1197  store( &(~C)(i+1UL,j), xmm2 );
1198  }
1199  if( i < M ) {
1200  IntrinsicType xmm1( load( &(~C)(i,j) ) );
1201  for( size_t k=0UL; k<K; ++k ) {
1202  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
1203  }
1204  store( &(~C)(i,j), xmm1 );
1205  }
1206  }
1207  }
1209  //**********************************************************************************************
1210 
1211  //**Vectorized default addition assignment to column-major dense matrices***********************
1225  template< typename MT3 // Type of the left-hand side target matrix
1226  , typename MT4 // Type of the left-hand side matrix operand
1227  , typename MT5 > // Type of the right-hand side matrix operand
1228  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1229  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1230  {
1231  typedef IntrinsicTrait<ElementType> IT;
1232 
1233  const size_t M( A.spacing() );
1234  const size_t N( B.columns() );
1235  const size_t K( A.columns() );
1236 
1237  size_t i( 0UL );
1238 
1239  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1240  for( size_t j=0UL; j<N; ++j ) {
1241  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1242  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j) ) );
1243  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j) ) );
1244  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j) ) );
1245  IntrinsicType xmm5( load( &(~C)(i+IT::size*4UL,j) ) );
1246  IntrinsicType xmm6( load( &(~C)(i+IT::size*5UL,j) ) );
1247  IntrinsicType xmm7( load( &(~C)(i+IT::size*6UL,j) ) );
1248  IntrinsicType xmm8( load( &(~C)(i+IT::size*7UL,j) ) );
1249  for( size_t k=0UL; k<K; ++k ) {
1250  const IntrinsicType b1( set( B(k,j) ) );
1251  xmm1 = xmm1 + A.get(i ,k) * b1;
1252  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1253  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1254  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1255  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
1256  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
1257  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
1258  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
1259  }
1260  store( &(~C)(i ,j), xmm1 );
1261  store( &(~C)(i+IT::size ,j), xmm2 );
1262  store( &(~C)(i+IT::size*2UL,j), xmm3 );
1263  store( &(~C)(i+IT::size*3UL,j), xmm4 );
1264  store( &(~C)(i+IT::size*4UL,j), xmm5 );
1265  store( &(~C)(i+IT::size*5UL,j), xmm6 );
1266  store( &(~C)(i+IT::size*6UL,j), xmm7 );
1267  store( &(~C)(i+IT::size*7UL,j), xmm8 );
1268  }
1269  }
1270  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1271  size_t j( 0UL );
1272  for( ; (j+2UL) <= N; j+=2UL ) {
1273  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1274  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j ) ) );
1275  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j ) ) );
1276  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j ) ) );
1277  IntrinsicType xmm5( load( &(~C)(i ,j+1UL) ) );
1278  IntrinsicType xmm6( load( &(~C)(i+IT::size ,j+1UL) ) );
1279  IntrinsicType xmm7( load( &(~C)(i+IT::size*2UL,j+1UL) ) );
1280  IntrinsicType xmm8( load( &(~C)(i+IT::size*3UL,j+1UL) ) );
1281  for( size_t k=0UL; k<K; ++k ) {
1282  const IntrinsicType a1( A.get(i ,k) );
1283  const IntrinsicType a2( A.get(i+IT::size ,k) );
1284  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
1285  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
1286  const IntrinsicType b1( set( B(k,j ) ) );
1287  const IntrinsicType b2( set( B(k,j+1UL) ) );
1288  xmm1 = xmm1 + a1 * b1;
1289  xmm2 = xmm2 + a2 * b1;
1290  xmm3 = xmm3 + a3 * b1;
1291  xmm4 = xmm4 + a4 * b1;
1292  xmm5 = xmm5 + a1 * b2;
1293  xmm6 = xmm6 + a2 * b2;
1294  xmm7 = xmm7 + a3 * b2;
1295  xmm8 = xmm8 + a4 * b2;
1296  }
1297  store( &(~C)(i ,j ), xmm1 );
1298  store( &(~C)(i+IT::size ,j ), xmm2 );
1299  store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1300  store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1301  store( &(~C)(i ,j+1UL), xmm5 );
1302  store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1303  store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1304  store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1305  }
1306  if( j < N ) {
1307  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1308  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j) ) );
1309  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j) ) );
1310  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j) ) );
1311  for( size_t k=0UL; k<K; ++k ) {
1312  const IntrinsicType b1( set( B(k,j) ) );
1313  xmm1 = xmm1 + A.get(i ,k) * b1;
1314  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1315  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1316  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1317  }
1318  store( &(~C)(i ,j), xmm1 );
1319  store( &(~C)(i+IT::size ,j), xmm2 );
1320  store( &(~C)(i+IT::size*2UL,j), xmm3 );
1321  store( &(~C)(i+IT::size*3UL,j), xmm4 );
1322  }
1323  }
1324  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1325  size_t j( 0UL );
1326  for( ; (j+2UL) <= N; j+=2UL ) {
1327  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1328  IntrinsicType xmm2( load( &(~C)(i+IT::size,j ) ) );
1329  IntrinsicType xmm3( load( &(~C)(i ,j+1UL) ) );
1330  IntrinsicType xmm4( load( &(~C)(i+IT::size,j+1UL) ) );
1331  for( size_t k=0UL; k<K; ++k ) {
1332  const IntrinsicType a1( A.get(i ,k) );
1333  const IntrinsicType a2( A.get(i+IT::size,k) );
1334  const IntrinsicType b1( set( B(k,j ) ) );
1335  const IntrinsicType b2( set( B(k,j+1UL) ) );
1336  xmm1 = xmm1 + a1 * b1;
1337  xmm2 = xmm2 + a2 * b1;
1338  xmm3 = xmm3 + a1 * b2;
1339  xmm4 = xmm4 + a2 * b2;
1340  }
1341  store( &(~C)(i ,j ), xmm1 );
1342  store( &(~C)(i+IT::size,j ), xmm2 );
1343  store( &(~C)(i ,j+1UL), xmm3 );
1344  store( &(~C)(i+IT::size,j+1UL), xmm4 );
1345  }
1346  if( j < N ) {
1347  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1348  IntrinsicType xmm2( load( &(~C)(i+IT::size,j) ) );
1349  for( size_t k=0UL; k<K; ++k ) {
1350  const IntrinsicType b1( set( B(k,j) ) );
1351  xmm1 = xmm1 + A.get(i ,k) * b1;
1352  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
1353  }
1354  store( &(~C)(i ,j), xmm1 );
1355  store( &(~C)(i+IT::size,j), xmm2 );
1356  }
1357  }
1358  if( i < M ) {
1359  size_t j( 0UL );
1360  for( ; (j+2UL) <= N; j+=2UL ) {
1361  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1362  IntrinsicType xmm2( load( &(~C)(i,j+1UL) ) );
1363  for( size_t k=0UL; k<K; ++k ) {
1364  const IntrinsicType a1( A.get(i,k) );
1365  xmm1 = xmm1 + a1 * set( B(k,j ) );
1366  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
1367  }
1368  store( &(~C)(i,j ), xmm1 );
1369  store( &(~C)(i,j+1UL), xmm2 );
1370  }
1371  if( j < N ) {
1372  IntrinsicType xmm1( load( &(~C)(i,j) ) );
1373  for( size_t k=0UL; k<K; ++k ) {
1374  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
1375  }
1376  store( &(~C)(i,j), xmm1 );
1377  }
1378  }
1379  }
1381  //**********************************************************************************************
1382 
1383  //**BLAS-based addition assignment to dense matrices (default)**********************************
1397  template< typename MT3 // Type of the left-hand side target matrix
1398  , typename MT4 // Type of the left-hand side matrix operand
1399  , typename MT5 > // Type of the right-hand side matrix operand
1400  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1401  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1402  {
1403  selectDefaultAddAssignKernel( C, A, B );
1404  }
1406  //**********************************************************************************************
1407 
1408  //**BLAS-based addition assignment to dense matrices (single precision)*************************
1409 #if BLAZE_BLAS_MODE
1410 
1423  template< typename MT3 // Type of the left-hand side target matrix
1424  , typename MT4 // Type of the left-hand side matrix operand
1425  , typename MT5 > // Type of the right-hand side matrix operand
1426  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1427  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1428  {
1429  using boost::numeric_cast;
1430 
1431  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
1432  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
1433  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
1434 
1435  const int M ( numeric_cast<int>( A.rows() ) );
1436  const int N ( numeric_cast<int>( B.columns() ) );
1437  const int K ( numeric_cast<int>( A.columns() ) );
1438  const int lda( numeric_cast<int>( A.spacing() ) );
1439  const int ldb( numeric_cast<int>( B.spacing() ) );
1440  const int ldc( numeric_cast<int>( C.spacing() ) );
1441 
1442  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1443  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1444  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1445  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1446  }
1448 #endif
1449  //**********************************************************************************************
1450 
1451  //**BLAS-based addition assignment to dense matrices (double precision)*************************
1452 #if BLAZE_BLAS_MODE
1453 
1466  template< typename MT3 // Type of the left-hand side target matrix
1467  , typename MT4 // Type of the left-hand side matrix operand
1468  , typename MT5 > // Type of the right-hand side matrix operand
1469  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1470  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1471  {
1472  using boost::numeric_cast;
1473 
1474  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
1475  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
1476  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
1477 
1478  const int M ( numeric_cast<int>( A.rows() ) );
1479  const int N ( numeric_cast<int>( B.columns() ) );
1480  const int K ( numeric_cast<int>( A.columns() ) );
1481  const int lda( numeric_cast<int>( A.spacing() ) );
1482  const int ldb( numeric_cast<int>( B.spacing() ) );
1483  const int ldc( numeric_cast<int>( C.spacing() ) );
1484 
1485  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1486  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1487  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1488  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1489  }
1491 #endif
1492  //**********************************************************************************************
1493 
1494  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
1495 #if BLAZE_BLAS_MODE
1496 
1509  template< typename MT3 // Type of the left-hand side target matrix
1510  , typename MT4 // Type of the left-hand side matrix operand
1511  , typename MT5 > // Type of the right-hand side matrix operand
1512  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1513  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1514  {
1515  using boost::numeric_cast;
1516 
1517  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1518  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1519  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1520  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
1521  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
1522  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
1523 
1524  const int M ( numeric_cast<int>( A.rows() ) );
1525  const int N ( numeric_cast<int>( B.columns() ) );
1526  const int K ( numeric_cast<int>( A.columns() ) );
1527  const int lda( numeric_cast<int>( A.spacing() ) );
1528  const int ldb( numeric_cast<int>( B.spacing() ) );
1529  const int ldc( numeric_cast<int>( C.spacing() ) );
1530  const complex<float> alpha( 1.0F, 0.0F );
1531  const complex<float> beta ( 1.0F, 0.0F );
1532 
1533  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1534  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1535  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1536  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1537  }
1539 #endif
1540  //**********************************************************************************************
1541 
1542  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
1543 #if BLAZE_BLAS_MODE
1544 
1557  template< typename MT3 // Type of the left-hand side target matrix
1558  , typename MT4 // Type of the left-hand side matrix operand
1559  , typename MT5 > // Type of the right-hand side matrix operand
1560  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1561  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1562  {
1563  using boost::numeric_cast;
1564 
1565  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1566  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1567  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1568  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1569  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1570  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1571 
1572  const int M ( numeric_cast<int>( A.rows() ) );
1573  const int N ( numeric_cast<int>( B.columns() ) );
1574  const int K ( numeric_cast<int>( A.columns() ) );
1575  const int lda( numeric_cast<int>( A.spacing() ) );
1576  const int ldb( numeric_cast<int>( B.spacing() ) );
1577  const int ldc( numeric_cast<int>( C.spacing() ) );
1578  const complex<double> alpha( 1.0, 0.0 );
1579  const complex<double> beta ( 1.0, 0.0 );
1580 
1581  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1582  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1583  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1584  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1585  }
1587 #endif
1588  //**********************************************************************************************
1589 
1590  //**Addition assignment to sparse matrices******************************************************
1591  // No special implementation for the addition assignment to sparse matrices.
1592  //**********************************************************************************************
1593 
1594  //**Subtraction assignment to dense matrices****************************************************
1607  template< typename MT // Type of the target dense matrix
1608  , bool SO > // Storage order of the target dense matrix
1609  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
1610  {
1611  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1612  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1613 
1614  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1615  return;
1616  }
1617 
1618  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
1619  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
1620 
1621  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1622  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1623  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1624  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1625  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1626  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1627 
1628  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
1629  TDMatDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1630  else
1631  TDMatDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1632  }
1634  //**********************************************************************************************
1635 
1636  //**Default subtraction assignment to dense matrices********************************************
1650  template< typename MT3 // Type of the left-hand side target matrix
1651  , typename MT4 // Type of the left-hand side matrix operand
1652  , typename MT5 > // Type of the right-hand side matrix operand
1653  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1654  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1655  {
1656  const size_t M( A.rows() );
1657  const size_t N( B.columns() );
1658  const size_t K( A.columns() );
1659 
1660  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1661  const size_t end( N & size_t(-2) );
1662 
1663  for( size_t i=0UL; i<M; ++i ) {
1664  for( size_t k=0UL; k<K; ++k ) {
1665  for( size_t j=0UL; j<end; j+=2UL ) {
1666  C(i,j ) -= A(i,k) * B(k,j );
1667  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1668  }
1669  if( end < N ) {
1670  C(i,end) -= A(i,k) * B(k,end);
1671  }
1672  }
1673  }
1674  }
1676  //**********************************************************************************************
1677 
1678  //**Vectorized default subtraction assignment to row-major dense matrices***********************
1692  template< typename MT3 // Type of the left-hand side target matrix
1693  , typename MT4 // Type of the left-hand side matrix operand
1694  , typename MT5 > // Type of the right-hand side matrix operand
1695  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1696  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1697  {
1698  typedef IntrinsicTrait<ElementType> IT;
1699 
1700  const size_t M( A.rows() );
1701  const size_t N( B.spacing() );
1702  const size_t K( A.columns() );
1703 
1704  size_t j( 0UL );
1705 
1706  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1707  for( size_t i=0UL; i<M; ++i ) {
1708  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1709  IntrinsicType xmm2( load( &(~C)(i,j+IT::size ) ) );
1710  IntrinsicType xmm3( load( &(~C)(i,j+IT::size*2UL) ) );
1711  IntrinsicType xmm4( load( &(~C)(i,j+IT::size*3UL) ) );
1712  IntrinsicType xmm5( load( &(~C)(i,j+IT::size*4UL) ) );
1713  IntrinsicType xmm6( load( &(~C)(i,j+IT::size*5UL) ) );
1714  IntrinsicType xmm7( load( &(~C)(i,j+IT::size*6UL) ) );
1715  IntrinsicType xmm8( load( &(~C)(i,j+IT::size*7UL) ) );
1716  for( size_t k=0UL; k<K; ++k ) {
1717  const IntrinsicType a1( set( A(i,k) ) );
1718  xmm1 = xmm1 - a1 * B.get(k,j );
1719  xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1720  xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1721  xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1722  xmm5 = xmm5 - a1 * B.get(k,j+IT::size*4UL);
1723  xmm6 = xmm6 - a1 * B.get(k,j+IT::size*5UL);
1724  xmm7 = xmm7 - a1 * B.get(k,j+IT::size*6UL);
1725  xmm8 = xmm8 - a1 * B.get(k,j+IT::size*7UL);
1726  }
1727  store( &(~C)(i,j ), xmm1 );
1728  store( &(~C)(i,j+IT::size ), xmm2 );
1729  store( &(~C)(i,j+IT::size*2UL), xmm3 );
1730  store( &(~C)(i,j+IT::size*3UL), xmm4 );
1731  store( &(~C)(i,j+IT::size*4UL), xmm5 );
1732  store( &(~C)(i,j+IT::size*5UL), xmm6 );
1733  store( &(~C)(i,j+IT::size*6UL), xmm7 );
1734  store( &(~C)(i,j+IT::size*7UL), xmm8 );
1735  }
1736  }
1737  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1738  size_t i( 0UL );
1739  for( ; (i+2UL) <= M; i+=2UL ) {
1740  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1741  IntrinsicType xmm2( load( &(~C)(i ,j+IT::size ) ) );
1742  IntrinsicType xmm3( load( &(~C)(i ,j+IT::size*2UL) ) );
1743  IntrinsicType xmm4( load( &(~C)(i ,j+IT::size*3UL) ) );
1744  IntrinsicType xmm5( load( &(~C)(i+1UL,j ) ) );
1745  IntrinsicType xmm6( load( &(~C)(i+1UL,j+IT::size ) ) );
1746  IntrinsicType xmm7( load( &(~C)(i+1UL,j+IT::size*2UL) ) );
1747  IntrinsicType xmm8( load( &(~C)(i+1UL,j+IT::size*3UL) ) );
1748  for( size_t k=0UL; k<K; ++k ) {
1749  const IntrinsicType a1( set( A(i ,k) ) );
1750  const IntrinsicType a2( set( A(i+1UL,k) ) );
1751  const IntrinsicType b1( B.get(k,j ) );
1752  const IntrinsicType b2( B.get(k,j+IT::size ) );
1753  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
1754  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
1755  xmm1 = xmm1 - a1 * b1;
1756  xmm2 = xmm2 - a1 * b2;
1757  xmm3 = xmm3 - a1 * b3;
1758  xmm4 = xmm4 - a1 * b4;
1759  xmm5 = xmm5 - a2 * b1;
1760  xmm6 = xmm6 - a2 * b2;
1761  xmm7 = xmm7 - a2 * b3;
1762  xmm8 = xmm8 - a2 * b4;
1763  }
1764  store( &(~C)(i ,j ), xmm1 );
1765  store( &(~C)(i ,j+IT::size ), xmm2 );
1766  store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1767  store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1768  store( &(~C)(i+1UL,j ), xmm5 );
1769  store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1770  store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1771  store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1772  }
1773  if( i < M ) {
1774  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1775  IntrinsicType xmm2( load( &(~C)(i,j+IT::size ) ) );
1776  IntrinsicType xmm3( load( &(~C)(i,j+IT::size*2UL) ) );
1777  IntrinsicType xmm4( load( &(~C)(i,j+IT::size*3UL) ) );
1778  for( size_t k=0UL; k<K; ++k ) {
1779  const IntrinsicType a1( set( A(i,k) ) );
1780  xmm1 = xmm1 - a1 * B.get(k,j );
1781  xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1782  xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1783  xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1784  }
1785  store( &(~C)(i,j ), xmm1 );
1786  store( &(~C)(i,j+IT::size ), xmm2 );
1787  store( &(~C)(i,j+IT::size*2UL), xmm3 );
1788  store( &(~C)(i,j+IT::size*3UL), xmm4 );
1789  }
1790  }
1791  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1792  size_t i( 0UL );
1793  for( ; (i+2UL) <= M; i+=2UL ) {
1794  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1795  IntrinsicType xmm2( load( &(~C)(i ,j+IT::size) ) );
1796  IntrinsicType xmm3( load( &(~C)(i+1UL,j ) ) );
1797  IntrinsicType xmm4( load( &(~C)(i+1UL,j+IT::size) ) );
1798  for( size_t k=0UL; k<K; ++k ) {
1799  const IntrinsicType a1( set( A(i ,k) ) );
1800  const IntrinsicType a2( set( A(i+1UL,k) ) );
1801  const IntrinsicType b1( B.get(k,j ) );
1802  const IntrinsicType b2( B.get(k,j+IT::size) );
1803  xmm1 = xmm1 - a1 * b1;
1804  xmm2 = xmm2 - a1 * b2;
1805  xmm3 = xmm3 - a2 * b1;
1806  xmm4 = xmm4 - a2 * b2;
1807  }
1808  store( &(~C)(i ,j ), xmm1 );
1809  store( &(~C)(i ,j+IT::size), xmm2 );
1810  store( &(~C)(i+1UL,j ), xmm3 );
1811  store( &(~C)(i+1UL,j+IT::size), xmm4 );
1812  }
1813  if( i < M ) {
1814  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1815  IntrinsicType xmm2( load( &(~C)(i,j+IT::size) ) );
1816  for( size_t k=0UL; k<K; ++k ) {
1817  const IntrinsicType a1( set( A(i,k) ) );
1818  xmm1 = xmm1 - a1 * B.get(k,j );
1819  xmm2 = xmm2 - a1 * B.get(k,j+IT::size);
1820  }
1821  store( &(~C)(i,j ), xmm1 );
1822  store( &(~C)(i,j+IT::size), xmm2 );
1823  }
1824  }
1825  if( j < N ) {
1826  size_t i( 0UL );
1827  for( ; (i+2UL) <= M; i+=2UL ) {
1828  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1829  IntrinsicType xmm2( load( &(~C)(i+1UL,j) ) );
1830  for( size_t k=0UL; k<K; ++k ) {
1831  const IntrinsicType b1( B.get(k,j) );
1832  xmm1 = xmm1 - set( A(i ,k) ) * b1;
1833  xmm2 = xmm2 - set( A(i+1UL,k) ) * b1;
1834  }
1835  store( &(~C)(i ,j), xmm1 );
1836  store( &(~C)(i+1UL,j), xmm2 );
1837  }
1838  if( i < M ) {
1839  IntrinsicType xmm1( load( &(~C)(i,j) ) );
1840  for( size_t k=0UL; k<K; ++k ) {
1841  xmm1 = xmm1 - set( A(i,k) ) * B.get(k,j);
1842  }
1843  store( &(~C)(i,j), xmm1 );
1844  }
1845  }
1846  }
1848  //**********************************************************************************************
1849 
1850  //**Vectorized default subtraction assignment to column-major dense matrices********************
1864  template< typename MT3 // Type of the left-hand side target matrix
1865  , typename MT4 // Type of the left-hand side matrix operand
1866  , typename MT5 > // Type of the right-hand side matrix operand
1867  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1868  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1869  {
1870  typedef IntrinsicTrait<ElementType> IT;
1871 
1872  const size_t M( A.spacing() );
1873  const size_t N( B.columns() );
1874  const size_t K( A.columns() );
1875 
1876  size_t i( 0UL );
1877 
1878  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1879  for( size_t j=0UL; j<N; ++j ) {
1880  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1881  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j) ) );
1882  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j) ) );
1883  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j) ) );
1884  IntrinsicType xmm5( load( &(~C)(i+IT::size*4UL,j) ) );
1885  IntrinsicType xmm6( load( &(~C)(i+IT::size*5UL,j) ) );
1886  IntrinsicType xmm7( load( &(~C)(i+IT::size*6UL,j) ) );
1887  IntrinsicType xmm8( load( &(~C)(i+IT::size*7UL,j) ) );
1888  for( size_t k=0UL; k<K; ++k ) {
1889  const IntrinsicType b1( set( B(k,j) ) );
1890  xmm1 = xmm1 - A.get(i ,k) * b1;
1891  xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1892  xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1893  xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1894  xmm5 = xmm5 - A.get(i+IT::size*4UL,k) * b1;
1895  xmm6 = xmm6 - A.get(i+IT::size*5UL,k) * b1;
1896  xmm7 = xmm7 - A.get(i+IT::size*6UL,k) * b1;
1897  xmm8 = xmm8 - A.get(i+IT::size*7UL,k) * b1;
1898  }
1899  store( &(~C)(i ,j), xmm1 );
1900  store( &(~C)(i+IT::size ,j), xmm2 );
1901  store( &(~C)(i+IT::size*2UL,j), xmm3 );
1902  store( &(~C)(i+IT::size*3UL,j), xmm4 );
1903  store( &(~C)(i+IT::size*4UL,j), xmm5 );
1904  store( &(~C)(i+IT::size*5UL,j), xmm6 );
1905  store( &(~C)(i+IT::size*6UL,j), xmm7 );
1906  store( &(~C)(i+IT::size*7UL,j), xmm8 );
1907  }
1908  }
1909  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1910  size_t j( 0UL );
1911  for( ; (j+2UL) <= N; j+=2UL ) {
1912  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1913  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j ) ) );
1914  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j ) ) );
1915  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j ) ) );
1916  IntrinsicType xmm5( load( &(~C)(i ,j+1UL) ) );
1917  IntrinsicType xmm6( load( &(~C)(i+IT::size ,j+1UL) ) );
1918  IntrinsicType xmm7( load( &(~C)(i+IT::size*2UL,j+1UL) ) );
1919  IntrinsicType xmm8( load( &(~C)(i+IT::size*3UL,j+1UL) ) );
1920  for( size_t k=0UL; k<K; ++k ) {
1921  const IntrinsicType a1( A.get(i ,k) );
1922  const IntrinsicType a2( A.get(i+IT::size ,k) );
1923  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
1924  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
1925  const IntrinsicType b1( set( B(k,j ) ) );
1926  const IntrinsicType b2( set( B(k,j+1UL) ) );
1927  xmm1 = xmm1 - a1 * b1;
1928  xmm2 = xmm2 - a2 * b1;
1929  xmm3 = xmm3 - a3 * b1;
1930  xmm4 = xmm4 - a4 * b1;
1931  xmm5 = xmm5 - a1 * b2;
1932  xmm6 = xmm6 - a2 * b2;
1933  xmm7 = xmm7 - a3 * b2;
1934  xmm8 = xmm8 - a4 * b2;
1935  }
1936  store( &(~C)(i ,j ), xmm1 );
1937  store( &(~C)(i+IT::size ,j ), xmm2 );
1938  store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1939  store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1940  store( &(~C)(i ,j+1UL), xmm5 );
1941  store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1942  store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1943  store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1944  }
1945  if( j < N ) {
1946  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1947  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j) ) );
1948  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j) ) );
1949  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j) ) );
1950  for( size_t k=0UL; k<K; ++k ) {
1951  const IntrinsicType b1( set( B(k,j) ) );
1952  xmm1 = xmm1 - A.get(i ,k) * b1;
1953  xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1954  xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1955  xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1956  }
1957  store( &(~C)(i ,j), xmm1 );
1958  store( &(~C)(i+IT::size ,j), xmm2 );
1959  store( &(~C)(i+IT::size*2UL,j), xmm3 );
1960  store( &(~C)(i+IT::size*3UL,j), xmm4 );
1961  }
1962  }
1963  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1964  size_t j( 0UL );
1965  for( ; (j+2UL) <= N; j+=2UL ) {
1966  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1967  IntrinsicType xmm2( load( &(~C)(i+IT::size,j ) ) );
1968  IntrinsicType xmm3( load( &(~C)(i ,j+1UL) ) );
1969  IntrinsicType xmm4( load( &(~C)(i+IT::size,j+1UL) ) );
1970  for( size_t k=0UL; k<K; ++k ) {
1971  const IntrinsicType a1( A.get(i ,k) );
1972  const IntrinsicType a2( A.get(i+IT::size,k) );
1973  const IntrinsicType b1( set( B(k,j ) ) );
1974  const IntrinsicType b2( set( B(k,j+1UL) ) );
1975  xmm1 = xmm1 - a1 * b1;
1976  xmm2 = xmm2 - a2 * b1;
1977  xmm3 = xmm3 - a1 * b2;
1978  xmm4 = xmm4 - a2 * b2;
1979  }
1980  store( &(~C)(i ,j ), xmm1 );
1981  store( &(~C)(i+IT::size,j ), xmm2 );
1982  store( &(~C)(i ,j+1UL), xmm3 );
1983  store( &(~C)(i+IT::size,j+1UL), xmm4 );
1984  }
1985  if( j < N ) {
1986  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1987  IntrinsicType xmm2( load( &(~C)(i+IT::size,j) ) );
1988  for( size_t k=0UL; k<K; ++k ) {
1989  const IntrinsicType b1( set( B(k,j) ) );
1990  xmm1 = xmm1 - A.get(i ,k) * b1;
1991  xmm2 = xmm2 - A.get(i+IT::size,k) * b1;
1992  }
1993  store( &(~C)(i ,j), xmm1 );
1994  store( &(~C)(i+IT::size,j), xmm2 );
1995  }
1996  }
1997  if( i < M ) {
1998  size_t j( 0UL );
1999  for( ; (j+2UL) <= N; j+=2UL ) {
2000  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
2001  IntrinsicType xmm2( load( &(~C)(i,j+1UL) ) );
2002  for( size_t k=0UL; k<K; ++k ) {
2003  const IntrinsicType a1( A.get(i,k) );
2004  xmm1 = xmm1 - a1 * set( B(k,j ) );
2005  xmm2 = xmm2 - a1 * set( B(k,j+1UL) );
2006  }
2007  store( &(~C)(i,j ), xmm1 );
2008  store( &(~C)(i,j+1UL), xmm2 );
2009  }
2010  if( j < N ) {
2011  IntrinsicType xmm1( load( &(~C)(i,j) ) );
2012  for( size_t k=0UL; k<K; ++k ) {
2013  xmm1 = xmm1 - A.get(i,k) * set( B(k,j) );
2014  }
2015  store( &(~C)(i,j), xmm1 );
2016  }
2017  }
2018  }
2020  //**********************************************************************************************
2021 
2022  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
2036  template< typename MT3 // Type of the left-hand side target matrix
2037  , typename MT4 // Type of the left-hand side matrix operand
2038  , typename MT5 > // Type of the right-hand side matrix operand
2039  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2040  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2041  {
2042  selectDefaultSubAssignKernel( C, A, B );
2043  }
2045  //**********************************************************************************************
2046 
2047  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
2048 #if BLAZE_BLAS_MODE
2049 
2062  template< typename MT3 // Type of the left-hand side target matrix
2063  , typename MT4 // Type of the left-hand side matrix operand
2064  , typename MT5 > // Type of the right-hand side matrix operand
2065  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2066  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2067  {
2068  using boost::numeric_cast;
2069 
2070  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
2071  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
2072  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
2073 
2074  const int M ( numeric_cast<int>( A.rows() ) );
2075  const int N ( numeric_cast<int>( B.columns() ) );
2076  const int K ( numeric_cast<int>( A.columns() ) );
2077  const int lda( numeric_cast<int>( A.spacing() ) );
2078  const int ldb( numeric_cast<int>( B.spacing() ) );
2079  const int ldc( numeric_cast<int>( C.spacing() ) );
2080 
2081  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2082  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2083  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2084  M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2085  }
2087 #endif
2088  //**********************************************************************************************
2089 
2090  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
2091 #if BLAZE_BLAS_MODE
2092 
2105  template< typename MT3 // Type of the left-hand side target matrix
2106  , typename MT4 // Type of the left-hand side matrix operand
2107  , typename MT5 > // Type of the right-hand side matrix operand
2108  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2109  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2110  {
2111  using boost::numeric_cast;
2112 
2113  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
2114  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
2115  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
2116 
2117  const int M ( numeric_cast<int>( A.rows() ) );
2118  const int N ( numeric_cast<int>( B.columns() ) );
2119  const int K ( numeric_cast<int>( A.columns() ) );
2120  const int lda( numeric_cast<int>( A.spacing() ) );
2121  const int ldb( numeric_cast<int>( B.spacing() ) );
2122  const int ldc( numeric_cast<int>( C.spacing() ) );
2123 
2124  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2125  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2126  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2127  M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2128  }
2130 #endif
2131  //**********************************************************************************************
2132 
2133  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
2134 #if BLAZE_BLAS_MODE
2135 
2148  template< typename MT3 // Type of the left-hand side target matrix
2149  , typename MT4 // Type of the left-hand side matrix operand
2150  , typename MT5 > // Type of the right-hand side matrix operand
2151  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2152  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2153  {
2154  using boost::numeric_cast;
2155 
2156  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
2157  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
2158  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
2159  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
2160  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
2161  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
2162 
2163  const int M ( numeric_cast<int>( A.rows() ) );
2164  const int N ( numeric_cast<int>( B.columns() ) );
2165  const int K ( numeric_cast<int>( A.columns() ) );
2166  const int lda( numeric_cast<int>( A.spacing() ) );
2167  const int ldb( numeric_cast<int>( B.spacing() ) );
2168  const int ldc( numeric_cast<int>( C.spacing() ) );
2169  const complex<float> alpha( -1.0F, 0.0F );
2170  const complex<float> beta ( 1.0F, 0.0F );
2171 
2172  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2173  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2174  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2175  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2176  }
2178 #endif
2179  //**********************************************************************************************
2180 
2181  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
2182 #if BLAZE_BLAS_MODE
2183 
2196  template< typename MT3 // Type of the left-hand side target matrix
2197  , typename MT4 // Type of the left-hand side matrix operand
2198  , typename MT5 > // Type of the right-hand side matrix operand
2199  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2200  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2201  {
2202  using boost::numeric_cast;
2203 
2204  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
2205  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
2206  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
2207  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
2208  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
2209  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
2210 
2211  const int M ( numeric_cast<int>( A.rows() ) );
2212  const int N ( numeric_cast<int>( B.columns() ) );
2213  const int K ( numeric_cast<int>( A.columns() ) );
2214  const int lda( numeric_cast<int>( A.spacing() ) );
2215  const int ldb( numeric_cast<int>( B.spacing() ) );
2216  const int ldc( numeric_cast<int>( C.spacing() ) );
2217  const complex<double> alpha( -1.0, 0.0 );
2218  const complex<double> beta ( 1.0, 0.0 );
2219 
2220  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2221  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2222  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2223  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2224  }
2226 #endif
2227  //**********************************************************************************************
2228 
2229  //**Subtraction assignment to sparse matrices***************************************************
2230  // No special implementation for the subtraction assignment to sparse matrices.
2231  //**********************************************************************************************
2232 
2233  //**Multiplication assignment to dense matrices*************************************************
2234  // No special implementation for the multiplication assignment to dense matrices.
2235  //**********************************************************************************************
2236 
2237  //**Multiplication assignment to sparse matrices************************************************
2238  // No special implementation for the multiplication assignment to sparse matrices.
2239  //**********************************************************************************************
2240 
2241  //**Compile time checks*************************************************************************
2248  //**********************************************************************************************
2249 };
2250 //*************************************************************************************************
2251 
2252 
2253 
2254 
2255 //=================================================================================================
2256 //
2257 // DMATSCALARMULTEXPR SPECIALIZATION
2258 //
2259 //=================================================================================================
2260 
2261 //*************************************************************************************************
2269 template< typename MT1 // Type of the left-hand side dense matrix
2270  , typename MT2 // Type of the right-hand side dense matrix
2271  , typename ST > // Type of the right-hand side scalar value
2272 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >
2273  : public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2274  , private Expression
2275  , private Computation
2276 {
2277  private:
2278  //**Type definitions****************************************************************************
2279  typedef TDMatDMatMultExpr<MT1,MT2> MMM;
2280  typedef typename MMM::ResultType RES;
2281  typedef typename MT1::ResultType RT1;
2282  typedef typename MT2::ResultType RT2;
2283  typedef typename MT1::CompositeType CT1;
2284  typedef typename MT2::CompositeType CT2;
2285  //**********************************************************************************************
2286 
2287  //**********************************************************************************************
2289 
2292  template< typename T1, typename T2, typename T3, typename T4 >
2293  struct UseSinglePrecisionKernel {
2294  enum { value = IsFloat<typename T1::ElementType>::value &&
2295  IsFloat<typename T2::ElementType>::value &&
2296  IsFloat<typename T3::ElementType>::value &&
2297  !IsComplex<T4>::value };
2298  };
2299  //**********************************************************************************************
2300 
2301  //**********************************************************************************************
2303 
2306  template< typename T1, typename T2, typename T3, typename T4 >
2307  struct UseDoublePrecisionKernel {
2308  enum { value = IsDouble<typename T1::ElementType>::value &&
2309  IsDouble<typename T2::ElementType>::value &&
2310  IsDouble<typename T3::ElementType>::value &&
2311  !IsComplex<T4>::value };
2312  };
2313  //**********************************************************************************************
2314 
2315  //**********************************************************************************************
2317 
2320  template< typename T1, typename T2, typename T3 >
2321  struct UseSinglePrecisionComplexKernel {
2322  typedef complex<float> Type;
2323  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2324  IsSame<typename T2::ElementType,Type>::value &&
2325  IsSame<typename T3::ElementType,Type>::value };
2326  };
2327  //**********************************************************************************************
2328 
2329  //**********************************************************************************************
2331 
2334  template< typename T1, typename T2, typename T3 >
2335  struct UseDoublePrecisionComplexKernel {
2336  typedef complex<double> Type;
2337  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2338  IsSame<typename T2::ElementType,Type>::value &&
2339  IsSame<typename T3::ElementType,Type>::value };
2340  };
2341  //**********************************************************************************************
2342 
2343  //**********************************************************************************************
2345 
2347  template< typename T1, typename T2, typename T3, typename T4 >
2348  struct UseDefaultKernel {
2349  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2350  !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2351  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2352  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2353  };
2354  //**********************************************************************************************
2355 
2356  //**********************************************************************************************
2358 
2360  template< typename T1, typename T2, typename T3, typename T4 >
2361  struct UseVectorizedDefaultKernel {
2362  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2363  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2364  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2365  IsSame<typename T1::ElementType,T4>::value &&
2366  IntrinsicTrait<typename T1::ElementType>::addition &&
2367  IntrinsicTrait<typename T1::ElementType>::multiplication };
2368  };
2369  //**********************************************************************************************
2370 
2371  public:
2372  //**Type definitions****************************************************************************
2373  typedef DMatScalarMultExpr<MMM,ST,true> This;
2374  typedef typename MultTrait<RES,ST>::Type ResultType;
2375  typedef typename ResultType::OppositeType OppositeType;
2376  typedef typename ResultType::TransposeType TransposeType;
2377  typedef typename ResultType::ElementType ElementType;
2378  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
2379  typedef const ElementType ReturnType;
2380  typedef const ResultType CompositeType;
2381 
2384 
2386  typedef typename SelectType< IsNumeric<ElementType>::value, ElementType, ST >::Type RightOperand;
2387 
2389  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
2390 
2392  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
2393  //**********************************************************************************************
2394 
2395  //**Compilation flags***************************************************************************
2397  enum { vectorizable = 0 };
2398 
2400  enum { canAlias = CanAlias<MMM>::value };
2401  //**********************************************************************************************
2402 
2403  //**Constructor*********************************************************************************
2409  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
2410  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
2411  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2412  {}
2413  //**********************************************************************************************
2414 
2415  //**Access operator*****************************************************************************
2422  inline ResultType operator()( size_t i, size_t j ) const {
2423  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
2424  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
2425  return matrix_(i,j) * scalar_;
2426  }
2427  //**********************************************************************************************
2428 
2429  //**Rows function*******************************************************************************
2434  inline size_t rows() const {
2435  return matrix_.rows();
2436  }
2437  //**********************************************************************************************
2438 
2439  //**Columns function****************************************************************************
2444  inline size_t columns() const {
2445  return matrix_.columns();
2446  }
2447  //**********************************************************************************************
2448 
2449  //**Left operand access*************************************************************************
2454  inline LeftOperand leftOperand() const {
2455  return matrix_;
2456  }
2457  //**********************************************************************************************
2458 
2459  //**Right operand access************************************************************************
2464  inline RightOperand rightOperand() const {
2465  return scalar_;
2466  }
2467  //**********************************************************************************************
2468 
2469  //**********************************************************************************************
2475  template< typename T >
2476  inline bool isAliased( const T* alias ) const {
2477  return CanAlias<MMM>::value && matrix_.isAliased( alias );
2478  }
2479  //**********************************************************************************************
2480 
2481  private:
2482  //**Member variables****************************************************************************
2483  LeftOperand matrix_;
2484  RightOperand scalar_;
2485  //**********************************************************************************************
2486 
2487  //**Assignment to dense matrices****************************************************************
2496  template< typename MT3 // Type of the target dense matrix
2497  , bool SO > // Storage order of the target dense matrix
2498  friend inline void assign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
2499  {
2500  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2501  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2502 
2503  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2504  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2505 
2506  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
2507  return;
2508  }
2509  else if( left.columns() == 0UL ) {
2510  reset( ~lhs );
2511  return;
2512  }
2513 
2514  LT A( left ); // Evaluation of the left-hand side dense matrix operand
2515  RT B( right ); // Evaluation of the right-hand side dense matrix operand
2516 
2517  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2518  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
2519  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
2520  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
2521  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2522  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
2523 
2524  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
2525  DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2526  else
2527  DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2528  }
2529  //**********************************************************************************************
2530 
2531  //**Default assignment to dense matrices********************************************************
2545  template< typename MT3 // Type of the left-hand side target matrix
2546  , typename MT4 // Type of the left-hand side matrix operand
2547  , typename MT5 // Type of the right-hand side matrix operand
2548  , typename ST2 > // Type of the scalar value
2549  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2550  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2551  {
2552  for( size_t i=0UL; i<A.rows(); ++i ) {
2553  for( size_t k=0UL; k<B.columns(); ++k ) {
2554  C(i,k) = A(i,0UL) * B(0UL,k);
2555  }
2556  for( size_t j=1UL; j<A.columns(); ++j ) {
2557  for( size_t k=0UL; k<B.columns(); ++k ) {
2558  C(i,k) += A(i,j) * B(j,k);
2559  }
2560  }
2561  for( size_t k=0UL; k<B.columns(); ++k ) {
2562  C(i,k) *= scalar;
2563  }
2564  }
2565  }
2566  //**********************************************************************************************
2567 
2568  //**Vectorized default assignment to row-major dense matrices***********************************
2582  template< typename MT3 // Type of the left-hand side target matrix
2583  , typename MT4 // Type of the left-hand side matrix operand
2584  , typename MT5 // Type of the right-hand side matrix operand
2585  , typename ST2 > // Type of the scalar value
2586  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2587  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
2588  {
2589  typedef IntrinsicTrait<ElementType> IT;
2590 
2591  const size_t M( A.rows() );
2592  const size_t N( B.spacing() );
2593  const size_t K( A.columns() );
2594 
2595  const IntrinsicType factor( set( scalar ) );
2596 
2597  size_t j( 0UL );
2598 
2599  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2600  for( size_t i=0UL; i<M; ++i ) {
2601  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2602  for( size_t k=0UL; k<K; ++k ) {
2603  const IntrinsicType a1( set( A(i,k) ) );
2604  xmm1 = xmm1 + a1 * B.get(k,j );
2605  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2606  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2607  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2608  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
2609  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
2610  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
2611  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
2612  }
2613  store( &(~C)(i,j ), xmm1 * factor );
2614  store( &(~C)(i,j+IT::size ), xmm2 * factor );
2615  store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2616  store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2617  store( &(~C)(i,j+IT::size*4UL), xmm5 * factor );
2618  store( &(~C)(i,j+IT::size*5UL), xmm6 * factor );
2619  store( &(~C)(i,j+IT::size*6UL), xmm7 * factor );
2620  store( &(~C)(i,j+IT::size*7UL), xmm8 * factor );
2621  }
2622  }
2623  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2624  size_t i( 0UL );
2625  for( ; (i+2UL) <= M; i+=2UL ) {
2626  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2627  for( size_t k=0UL; k<K; ++k ) {
2628  const IntrinsicType a1( set( A(i ,k) ) );
2629  const IntrinsicType a2( set( A(i+1UL,k) ) );
2630  const IntrinsicType b1( B.get(k,j ) );
2631  const IntrinsicType b2( B.get(k,j+IT::size ) );
2632  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
2633  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
2634  xmm1 = xmm1 + a1 * b1;
2635  xmm2 = xmm2 + a1 * b2;
2636  xmm3 = xmm3 + a1 * b3;
2637  xmm4 = xmm4 + a1 * b4;
2638  xmm5 = xmm5 + a2 * b1;
2639  xmm6 = xmm6 + a2 * b2;
2640  xmm7 = xmm7 + a2 * b3;
2641  xmm8 = xmm8 + a2 * b4;
2642  }
2643  store( &(~C)(i ,j ), xmm1 * factor );
2644  store( &(~C)(i ,j+IT::size ), xmm2 * factor );
2645  store( &(~C)(i ,j+IT::size*2UL), xmm3 * factor );
2646  store( &(~C)(i ,j+IT::size*3UL), xmm4 * factor );
2647  store( &(~C)(i+1UL,j ), xmm5 * factor );
2648  store( &(~C)(i+1UL,j+IT::size ), xmm6 * factor );
2649  store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 * factor );
2650  store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 * factor );
2651  }
2652  if( i < M ) {
2653  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2654  for( size_t k=0UL; k<K; ++k ) {
2655  const IntrinsicType a1( set( A(i,k) ) );
2656  xmm1 = xmm1 + a1 * B.get(k,j );
2657  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2658  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2659  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2660  }
2661  store( &(~C)(i,j ), xmm1 * factor );
2662  store( &(~C)(i,j+IT::size ), xmm2 * factor );
2663  store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2664  store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2665  }
2666  }
2667  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2668  size_t i( 0UL );
2669  for( ; (i+2UL) <= M; i+=2UL ) {
2670  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2671  for( size_t k=0UL; k<K; ++k ) {
2672  const IntrinsicType a1( set( A(i ,k) ) );
2673  const IntrinsicType a2( set( A(i+1UL,k) ) );
2674  const IntrinsicType b1( B.get(k,j ) );
2675  const IntrinsicType b2( B.get(k,j+IT::size) );
2676  xmm1 = xmm1 + a1 * b1;
2677  xmm2 = xmm2 + a1 * b2;
2678  xmm3 = xmm3 + a2 * b1;
2679  xmm4 = xmm4 + a2 * b2;
2680  }
2681  store( &(~C)(i ,j ), xmm1 * factor );
2682  store( &(~C)(i ,j+IT::size), xmm2 * factor );
2683  store( &(~C)(i+1UL,j ), xmm3 * factor );
2684  store( &(~C)(i+1UL,j+IT::size), xmm4 * factor );
2685  }
2686  if( i < M ) {
2687  IntrinsicType xmm1, xmm2;
2688  for( size_t k=0UL; k<K; ++k ) {
2689  const IntrinsicType a1( set( A(i,k) ) );
2690  xmm1 = xmm1 + a1 * B.get(k,j );
2691  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
2692  }
2693  store( &(~C)(i,j ), xmm1 * factor );
2694  store( &(~C)(i,j+IT::size), xmm2 * factor );
2695  }
2696  }
2697  if( j < N ) {
2698  size_t i( 0UL );
2699  for( ; (i+2UL) <= M; i+=2UL ) {
2700  IntrinsicType xmm1, xmm2;
2701  for( size_t k=0UL; k<K; ++k ) {
2702  const IntrinsicType b1( B.get(k,j) );
2703  xmm1 = xmm1 + set( A(i ,k) ) * b1;
2704  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
2705  }
2706  store( &(~C)(i ,j), xmm1 * factor );
2707  store( &(~C)(i+1UL,j), xmm2 * factor );
2708  }
2709  if( i < M ) {
2710  IntrinsicType xmm1;
2711  for( size_t k=0UL; k<K; ++k ) {
2712  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
2713  }
2714  store( &(~C)(i,j), xmm1 * factor );
2715  }
2716  }
2717  }
2718  //**********************************************************************************************
2719 
2720  //**Vectorized default assignment to column-major dense matrices********************************
2734  template< typename MT3 // Type of the left-hand side target matrix
2735  , typename MT4 // Type of the left-hand side matrix operand
2736  , typename MT5 // Type of the right-hand side matrix operand
2737  , typename ST2 > // Type of the scalar value
2738  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2739  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
2740  {
2741  typedef IntrinsicTrait<ElementType> IT;
2742 
2743  const size_t M( A.spacing() );
2744  const size_t N( B.columns() );
2745  const size_t K( A.columns() );
2746 
2747  const IntrinsicType factor( set( scalar ) );
2748 
2749  size_t i( 0UL );
2750 
2751  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2752  for( size_t j=0UL; j<N; ++j ) {
2753  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2754  for( size_t k=0UL; k<K; ++k ) {
2755  const IntrinsicType b1( set( B(k,j) ) );
2756  xmm1 = xmm1 + A.get(i ,k) * b1;
2757  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2758  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2759  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2760  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
2761  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
2762  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
2763  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
2764  }
2765  store( &(~C)(i ,j), xmm1 * factor );
2766  store( &(~C)(i+IT::size ,j), xmm2 * factor );
2767  store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2768  store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2769  store( &(~C)(i+IT::size*4UL,j), xmm5 * factor );
2770  store( &(~C)(i+IT::size*5UL,j), xmm6 * factor );
2771  store( &(~C)(i+IT::size*6UL,j), xmm7 * factor );
2772  store( &(~C)(i+IT::size*7UL,j), xmm8 * factor );
2773  }
2774  }
2775  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2776  size_t j( 0UL );
2777  for( ; (j+2UL) <= N; j+=2UL ) {
2778  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2779  for( size_t k=0UL; k<K; ++k ) {
2780  const IntrinsicType a1( A.get(i ,k) );
2781  const IntrinsicType a2( A.get(i+IT::size ,k) );
2782  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
2783  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
2784  const IntrinsicType b1( set( B(k,j ) ) );
2785  const IntrinsicType b2( set( B(k,j+1UL) ) );
2786  xmm1 = xmm1 + a1 * b1;
2787  xmm2 = xmm2 + a2 * b1;
2788  xmm3 = xmm3 + a3 * b1;
2789  xmm4 = xmm4 + a4 * b1;
2790  xmm5 = xmm5 + a1 * b2;
2791  xmm6 = xmm6 + a2 * b2;
2792  xmm7 = xmm7 + a3 * b2;
2793  xmm8 = xmm8 + a4 * b2;
2794  }
2795  store( &(~C)(i ,j ), xmm1 * factor );
2796  store( &(~C)(i+IT::size ,j ), xmm2 * factor );
2797  store( &(~C)(i+IT::size*2UL,j ), xmm3 * factor );
2798  store( &(~C)(i+IT::size*3UL,j ), xmm4 * factor );
2799  store( &(~C)(i ,j+1UL), xmm5 * factor );
2800  store( &(~C)(i+IT::size ,j+1UL), xmm6 * factor );
2801  store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 * factor );
2802  store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 * factor );
2803  }
2804  if( j < N ) {
2805  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2806  for( size_t k=0UL; k<K; ++k ) {
2807  const IntrinsicType b1( set( B(k,j) ) );
2808  xmm1 = xmm1 + A.get(i ,k) * b1;
2809  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2810  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2811  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2812  }
2813  store( &(~C)(i ,j), xmm1 * factor );
2814  store( &(~C)(i+IT::size ,j), xmm2 * factor );
2815  store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2816  store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2817  }
2818  }
2819  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2820  size_t j( 0UL );
2821  for( ; (j+2UL) <= N; j+=2UL ) {
2822  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2823  for( size_t k=0UL; k<K; ++k ) {
2824  const IntrinsicType a1( A.get(i ,k) );
2825  const IntrinsicType a2( A.get(i+IT::size,k) );
2826  const IntrinsicType b1( set( B(k,j ) ) );
2827  const IntrinsicType b2( set( B(k,j+1UL) ) );
2828  xmm1 = xmm1 + a1 * b1;
2829  xmm2 = xmm2 + a2 * b1;
2830  xmm3 = xmm3 + a1 * b2;
2831  xmm4 = xmm4 + a2 * b2;
2832  }
2833  store( &(~C)(i ,j ), xmm1 * factor );
2834  store( &(~C)(i+IT::size,j ), xmm2 * factor );
2835  store( &(~C)(i ,j+1UL), xmm3 * factor );
2836  store( &(~C)(i+IT::size,j+1UL), xmm4 * factor );
2837  }
2838  if( j < N ) {
2839  IntrinsicType xmm1, xmm2;
2840  for( size_t k=0UL; k<K; ++k ) {
2841  const IntrinsicType b1( set( B(k,j) ) );
2842  xmm1 = xmm1 + A.get(i ,k) * b1;
2843  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
2844  }
2845  store( &(~C)(i ,j), xmm1 * factor );
2846  store( &(~C)(i+IT::size,j), xmm2 * factor );
2847  }
2848  }
2849  if( i < M ) {
2850  size_t j( 0UL );
2851  for( ; (j+2UL) <= N; j+=2UL ) {
2852  IntrinsicType xmm1, xmm2;
2853  for( size_t k=0UL; k<K; ++k ) {
2854  const IntrinsicType a1( A.get(i,k) );
2855  xmm1 = xmm1 + a1 * set( B(k,j ) );
2856  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
2857  }
2858  store( &(~C)(i,j ), xmm1 * factor );
2859  store( &(~C)(i,j+1UL), xmm2 * factor );
2860  }
2861  if( j < N ) {
2862  IntrinsicType xmm1;
2863  for( size_t k=0UL; k<K; ++k ) {
2864  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
2865  }
2866  store( &(~C)(i,j), xmm1 * factor );
2867  }
2868  }
2869  }
2870  //**********************************************************************************************
2871 
2872  //**BLAS-based assignment to dense matrices (default)*******************************************
2886  template< typename MT3 // Type of the left-hand side target matrix
2887  , typename MT4 // Type of the left-hand side matrix operand
2888  , typename MT5 // Type of the right-hand side matrix operand
2889  , typename ST2 > // Type of the scalar value
2890  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2891  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2892  {
2893  selectDefaultAssignKernel( C, A, B, scalar );
2894  }
2895  //**********************************************************************************************
2896 
2897  //**BLAS-based assignment to dense matrices (single precision)**********************************
2898 #if BLAZE_BLAS_MODE
2899 
2912  template< typename MT3 // Type of the left-hand side target matrix
2913  , typename MT4 // Type of the left-hand side matrix operand
2914  , typename MT5 // Type of the right-hand side matrix operand
2915  , typename ST2 > // Type of the scalar value
2916  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2917  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2918  {
2919  using boost::numeric_cast;
2920 
2921  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
2922  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
2923  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
2924 
2925  const int M ( numeric_cast<int>( A.rows() ) );
2926  const int N ( numeric_cast<int>( B.columns() ) );
2927  const int K ( numeric_cast<int>( A.columns() ) );
2928  const int lda( numeric_cast<int>( A.spacing() ) );
2929  const int ldb( numeric_cast<int>( B.spacing() ) );
2930  const int ldc( numeric_cast<int>( C.spacing() ) );
2931 
2932  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2933  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2934  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2935  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2936  }
2937 #endif
2938  //**********************************************************************************************
2939 
2940  //**BLAS-based assignment to dense matrices (double precision)**********************************
2941 #if BLAZE_BLAS_MODE
2942 
2955  template< typename MT3 // Type of the left-hand side target matrix
2956  , typename MT4 // Type of the left-hand side matrix operand
2957  , typename MT5 // Type of the right-hand side matrix operand
2958  , typename ST2 > // Type of the scalar value
2959  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2960  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2961  {
2962  using boost::numeric_cast;
2963 
2964  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
2965  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
2966  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
2967 
2968  const int M ( numeric_cast<int>( A.rows() ) );
2969  const int N ( numeric_cast<int>( B.columns() ) );
2970  const int K ( numeric_cast<int>( A.columns() ) );
2971  const int lda( numeric_cast<int>( A.spacing() ) );
2972  const int ldb( numeric_cast<int>( B.spacing() ) );
2973  const int ldc( numeric_cast<int>( C.spacing() ) );
2974 
2975  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2976  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2977  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2978  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
2979  }
2980 #endif
2981  //**********************************************************************************************
2982 
2983  //**BLAS-based assignment to dense matrices (single precision complex)**************************
2984 #if BLAZE_BLAS_MODE
2985 
2998  template< typename MT3 // Type of the left-hand side target matrix
2999  , typename MT4 // Type of the left-hand side matrix operand
3000  , typename MT5 // Type of the right-hand side matrix operand
3001  , typename ST2 > // Type of the scalar value
3002  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3003  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3004  {
3005  using boost::numeric_cast;
3006 
3007  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3008  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3009  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3011  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3012  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3013  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3014 
3015  const int M ( numeric_cast<int>( A.rows() ) );
3016  const int N ( numeric_cast<int>( B.columns() ) );
3017  const int K ( numeric_cast<int>( A.columns() ) );
3018  const int lda( numeric_cast<int>( A.spacing() ) );
3019  const int ldb( numeric_cast<int>( B.spacing() ) );
3020  const int ldc( numeric_cast<int>( C.spacing() ) );
3021  const complex<float> alpha( scalar );
3022  const complex<float> beta ( 0.0F, 0.0F );
3023 
3024  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3025  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3026  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3027  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3028  }
3029 #endif
3030  //**********************************************************************************************
3031 
3032  //**BLAS-based assignment to dense matrices (double precision complex)**************************
3033 #if BLAZE_BLAS_MODE
3034 
3047  template< typename MT3 // Type of the left-hand side target matrix
3048  , typename MT4 // Type of the left-hand side matrix operand
3049  , typename MT5 // Type of the right-hand side matrix operand
3050  , typename ST2 > // Type of the scalar value
3051  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3052  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3053  {
3054  using boost::numeric_cast;
3055 
3056  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3057  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3058  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3060  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3061  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3062  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3063 
3064  const int M ( numeric_cast<int>( A.rows() ) );
3065  const int N ( numeric_cast<int>( B.columns() ) );
3066  const int K ( numeric_cast<int>( A.columns() ) );
3067  const int lda( numeric_cast<int>( A.spacing() ) );
3068  const int ldb( numeric_cast<int>( B.spacing() ) );
3069  const int ldc( numeric_cast<int>( C.spacing() ) );
3070  const complex<double> alpha( scalar );
3071  const complex<double> beta ( 0.0, 0.0 );
3072 
3073  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3074  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3075  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3076  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3077  }
3078 #endif
3079  //**********************************************************************************************
3080 
3081  //**Assignment to sparse matrices***************************************************************
3093  template< typename MT // Type of the target sparse matrix
3094  , bool SO > // Storage order of the target sparse matrix
3095  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
3096  {
3097  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3098 
3104  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( typename TmpType::CompositeType );
3105 
3106  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3107  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3108 
3109  const TmpType tmp( rhs );
3110  assign( ~lhs, tmp );
3111  }
3112  //**********************************************************************************************
3113 
3114  //**Addition assignment to dense matrices*******************************************************
3126  template< typename MT3 // Type of the target dense matrix
3127  , bool SO > // Storage order of the target dense matrix
3128  friend inline void addAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
3129  {
3130  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3131  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3132 
3133  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3134  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3135 
3136  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3137  return;
3138  }
3139 
3140  LT A( left ); // Evaluation of the left-hand side dense matrix operand
3141  RT B( right ); // Evaluation of the right-hand side dense matrix operand
3142 
3143  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3144  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3145  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3146  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3147  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3148  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3149 
3150  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
3151  DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3152  else
3153  DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3154  }
3155  //**********************************************************************************************
3156 
3157  //**Default addition assignment to dense matrices***********************************************
3171  template< typename MT3 // Type of the left-hand side target matrix
3172  , typename MT4 // Type of the left-hand side matrix operand
3173  , typename MT5 // Type of the right-hand side matrix operand
3174  , typename ST2 > // Type of the scalar value
3175  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3176  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3177  {
3178  const ResultType tmp( A * B * scalar );
3179  addAssign( C, tmp );
3180  }
3181  //**********************************************************************************************
3182 
3183  //**Vectorized default addition assignment to row-major dense matrices**************************
3197  template< typename MT3 // Type of the left-hand side target matrix
3198  , typename MT4 // Type of the left-hand side matrix operand
3199  , typename MT5 // Type of the right-hand side matrix operand
3200  , typename ST2 > // Type of the scalar value
3201  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3202  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3203  {
3204  typedef IntrinsicTrait<ElementType> IT;
3205 
3206  const size_t M( A.rows() );
3207  const size_t N( B.spacing() );
3208  const size_t K( A.columns() );
3209 
3210  const IntrinsicType factor( set( scalar ) );
3211 
3212  size_t j( 0UL );
3213 
3214  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3215  for( size_t i=0UL; i<M; ++i ) {
3216  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3217  for( size_t k=0UL; k<K; ++k ) {
3218  const IntrinsicType a1( set( A(i,k) ) );
3219  xmm1 = xmm1 + a1 * B.get(k,j );
3220  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3221  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3222  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3223  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3224  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3225  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3226  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3227  }
3228  store( &(~C)(i,j ), load( &(~C)(i,j ) ) + xmm1 * factor );
3229  store( &(~C)(i,j+IT::size ), load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
3230  store( &(~C)(i,j+IT::size*2UL), load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
3231  store( &(~C)(i,j+IT::size*3UL), load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
3232  store( &(~C)(i,j+IT::size*4UL), load( &(~C)(i,j+IT::size*4UL) ) + xmm5 * factor );
3233  store( &(~C)(i,j+IT::size*5UL), load( &(~C)(i,j+IT::size*5UL) ) + xmm6 * factor );
3234  store( &(~C)(i,j+IT::size*6UL), load( &(~C)(i,j+IT::size*6UL) ) + xmm7 * factor );
3235  store( &(~C)(i,j+IT::size*7UL), load( &(~C)(i,j+IT::size*7UL) ) + xmm8 * factor );
3236  }
3237  }
3238  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3239  size_t i( 0UL );
3240  for( ; (i+2UL) <= M; i+=2UL ) {
3241  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3242  for( size_t k=0UL; k<K; ++k ) {
3243  const IntrinsicType a1( set( A(i ,k) ) );
3244  const IntrinsicType a2( set( A(i+1UL,k) ) );
3245  const IntrinsicType b1( B.get(k,j ) );
3246  const IntrinsicType b2( B.get(k,j+IT::size ) );
3247  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
3248  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
3249  xmm1 = xmm1 + a1 * b1;
3250  xmm2 = xmm2 + a1 * b2;
3251  xmm3 = xmm3 + a1 * b3;
3252  xmm4 = xmm4 + a1 * b4;
3253  xmm5 = xmm5 + a2 * b1;
3254  xmm6 = xmm6 + a2 * b2;
3255  xmm7 = xmm7 + a2 * b3;
3256  xmm8 = xmm8 + a2 * b4;
3257  }
3258  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) + xmm1 * factor );
3259  store( &(~C)(i ,j+IT::size ), load( &(~C)(i ,j+IT::size ) ) + xmm2 * factor );
3260  store( &(~C)(i ,j+IT::size*2UL), load( &(~C)(i ,j+IT::size*2UL) ) + xmm3 * factor );
3261  store( &(~C)(i ,j+IT::size*3UL), load( &(~C)(i ,j+IT::size*3UL) ) + xmm4 * factor );
3262  store( &(~C)(i+1UL,j ), load( &(~C)(i+1UL,j ) ) + xmm5 * factor );
3263  store( &(~C)(i+1UL,j+IT::size ), load( &(~C)(i+1UL,j+IT::size ) ) + xmm6 * factor );
3264  store( &(~C)(i+1UL,j+IT::size*2UL), load( &(~C)(i+1UL,j+IT::size*2UL) ) + xmm7 * factor );
3265  store( &(~C)(i+1UL,j+IT::size*3UL), load( &(~C)(i+1UL,j+IT::size*3UL) ) + xmm8 * factor );
3266  }
3267  if( i < M ) {
3268  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3269  for( size_t k=0UL; k<K; ++k ) {
3270  const IntrinsicType a1( set( A(i,k) ) );
3271  xmm1 = xmm1 + a1 * B.get(k,j );
3272  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3273  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3274  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3275  }
3276  store( &(~C)(i,j ), load( &(~C)(i,j ) ) + xmm1 * factor );
3277  store( &(~C)(i,j+IT::size ), load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
3278  store( &(~C)(i,j+IT::size*2UL), load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
3279  store( &(~C)(i,j+IT::size*3UL), load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
3280  }
3281  }
3282  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3283  size_t i( 0UL );
3284  for( ; (i+2UL) <= M; i+=2UL ) {
3285  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3286  for( size_t k=0UL; k<K; ++k ) {
3287  const IntrinsicType a1( set( A(i ,k) ) );
3288  const IntrinsicType a2( set( A(i+1UL,k) ) );
3289  const IntrinsicType b1( B.get(k,j ) );
3290  const IntrinsicType b2( B.get(k,j+IT::size) );
3291  xmm1 = xmm1 + a1 * b1;
3292  xmm2 = xmm2 + a1 * b2;
3293  xmm3 = xmm3 + a2 * b1;
3294  xmm4 = xmm4 + a2 * b2;
3295  }
3296  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) + xmm1 * factor );
3297  store( &(~C)(i ,j+IT::size), load( &(~C)(i ,j+IT::size) ) + xmm2 * factor );
3298  store( &(~C)(i+1UL,j ), load( &(~C)(i+1UL,j ) ) + xmm3 * factor );
3299  store( &(~C)(i+1UL,j+IT::size), load( &(~C)(i+1UL,j+IT::size) ) + xmm4 * factor );
3300  }
3301  if( i < M ) {
3302  IntrinsicType xmm1, xmm2;
3303  for( size_t k=0UL; k<K; ++k ) {
3304  const IntrinsicType a1( set( A(i,k) ) );
3305  xmm1 = xmm1 + a1 * B.get(k,j );
3306  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3307  }
3308  store( &(~C)(i,j ), load( &(~C)(i,j ) ) + xmm1 * factor );
3309  store( &(~C)(i,j+IT::size), load( &(~C)(i,j+IT::size) ) + xmm2 * factor );
3310  }
3311  }
3312  if( j < N ) {
3313  size_t i( 0UL );
3314  for( ; (i+2UL) <= M; i+=2UL ) {
3315  IntrinsicType xmm1, xmm2;
3316  for( size_t k=0UL; k<K; ++k ) {
3317  const IntrinsicType b1( B.get(k,j) );
3318  xmm1 = xmm1 + set( A(i ,k) ) * b1;
3319  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
3320  }
3321  store( &(~C)(i ,j), load( &(~C)(i ,j) ) + xmm1 * factor );
3322  store( &(~C)(i+1UL,j), load( &(~C)(i+1UL,j) ) + xmm2 * factor );
3323  }
3324  if( i < M ) {
3325  IntrinsicType xmm1;
3326  for( size_t k=0UL; k<K; ++k ) {
3327  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
3328  }
3329  store( &(~C)(i,j), load( &(~C)(i,j) ) + xmm1 * factor );
3330  }
3331  }
3332  }
3333  //**********************************************************************************************
3334 
3335  //**Vectorized default addition assignment to column-major dense matrices***********************
3349  template< typename MT3 // Type of the left-hand side target matrix
3350  , typename MT4 // Type of the left-hand side matrix operand
3351  , typename MT5 // Type of the right-hand side matrix operand
3352  , typename ST2 > // Type of the scalar value
3353  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3354  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3355  {
3356  typedef IntrinsicTrait<ElementType> IT;
3357 
3358  const size_t M( A.spacing() );
3359  const size_t N( B.columns() );
3360  const size_t K( A.columns() );
3361 
3362  const IntrinsicType factor( set( scalar ) );
3363 
3364  size_t i( 0UL );
3365 
3366  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3367  for( size_t j=0UL; j<N; ++j ) {
3368  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3369  for( size_t k=0UL; k<K; ++k ) {
3370  const IntrinsicType b1( set( B(k,j) ) );
3371  xmm1 = xmm1 + A.get(i ,k) * b1;
3372  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3373  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3374  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3375  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3376  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3377  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3378  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
3379  }
3380  store( &(~C)(i ,j), load( &(~C)(i ,j) ) + xmm1 * factor );
3381  store( &(~C)(i+IT::size ,j), load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
3382  store( &(~C)(i+IT::size*2UL,j), load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
3383  store( &(~C)(i+IT::size*3UL,j), load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
3384  store( &(~C)(i+IT::size*4UL,j), load( &(~C)(i+IT::size*4UL,j) ) + xmm5 * factor );
3385  store( &(~C)(i+IT::size*5UL,j), load( &(~C)(i+IT::size*5UL,j) ) + xmm6 * factor );
3386  store( &(~C)(i+IT::size*6UL,j), load( &(~C)(i+IT::size*6UL,j) ) + xmm7 * factor );
3387  store( &(~C)(i+IT::size*7UL,j), load( &(~C)(i+IT::size*7UL,j) ) + xmm8 * factor );
3388  }
3389  }
3390  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
3391  size_t j( 0UL );
3392  for( ; (j+2UL) <= N; j+=2UL ) {
3393  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3394  for( size_t k=0UL; k<K; ++k ) {
3395  const IntrinsicType a1( A.get(i ,k) );
3396  const IntrinsicType a2( A.get(i+IT::size ,k) );
3397  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
3398  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
3399  const IntrinsicType b1( set( B(k,j ) ) );
3400  const IntrinsicType b2( set( B(k,j+1UL) ) );
3401  xmm1 = xmm1 + a1 * b1;
3402  xmm2 = xmm2 + a2 * b1;
3403  xmm3 = xmm3 + a3 * b1;
3404  xmm4 = xmm4 + a4 * b1;
3405  xmm5 = xmm5 + a1 * b2;
3406  xmm6 = xmm6 + a2 * b2;
3407  xmm7 = xmm7 + a3 * b2;
3408  xmm8 = xmm8 + a4 * b2;
3409  }
3410  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) + xmm1 * factor );
3411  store( &(~C)(i+IT::size ,j ), load( &(~C)(i+IT::size ,j ) ) + xmm2 * factor );
3412  store( &(~C)(i+IT::size*2UL,j ), load( &(~C)(i+IT::size*2UL,j ) ) + xmm3 * factor );
3413  store( &(~C)(i+IT::size*3UL,j ), load( &(~C)(i+IT::size*3UL,j ) ) + xmm4 * factor );
3414  store( &(~C)(i ,j+1UL), load( &(~C)(i ,j+1UL) ) + xmm5 * factor );
3415  store( &(~C)(i+IT::size ,j+1UL), load( &(~C)(i+IT::size ,j+1UL) ) + xmm6 * factor );
3416  store( &(~C)(i+IT::size*2UL,j+1UL), load( &(~C)(i+IT::size*2UL,j+1UL) ) + xmm7 * factor );
3417  store( &(~C)(i+IT::size*3UL,j+1UL), load( &(~C)(i+IT::size*3UL,j+1UL) ) + xmm8 * factor );
3418  }
3419  if( j < N ) {
3420  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3421  for( size_t k=0UL; k<K; ++k ) {
3422  const IntrinsicType b1( set( B(k,j) ) );
3423  xmm1 = xmm1 + A.get(i ,k) * b1;
3424  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3425  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3426  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3427  }
3428  store( &(~C)(i ,j), load( &(~C)(i ,j) ) + xmm1 * factor );
3429  store( &(~C)(i+IT::size ,j), load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
3430  store( &(~C)(i+IT::size*2UL,j), load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
3431  store( &(~C)(i+IT::size*3UL,j), load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
3432  }
3433  }
3434  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
3435  size_t j( 0UL );
3436  for( ; (j+2UL) <= N; j+=2UL ) {
3437  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3438  for( size_t k=0UL; k<K; ++k ) {
3439  const IntrinsicType a1( A.get(i ,k) );
3440  const IntrinsicType a2( A.get(i+IT::size,k) );
3441  const IntrinsicType b1( set( B(k,j ) ) );
3442  const IntrinsicType b2( set( B(k,j+1UL) ) );
3443  xmm1 = xmm1 + a1 * b1;
3444  xmm2 = xmm2 + a2 * b1;
3445  xmm3 = xmm3 + a1 * b2;
3446  xmm4 = xmm4 + a2 * b2;
3447  }
3448  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) + xmm1 * factor );
3449  store( &(~C)(i+IT::size,j ), load( &(~C)(i+IT::size,j ) ) + xmm2 * factor );
3450  store( &(~C)(i ,j+1UL), load( &(~C)(i ,j+1UL) ) + xmm3 * factor );
3451  store( &(~C)(i+IT::size,j+1UL), load( &(~C)(i+IT::size,j+1UL) ) + xmm4 * factor );
3452  }
3453  if( j < N ) {
3454  IntrinsicType xmm1, xmm2;
3455  for( size_t k=0UL; k<K; ++k ) {
3456  const IntrinsicType b1( set( B(k,j) ) );
3457  xmm1 = xmm1 + A.get(i ,k) * b1;
3458  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
3459  }
3460  store( &(~C)(i ,j), load( &(~C)(i ,j) ) + xmm1 * factor );
3461  store( &(~C)(i+IT::size,j), load( &(~C)(i+IT::size,j) ) + xmm2 * factor );
3462  }
3463  }
3464  if( i < M ) {
3465  size_t j( 0UL );
3466  for( ; (j+2UL) <= N; j+=2UL ) {
3467  IntrinsicType xmm1, xmm2;
3468  for( size_t k=0UL; k<K; ++k ) {
3469  const IntrinsicType a1( A.get(i,k) );
3470  xmm1 = xmm1 + a1 * set( B(k,j ) );
3471  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
3472  }
3473  store( &(~C)(i,j ), load( &(~C)(i,j ) ) + xmm1 * factor );
3474  store( &(~C)(i,j+1UL), load( &(~C)(i,j+1UL) ) + xmm2 * factor );
3475  }
3476  if( j < N ) {
3477  IntrinsicType xmm1;
3478  for( size_t k=0UL; k<K; ++k ) {
3479  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
3480  }
3481  store( &(~C)(i,j), load( &(~C)(i,j) ) + xmm1 * factor );
3482  }
3483  }
3484  }
3485  //**********************************************************************************************
3486 
3487  //**BLAS-based addition assignment to dense matrices (default)**********************************
3501  template< typename MT3 // Type of the left-hand side target matrix
3502  , typename MT4 // Type of the left-hand side matrix operand
3503  , typename MT5 // Type of the right-hand side matrix operand
3504  , typename ST2 > // Type of the scalar value
3505  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3506  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3507  {
3508  selectDefaultAddAssignKernel( C, A, B, scalar );
3509  }
3510  //**********************************************************************************************
3511 
3512  //**BLAS-based addition assignment to dense matrices (single precision)*************************
3513 #if BLAZE_BLAS_MODE
3514 
3527  template< typename MT3 // Type of the left-hand side target matrix
3528  , typename MT4 // Type of the left-hand side matrix operand
3529  , typename MT5 // Type of the right-hand side matrix operand
3530  , typename ST2 > // Type of the scalar value
3531  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3532  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3533  {
3534  using boost::numeric_cast;
3535 
3536  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
3537  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
3538  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
3539 
3540  const int M ( numeric_cast<int>( A.rows() ) );
3541  const int N ( numeric_cast<int>( B.columns() ) );
3542  const int K ( numeric_cast<int>( A.columns() ) );
3543  const int lda( numeric_cast<int>( A.spacing() ) );
3544  const int ldb( numeric_cast<int>( B.spacing() ) );
3545  const int ldc( numeric_cast<int>( C.spacing() ) );
3546 
3547  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3548  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3549  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3550  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3551  }
3552 #endif
3553  //**********************************************************************************************
3554 
3555  //**BLAS-based addition assignment to dense matrices (double precision)*************************
3556 #if BLAZE_BLAS_MODE
3557 
3570  template< typename MT3 // Type of the left-hand side target matrix
3571  , typename MT4 // Type of the left-hand side matrix operand
3572  , typename MT5 // Type of the right-hand side matrix operand
3573  , typename ST2 > // Type of the scalar value
3574  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3575  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3576  {
3577  using boost::numeric_cast;
3578 
3579  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
3580  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
3581  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
3582 
3583  const int M ( numeric_cast<int>( A.rows() ) );
3584  const int N ( numeric_cast<int>( B.columns() ) );
3585  const int K ( numeric_cast<int>( A.columns() ) );
3586  const int lda( numeric_cast<int>( A.spacing() ) );
3587  const int ldb( numeric_cast<int>( B.spacing() ) );
3588  const int ldc( numeric_cast<int>( C.spacing() ) );
3589 
3590  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3591  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3592  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3593  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3594  }
3595 #endif
3596  //**********************************************************************************************
3597 
3598  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
3599 #if BLAZE_BLAS_MODE
3600 
3613  template< typename MT3 // Type of the left-hand side target matrix
3614  , typename MT4 // Type of the left-hand side matrix operand
3615  , typename MT5 // Type of the right-hand side matrix operand
3616  , typename ST2 > // Type of the scalar value
3617  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3618  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3619  {
3620  using boost::numeric_cast;
3621 
3622  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3623  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3624  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3626  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3627  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3628  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3629 
3630  const int M ( numeric_cast<int>( A.rows() ) );
3631  const int N ( numeric_cast<int>( B.columns() ) );
3632  const int K ( numeric_cast<int>( A.columns() ) );
3633  const int lda( numeric_cast<int>( A.spacing() ) );
3634  const int ldb( numeric_cast<int>( B.spacing() ) );
3635  const int ldc( numeric_cast<int>( C.spacing() ) );
3636  const complex<float> alpha( scalar );
3637  const complex<float> beta ( 1.0F, 0.0F );
3638 
3639  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3640  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3641  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3642  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3643  }
3644 #endif
3645  //**********************************************************************************************
3646 
3647  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
3648 #if BLAZE_BLAS_MODE
3649 
3662  template< typename MT3 // Type of the left-hand side target matrix
3663  , typename MT4 // Type of the left-hand side matrix operand
3664  , typename MT5 // Type of the right-hand side matrix operand
3665  , typename ST2 > // Type of the scalar value
3666  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3667  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3668  {
3669  using boost::numeric_cast;
3670 
3671  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3672  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3673  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3675  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3676  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3677  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3678 
3679  const int M ( numeric_cast<int>( A.rows() ) );
3680  const int N ( numeric_cast<int>( B.columns() ) );
3681  const int K ( numeric_cast<int>( A.columns() ) );
3682  const int lda( numeric_cast<int>( A.spacing() ) );
3683  const int ldb( numeric_cast<int>( B.spacing() ) );
3684  const int ldc( numeric_cast<int>( C.spacing() ) );
3685  const complex<double> alpha( scalar );
3686  const complex<double> beta ( 1.0, 0.0 );
3687 
3688  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3689  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3690  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3691  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3692  }
3693 #endif
3694  //**********************************************************************************************
3695 
3696  //**Addition assignment to sparse matrices******************************************************
3697  // No special implementation for the addition assignment to sparse matrices.
3698  //**********************************************************************************************
3699 
3700  //**Subtraction assignment to dense matrices****************************************************
3712  template< typename MT3 // Type of the target dense matrix
3713  , bool SO > // Storage order of the target dense matrix
3714  friend inline void subAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
3715  {
3716  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3717  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3718 
3719  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3720  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3721 
3722  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3723  return;
3724  }
3725 
3726  LT A( left ); // Evaluation of the left-hand side dense matrix operand
3727  RT B( right ); // Evaluation of the right-hand side dense matrix operand
3728 
3729  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3730  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3731  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3732  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3733  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3734  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3735 
3736  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
3737  DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3738  else
3739  DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3740  }
3741  //**********************************************************************************************
3742 
3743  //**Default subtraction assignment to dense matrices********************************************
3757  template< typename MT3 // Type of the left-hand side target matrix
3758  , typename MT4 // Type of the left-hand side matrix operand
3759  , typename MT5 // Type of the right-hand side matrix operand
3760  , typename ST2 > // Type of the scalar value
3761  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3762  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3763  {
3764  const ResultType tmp( A * B * scalar );
3765  subAssign( C, tmp );
3766  }
3767  //**********************************************************************************************
3768 
3769  //**Vectorized default subtraction assignment to row-major dense matrices***********************
3783  template< typename MT3 // Type of the left-hand side target matrix
3784  , typename MT4 // Type of the left-hand side matrix operand
3785  , typename MT5 // Type of the right-hand side matrix operand
3786  , typename ST2 > // Type of the scalar value
3787  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3788  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3789  {
3790  typedef IntrinsicTrait<ElementType> IT;
3791 
3792  const size_t M( A.rows() );
3793  const size_t N( B.spacing() );
3794  const size_t K( A.columns() );
3795 
3796  const IntrinsicType factor( set( scalar ) );
3797 
3798  size_t j( 0UL );
3799 
3800  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3801  for( size_t i=0UL; i<M; ++i ) {
3802  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3803  for( size_t k=0UL; k<K; ++k ) {
3804  const IntrinsicType a1( set( A(i,k) ) );
3805  xmm1 = xmm1 + a1 * B.get(k,j );
3806  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3807  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3808  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3809  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3810  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3811  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3812  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3813  }
3814  store( &(~C)(i,j ), load( &(~C)(i,j ) ) - xmm1 * factor );
3815  store( &(~C)(i,j+IT::size ), load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3816  store( &(~C)(i,j+IT::size*2UL), load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3817  store( &(~C)(i,j+IT::size*3UL), load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3818  store( &(~C)(i,j+IT::size*4UL), load( &(~C)(i,j+IT::size*4UL) ) - xmm5 * factor );
3819  store( &(~C)(i,j+IT::size*5UL), load( &(~C)(i,j+IT::size*5UL) ) - xmm6 * factor );
3820  store( &(~C)(i,j+IT::size*6UL), load( &(~C)(i,j+IT::size*6UL) ) - xmm7 * factor );
3821  store( &(~C)(i,j+IT::size*7UL), load( &(~C)(i,j+IT::size*7UL) ) - xmm8 * factor );
3822  }
3823  }
3824  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3825  size_t i( 0UL );
3826  for( ; (i+2UL) <= M; i+=2UL ) {
3827  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3828  for( size_t k=0UL; k<K; ++k ) {
3829  const IntrinsicType a1( set( A(i ,k) ) );
3830  const IntrinsicType a2( set( A(i+1UL,k) ) );
3831  const IntrinsicType b1( B.get(k,j ) );
3832  const IntrinsicType b2( B.get(k,j+IT::size ) );
3833  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
3834  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
3835  xmm1 = xmm1 + a1 * b1;
3836  xmm2 = xmm2 + a1 * b2;
3837  xmm3 = xmm3 + a1 * b3;
3838  xmm4 = xmm4 + a1 * b4;
3839  xmm5 = xmm5 + a2 * b1;
3840  xmm6 = xmm6 + a2 * b2;
3841  xmm7 = xmm7 + a2 * b3;
3842  xmm8 = xmm8 + a2 * b4;
3843  }
3844  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) - xmm1 * factor );
3845  store( &(~C)(i ,j+IT::size ), load( &(~C)(i ,j+IT::size ) ) - xmm2 * factor );
3846  store( &(~C)(i ,j+IT::size*2UL), load( &(~C)(i ,j+IT::size*2UL) ) - xmm3 * factor );
3847  store( &(~C)(i ,j+IT::size*3UL), load( &(~C)(i ,j+IT::size*3UL) ) - xmm4 * factor );
3848  store( &(~C)(i+1UL,j ), load( &(~C)(i+1UL,j ) ) - xmm5 * factor );
3849  store( &(~C)(i+1UL,j+IT::size ), load( &(~C)(i+1UL,j+IT::size ) ) - xmm6 * factor );
3850  store( &(~C)(i+1UL,j+IT::size*2UL), load( &(~C)(i+1UL,j+IT::size*2UL) ) - xmm7 * factor );
3851  store( &(~C)(i+1UL,j+IT::size*3UL), load( &(~C)(i+1UL,j+IT::size*3UL) ) - xmm8 * factor );
3852  }
3853  if( i < M ) {
3854  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3855  for( size_t k=0UL; k<K; ++k ) {
3856  const IntrinsicType a1( set( A(i,k) ) );
3857  xmm1 = xmm1 + a1 * B.get(k,j );
3858  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3859  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3860  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3861  }
3862  store( &(~C)(i,j ), load( &(~C)(i,j ) ) - xmm1 * factor );
3863  store( &(~C)(i,j+IT::size ), load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3864  store( &(~C)(i,j+IT::size*2UL), load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3865  store( &(~C)(i,j+IT::size*3UL), load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3866  }
3867  }
3868  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3869  size_t i( 0UL );
3870  for( ; (i+2UL) <= M; i+=2UL ) {
3871  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3872  for( size_t k=0UL; k<K; ++k ) {
3873  const IntrinsicType a1( set( A(i ,k) ) );
3874  const IntrinsicType a2( set( A(i+1UL,k) ) );
3875  const IntrinsicType b1( B.get(k,j ) );
3876  const IntrinsicType b2( B.get(k,j+IT::size) );
3877  xmm1 = xmm1 + a1 * b1;
3878  xmm2 = xmm2 + a1 * b2;
3879  xmm3 = xmm3 + a2 * b1;
3880  xmm4 = xmm4 + a2 * b2;
3881  }
3882  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) - xmm1 * factor );
3883  store( &(~C)(i ,j+IT::size), load( &(~C)(i ,j+IT::size) ) - xmm2 * factor );
3884  store( &(~C)(i+1UL,j ), load( &(~C)(i+1UL,j ) ) - xmm3 * factor );
3885  store( &(~C)(i+1UL,j+IT::size), load( &(~C)(i+1UL,j+IT::size) ) - xmm4 * factor );
3886  }
3887  if( i < M ) {
3888  IntrinsicType xmm1, xmm2;
3889  for( size_t k=0UL; k<K; ++k ) {
3890  const IntrinsicType a1( set( A(i,k) ) );
3891  xmm1 = xmm1 + a1 * B.get(k,j );
3892  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3893  }
3894  store( &(~C)(i,j ), load( &(~C)(i,j ) ) - xmm1 * factor );
3895  store( &(~C)(i,j+IT::size), load( &(~C)(i,j+IT::size) ) - xmm2 * factor );
3896  }
3897  }
3898  if( j < N ) {
3899  size_t i( 0UL );
3900  for( ; (i+2UL) <= M; i+=2UL ) {
3901  IntrinsicType xmm1, xmm2;
3902  for( size_t k=0UL; k<K; ++k ) {
3903  const IntrinsicType b1( B.get(k,j) );
3904  xmm1 = xmm1 + set( A(i ,k) ) * b1;
3905  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
3906  }
3907  store( &(~C)(i ,j), load( &(~C)(i ,j) ) - xmm1 * factor );
3908  store( &(~C)(i+1UL,j), load( &(~C)(i+1UL,j) ) - xmm2 * factor );
3909  }
3910  if( i < M ) {
3911  IntrinsicType xmm1;
3912  for( size_t k=0UL; k<K; ++k ) {
3913  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
3914  }
3915  store( &(~C)(i,j), load( &(~C)(i,j) ) - xmm1 * factor );
3916  }
3917  }
3918  }
3919  //**********************************************************************************************
3920 
3921  //**Vectorized default subtraction assignment to column-major dense matrices********************
3935  template< typename MT3 // Type of the left-hand side target matrix
3936  , typename MT4 // Type of the left-hand side matrix operand
3937  , typename MT5 // Type of the right-hand side matrix operand
3938  , typename ST2 > // Type of the scalar value
3939  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3940  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3941  {
3942  typedef IntrinsicTrait<ElementType> IT;
3943 
3944  const size_t M( A.spacing() );
3945  const size_t N( B.columns() );
3946  const size_t K( A.columns() );
3947 
3948  const IntrinsicType factor( set( scalar ) );
3949 
3950  size_t i( 0UL );
3951 
3952  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3953  for( size_t j=0UL; j<N; ++j ) {
3954  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3955  for( size_t k=0UL; k<K; ++k ) {
3956  const IntrinsicType b1( set( B(k,j) ) );
3957  xmm1 = xmm1 + A.get(i ,k) * b1;
3958  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3959  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3960  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3961  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3962  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3963  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3964  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
3965  }
3966  store( &(~C)(i ,j), load( &(~C)(i ,j) ) - xmm1 * factor );
3967  store( &(~C)(i+IT::size ,j), load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
3968  store( &(~C)(i+IT::size*2UL,j), load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
3969  store( &(~C)(i+IT::size*3UL,j), load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
3970  store( &(~C)(i+IT::size*4UL,j), load( &(~C)(i+IT::size*4UL,j) ) - xmm5 * factor );
3971  store( &(~C)(i+IT::size*5UL,j), load( &(~C)(i+IT::size*5UL,j) ) - xmm6 * factor );
3972  store( &(~C)(i+IT::size*6UL,j), load( &(~C)(i+IT::size*6UL,j) ) - xmm7 * factor );
3973  store( &(~C)(i+IT::size*7UL,j), load( &(~C)(i+IT::size*7UL,j) ) - xmm8 * factor );
3974  }
3975  }
3976  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
3977  size_t j( 0UL );
3978  for( ; (j+2UL) <= N; j+=2UL ) {
3979  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3980  for( size_t k=0UL; k<K; ++k ) {
3981  const IntrinsicType a1( A.get(i ,k) );
3982  const IntrinsicType a2( A.get(i+IT::size ,k) );
3983  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
3984  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
3985  const IntrinsicType b1( set( B(k,j ) ) );
3986  const IntrinsicType b2( set( B(k,j+1UL) ) );
3987  xmm1 = xmm1 + a1 * b1;
3988  xmm2 = xmm2 + a2 * b1;
3989  xmm3 = xmm3 + a3 * b1;
3990  xmm4 = xmm4 + a4 * b1;
3991  xmm5 = xmm5 + a1 * b2;
3992  xmm6 = xmm6 + a2 * b2;
3993  xmm7 = xmm7 + a3 * b2;
3994  xmm8 = xmm8 + a4 * b2;
3995  }
3996  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) - xmm1 * factor );
3997  store( &(~C)(i+IT::size ,j ), load( &(~C)(i+IT::size ,j ) ) - xmm2 * factor );
3998  store( &(~C)(i+IT::size*2UL,j ), load( &(~C)(i+IT::size*2UL,j ) ) - xmm3 * factor );
3999  store( &(~C)(i+IT::size*3UL,j ), load( &(~C)(i+IT::size*3UL,j ) ) - xmm4 * factor );
4000  store( &(~C)(i ,j+1UL), load( &(~C)(i ,j+1UL) ) - xmm5 * factor );
4001  store( &(~C)(i+IT::size ,j+1UL), load( &(~C)(i+IT::size ,j+1UL) ) - xmm6 * factor );
4002  store( &(~C)(i+IT::size*2UL,j+1UL), load( &(~C)(i+IT::size*2UL,j+1UL) ) - xmm7 * factor );
4003  store( &(~C)(i+IT::size*3UL,j+1UL), load( &(~C)(i+IT::size*3UL,j+1UL) ) - xmm8 * factor );
4004  }
4005  if( j < N ) {
4006  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4007  for( size_t k=0UL; k<K; ++k ) {
4008  const IntrinsicType b1( set( B(k,j) ) );
4009  xmm1 = xmm1 + A.get(i ,k) * b1;
4010  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
4011  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
4012  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
4013  }
4014  store( &(~C)(i ,j), load( &(~C)(i ,j) ) - xmm1 * factor );
4015  store( &(~C)(i+IT::size ,j), load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
4016  store( &(~C)(i+IT::size*2UL,j), load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
4017  store( &(~C)(i+IT::size*3UL,j), load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
4018  }
4019  }
4020  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
4021  size_t j( 0UL );
4022  for( ; (j+2UL) <= N; j+=2UL ) {
4023  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4024  for( size_t k=0UL; k<K; ++k ) {
4025  const IntrinsicType a1( A.get(i ,k) );
4026  const IntrinsicType a2( A.get(i+IT::size,k) );
4027  const IntrinsicType b1( set( B(k,j ) ) );
4028  const IntrinsicType b2( set( B(k,j+1UL) ) );
4029  xmm1 = xmm1 + a1 * b1;
4030  xmm2 = xmm2 + a2 * b1;
4031  xmm3 = xmm3 + a1 * b2;
4032  xmm4 = xmm4 + a2 * b2;
4033  }
4034  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) - xmm1 * factor );
4035  store( &(~C)(i+IT::size,j ), load( &(~C)(i+IT::size,j ) ) - xmm2 * factor );
4036  store( &(~C)(i ,j+1UL), load( &(~C)(i ,j+1UL) ) - xmm3 * factor );
4037  store( &(~C)(i+IT::size,j+1UL), load( &(~C)(i+IT::size,j+1UL) ) - xmm4 * factor );
4038  }
4039  if( j < N ) {
4040  IntrinsicType xmm1, xmm2;
4041  for( size_t k=0UL; k<K; ++k ) {
4042  const IntrinsicType b1( set( B(k,j) ) );
4043  xmm1 = xmm1 + A.get(i ,k) * b1;
4044  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
4045  }
4046  store( &(~C)(i ,j), load( &(~C)(i ,j) ) - xmm1 * factor );
4047  store( &(~C)(i+IT::size,j), load( &(~C)(i+IT::size,j) ) - xmm2 * factor );
4048  }
4049  }
4050  if( i < M ) {
4051  size_t j( 0UL );
4052  for( ; (j+2UL) <= N; j+=2UL ) {
4053  IntrinsicType xmm1, xmm2;
4054  for( size_t k=0UL; k<K; ++k ) {
4055  const IntrinsicType a1( A.get(i,k) );
4056  xmm1 = xmm1 + a1 * set( B(k,j ) );
4057  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
4058  }
4059  store( &(~C)(i,j ), load( &(~C)(i,j ) ) - xmm1 * factor );
4060  store( &(~C)(i,j+1UL), load( &(~C)(i,j+1UL) ) - xmm2 * factor );
4061  }
4062  if( j < N ) {
4063  IntrinsicType xmm1;
4064  for( size_t k=0UL; k<K; ++k ) {
4065  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
4066  }
4067  store( &(~C)(i,j), load( &(~C)(i,j) ) - xmm1 * factor );
4068  }
4069  }
4070  }
4071  //**********************************************************************************************
4072 
4073  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4087  template< typename MT3 // Type of the left-hand side target matrix
4088  , typename MT4 // Type of the left-hand side matrix operand
4089  , typename MT5 // Type of the right-hand side matrix operand
4090  , typename ST2 > // Type of the scalar value
4091  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4092  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4093  {
4094  selectDefaultSubAssignKernel( C, A, B, scalar );
4095  }
4096  //**********************************************************************************************
4097 
4098  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
4099 #if BLAZE_BLAS_MODE
4100 
4113  template< typename MT3 // Type of the left-hand side target matrix
4114  , typename MT4 // Type of the left-hand side matrix operand
4115  , typename MT5 // Type of the right-hand side matrix operand
4116  , typename ST2 > // Type of the scalar value
4117  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4118  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4119  {
4120  using boost::numeric_cast;
4121 
4122  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
4123  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
4124  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
4125 
4126  const int M ( numeric_cast<int>( A.rows() ) );
4127  const int N ( numeric_cast<int>( B.columns() ) );
4128  const int K ( numeric_cast<int>( A.columns() ) );
4129  const int lda( numeric_cast<int>( A.spacing() ) );
4130  const int ldb( numeric_cast<int>( B.spacing() ) );
4131  const int ldc( numeric_cast<int>( C.spacing() ) );
4132 
4133  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4134  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4135  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4136  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4137  }
4138 #endif
4139  //**********************************************************************************************
4140 
4141  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
4142 #if BLAZE_BLAS_MODE
4143 
4156  template< typename MT3 // Type of the left-hand side target matrix
4157  , typename MT4 // Type of the left-hand side matrix operand
4158  , typename MT5 // Type of the right-hand side matrix operand
4159  , typename ST2 > // Type of the scalar value
4160  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4161  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4162  {
4163  using boost::numeric_cast;
4164 
4165  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
4166  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
4167  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
4168 
4169  const int M ( numeric_cast<int>( A.rows() ) );
4170  const int N ( numeric_cast<int>( B.columns() ) );
4171  const int K ( numeric_cast<int>( A.columns() ) );
4172  const int lda( numeric_cast<int>( A.spacing() ) );
4173  const int ldb( numeric_cast<int>( B.spacing() ) );
4174  const int ldc( numeric_cast<int>( C.spacing() ) );
4175 
4176  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4177  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4178  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4179  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4180  }
4181 #endif
4182  //**********************************************************************************************
4183 
4184  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
4185 #if BLAZE_BLAS_MODE
4186 
4199  template< typename MT3 // Type of the left-hand side target matrix
4200  , typename MT4 // Type of the left-hand side matrix operand
4201  , typename MT5 // Type of the right-hand side matrix operand
4202  , typename ST2 > // Type of the scalar value
4203  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4204  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4205  {
4206  using boost::numeric_cast;
4207 
4208  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
4209  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
4210  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
4212  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
4213  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
4214  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
4215 
4216  const int M ( numeric_cast<int>( A.rows() ) );
4217  const int N ( numeric_cast<int>( B.columns() ) );
4218  const int K ( numeric_cast<int>( A.columns() ) );
4219  const int lda( numeric_cast<int>( A.spacing() ) );
4220  const int ldb( numeric_cast<int>( B.spacing() ) );
4221  const int ldc( numeric_cast<int>( C.spacing() ) );
4222  const complex<float> alpha( -scalar );
4223  const complex<float> beta ( 1.0F, 0.0F );
4224 
4225  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4226  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4227  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4228  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4229  }
4230 #endif
4231  //**********************************************************************************************
4232 
4233  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
4234 #if BLAZE_BLAS_MODE
4235 
4248  template< typename MT3 // Type of the left-hand side target matrix
4249  , typename MT4 // Type of the left-hand side matrix operand
4250  , typename MT5 // Type of the right-hand side matrix operand
4251  , typename ST2 > // Type of the scalar value
4252  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4253  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4254  {
4255  using boost::numeric_cast;
4256 
4257  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
4258  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
4259  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
4261  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
4262  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
4263  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
4264 
4265  const int M ( numeric_cast<int>( A.rows() ) );
4266  const int N ( numeric_cast<int>( B.columns() ) );
4267  const int K ( numeric_cast<int>( A.columns() ) );
4268  const int lda( numeric_cast<int>( A.spacing() ) );
4269  const int ldb( numeric_cast<int>( B.spacing() ) );
4270  const int ldc( numeric_cast<int>( C.spacing() ) );
4271  const complex<double> alpha( -scalar );
4272  const complex<double> beta ( 1.0, 0.0 );
4273 
4274  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4275  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4276  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4277  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4278  }
4279 #endif
4280  //**********************************************************************************************
4281 
4282  //**Subtraction assignment to sparse matrices***************************************************
4283  // No special implementation for the subtraction assignment to sparse matrices.
4284  //**********************************************************************************************
4285 
4286  //**Multiplication assignment to dense matrices*************************************************
4287  // No special implementation for the multiplication assignment to dense matrices.
4288  //**********************************************************************************************
4289 
4290  //**Multiplication assignment to sparse matrices************************************************
4291  // No special implementation for the multiplication assignment to sparse matrices.
4292  //**********************************************************************************************
4293 
4294  //**Compile time checks*************************************************************************
4302  //**********************************************************************************************
4303 };
4305 //*************************************************************************************************
4306 
4307 
4308 
4309 
4310 //=================================================================================================
4311 //
4312 // GLOBAL BINARY ARITHMETIC OPERATORS
4313 //
4314 //=================================================================================================
4315 
4316 //*************************************************************************************************
4345 template< typename T1 // Type of the left-hand side dense matrix
4346  , typename T2 > // Type of the right-hand side dense matrix
4347 inline const TDMatDMatMultExpr<T1,T2>
4349 {
4350  if( (~lhs).columns() != (~rhs).rows() )
4351  throw std::invalid_argument( "Matrix sizes do not match" );
4352 
4353  return TDMatDMatMultExpr<T1,T2>( ~lhs, ~rhs );
4354 }
4355 //*************************************************************************************************
4356 
4357 
4358 
4359 
4360 //=================================================================================================
4361 //
4362 // EXPRESSION TRAIT SPECIALIZATIONS
4363 //
4364 //=================================================================================================
4365 
4366 //*************************************************************************************************
4368 template< typename MT1, typename MT2, typename VT >
4369 struct TDMatDVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
4370 {
4371  public:
4372  //**********************************************************************************************
4373  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4374  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4375  IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
4376  , typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
4377  , INVALID_TYPE >::Type Type;
4378  //**********************************************************************************************
4379 };
4381 //*************************************************************************************************
4382 
4383 
4384 //*************************************************************************************************
4386 template< typename MT1, typename MT2, typename VT >
4387 struct TDMatSVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
4388 {
4389  public:
4390  //**********************************************************************************************
4391  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4392  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4393  IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
4394  , typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
4395  , INVALID_TYPE >::Type Type;
4396  //**********************************************************************************************
4397 };
4399 //*************************************************************************************************
4400 
4401 
4402 //*************************************************************************************************
4404 template< typename VT, typename MT1, typename MT2 >
4405 struct TDVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
4406 {
4407  public:
4408  //**********************************************************************************************
4409  typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
4410  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4411  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4412  , typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4413  , INVALID_TYPE >::Type Type;
4414  //**********************************************************************************************
4415 };
4417 //*************************************************************************************************
4418 
4419 
4420 //*************************************************************************************************
4422 template< typename VT, typename MT1, typename MT2 >
4423 struct TSVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
4424 {
4425  public:
4426  //**********************************************************************************************
4427  typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
4428  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4429  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4430  , typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4431  , INVALID_TYPE >::Type Type;
4432  //**********************************************************************************************
4433 };
4435 //*************************************************************************************************
4436 
4437 } // namespace blaze
4438 
4439 #endif