All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
20 //=================================================================================================
21 
22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
24 
25 
26 //*************************************************************************************************
27 // Includes
28 //*************************************************************************************************
29 
30 #include <stdexcept>
31 #include <boost/cast.hpp>
39 #include <blaze/math/Intrinsics.h>
40 #include <blaze/math/shims/Reset.h>
60 #include <blaze/system/BLAS.h>
62 #include <blaze/util/Assert.h>
63 #include <blaze/util/Complex.h>
69 #include <blaze/util/DisableIf.h>
70 #include <blaze/util/EnableIf.h>
71 #include <blaze/util/InvalidType.h>
73 #include <blaze/util/SelectType.h>
74 #include <blaze/util/Types.h>
80 
81 
82 namespace blaze {
83 
84 //=================================================================================================
85 //
86 // CLASS TDMATDMATMULTEXPR
87 //
88 //=================================================================================================
89 
90 //*************************************************************************************************
97 template< typename MT1 // Type of the left-hand side dense matrix
98  , typename MT2 > // Type of the right-hand side dense matrix
99 class TDMatDMatMultExpr : public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
100  , private MatMatMultExpr
101  , private Computation
102 {
103  private:
104  //**Type definitions****************************************************************************
105  typedef typename MT1::ResultType RT1;
106  typedef typename MT2::ResultType RT2;
107  typedef typename MT1::CompositeType CT1;
108  typedef typename MT2::CompositeType CT2;
109  //**********************************************************************************************
110 
111  //**********************************************************************************************
113 
114 
116  template< typename T1, typename T2, typename T3 >
117  struct UseSinglePrecisionKernel {
121  };
123  //**********************************************************************************************
124 
125  //**********************************************************************************************
127 
128 
130  template< typename T1, typename T2, typename T3 >
131  struct UseDoublePrecisionKernel {
135  };
137  //**********************************************************************************************
138 
139  //**********************************************************************************************
141 
142 
145  template< typename T1, typename T2, typename T3 >
146  struct UseSinglePrecisionComplexKernel {
147  typedef complex<float> Type;
148  enum { value = IsSame<typename T1::ElementType,Type>::value &&
149  IsSame<typename T2::ElementType,Type>::value &&
150  IsSame<typename T3::ElementType,Type>::value };
151  };
153  //**********************************************************************************************
154 
155  //**********************************************************************************************
157 
158 
161  template< typename T1, typename T2, typename T3 >
162  struct UseDoublePrecisionComplexKernel {
163  typedef complex<double> Type;
164  enum { value = IsSame<typename T1::ElementType,Type>::value &&
165  IsSame<typename T2::ElementType,Type>::value &&
166  IsSame<typename T3::ElementType,Type>::value };
167  };
169  //**********************************************************************************************
170 
171  //**********************************************************************************************
173 
174 
176  template< typename T1, typename T2, typename T3 >
177  struct UseDefaultKernel {
178  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
179  !UseDoublePrecisionKernel<T1,T2,T3>::value &&
180  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
181  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
182  };
184  //**********************************************************************************************
185 
186  //**********************************************************************************************
188 
189 
191  template< typename T1, typename T2, typename T3 >
192  struct UseVectorizedDefaultKernel {
193  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
194  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
195  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
196  IntrinsicTrait<typename T1::ElementType>::addition &&
197  IntrinsicTrait<typename T1::ElementType>::multiplication };
198  };
200  //**********************************************************************************************
201 
202  public:
203  //**Type definitions****************************************************************************
206  typedef typename ResultType::OppositeType OppositeType;
207  typedef typename ResultType::TransposeType TransposeType;
208  typedef typename ResultType::ElementType ElementType;
210  typedef const ElementType ReturnType;
211  typedef const ResultType CompositeType;
212 
214  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
215 
217  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
218 
220  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
221 
223  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
224  //**********************************************************************************************
225 
226  //**Compilation flags***************************************************************************
228  enum { vectorizable = 0 };
229  //**********************************************************************************************
230 
231  //**Constructor*********************************************************************************
237  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs )
238  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
239  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
240  {
241  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
242  }
243  //**********************************************************************************************
244 
245  //**Access operator*****************************************************************************
252  inline ReturnType operator()( size_t i, size_t j ) const {
253  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
254  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
255 
256  ElementType tmp;
257 
258  if( lhs_.columns() != 0UL ) {
259  const size_t end( ( ( lhs_.columns()-1UL ) & size_t(-2) ) + 1UL );
260  tmp = lhs_(i,0UL) * rhs_(0UL,j);
261  for( size_t k=1UL; k<end; k+=2UL ) {
262  tmp += lhs_(i,k ) * rhs_(k ,j);
263  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
264  }
265  if( end < lhs_.columns() ) {
266  tmp += lhs_(i,end) * rhs_(end,j);
267  }
268  }
269  else {
270  reset( tmp );
271  }
272 
273  return tmp;
274  }
275  //**********************************************************************************************
276 
277  //**Rows function*******************************************************************************
282  inline size_t rows() const {
283  return lhs_.rows();
284  }
285  //**********************************************************************************************
286 
287  //**Columns function****************************************************************************
292  inline size_t columns() const {
293  return rhs_.columns();
294  }
295  //**********************************************************************************************
296 
297  //**Left operand access*************************************************************************
302  inline LeftOperand leftOperand() const {
303  return lhs_;
304  }
305  //**********************************************************************************************
306 
307  //**Right operand access************************************************************************
312  inline RightOperand rightOperand() const {
313  return rhs_;
314  }
315  //**********************************************************************************************
316 
317  //**********************************************************************************************
323  template< typename T >
324  inline bool canAlias( const T* alias ) const {
325  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
326  }
327  //**********************************************************************************************
328 
329  //**********************************************************************************************
335  template< typename T >
336  inline bool isAliased( const T* alias ) const {
337  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
338  }
339  //**********************************************************************************************
340 
341  private:
342  //**Member variables****************************************************************************
345  //**********************************************************************************************
346 
347  //**Assignment to dense matrices****************************************************************
356  template< typename MT // Type of the target dense matrix
357  , bool SO > // Storage order of the target dense matrix
358  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
359  {
361 
362  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
363  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
364 
365  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
366  return;
367  }
368  else if( rhs.lhs_.columns() == 0UL ) {
369  reset( ~lhs );
370  return;
371  }
372 
373  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
374  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
375 
376  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
377  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
378  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
379  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
380  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
381  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
382 
383  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
384  TDMatDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
385  else
386  TDMatDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
387  }
389  //**********************************************************************************************
390 
391  //**Default assignment to dense matrices********************************************************
405  template< typename MT3 // Type of the left-hand side target matrix
406  , typename MT4 // Type of the left-hand side matrix operand
407  , typename MT5 > // Type of the right-hand side matrix operand
408  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
409  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
410  {
411  const size_t M( A.rows() );
412  const size_t N( B.columns() );
413  const size_t K( A.columns() );
414 
415  for( size_t i=0UL; i<M; ++i ) {
416  for( size_t j=0UL; j<N; ++j ) {
417  C(i,j) = A(i,0UL) * B(0UL,j);
418  }
419  for( size_t k=1UL; k<K; ++k ) {
420  for( size_t j=0UL; j<N; ++j ) {
421  C(i,j) += A(i,k) * B(k,j);
422  }
423  }
424  }
425  }
427  //**********************************************************************************************
428 
429  //**Vectorized default assignment to row-major dense matrices***********************************
443  template< typename MT3 // Type of the left-hand side target matrix
444  , typename MT4 // Type of the left-hand side matrix operand
445  , typename MT5 > // Type of the right-hand side matrix operand
446  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
447  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
448  {
449  typedef IntrinsicTrait<ElementType> IT;
450 
451  const size_t M( A.rows() );
452  const size_t N( B.spacing() );
453  const size_t K( A.columns() );
454 
455  size_t j( 0UL );
456 
457  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
458  for( size_t i=0UL; i<M; ++i ) {
459  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
460  for( size_t k=0UL; k<K; ++k ) {
461  const IntrinsicType a1( set( A(i,k) ) );
462  xmm1 = xmm1 + a1 * B.get(k,j );
463  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
464  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
465  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
466  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
467  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
468  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
469  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
470  }
471  store( &(~C)(i,j ), xmm1 );
472  store( &(~C)(i,j+IT::size ), xmm2 );
473  store( &(~C)(i,j+IT::size*2UL), xmm3 );
474  store( &(~C)(i,j+IT::size*3UL), xmm4 );
475  store( &(~C)(i,j+IT::size*4UL), xmm5 );
476  store( &(~C)(i,j+IT::size*5UL), xmm6 );
477  store( &(~C)(i,j+IT::size*6UL), xmm7 );
478  store( &(~C)(i,j+IT::size*7UL), xmm8 );
479  }
480  }
481  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
482  size_t i( 0UL );
483  for( ; (i+2UL) <= M; i+=2UL ) {
484  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
485  for( size_t k=0UL; k<K; ++k ) {
486  const IntrinsicType a1( set( A(i ,k) ) );
487  const IntrinsicType a2( set( A(i+1UL,k) ) );
488  const IntrinsicType b1( B.get(k,j ) );
489  const IntrinsicType b2( B.get(k,j+IT::size ) );
490  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
491  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
492  xmm1 = xmm1 + a1 * b1;
493  xmm2 = xmm2 + a1 * b2;
494  xmm3 = xmm3 + a1 * b3;
495  xmm4 = xmm4 + a1 * b4;
496  xmm5 = xmm5 + a2 * b1;
497  xmm6 = xmm6 + a2 * b2;
498  xmm7 = xmm7 + a2 * b3;
499  xmm8 = xmm8 + a2 * b4;
500  }
501  store( &(~C)(i ,j ), xmm1 );
502  store( &(~C)(i ,j+IT::size ), xmm2 );
503  store( &(~C)(i ,j+IT::size*2UL), xmm3 );
504  store( &(~C)(i ,j+IT::size*3UL), xmm4 );
505  store( &(~C)(i+1UL,j ), xmm5 );
506  store( &(~C)(i+1UL,j+IT::size ), xmm6 );
507  store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
508  store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
509  }
510  if( i < M ) {
511  IntrinsicType xmm1, xmm2, xmm3, xmm4;
512  for( size_t k=0UL; k<K; ++k ) {
513  const IntrinsicType a1( set( A(i,k) ) );
514  xmm1 = xmm1 + a1 * B.get(k,j );
515  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
516  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
517  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
518  }
519  store( &(~C)(i,j ), xmm1 );
520  store( &(~C)(i,j+IT::size ), xmm2 );
521  store( &(~C)(i,j+IT::size*2UL), xmm3 );
522  store( &(~C)(i,j+IT::size*3UL), xmm4 );
523  }
524  }
525  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
526  size_t i( 0UL );
527  for( ; (i+2UL) <= M; i+=2UL ) {
528  IntrinsicType xmm1, xmm2, xmm3, xmm4;
529  for( size_t k=0UL; k<K; ++k ) {
530  const IntrinsicType a1( set( A(i ,k) ) );
531  const IntrinsicType a2( set( A(i+1UL,k) ) );
532  const IntrinsicType b1( B.get(k,j ) );
533  const IntrinsicType b2( B.get(k,j+IT::size) );
534  xmm1 = xmm1 + a1 * b1;
535  xmm2 = xmm2 + a1 * b2;
536  xmm3 = xmm3 + a2 * b1;
537  xmm4 = xmm4 + a2 * b2;
538  }
539  store( &(~C)(i ,j ), xmm1 );
540  store( &(~C)(i ,j+IT::size), xmm2 );
541  store( &(~C)(i+1UL,j ), xmm3 );
542  store( &(~C)(i+1UL,j+IT::size), xmm4 );
543  }
544  if( i < M ) {
545  IntrinsicType xmm1, xmm2;
546  for( size_t k=0UL; k<K; ++k ) {
547  const IntrinsicType a1( set( A(i,k) ) );
548  xmm1 = xmm1 + a1 * B.get(k,j );
549  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
550  }
551  store( &(~C)(i,j ), xmm1 );
552  store( &(~C)(i,j+IT::size), xmm2 );
553  }
554  }
555  if( j < N ) {
556  size_t i( 0UL );
557  for( ; (i+2UL) <= M; i+=2UL ) {
558  IntrinsicType xmm1, xmm2;
559  for( size_t k=0UL; k<K; ++k ) {
560  const IntrinsicType b1( B.get(k,j) );
561  xmm1 = xmm1 + set( A(i ,k) ) * b1;
562  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
563  }
564  store( &(~C)(i ,j), xmm1 );
565  store( &(~C)(i+1UL,j), xmm2 );
566  }
567  if( i < M ) {
568  IntrinsicType xmm1;
569  for( size_t k=0UL; k<K; ++k ) {
570  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
571  }
572  store( &(~C)(i,j), xmm1 );
573  }
574  }
575  }
577  //**********************************************************************************************
578 
579  //**Vectorized default assignment to column-major dense matrices********************************
593  template< typename MT3 // Type of the left-hand side target matrix
594  , typename MT4 // Type of the left-hand side matrix operand
595  , typename MT5 > // Type of the right-hand side matrix operand
596  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
597  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
598  {
599  typedef IntrinsicTrait<ElementType> IT;
600 
601  const size_t M( A.spacing() );
602  const size_t N( B.columns() );
603  const size_t K( A.columns() );
604 
605  size_t i( 0UL );
606 
607  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
608  for( size_t j=0UL; j<N; ++j ) {
609  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
610  for( size_t k=0UL; k<K; ++k ) {
611  const IntrinsicType b1( set( B(k,j) ) );
612  xmm1 = xmm1 + A.get(i ,k) * b1;
613  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
614  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
615  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
616  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
617  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
618  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
619  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
620  }
621  store( &(~C)(i ,j), xmm1 );
622  store( &(~C)(i+IT::size ,j), xmm2 );
623  store( &(~C)(i+IT::size*2UL,j), xmm3 );
624  store( &(~C)(i+IT::size*3UL,j), xmm4 );
625  store( &(~C)(i+IT::size*4UL,j), xmm5 );
626  store( &(~C)(i+IT::size*5UL,j), xmm6 );
627  store( &(~C)(i+IT::size*6UL,j), xmm7 );
628  store( &(~C)(i+IT::size*7UL,j), xmm8 );
629  }
630  }
631  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
632  size_t j( 0UL );
633  for( ; (j+2UL) <= N; j+=2UL ) {
634  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
635  for( size_t k=0UL; k<K; ++k ) {
636  const IntrinsicType a1( A.get(i ,k) );
637  const IntrinsicType a2( A.get(i+IT::size ,k) );
638  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
639  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
640  const IntrinsicType b1( set( B(k,j ) ) );
641  const IntrinsicType b2( set( B(k,j+1UL) ) );
642  xmm1 = xmm1 + a1 * b1;
643  xmm2 = xmm2 + a2 * b1;
644  xmm3 = xmm3 + a3 * b1;
645  xmm4 = xmm4 + a4 * b1;
646  xmm5 = xmm5 + a1 * b2;
647  xmm6 = xmm6 + a2 * b2;
648  xmm7 = xmm7 + a3 * b2;
649  xmm8 = xmm8 + a4 * b2;
650  }
651  store( &(~C)(i ,j ), xmm1 );
652  store( &(~C)(i+IT::size ,j ), xmm2 );
653  store( &(~C)(i+IT::size*2UL,j ), xmm3 );
654  store( &(~C)(i+IT::size*3UL,j ), xmm4 );
655  store( &(~C)(i ,j+1UL), xmm5 );
656  store( &(~C)(i+IT::size ,j+1UL), xmm6 );
657  store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
658  store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
659  }
660  if( j < N ) {
661  IntrinsicType xmm1, xmm2, xmm3, xmm4;
662  for( size_t k=0UL; k<K; ++k ) {
663  const IntrinsicType b1( set( B(k,j) ) );
664  xmm1 = xmm1 + A.get(i ,k) * b1;
665  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
666  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
667  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
668  }
669  store( &(~C)(i ,j), xmm1 );
670  store( &(~C)(i+IT::size ,j), xmm2 );
671  store( &(~C)(i+IT::size*2UL,j), xmm3 );
672  store( &(~C)(i+IT::size*3UL,j), xmm4 );
673  }
674  }
675  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
676  size_t j( 0UL );
677  for( ; (j+2UL) <= N; j+=2UL ) {
678  IntrinsicType xmm1, xmm2, xmm3, xmm4;
679  for( size_t k=0UL; k<K; ++k ) {
680  const IntrinsicType a1( A.get(i ,k) );
681  const IntrinsicType a2( A.get(i+IT::size,k) );
682  const IntrinsicType b1( set( B(k,j ) ) );
683  const IntrinsicType b2( set( B(k,j+1UL) ) );
684  xmm1 = xmm1 + a1 * b1;
685  xmm2 = xmm2 + a2 * b1;
686  xmm3 = xmm3 + a1 * b2;
687  xmm4 = xmm4 + a2 * b2;
688  }
689  store( &(~C)(i ,j ), xmm1 );
690  store( &(~C)(i+IT::size,j ), xmm2 );
691  store( &(~C)(i ,j+1UL), xmm3 );
692  store( &(~C)(i+IT::size,j+1UL), xmm4 );
693  }
694  if( j < N ) {
695  IntrinsicType xmm1, xmm2;
696  for( size_t k=0UL; k<K; ++k ) {
697  const IntrinsicType b1( set( B(k,j) ) );
698  xmm1 = xmm1 + A.get(i ,k) * b1;
699  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
700  }
701  store( &(~C)(i ,j), xmm1 );
702  store( &(~C)(i+IT::size,j), xmm2 );
703  }
704  }
705  if( i < M ) {
706  size_t j( 0UL );
707  for( ; (j+2UL) <= N; j+=2UL ) {
708  IntrinsicType xmm1, xmm2;
709  for( size_t k=0UL; k<K; ++k ) {
710  const IntrinsicType a1( A.get(i,k) );
711  xmm1 = xmm1 + a1 * set( B(k,j ) );
712  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
713  }
714  store( &(~C)(i,j ), xmm1 );
715  store( &(~C)(i,j+1UL), xmm2 );
716  }
717  if( j < N ) {
718  IntrinsicType xmm1;
719  for( size_t k=0UL; k<K; ++k ) {
720  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
721  }
722  store( &(~C)(i,j), xmm1 );
723  }
724  }
725  }
727  //**********************************************************************************************
728 
729  //**BLAS-based assignment to dense matrices (default)*******************************************
743  template< typename MT3 // Type of the left-hand side target matrix
744  , typename MT4 // Type of the left-hand side matrix operand
745  , typename MT5 > // Type of the right-hand side matrix operand
746  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
747  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
748  {
749  selectDefaultAssignKernel( C, A, B );
750  }
752  //**********************************************************************************************
753 
754  //**BLAS-based assignment to dense matrices (single precision)**********************************
755 #if BLAZE_BLAS_MODE
756 
769  template< typename MT3 // Type of the left-hand side target matrix
770  , typename MT4 // Type of the left-hand side matrix operand
771  , typename MT5 > // Type of the right-hand side matrix operand
772  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
773  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
774  {
775  using boost::numeric_cast;
776 
777  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
778  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
779  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
780 
781  const int M ( numeric_cast<int>( A.rows() ) );
782  const int N ( numeric_cast<int>( B.columns() ) );
783  const int K ( numeric_cast<int>( A.columns() ) );
784  const int lda( numeric_cast<int>( A.spacing() ) );
785  const int ldb( numeric_cast<int>( B.spacing() ) );
786  const int ldc( numeric_cast<int>( C.spacing() ) );
787 
788  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
789  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
790  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
791  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
792  }
794 #endif
795  //**********************************************************************************************
796 
797  //**BLAS-based assignment to dense matrices (double precision)**********************************
798 #if BLAZE_BLAS_MODE
799 
812  template< typename MT3 // Type of the left-hand side target matrix
813  , typename MT4 // Type of the left-hand side matrix operand
814  , typename MT5 > // Type of the right-hand side matrix operand
815  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
816  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
817  {
818  using boost::numeric_cast;
819 
820  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
821  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
822  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
823 
824  const int M ( numeric_cast<int>( A.rows() ) );
825  const int N ( numeric_cast<int>( B.columns() ) );
826  const int K ( numeric_cast<int>( A.columns() ) );
827  const int lda( numeric_cast<int>( A.spacing() ) );
828  const int ldb( numeric_cast<int>( B.spacing() ) );
829  const int ldc( numeric_cast<int>( C.spacing() ) );
830 
831  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
832  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
833  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
834  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
835  }
837 #endif
838  //**********************************************************************************************
839 
840  //**BLAS-based assignment to dense matrices (single precision complex)**************************
841 #if BLAZE_BLAS_MODE
842 
855  template< typename MT3 // Type of the left-hand side target matrix
856  , typename MT4 // Type of the left-hand side matrix operand
857  , typename MT5 > // Type of the right-hand side matrix operand
858  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
859  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
860  {
861  using boost::numeric_cast;
862 
863  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
864  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
865  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
866  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
867  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
868  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
869 
870  const int M ( numeric_cast<int>( A.rows() ) );
871  const int N ( numeric_cast<int>( B.columns() ) );
872  const int K ( numeric_cast<int>( A.columns() ) );
873  const int lda( numeric_cast<int>( A.spacing() ) );
874  const int ldb( numeric_cast<int>( B.spacing() ) );
875  const int ldc( numeric_cast<int>( C.spacing() ) );
876  const complex<float> alpha( 1.0F, 0.0F );
877  const complex<float> beta ( 0.0F, 0.0F );
878 
879  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
880  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
881  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
882  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
883  }
885 #endif
886  //**********************************************************************************************
887 
888  //**BLAS-based assignment to dense matrices (double precision complex)**************************
889 #if BLAZE_BLAS_MODE
890 
903  template< typename MT3 // Type of the left-hand side target matrix
904  , typename MT4 // Type of the left-hand side matrix operand
905  , typename MT5 > // Type of the right-hand side matrix operand
906  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
907  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
908  {
909  using boost::numeric_cast;
910 
911  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
912  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
913  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
914  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
915  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
916  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
917 
918  const int M ( numeric_cast<int>( A.rows() ) );
919  const int N ( numeric_cast<int>( B.columns() ) );
920  const int K ( numeric_cast<int>( A.columns() ) );
921  const int lda( numeric_cast<int>( A.spacing() ) );
922  const int ldb( numeric_cast<int>( B.spacing() ) );
923  const int ldc( numeric_cast<int>( C.spacing() ) );
924  const complex<double> alpha( 1.0, 0.0 );
925  const complex<double> beta ( 0.0, 0.0 );
926 
927  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
928  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
929  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
930  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
931  }
933 #endif
934  //**********************************************************************************************
935 
936  //**Assignment to sparse matrices***************************************************************
948  template< typename MT // Type of the target sparse matrix
949  , bool SO > // Storage order of the target sparse matrix
950  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
951  {
953 
954  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
955 
961  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( typename TmpType::CompositeType );
962 
963  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
964  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
965 
966  const TmpType tmp( rhs );
967  assign( ~lhs, tmp );
968  }
970  //**********************************************************************************************
971 
972  //**Addition assignment to dense matrices*******************************************************
985  template< typename MT // Type of the target dense matrix
986  , bool SO > // Storage order of the target dense matrix
987  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
988  {
990 
991  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
992  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
993 
994  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
995  return;
996  }
997 
998  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
999  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
1000 
1001  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1002  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1003  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1004  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1005  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1006  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1007 
1008  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
1009  TDMatDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
1010  else
1011  TDMatDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
1012  }
1014  //**********************************************************************************************
1015 
1016  //**Default addition assignment to dense matrices***********************************************
1030  template< typename MT3 // Type of the left-hand side target matrix
1031  , typename MT4 // Type of the left-hand side matrix operand
1032  , typename MT5 > // Type of the right-hand side matrix operand
1033  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1034  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1035  {
1036  const size_t M( A.rows() );
1037  const size_t N( B.columns() );
1038  const size_t K( A.columns() );
1039 
1040  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1041  const size_t end( N & size_t(-2) );
1042 
1043  for( size_t i=0UL; i<M; ++i ) {
1044  for( size_t k=0UL; k<K; ++k ) {
1045  for( size_t j=0UL; j<end; j+=2UL ) {
1046  C(i,j ) += A(i,k) * B(k,j );
1047  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1048  }
1049  if( end < N ) {
1050  C(i,end) += A(i,k) * B(k,end);
1051  }
1052  }
1053  }
1054  }
1056  //**********************************************************************************************
1057 
1058  //**Vectorized default addition assignment to row-major dense matrices**************************
1072  template< typename MT3 // Type of the left-hand side target matrix
1073  , typename MT4 // Type of the left-hand side matrix operand
1074  , typename MT5 > // Type of the right-hand side matrix operand
1075  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1076  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1077  {
1078  typedef IntrinsicTrait<ElementType> IT;
1079 
1080  const size_t M( A.rows() );
1081  const size_t N( B.spacing() );
1082  const size_t K( A.columns() );
1083 
1084  size_t j( 0UL );
1085 
1086  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1087  for( size_t i=0UL; i<M; ++i ) {
1088  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1089  IntrinsicType xmm2( load( &(~C)(i,j+IT::size ) ) );
1090  IntrinsicType xmm3( load( &(~C)(i,j+IT::size*2UL) ) );
1091  IntrinsicType xmm4( load( &(~C)(i,j+IT::size*3UL) ) );
1092  IntrinsicType xmm5( load( &(~C)(i,j+IT::size*4UL) ) );
1093  IntrinsicType xmm6( load( &(~C)(i,j+IT::size*5UL) ) );
1094  IntrinsicType xmm7( load( &(~C)(i,j+IT::size*6UL) ) );
1095  IntrinsicType xmm8( load( &(~C)(i,j+IT::size*7UL) ) );
1096  for( size_t k=0UL; k<K; ++k ) {
1097  const IntrinsicType a1( set( A(i,k) ) );
1098  xmm1 = xmm1 + a1 * B.get(k,j );
1099  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1100  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1101  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1102  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
1103  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
1104  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
1105  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
1106  }
1107  store( &(~C)(i,j ), xmm1 );
1108  store( &(~C)(i,j+IT::size ), xmm2 );
1109  store( &(~C)(i,j+IT::size*2UL), xmm3 );
1110  store( &(~C)(i,j+IT::size*3UL), xmm4 );
1111  store( &(~C)(i,j+IT::size*4UL), xmm5 );
1112  store( &(~C)(i,j+IT::size*5UL), xmm6 );
1113  store( &(~C)(i,j+IT::size*6UL), xmm7 );
1114  store( &(~C)(i,j+IT::size*7UL), xmm8 );
1115  }
1116  }
1117  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1118  size_t i( 0UL );
1119  for( ; (i+2UL) <= M; i+=2UL ) {
1120  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1121  IntrinsicType xmm2( load( &(~C)(i ,j+IT::size ) ) );
1122  IntrinsicType xmm3( load( &(~C)(i ,j+IT::size*2UL) ) );
1123  IntrinsicType xmm4( load( &(~C)(i ,j+IT::size*3UL) ) );
1124  IntrinsicType xmm5( load( &(~C)(i+1UL,j ) ) );
1125  IntrinsicType xmm6( load( &(~C)(i+1UL,j+IT::size ) ) );
1126  IntrinsicType xmm7( load( &(~C)(i+1UL,j+IT::size*2UL) ) );
1127  IntrinsicType xmm8( load( &(~C)(i+1UL,j+IT::size*3UL) ) );
1128  for( size_t k=0UL; k<K; ++k ) {
1129  const IntrinsicType a1( set( A(i ,k) ) );
1130  const IntrinsicType a2( set( A(i+1UL,k) ) );
1131  const IntrinsicType b1( B.get(k,j ) );
1132  const IntrinsicType b2( B.get(k,j+IT::size ) );
1133  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
1134  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
1135  xmm1 = xmm1 + a1 * b1;
1136  xmm2 = xmm2 + a1 * b2;
1137  xmm3 = xmm3 + a1 * b3;
1138  xmm4 = xmm4 + a1 * b4;
1139  xmm5 = xmm5 + a2 * b1;
1140  xmm6 = xmm6 + a2 * b2;
1141  xmm7 = xmm7 + a2 * b3;
1142  xmm8 = xmm8 + a2 * b4;
1143  }
1144  store( &(~C)(i ,j ), xmm1 );
1145  store( &(~C)(i ,j+IT::size ), xmm2 );
1146  store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1147  store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1148  store( &(~C)(i+1UL,j ), xmm5 );
1149  store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1150  store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1151  store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1152  }
1153  if( i < M ) {
1154  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1155  IntrinsicType xmm2( load( &(~C)(i,j+IT::size ) ) );
1156  IntrinsicType xmm3( load( &(~C)(i,j+IT::size*2UL) ) );
1157  IntrinsicType xmm4( load( &(~C)(i,j+IT::size*3UL) ) );
1158  for( size_t k=0UL; k<K; ++k ) {
1159  const IntrinsicType a1( set( A(i,k) ) );
1160  xmm1 = xmm1 + a1 * B.get(k,j );
1161  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
1162  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
1163  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
1164  }
1165  store( &(~C)(i,j ), xmm1 );
1166  store( &(~C)(i,j+IT::size ), xmm2 );
1167  store( &(~C)(i,j+IT::size*2UL), xmm3 );
1168  store( &(~C)(i,j+IT::size*3UL), xmm4 );
1169  }
1170  }
1171  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1172  size_t i( 0UL );
1173  for( ; (i+2UL) <= M; i+=2UL ) {
1174  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1175  IntrinsicType xmm2( load( &(~C)(i ,j+IT::size) ) );
1176  IntrinsicType xmm3( load( &(~C)(i+1UL,j ) ) );
1177  IntrinsicType xmm4( load( &(~C)(i+1UL,j+IT::size) ) );
1178  for( size_t k=0UL; k<K; ++k ) {
1179  const IntrinsicType a1( set( A(i ,k) ) );
1180  const IntrinsicType a2( set( A(i+1UL,k) ) );
1181  const IntrinsicType b1( B.get(k,j ) );
1182  const IntrinsicType b2( B.get(k,j+IT::size) );
1183  xmm1 = xmm1 + a1 * b1;
1184  xmm2 = xmm2 + a1 * b2;
1185  xmm3 = xmm3 + a2 * b1;
1186  xmm4 = xmm4 + a2 * b2;
1187  }
1188  store( &(~C)(i ,j ), xmm1 );
1189  store( &(~C)(i ,j+IT::size), xmm2 );
1190  store( &(~C)(i+1UL,j ), xmm3 );
1191  store( &(~C)(i+1UL,j+IT::size), xmm4 );
1192  }
1193  if( i < M ) {
1194  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1195  IntrinsicType xmm2( load( &(~C)(i,j+IT::size) ) );
1196  for( size_t k=0UL; k<K; ++k ) {
1197  const IntrinsicType a1( set( A(i,k) ) );
1198  xmm1 = xmm1 + a1 * B.get(k,j );
1199  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
1200  }
1201  store( &(~C)(i,j ), xmm1 );
1202  store( &(~C)(i,j+IT::size), xmm2 );
1203  }
1204  }
1205  if( j < N ) {
1206  size_t i( 0UL );
1207  for( ; (i+2UL) <= M; i+=2UL ) {
1208  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1209  IntrinsicType xmm2( load( &(~C)(i+1UL,j) ) );
1210  for( size_t k=0UL; k<K; ++k ) {
1211  const IntrinsicType b1( B.get(k,j) );
1212  xmm1 = xmm1 + set( A(i ,k) ) * b1;
1213  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
1214  }
1215  store( &(~C)(i ,j), xmm1 );
1216  store( &(~C)(i+1UL,j), xmm2 );
1217  }
1218  if( i < M ) {
1219  IntrinsicType xmm1( load( &(~C)(i,j) ) );
1220  for( size_t k=0UL; k<K; ++k ) {
1221  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
1222  }
1223  store( &(~C)(i,j), xmm1 );
1224  }
1225  }
1226  }
1228  //**********************************************************************************************
1229 
1230  //**Vectorized default addition assignment to column-major dense matrices***********************
1244  template< typename MT3 // Type of the left-hand side target matrix
1245  , typename MT4 // Type of the left-hand side matrix operand
1246  , typename MT5 > // Type of the right-hand side matrix operand
1247  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1248  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1249  {
1250  typedef IntrinsicTrait<ElementType> IT;
1251 
1252  const size_t M( A.spacing() );
1253  const size_t N( B.columns() );
1254  const size_t K( A.columns() );
1255 
1256  size_t i( 0UL );
1257 
1258  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1259  for( size_t j=0UL; j<N; ++j ) {
1260  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1261  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j) ) );
1262  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j) ) );
1263  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j) ) );
1264  IntrinsicType xmm5( load( &(~C)(i+IT::size*4UL,j) ) );
1265  IntrinsicType xmm6( load( &(~C)(i+IT::size*5UL,j) ) );
1266  IntrinsicType xmm7( load( &(~C)(i+IT::size*6UL,j) ) );
1267  IntrinsicType xmm8( load( &(~C)(i+IT::size*7UL,j) ) );
1268  for( size_t k=0UL; k<K; ++k ) {
1269  const IntrinsicType b1( set( B(k,j) ) );
1270  xmm1 = xmm1 + A.get(i ,k) * b1;
1271  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1272  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1273  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1274  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
1275  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
1276  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
1277  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
1278  }
1279  store( &(~C)(i ,j), xmm1 );
1280  store( &(~C)(i+IT::size ,j), xmm2 );
1281  store( &(~C)(i+IT::size*2UL,j), xmm3 );
1282  store( &(~C)(i+IT::size*3UL,j), xmm4 );
1283  store( &(~C)(i+IT::size*4UL,j), xmm5 );
1284  store( &(~C)(i+IT::size*5UL,j), xmm6 );
1285  store( &(~C)(i+IT::size*6UL,j), xmm7 );
1286  store( &(~C)(i+IT::size*7UL,j), xmm8 );
1287  }
1288  }
1289  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1290  size_t j( 0UL );
1291  for( ; (j+2UL) <= N; j+=2UL ) {
1292  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1293  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j ) ) );
1294  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j ) ) );
1295  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j ) ) );
1296  IntrinsicType xmm5( load( &(~C)(i ,j+1UL) ) );
1297  IntrinsicType xmm6( load( &(~C)(i+IT::size ,j+1UL) ) );
1298  IntrinsicType xmm7( load( &(~C)(i+IT::size*2UL,j+1UL) ) );
1299  IntrinsicType xmm8( load( &(~C)(i+IT::size*3UL,j+1UL) ) );
1300  for( size_t k=0UL; k<K; ++k ) {
1301  const IntrinsicType a1( A.get(i ,k) );
1302  const IntrinsicType a2( A.get(i+IT::size ,k) );
1303  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
1304  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
1305  const IntrinsicType b1( set( B(k,j ) ) );
1306  const IntrinsicType b2( set( B(k,j+1UL) ) );
1307  xmm1 = xmm1 + a1 * b1;
1308  xmm2 = xmm2 + a2 * b1;
1309  xmm3 = xmm3 + a3 * b1;
1310  xmm4 = xmm4 + a4 * b1;
1311  xmm5 = xmm5 + a1 * b2;
1312  xmm6 = xmm6 + a2 * b2;
1313  xmm7 = xmm7 + a3 * b2;
1314  xmm8 = xmm8 + a4 * b2;
1315  }
1316  store( &(~C)(i ,j ), xmm1 );
1317  store( &(~C)(i+IT::size ,j ), xmm2 );
1318  store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1319  store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1320  store( &(~C)(i ,j+1UL), xmm5 );
1321  store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1322  store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1323  store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1324  }
1325  if( j < N ) {
1326  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1327  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j) ) );
1328  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j) ) );
1329  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j) ) );
1330  for( size_t k=0UL; k<K; ++k ) {
1331  const IntrinsicType b1( set( B(k,j) ) );
1332  xmm1 = xmm1 + A.get(i ,k) * b1;
1333  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
1334  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
1335  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
1336  }
1337  store( &(~C)(i ,j), xmm1 );
1338  store( &(~C)(i+IT::size ,j), xmm2 );
1339  store( &(~C)(i+IT::size*2UL,j), xmm3 );
1340  store( &(~C)(i+IT::size*3UL,j), xmm4 );
1341  }
1342  }
1343  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1344  size_t j( 0UL );
1345  for( ; (j+2UL) <= N; j+=2UL ) {
1346  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1347  IntrinsicType xmm2( load( &(~C)(i+IT::size,j ) ) );
1348  IntrinsicType xmm3( load( &(~C)(i ,j+1UL) ) );
1349  IntrinsicType xmm4( load( &(~C)(i+IT::size,j+1UL) ) );
1350  for( size_t k=0UL; k<K; ++k ) {
1351  const IntrinsicType a1( A.get(i ,k) );
1352  const IntrinsicType a2( A.get(i+IT::size,k) );
1353  const IntrinsicType b1( set( B(k,j ) ) );
1354  const IntrinsicType b2( set( B(k,j+1UL) ) );
1355  xmm1 = xmm1 + a1 * b1;
1356  xmm2 = xmm2 + a2 * b1;
1357  xmm3 = xmm3 + a1 * b2;
1358  xmm4 = xmm4 + a2 * b2;
1359  }
1360  store( &(~C)(i ,j ), xmm1 );
1361  store( &(~C)(i+IT::size,j ), xmm2 );
1362  store( &(~C)(i ,j+1UL), xmm3 );
1363  store( &(~C)(i+IT::size,j+1UL), xmm4 );
1364  }
1365  if( j < N ) {
1366  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1367  IntrinsicType xmm2( load( &(~C)(i+IT::size,j) ) );
1368  for( size_t k=0UL; k<K; ++k ) {
1369  const IntrinsicType b1( set( B(k,j) ) );
1370  xmm1 = xmm1 + A.get(i ,k) * b1;
1371  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
1372  }
1373  store( &(~C)(i ,j), xmm1 );
1374  store( &(~C)(i+IT::size,j), xmm2 );
1375  }
1376  }
1377  if( i < M ) {
1378  size_t j( 0UL );
1379  for( ; (j+2UL) <= N; j+=2UL ) {
1380  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1381  IntrinsicType xmm2( load( &(~C)(i,j+1UL) ) );
1382  for( size_t k=0UL; k<K; ++k ) {
1383  const IntrinsicType a1( A.get(i,k) );
1384  xmm1 = xmm1 + a1 * set( B(k,j ) );
1385  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
1386  }
1387  store( &(~C)(i,j ), xmm1 );
1388  store( &(~C)(i,j+1UL), xmm2 );
1389  }
1390  if( j < N ) {
1391  IntrinsicType xmm1( load( &(~C)(i,j) ) );
1392  for( size_t k=0UL; k<K; ++k ) {
1393  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
1394  }
1395  store( &(~C)(i,j), xmm1 );
1396  }
1397  }
1398  }
1400  //**********************************************************************************************
1401 
1402  //**BLAS-based addition assignment to dense matrices (default)**********************************
1416  template< typename MT3 // Type of the left-hand side target matrix
1417  , typename MT4 // Type of the left-hand side matrix operand
1418  , typename MT5 > // Type of the right-hand side matrix operand
1419  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1420  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1421  {
1422  selectDefaultAddAssignKernel( C, A, B );
1423  }
1425  //**********************************************************************************************
1426 
1427  //**BLAS-based addition assignment to dense matrices (single precision)*************************
1428 #if BLAZE_BLAS_MODE
1429 
1442  template< typename MT3 // Type of the left-hand side target matrix
1443  , typename MT4 // Type of the left-hand side matrix operand
1444  , typename MT5 > // Type of the right-hand side matrix operand
1445  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1446  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1447  {
1448  using boost::numeric_cast;
1449 
1450  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
1451  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
1452  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
1453 
1454  const int M ( numeric_cast<int>( A.rows() ) );
1455  const int N ( numeric_cast<int>( B.columns() ) );
1456  const int K ( numeric_cast<int>( A.columns() ) );
1457  const int lda( numeric_cast<int>( A.spacing() ) );
1458  const int ldb( numeric_cast<int>( B.spacing() ) );
1459  const int ldc( numeric_cast<int>( C.spacing() ) );
1460 
1461  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1462  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1463  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1464  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1465  }
1467 #endif
1468  //**********************************************************************************************
1469 
1470  //**BLAS-based addition assignment to dense matrices (double precision)*************************
1471 #if BLAZE_BLAS_MODE
1472 
1485  template< typename MT3 // Type of the left-hand side target matrix
1486  , typename MT4 // Type of the left-hand side matrix operand
1487  , typename MT5 > // Type of the right-hand side matrix operand
1488  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1489  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1490  {
1491  using boost::numeric_cast;
1492 
1493  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
1494  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
1495  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
1496 
1497  const int M ( numeric_cast<int>( A.rows() ) );
1498  const int N ( numeric_cast<int>( B.columns() ) );
1499  const int K ( numeric_cast<int>( A.columns() ) );
1500  const int lda( numeric_cast<int>( A.spacing() ) );
1501  const int ldb( numeric_cast<int>( B.spacing() ) );
1502  const int ldc( numeric_cast<int>( C.spacing() ) );
1503 
1504  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1505  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1506  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1507  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1508  }
1510 #endif
1511  //**********************************************************************************************
1512 
1513  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
1514 #if BLAZE_BLAS_MODE
1515 
1528  template< typename MT3 // Type of the left-hand side target matrix
1529  , typename MT4 // Type of the left-hand side matrix operand
1530  , typename MT5 > // Type of the right-hand side matrix operand
1531  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1532  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1533  {
1534  using boost::numeric_cast;
1535 
1536  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1537  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1538  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1539  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
1540  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
1541  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
1542 
1543  const int M ( numeric_cast<int>( A.rows() ) );
1544  const int N ( numeric_cast<int>( B.columns() ) );
1545  const int K ( numeric_cast<int>( A.columns() ) );
1546  const int lda( numeric_cast<int>( A.spacing() ) );
1547  const int ldb( numeric_cast<int>( B.spacing() ) );
1548  const int ldc( numeric_cast<int>( C.spacing() ) );
1549  const complex<float> alpha( 1.0F, 0.0F );
1550  const complex<float> beta ( 1.0F, 0.0F );
1551 
1552  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1553  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1554  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1555  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1556  }
1558 #endif
1559  //**********************************************************************************************
1560 
1561  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
1562 #if BLAZE_BLAS_MODE
1563 
1576  template< typename MT3 // Type of the left-hand side target matrix
1577  , typename MT4 // Type of the left-hand side matrix operand
1578  , typename MT5 > // Type of the right-hand side matrix operand
1579  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1580  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1581  {
1582  using boost::numeric_cast;
1583 
1584  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
1585  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
1586  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
1587  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1588  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1589  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1590 
1591  const int M ( numeric_cast<int>( A.rows() ) );
1592  const int N ( numeric_cast<int>( B.columns() ) );
1593  const int K ( numeric_cast<int>( A.columns() ) );
1594  const int lda( numeric_cast<int>( A.spacing() ) );
1595  const int ldb( numeric_cast<int>( B.spacing() ) );
1596  const int ldc( numeric_cast<int>( C.spacing() ) );
1597  const complex<double> alpha( 1.0, 0.0 );
1598  const complex<double> beta ( 1.0, 0.0 );
1599 
1600  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1601  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1602  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1603  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1604  }
1606 #endif
1607  //**********************************************************************************************
1608 
1609  //**Addition assignment to sparse matrices******************************************************
1610  // No special implementation for the addition assignment to sparse matrices.
1611  //**********************************************************************************************
1612 
1613  //**Subtraction assignment to dense matrices****************************************************
1626  template< typename MT // Type of the target dense matrix
1627  , bool SO > // Storage order of the target dense matrix
1628  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
1629  {
1631 
1632  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1633  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1634 
1635  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1636  return;
1637  }
1638 
1639  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
1640  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
1641 
1642  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1643  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1644  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1645  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1646  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1647  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1648 
1649  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
1650  TDMatDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1651  else
1652  TDMatDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1653  }
1655  //**********************************************************************************************
1656 
1657  //**Default subtraction assignment to dense matrices********************************************
1671  template< typename MT3 // Type of the left-hand side target matrix
1672  , typename MT4 // Type of the left-hand side matrix operand
1673  , typename MT5 > // Type of the right-hand side matrix operand
1674  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1675  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1676  {
1677  const size_t M( A.rows() );
1678  const size_t N( B.columns() );
1679  const size_t K( A.columns() );
1680 
1681  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1682  const size_t end( N & size_t(-2) );
1683 
1684  for( size_t i=0UL; i<M; ++i ) {
1685  for( size_t k=0UL; k<K; ++k ) {
1686  for( size_t j=0UL; j<end; j+=2UL ) {
1687  C(i,j ) -= A(i,k) * B(k,j );
1688  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1689  }
1690  if( end < N ) {
1691  C(i,end) -= A(i,k) * B(k,end);
1692  }
1693  }
1694  }
1695  }
1697  //**********************************************************************************************
1698 
1699  //**Vectorized default subtraction assignment to row-major dense matrices***********************
1713  template< typename MT3 // Type of the left-hand side target matrix
1714  , typename MT4 // Type of the left-hand side matrix operand
1715  , typename MT5 > // Type of the right-hand side matrix operand
1716  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1717  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1718  {
1719  typedef IntrinsicTrait<ElementType> IT;
1720 
1721  const size_t M( A.rows() );
1722  const size_t N( B.spacing() );
1723  const size_t K( A.columns() );
1724 
1725  size_t j( 0UL );
1726 
1727  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1728  for( size_t i=0UL; i<M; ++i ) {
1729  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1730  IntrinsicType xmm2( load( &(~C)(i,j+IT::size ) ) );
1731  IntrinsicType xmm3( load( &(~C)(i,j+IT::size*2UL) ) );
1732  IntrinsicType xmm4( load( &(~C)(i,j+IT::size*3UL) ) );
1733  IntrinsicType xmm5( load( &(~C)(i,j+IT::size*4UL) ) );
1734  IntrinsicType xmm6( load( &(~C)(i,j+IT::size*5UL) ) );
1735  IntrinsicType xmm7( load( &(~C)(i,j+IT::size*6UL) ) );
1736  IntrinsicType xmm8( load( &(~C)(i,j+IT::size*7UL) ) );
1737  for( size_t k=0UL; k<K; ++k ) {
1738  const IntrinsicType a1( set( A(i,k) ) );
1739  xmm1 = xmm1 - a1 * B.get(k,j );
1740  xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1741  xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1742  xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1743  xmm5 = xmm5 - a1 * B.get(k,j+IT::size*4UL);
1744  xmm6 = xmm6 - a1 * B.get(k,j+IT::size*5UL);
1745  xmm7 = xmm7 - a1 * B.get(k,j+IT::size*6UL);
1746  xmm8 = xmm8 - a1 * B.get(k,j+IT::size*7UL);
1747  }
1748  store( &(~C)(i,j ), xmm1 );
1749  store( &(~C)(i,j+IT::size ), xmm2 );
1750  store( &(~C)(i,j+IT::size*2UL), xmm3 );
1751  store( &(~C)(i,j+IT::size*3UL), xmm4 );
1752  store( &(~C)(i,j+IT::size*4UL), xmm5 );
1753  store( &(~C)(i,j+IT::size*5UL), xmm6 );
1754  store( &(~C)(i,j+IT::size*6UL), xmm7 );
1755  store( &(~C)(i,j+IT::size*7UL), xmm8 );
1756  }
1757  }
1758  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1759  size_t i( 0UL );
1760  for( ; (i+2UL) <= M; i+=2UL ) {
1761  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1762  IntrinsicType xmm2( load( &(~C)(i ,j+IT::size ) ) );
1763  IntrinsicType xmm3( load( &(~C)(i ,j+IT::size*2UL) ) );
1764  IntrinsicType xmm4( load( &(~C)(i ,j+IT::size*3UL) ) );
1765  IntrinsicType xmm5( load( &(~C)(i+1UL,j ) ) );
1766  IntrinsicType xmm6( load( &(~C)(i+1UL,j+IT::size ) ) );
1767  IntrinsicType xmm7( load( &(~C)(i+1UL,j+IT::size*2UL) ) );
1768  IntrinsicType xmm8( load( &(~C)(i+1UL,j+IT::size*3UL) ) );
1769  for( size_t k=0UL; k<K; ++k ) {
1770  const IntrinsicType a1( set( A(i ,k) ) );
1771  const IntrinsicType a2( set( A(i+1UL,k) ) );
1772  const IntrinsicType b1( B.get(k,j ) );
1773  const IntrinsicType b2( B.get(k,j+IT::size ) );
1774  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
1775  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
1776  xmm1 = xmm1 - a1 * b1;
1777  xmm2 = xmm2 - a1 * b2;
1778  xmm3 = xmm3 - a1 * b3;
1779  xmm4 = xmm4 - a1 * b4;
1780  xmm5 = xmm5 - a2 * b1;
1781  xmm6 = xmm6 - a2 * b2;
1782  xmm7 = xmm7 - a2 * b3;
1783  xmm8 = xmm8 - a2 * b4;
1784  }
1785  store( &(~C)(i ,j ), xmm1 );
1786  store( &(~C)(i ,j+IT::size ), xmm2 );
1787  store( &(~C)(i ,j+IT::size*2UL), xmm3 );
1788  store( &(~C)(i ,j+IT::size*3UL), xmm4 );
1789  store( &(~C)(i+1UL,j ), xmm5 );
1790  store( &(~C)(i+1UL,j+IT::size ), xmm6 );
1791  store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 );
1792  store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 );
1793  }
1794  if( i < M ) {
1795  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1796  IntrinsicType xmm2( load( &(~C)(i,j+IT::size ) ) );
1797  IntrinsicType xmm3( load( &(~C)(i,j+IT::size*2UL) ) );
1798  IntrinsicType xmm4( load( &(~C)(i,j+IT::size*3UL) ) );
1799  for( size_t k=0UL; k<K; ++k ) {
1800  const IntrinsicType a1( set( A(i,k) ) );
1801  xmm1 = xmm1 - a1 * B.get(k,j );
1802  xmm2 = xmm2 - a1 * B.get(k,j+IT::size );
1803  xmm3 = xmm3 - a1 * B.get(k,j+IT::size*2UL);
1804  xmm4 = xmm4 - a1 * B.get(k,j+IT::size*3UL);
1805  }
1806  store( &(~C)(i,j ), xmm1 );
1807  store( &(~C)(i,j+IT::size ), xmm2 );
1808  store( &(~C)(i,j+IT::size*2UL), xmm3 );
1809  store( &(~C)(i,j+IT::size*3UL), xmm4 );
1810  }
1811  }
1812  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1813  size_t i( 0UL );
1814  for( ; (i+2UL) <= M; i+=2UL ) {
1815  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1816  IntrinsicType xmm2( load( &(~C)(i ,j+IT::size) ) );
1817  IntrinsicType xmm3( load( &(~C)(i+1UL,j ) ) );
1818  IntrinsicType xmm4( load( &(~C)(i+1UL,j+IT::size) ) );
1819  for( size_t k=0UL; k<K; ++k ) {
1820  const IntrinsicType a1( set( A(i ,k) ) );
1821  const IntrinsicType a2( set( A(i+1UL,k) ) );
1822  const IntrinsicType b1( B.get(k,j ) );
1823  const IntrinsicType b2( B.get(k,j+IT::size) );
1824  xmm1 = xmm1 - a1 * b1;
1825  xmm2 = xmm2 - a1 * b2;
1826  xmm3 = xmm3 - a2 * b1;
1827  xmm4 = xmm4 - a2 * b2;
1828  }
1829  store( &(~C)(i ,j ), xmm1 );
1830  store( &(~C)(i ,j+IT::size), xmm2 );
1831  store( &(~C)(i+1UL,j ), xmm3 );
1832  store( &(~C)(i+1UL,j+IT::size), xmm4 );
1833  }
1834  if( i < M ) {
1835  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
1836  IntrinsicType xmm2( load( &(~C)(i,j+IT::size) ) );
1837  for( size_t k=0UL; k<K; ++k ) {
1838  const IntrinsicType a1( set( A(i,k) ) );
1839  xmm1 = xmm1 - a1 * B.get(k,j );
1840  xmm2 = xmm2 - a1 * B.get(k,j+IT::size);
1841  }
1842  store( &(~C)(i,j ), xmm1 );
1843  store( &(~C)(i,j+IT::size), xmm2 );
1844  }
1845  }
1846  if( j < N ) {
1847  size_t i( 0UL );
1848  for( ; (i+2UL) <= M; i+=2UL ) {
1849  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1850  IntrinsicType xmm2( load( &(~C)(i+1UL,j) ) );
1851  for( size_t k=0UL; k<K; ++k ) {
1852  const IntrinsicType b1( B.get(k,j) );
1853  xmm1 = xmm1 - set( A(i ,k) ) * b1;
1854  xmm2 = xmm2 - set( A(i+1UL,k) ) * b1;
1855  }
1856  store( &(~C)(i ,j), xmm1 );
1857  store( &(~C)(i+1UL,j), xmm2 );
1858  }
1859  if( i < M ) {
1860  IntrinsicType xmm1( load( &(~C)(i,j) ) );
1861  for( size_t k=0UL; k<K; ++k ) {
1862  xmm1 = xmm1 - set( A(i,k) ) * B.get(k,j);
1863  }
1864  store( &(~C)(i,j), xmm1 );
1865  }
1866  }
1867  }
1869  //**********************************************************************************************
1870 
1871  //**Vectorized default subtraction assignment to column-major dense matrices********************
1885  template< typename MT3 // Type of the left-hand side target matrix
1886  , typename MT4 // Type of the left-hand side matrix operand
1887  , typename MT5 > // Type of the right-hand side matrix operand
1888  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1889  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1890  {
1891  typedef IntrinsicTrait<ElementType> IT;
1892 
1893  const size_t M( A.spacing() );
1894  const size_t N( B.columns() );
1895  const size_t K( A.columns() );
1896 
1897  size_t i( 0UL );
1898 
1899  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1900  for( size_t j=0UL; j<N; ++j ) {
1901  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1902  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j) ) );
1903  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j) ) );
1904  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j) ) );
1905  IntrinsicType xmm5( load( &(~C)(i+IT::size*4UL,j) ) );
1906  IntrinsicType xmm6( load( &(~C)(i+IT::size*5UL,j) ) );
1907  IntrinsicType xmm7( load( &(~C)(i+IT::size*6UL,j) ) );
1908  IntrinsicType xmm8( load( &(~C)(i+IT::size*7UL,j) ) );
1909  for( size_t k=0UL; k<K; ++k ) {
1910  const IntrinsicType b1( set( B(k,j) ) );
1911  xmm1 = xmm1 - A.get(i ,k) * b1;
1912  xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1913  xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1914  xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1915  xmm5 = xmm5 - A.get(i+IT::size*4UL,k) * b1;
1916  xmm6 = xmm6 - A.get(i+IT::size*5UL,k) * b1;
1917  xmm7 = xmm7 - A.get(i+IT::size*6UL,k) * b1;
1918  xmm8 = xmm8 - A.get(i+IT::size*7UL,k) * b1;
1919  }
1920  store( &(~C)(i ,j), xmm1 );
1921  store( &(~C)(i+IT::size ,j), xmm2 );
1922  store( &(~C)(i+IT::size*2UL,j), xmm3 );
1923  store( &(~C)(i+IT::size*3UL,j), xmm4 );
1924  store( &(~C)(i+IT::size*4UL,j), xmm5 );
1925  store( &(~C)(i+IT::size*5UL,j), xmm6 );
1926  store( &(~C)(i+IT::size*6UL,j), xmm7 );
1927  store( &(~C)(i+IT::size*7UL,j), xmm8 );
1928  }
1929  }
1930  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1931  size_t j( 0UL );
1932  for( ; (j+2UL) <= N; j+=2UL ) {
1933  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1934  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j ) ) );
1935  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j ) ) );
1936  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j ) ) );
1937  IntrinsicType xmm5( load( &(~C)(i ,j+1UL) ) );
1938  IntrinsicType xmm6( load( &(~C)(i+IT::size ,j+1UL) ) );
1939  IntrinsicType xmm7( load( &(~C)(i+IT::size*2UL,j+1UL) ) );
1940  IntrinsicType xmm8( load( &(~C)(i+IT::size*3UL,j+1UL) ) );
1941  for( size_t k=0UL; k<K; ++k ) {
1942  const IntrinsicType a1( A.get(i ,k) );
1943  const IntrinsicType a2( A.get(i+IT::size ,k) );
1944  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
1945  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
1946  const IntrinsicType b1( set( B(k,j ) ) );
1947  const IntrinsicType b2( set( B(k,j+1UL) ) );
1948  xmm1 = xmm1 - a1 * b1;
1949  xmm2 = xmm2 - a2 * b1;
1950  xmm3 = xmm3 - a3 * b1;
1951  xmm4 = xmm4 - a4 * b1;
1952  xmm5 = xmm5 - a1 * b2;
1953  xmm6 = xmm6 - a2 * b2;
1954  xmm7 = xmm7 - a3 * b2;
1955  xmm8 = xmm8 - a4 * b2;
1956  }
1957  store( &(~C)(i ,j ), xmm1 );
1958  store( &(~C)(i+IT::size ,j ), xmm2 );
1959  store( &(~C)(i+IT::size*2UL,j ), xmm3 );
1960  store( &(~C)(i+IT::size*3UL,j ), xmm4 );
1961  store( &(~C)(i ,j+1UL), xmm5 );
1962  store( &(~C)(i+IT::size ,j+1UL), xmm6 );
1963  store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 );
1964  store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 );
1965  }
1966  if( j < N ) {
1967  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
1968  IntrinsicType xmm2( load( &(~C)(i+IT::size ,j) ) );
1969  IntrinsicType xmm3( load( &(~C)(i+IT::size*2UL,j) ) );
1970  IntrinsicType xmm4( load( &(~C)(i+IT::size*3UL,j) ) );
1971  for( size_t k=0UL; k<K; ++k ) {
1972  const IntrinsicType b1( set( B(k,j) ) );
1973  xmm1 = xmm1 - A.get(i ,k) * b1;
1974  xmm2 = xmm2 - A.get(i+IT::size ,k) * b1;
1975  xmm3 = xmm3 - A.get(i+IT::size*2UL,k) * b1;
1976  xmm4 = xmm4 - A.get(i+IT::size*3UL,k) * b1;
1977  }
1978  store( &(~C)(i ,j), xmm1 );
1979  store( &(~C)(i+IT::size ,j), xmm2 );
1980  store( &(~C)(i+IT::size*2UL,j), xmm3 );
1981  store( &(~C)(i+IT::size*3UL,j), xmm4 );
1982  }
1983  }
1984  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1985  size_t j( 0UL );
1986  for( ; (j+2UL) <= N; j+=2UL ) {
1987  IntrinsicType xmm1( load( &(~C)(i ,j ) ) );
1988  IntrinsicType xmm2( load( &(~C)(i+IT::size,j ) ) );
1989  IntrinsicType xmm3( load( &(~C)(i ,j+1UL) ) );
1990  IntrinsicType xmm4( load( &(~C)(i+IT::size,j+1UL) ) );
1991  for( size_t k=0UL; k<K; ++k ) {
1992  const IntrinsicType a1( A.get(i ,k) );
1993  const IntrinsicType a2( A.get(i+IT::size,k) );
1994  const IntrinsicType b1( set( B(k,j ) ) );
1995  const IntrinsicType b2( set( B(k,j+1UL) ) );
1996  xmm1 = xmm1 - a1 * b1;
1997  xmm2 = xmm2 - a2 * b1;
1998  xmm3 = xmm3 - a1 * b2;
1999  xmm4 = xmm4 - a2 * b2;
2000  }
2001  store( &(~C)(i ,j ), xmm1 );
2002  store( &(~C)(i+IT::size,j ), xmm2 );
2003  store( &(~C)(i ,j+1UL), xmm3 );
2004  store( &(~C)(i+IT::size,j+1UL), xmm4 );
2005  }
2006  if( j < N ) {
2007  IntrinsicType xmm1( load( &(~C)(i ,j) ) );
2008  IntrinsicType xmm2( load( &(~C)(i+IT::size,j) ) );
2009  for( size_t k=0UL; k<K; ++k ) {
2010  const IntrinsicType b1( set( B(k,j) ) );
2011  xmm1 = xmm1 - A.get(i ,k) * b1;
2012  xmm2 = xmm2 - A.get(i+IT::size,k) * b1;
2013  }
2014  store( &(~C)(i ,j), xmm1 );
2015  store( &(~C)(i+IT::size,j), xmm2 );
2016  }
2017  }
2018  if( i < M ) {
2019  size_t j( 0UL );
2020  for( ; (j+2UL) <= N; j+=2UL ) {
2021  IntrinsicType xmm1( load( &(~C)(i,j ) ) );
2022  IntrinsicType xmm2( load( &(~C)(i,j+1UL) ) );
2023  for( size_t k=0UL; k<K; ++k ) {
2024  const IntrinsicType a1( A.get(i,k) );
2025  xmm1 = xmm1 - a1 * set( B(k,j ) );
2026  xmm2 = xmm2 - a1 * set( B(k,j+1UL) );
2027  }
2028  store( &(~C)(i,j ), xmm1 );
2029  store( &(~C)(i,j+1UL), xmm2 );
2030  }
2031  if( j < N ) {
2032  IntrinsicType xmm1( load( &(~C)(i,j) ) );
2033  for( size_t k=0UL; k<K; ++k ) {
2034  xmm1 = xmm1 - A.get(i,k) * set( B(k,j) );
2035  }
2036  store( &(~C)(i,j), xmm1 );
2037  }
2038  }
2039  }
2041  //**********************************************************************************************
2042 
2043  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
2057  template< typename MT3 // Type of the left-hand side target matrix
2058  , typename MT4 // Type of the left-hand side matrix operand
2059  , typename MT5 > // Type of the right-hand side matrix operand
2060  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2061  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2062  {
2063  selectDefaultSubAssignKernel( C, A, B );
2064  }
2066  //**********************************************************************************************
2067 
2068  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
2069 #if BLAZE_BLAS_MODE
2070 
2083  template< typename MT3 // Type of the left-hand side target matrix
2084  , typename MT4 // Type of the left-hand side matrix operand
2085  , typename MT5 > // Type of the right-hand side matrix operand
2086  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2087  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2088  {
2089  using boost::numeric_cast;
2090 
2091  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
2092  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
2093  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
2094 
2095  const int M ( numeric_cast<int>( A.rows() ) );
2096  const int N ( numeric_cast<int>( B.columns() ) );
2097  const int K ( numeric_cast<int>( A.columns() ) );
2098  const int lda( numeric_cast<int>( A.spacing() ) );
2099  const int ldb( numeric_cast<int>( B.spacing() ) );
2100  const int ldc( numeric_cast<int>( C.spacing() ) );
2101 
2102  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2103  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2104  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2105  M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2106  }
2108 #endif
2109  //**********************************************************************************************
2110 
2111  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
2112 #if BLAZE_BLAS_MODE
2113 
2126  template< typename MT3 // Type of the left-hand side target matrix
2127  , typename MT4 // Type of the left-hand side matrix operand
2128  , typename MT5 > // Type of the right-hand side matrix operand
2129  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2130  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2131  {
2132  using boost::numeric_cast;
2133 
2134  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
2135  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
2136  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
2137 
2138  const int M ( numeric_cast<int>( A.rows() ) );
2139  const int N ( numeric_cast<int>( B.columns() ) );
2140  const int K ( numeric_cast<int>( A.columns() ) );
2141  const int lda( numeric_cast<int>( A.spacing() ) );
2142  const int ldb( numeric_cast<int>( B.spacing() ) );
2143  const int ldc( numeric_cast<int>( C.spacing() ) );
2144 
2145  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2146  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2147  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2148  M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2149  }
2151 #endif
2152  //**********************************************************************************************
2153 
2154  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
2155 #if BLAZE_BLAS_MODE
2156 
2169  template< typename MT3 // Type of the left-hand side target matrix
2170  , typename MT4 // Type of the left-hand side matrix operand
2171  , typename MT5 > // Type of the right-hand side matrix operand
2172  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2173  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2174  {
2175  using boost::numeric_cast;
2176 
2177  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
2178  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
2179  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
2180  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
2181  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
2182  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
2183 
2184  const int M ( numeric_cast<int>( A.rows() ) );
2185  const int N ( numeric_cast<int>( B.columns() ) );
2186  const int K ( numeric_cast<int>( A.columns() ) );
2187  const int lda( numeric_cast<int>( A.spacing() ) );
2188  const int ldb( numeric_cast<int>( B.spacing() ) );
2189  const int ldc( numeric_cast<int>( C.spacing() ) );
2190  const complex<float> alpha( -1.0F, 0.0F );
2191  const complex<float> beta ( 1.0F, 0.0F );
2192 
2193  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2194  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2195  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2196  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2197  }
2199 #endif
2200  //**********************************************************************************************
2201 
2202  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
2203 #if BLAZE_BLAS_MODE
2204 
2217  template< typename MT3 // Type of the left-hand side target matrix
2218  , typename MT4 // Type of the left-hand side matrix operand
2219  , typename MT5 > // Type of the right-hand side matrix operand
2220  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2221  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2222  {
2223  using boost::numeric_cast;
2224 
2225  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
2226  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
2227  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
2228  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
2229  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
2230  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
2231 
2232  const int M ( numeric_cast<int>( A.rows() ) );
2233  const int N ( numeric_cast<int>( B.columns() ) );
2234  const int K ( numeric_cast<int>( A.columns() ) );
2235  const int lda( numeric_cast<int>( A.spacing() ) );
2236  const int ldb( numeric_cast<int>( B.spacing() ) );
2237  const int ldc( numeric_cast<int>( C.spacing() ) );
2238  const complex<double> alpha( -1.0, 0.0 );
2239  const complex<double> beta ( 1.0, 0.0 );
2240 
2241  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2242  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2243  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2244  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2245  }
2247 #endif
2248  //**********************************************************************************************
2249 
2250  //**Subtraction assignment to sparse matrices***************************************************
2251  // No special implementation for the subtraction assignment to sparse matrices.
2252  //**********************************************************************************************
2253 
2254  //**Multiplication assignment to dense matrices*************************************************
2255  // No special implementation for the multiplication assignment to dense matrices.
2256  //**********************************************************************************************
2257 
2258  //**Multiplication assignment to sparse matrices************************************************
2259  // No special implementation for the multiplication assignment to sparse matrices.
2260  //**********************************************************************************************
2261 
2262  //**Compile time checks*************************************************************************
2269  //**********************************************************************************************
2270 };
2271 //*************************************************************************************************
2272 
2273 
2274 
2275 
2276 //=================================================================================================
2277 //
2278 // DMATSCALARMULTEXPR SPECIALIZATION
2279 //
2280 //=================================================================================================
2281 
2282 //*************************************************************************************************
2290 template< typename MT1 // Type of the left-hand side dense matrix
2291  , typename MT2 // Type of the right-hand side dense matrix
2292  , typename ST > // Type of the right-hand side scalar value
2293 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >
2294  : public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2295  , private MatScalarMultExpr
2296  , private Computation
2297 {
2298  private:
2299  //**Type definitions****************************************************************************
2300  typedef TDMatDMatMultExpr<MT1,MT2> MMM;
2301  typedef typename MMM::ResultType RES;
2302  typedef typename MT1::ResultType RT1;
2303  typedef typename MT2::ResultType RT2;
2304  typedef typename MT1::CompositeType CT1;
2305  typedef typename MT2::CompositeType CT2;
2306  //**********************************************************************************************
2307 
2308  //**********************************************************************************************
2310 
2313  template< typename T1, typename T2, typename T3, typename T4 >
2314  struct UseSinglePrecisionKernel {
2315  enum { value = IsFloat<typename T1::ElementType>::value &&
2316  IsFloat<typename T2::ElementType>::value &&
2317  IsFloat<typename T3::ElementType>::value &&
2318  !IsComplex<T4>::value };
2319  };
2320  //**********************************************************************************************
2321 
2322  //**********************************************************************************************
2324 
2327  template< typename T1, typename T2, typename T3, typename T4 >
2328  struct UseDoublePrecisionKernel {
2329  enum { value = IsDouble<typename T1::ElementType>::value &&
2330  IsDouble<typename T2::ElementType>::value &&
2331  IsDouble<typename T3::ElementType>::value &&
2332  !IsComplex<T4>::value };
2333  };
2334  //**********************************************************************************************
2335 
2336  //**********************************************************************************************
2338 
2341  template< typename T1, typename T2, typename T3 >
2342  struct UseSinglePrecisionComplexKernel {
2343  typedef complex<float> Type;
2344  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2345  IsSame<typename T2::ElementType,Type>::value &&
2346  IsSame<typename T3::ElementType,Type>::value };
2347  };
2348  //**********************************************************************************************
2349 
2350  //**********************************************************************************************
2352 
2355  template< typename T1, typename T2, typename T3 >
2356  struct UseDoublePrecisionComplexKernel {
2357  typedef complex<double> Type;
2358  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2359  IsSame<typename T2::ElementType,Type>::value &&
2360  IsSame<typename T3::ElementType,Type>::value };
2361  };
2362  //**********************************************************************************************
2363 
2364  //**********************************************************************************************
2366 
2368  template< typename T1, typename T2, typename T3, typename T4 >
2369  struct UseDefaultKernel {
2370  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2371  !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2372  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2373  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2374  };
2375  //**********************************************************************************************
2376 
2377  //**********************************************************************************************
2379 
2381  template< typename T1, typename T2, typename T3, typename T4 >
2382  struct UseVectorizedDefaultKernel {
2383  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2384  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2385  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2386  IsSame<typename T1::ElementType,T4>::value &&
2387  IntrinsicTrait<typename T1::ElementType>::addition &&
2388  IntrinsicTrait<typename T1::ElementType>::multiplication };
2389  };
2390  //**********************************************************************************************
2391 
2392  public:
2393  //**Type definitions****************************************************************************
2394  typedef DMatScalarMultExpr<MMM,ST,true> This;
2395  typedef typename MultTrait<RES,ST>::Type ResultType;
2396  typedef typename ResultType::OppositeType OppositeType;
2397  typedef typename ResultType::TransposeType TransposeType;
2398  typedef typename ResultType::ElementType ElementType;
2399  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
2400  typedef const ElementType ReturnType;
2401  typedef const ResultType CompositeType;
2402 
2404  typedef const TDMatDMatMultExpr<MT1,MT2> LeftOperand;
2405 
2407  typedef ST RightOperand;
2408 
2410  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
2411 
2413  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
2414  //**********************************************************************************************
2415 
2416  //**Compilation flags***************************************************************************
2418  enum { vectorizable = 0 };
2419  //**********************************************************************************************
2420 
2421  //**Constructor*********************************************************************************
2427  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
2428  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
2429  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2430  {}
2431  //**********************************************************************************************
2432 
2433  //**Access operator*****************************************************************************
2440  inline ResultType operator()( size_t i, size_t j ) const {
2441  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
2442  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
2443  return matrix_(i,j) * scalar_;
2444  }
2445  //**********************************************************************************************
2446 
2447  //**Rows function*******************************************************************************
2452  inline size_t rows() const {
2453  return matrix_.rows();
2454  }
2455  //**********************************************************************************************
2456 
2457  //**Columns function****************************************************************************
2462  inline size_t columns() const {
2463  return matrix_.columns();
2464  }
2465  //**********************************************************************************************
2466 
2467  //**Left operand access*************************************************************************
2472  inline LeftOperand leftOperand() const {
2473  return matrix_;
2474  }
2475  //**********************************************************************************************
2476 
2477  //**Right operand access************************************************************************
2482  inline RightOperand rightOperand() const {
2483  return scalar_;
2484  }
2485  //**********************************************************************************************
2486 
2487  //**********************************************************************************************
2493  template< typename T >
2494  inline bool canAlias( const T* alias ) const {
2495  return matrix_.canAlias( alias );
2496  }
2497  //**********************************************************************************************
2498 
2499  //**********************************************************************************************
2505  template< typename T >
2506  inline bool isAliased( const T* alias ) const {
2507  return matrix_.isAliased( alias );
2508  }
2509  //**********************************************************************************************
2510 
2511  private:
2512  //**Member variables****************************************************************************
2513  LeftOperand matrix_;
2514  RightOperand scalar_;
2515  //**********************************************************************************************
2516 
2517  //**Assignment to dense matrices****************************************************************
2526  template< typename MT3 // Type of the target dense matrix
2527  , bool SO > // Storage order of the target dense matrix
2528  friend inline void assign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
2529  {
2531 
2532  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2533  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2534 
2535  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2536  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2537 
2538  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
2539  return;
2540  }
2541  else if( left.columns() == 0UL ) {
2542  reset( ~lhs );
2543  return;
2544  }
2545 
2546  LT A( left ); // Evaluation of the left-hand side dense matrix operand
2547  RT B( right ); // Evaluation of the right-hand side dense matrix operand
2548 
2549  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2550  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
2551  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
2552  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
2553  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2554  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
2555 
2556  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
2557  DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2558  else
2559  DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2560  }
2561  //**********************************************************************************************
2562 
2563  //**Default assignment to dense matrices********************************************************
2577  template< typename MT3 // Type of the left-hand side target matrix
2578  , typename MT4 // Type of the left-hand side matrix operand
2579  , typename MT5 // Type of the right-hand side matrix operand
2580  , typename ST2 > // Type of the scalar value
2581  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2582  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2583  {
2584  for( size_t i=0UL; i<A.rows(); ++i ) {
2585  for( size_t k=0UL; k<B.columns(); ++k ) {
2586  C(i,k) = A(i,0UL) * B(0UL,k);
2587  }
2588  for( size_t j=1UL; j<A.columns(); ++j ) {
2589  for( size_t k=0UL; k<B.columns(); ++k ) {
2590  C(i,k) += A(i,j) * B(j,k);
2591  }
2592  }
2593  for( size_t k=0UL; k<B.columns(); ++k ) {
2594  C(i,k) *= scalar;
2595  }
2596  }
2597  }
2598  //**********************************************************************************************
2599 
2600  //**Vectorized default assignment to row-major dense matrices***********************************
2614  template< typename MT3 // Type of the left-hand side target matrix
2615  , typename MT4 // Type of the left-hand side matrix operand
2616  , typename MT5 // Type of the right-hand side matrix operand
2617  , typename ST2 > // Type of the scalar value
2618  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2619  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
2620  {
2621  typedef IntrinsicTrait<ElementType> IT;
2622 
2623  const size_t M( A.rows() );
2624  const size_t N( B.spacing() );
2625  const size_t K( A.columns() );
2626 
2627  const IntrinsicType factor( set( scalar ) );
2628 
2629  size_t j( 0UL );
2630 
2631  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2632  for( size_t i=0UL; i<M; ++i ) {
2633  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2634  for( size_t k=0UL; k<K; ++k ) {
2635  const IntrinsicType a1( set( A(i,k) ) );
2636  xmm1 = xmm1 + a1 * B.get(k,j );
2637  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2638  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2639  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2640  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
2641  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
2642  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
2643  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
2644  }
2645  store( &(~C)(i,j ), xmm1 * factor );
2646  store( &(~C)(i,j+IT::size ), xmm2 * factor );
2647  store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2648  store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2649  store( &(~C)(i,j+IT::size*4UL), xmm5 * factor );
2650  store( &(~C)(i,j+IT::size*5UL), xmm6 * factor );
2651  store( &(~C)(i,j+IT::size*6UL), xmm7 * factor );
2652  store( &(~C)(i,j+IT::size*7UL), xmm8 * factor );
2653  }
2654  }
2655  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2656  size_t i( 0UL );
2657  for( ; (i+2UL) <= M; i+=2UL ) {
2658  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2659  for( size_t k=0UL; k<K; ++k ) {
2660  const IntrinsicType a1( set( A(i ,k) ) );
2661  const IntrinsicType a2( set( A(i+1UL,k) ) );
2662  const IntrinsicType b1( B.get(k,j ) );
2663  const IntrinsicType b2( B.get(k,j+IT::size ) );
2664  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
2665  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
2666  xmm1 = xmm1 + a1 * b1;
2667  xmm2 = xmm2 + a1 * b2;
2668  xmm3 = xmm3 + a1 * b3;
2669  xmm4 = xmm4 + a1 * b4;
2670  xmm5 = xmm5 + a2 * b1;
2671  xmm6 = xmm6 + a2 * b2;
2672  xmm7 = xmm7 + a2 * b3;
2673  xmm8 = xmm8 + a2 * b4;
2674  }
2675  store( &(~C)(i ,j ), xmm1 * factor );
2676  store( &(~C)(i ,j+IT::size ), xmm2 * factor );
2677  store( &(~C)(i ,j+IT::size*2UL), xmm3 * factor );
2678  store( &(~C)(i ,j+IT::size*3UL), xmm4 * factor );
2679  store( &(~C)(i+1UL,j ), xmm5 * factor );
2680  store( &(~C)(i+1UL,j+IT::size ), xmm6 * factor );
2681  store( &(~C)(i+1UL,j+IT::size*2UL), xmm7 * factor );
2682  store( &(~C)(i+1UL,j+IT::size*3UL), xmm8 * factor );
2683  }
2684  if( i < M ) {
2685  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2686  for( size_t k=0UL; k<K; ++k ) {
2687  const IntrinsicType a1( set( A(i,k) ) );
2688  xmm1 = xmm1 + a1 * B.get(k,j );
2689  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
2690  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
2691  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
2692  }
2693  store( &(~C)(i,j ), xmm1 * factor );
2694  store( &(~C)(i,j+IT::size ), xmm2 * factor );
2695  store( &(~C)(i,j+IT::size*2UL), xmm3 * factor );
2696  store( &(~C)(i,j+IT::size*3UL), xmm4 * factor );
2697  }
2698  }
2699  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2700  size_t i( 0UL );
2701  for( ; (i+2UL) <= M; i+=2UL ) {
2702  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2703  for( size_t k=0UL; k<K; ++k ) {
2704  const IntrinsicType a1( set( A(i ,k) ) );
2705  const IntrinsicType a2( set( A(i+1UL,k) ) );
2706  const IntrinsicType b1( B.get(k,j ) );
2707  const IntrinsicType b2( B.get(k,j+IT::size) );
2708  xmm1 = xmm1 + a1 * b1;
2709  xmm2 = xmm2 + a1 * b2;
2710  xmm3 = xmm3 + a2 * b1;
2711  xmm4 = xmm4 + a2 * b2;
2712  }
2713  store( &(~C)(i ,j ), xmm1 * factor );
2714  store( &(~C)(i ,j+IT::size), xmm2 * factor );
2715  store( &(~C)(i+1UL,j ), xmm3 * factor );
2716  store( &(~C)(i+1UL,j+IT::size), xmm4 * factor );
2717  }
2718  if( i < M ) {
2719  IntrinsicType xmm1, xmm2;
2720  for( size_t k=0UL; k<K; ++k ) {
2721  const IntrinsicType a1( set( A(i,k) ) );
2722  xmm1 = xmm1 + a1 * B.get(k,j );
2723  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
2724  }
2725  store( &(~C)(i,j ), xmm1 * factor );
2726  store( &(~C)(i,j+IT::size), xmm2 * factor );
2727  }
2728  }
2729  if( j < N ) {
2730  size_t i( 0UL );
2731  for( ; (i+2UL) <= M; i+=2UL ) {
2732  IntrinsicType xmm1, xmm2;
2733  for( size_t k=0UL; k<K; ++k ) {
2734  const IntrinsicType b1( B.get(k,j) );
2735  xmm1 = xmm1 + set( A(i ,k) ) * b1;
2736  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
2737  }
2738  store( &(~C)(i ,j), xmm1 * factor );
2739  store( &(~C)(i+1UL,j), xmm2 * factor );
2740  }
2741  if( i < M ) {
2742  IntrinsicType xmm1;
2743  for( size_t k=0UL; k<K; ++k ) {
2744  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
2745  }
2746  store( &(~C)(i,j), xmm1 * factor );
2747  }
2748  }
2749  }
2750  //**********************************************************************************************
2751 
2752  //**Vectorized default assignment to column-major dense matrices********************************
2766  template< typename MT3 // Type of the left-hand side target matrix
2767  , typename MT4 // Type of the left-hand side matrix operand
2768  , typename MT5 // Type of the right-hand side matrix operand
2769  , typename ST2 > // Type of the scalar value
2770  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2771  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
2772  {
2773  typedef IntrinsicTrait<ElementType> IT;
2774 
2775  const size_t M( A.spacing() );
2776  const size_t N( B.columns() );
2777  const size_t K( A.columns() );
2778 
2779  const IntrinsicType factor( set( scalar ) );
2780 
2781  size_t i( 0UL );
2782 
2783  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2784  for( size_t j=0UL; j<N; ++j ) {
2785  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2786  for( size_t k=0UL; k<K; ++k ) {
2787  const IntrinsicType b1( set( B(k,j) ) );
2788  xmm1 = xmm1 + A.get(i ,k) * b1;
2789  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2790  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2791  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2792  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
2793  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
2794  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
2795  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
2796  }
2797  store( &(~C)(i ,j), xmm1 * factor );
2798  store( &(~C)(i+IT::size ,j), xmm2 * factor );
2799  store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2800  store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2801  store( &(~C)(i+IT::size*4UL,j), xmm5 * factor );
2802  store( &(~C)(i+IT::size*5UL,j), xmm6 * factor );
2803  store( &(~C)(i+IT::size*6UL,j), xmm7 * factor );
2804  store( &(~C)(i+IT::size*7UL,j), xmm8 * factor );
2805  }
2806  }
2807  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2808  size_t j( 0UL );
2809  for( ; (j+2UL) <= N; j+=2UL ) {
2810  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2811  for( size_t k=0UL; k<K; ++k ) {
2812  const IntrinsicType a1( A.get(i ,k) );
2813  const IntrinsicType a2( A.get(i+IT::size ,k) );
2814  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
2815  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
2816  const IntrinsicType b1( set( B(k,j ) ) );
2817  const IntrinsicType b2( set( B(k,j+1UL) ) );
2818  xmm1 = xmm1 + a1 * b1;
2819  xmm2 = xmm2 + a2 * b1;
2820  xmm3 = xmm3 + a3 * b1;
2821  xmm4 = xmm4 + a4 * b1;
2822  xmm5 = xmm5 + a1 * b2;
2823  xmm6 = xmm6 + a2 * b2;
2824  xmm7 = xmm7 + a3 * b2;
2825  xmm8 = xmm8 + a4 * b2;
2826  }
2827  store( &(~C)(i ,j ), xmm1 * factor );
2828  store( &(~C)(i+IT::size ,j ), xmm2 * factor );
2829  store( &(~C)(i+IT::size*2UL,j ), xmm3 * factor );
2830  store( &(~C)(i+IT::size*3UL,j ), xmm4 * factor );
2831  store( &(~C)(i ,j+1UL), xmm5 * factor );
2832  store( &(~C)(i+IT::size ,j+1UL), xmm6 * factor );
2833  store( &(~C)(i+IT::size*2UL,j+1UL), xmm7 * factor );
2834  store( &(~C)(i+IT::size*3UL,j+1UL), xmm8 * factor );
2835  }
2836  if( j < N ) {
2837  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2838  for( size_t k=0UL; k<K; ++k ) {
2839  const IntrinsicType b1( set( B(k,j) ) );
2840  xmm1 = xmm1 + A.get(i ,k) * b1;
2841  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
2842  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
2843  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
2844  }
2845  store( &(~C)(i ,j), xmm1 * factor );
2846  store( &(~C)(i+IT::size ,j), xmm2 * factor );
2847  store( &(~C)(i+IT::size*2UL,j), xmm3 * factor );
2848  store( &(~C)(i+IT::size*3UL,j), xmm4 * factor );
2849  }
2850  }
2851  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2852  size_t j( 0UL );
2853  for( ; (j+2UL) <= N; j+=2UL ) {
2854  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2855  for( size_t k=0UL; k<K; ++k ) {
2856  const IntrinsicType a1( A.get(i ,k) );
2857  const IntrinsicType a2( A.get(i+IT::size,k) );
2858  const IntrinsicType b1( set( B(k,j ) ) );
2859  const IntrinsicType b2( set( B(k,j+1UL) ) );
2860  xmm1 = xmm1 + a1 * b1;
2861  xmm2 = xmm2 + a2 * b1;
2862  xmm3 = xmm3 + a1 * b2;
2863  xmm4 = xmm4 + a2 * b2;
2864  }
2865  store( &(~C)(i ,j ), xmm1 * factor );
2866  store( &(~C)(i+IT::size,j ), xmm2 * factor );
2867  store( &(~C)(i ,j+1UL), xmm3 * factor );
2868  store( &(~C)(i+IT::size,j+1UL), xmm4 * factor );
2869  }
2870  if( j < N ) {
2871  IntrinsicType xmm1, xmm2;
2872  for( size_t k=0UL; k<K; ++k ) {
2873  const IntrinsicType b1( set( B(k,j) ) );
2874  xmm1 = xmm1 + A.get(i ,k) * b1;
2875  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
2876  }
2877  store( &(~C)(i ,j), xmm1 * factor );
2878  store( &(~C)(i+IT::size,j), xmm2 * factor );
2879  }
2880  }
2881  if( i < M ) {
2882  size_t j( 0UL );
2883  for( ; (j+2UL) <= N; j+=2UL ) {
2884  IntrinsicType xmm1, xmm2;
2885  for( size_t k=0UL; k<K; ++k ) {
2886  const IntrinsicType a1( A.get(i,k) );
2887  xmm1 = xmm1 + a1 * set( B(k,j ) );
2888  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
2889  }
2890  store( &(~C)(i,j ), xmm1 * factor );
2891  store( &(~C)(i,j+1UL), xmm2 * factor );
2892  }
2893  if( j < N ) {
2894  IntrinsicType xmm1;
2895  for( size_t k=0UL; k<K; ++k ) {
2896  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
2897  }
2898  store( &(~C)(i,j), xmm1 * factor );
2899  }
2900  }
2901  }
2902  //**********************************************************************************************
2903 
2904  //**BLAS-based assignment to dense matrices (default)*******************************************
2918  template< typename MT3 // Type of the left-hand side target matrix
2919  , typename MT4 // Type of the left-hand side matrix operand
2920  , typename MT5 // Type of the right-hand side matrix operand
2921  , typename ST2 > // Type of the scalar value
2922  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2923  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2924  {
2925  selectDefaultAssignKernel( C, A, B, scalar );
2926  }
2927  //**********************************************************************************************
2928 
2929  //**BLAS-based assignment to dense matrices (single precision)**********************************
2930 #if BLAZE_BLAS_MODE
2931 
2944  template< typename MT3 // Type of the left-hand side target matrix
2945  , typename MT4 // Type of the left-hand side matrix operand
2946  , typename MT5 // Type of the right-hand side matrix operand
2947  , typename ST2 > // Type of the scalar value
2948  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2949  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2950  {
2951  using boost::numeric_cast;
2952 
2953  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
2954  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
2955  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
2956 
2957  const int M ( numeric_cast<int>( A.rows() ) );
2958  const int N ( numeric_cast<int>( B.columns() ) );
2959  const int K ( numeric_cast<int>( A.columns() ) );
2960  const int lda( numeric_cast<int>( A.spacing() ) );
2961  const int ldb( numeric_cast<int>( B.spacing() ) );
2962  const int ldc( numeric_cast<int>( C.spacing() ) );
2963 
2964  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2965  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2966  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2967  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2968  }
2969 #endif
2970  //**********************************************************************************************
2971 
2972  //**BLAS-based assignment to dense matrices (double precision)**********************************
2973 #if BLAZE_BLAS_MODE
2974 
2987  template< typename MT3 // Type of the left-hand side target matrix
2988  , typename MT4 // Type of the left-hand side matrix operand
2989  , typename MT5 // Type of the right-hand side matrix operand
2990  , typename ST2 > // Type of the scalar value
2991  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2992  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2993  {
2994  using boost::numeric_cast;
2995 
2996  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
2997  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
2998  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
2999 
3000  const int M ( numeric_cast<int>( A.rows() ) );
3001  const int N ( numeric_cast<int>( B.columns() ) );
3002  const int K ( numeric_cast<int>( A.columns() ) );
3003  const int lda( numeric_cast<int>( A.spacing() ) );
3004  const int ldb( numeric_cast<int>( B.spacing() ) );
3005  const int ldc( numeric_cast<int>( C.spacing() ) );
3006 
3007  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3008  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3009  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3010  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
3011  }
3012 #endif
3013  //**********************************************************************************************
3014 
3015  //**BLAS-based assignment to dense matrices (single precision complex)**************************
3016 #if BLAZE_BLAS_MODE
3017 
3030  template< typename MT3 // Type of the left-hand side target matrix
3031  , typename MT4 // Type of the left-hand side matrix operand
3032  , typename MT5 // Type of the right-hand side matrix operand
3033  , typename ST2 > // Type of the scalar value
3034  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3035  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3036  {
3037  using boost::numeric_cast;
3038 
3039  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3040  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3041  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3042  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3043  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3044  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3045 
3046  const int M ( numeric_cast<int>( A.rows() ) );
3047  const int N ( numeric_cast<int>( B.columns() ) );
3048  const int K ( numeric_cast<int>( A.columns() ) );
3049  const int lda( numeric_cast<int>( A.spacing() ) );
3050  const int ldb( numeric_cast<int>( B.spacing() ) );
3051  const int ldc( numeric_cast<int>( C.spacing() ) );
3052  const complex<float> alpha( scalar );
3053  const complex<float> beta ( 0.0F, 0.0F );
3054 
3055  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3056  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3057  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3058  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3059  }
3060 #endif
3061  //**********************************************************************************************
3062 
3063  //**BLAS-based assignment to dense matrices (double precision complex)**************************
3064 #if BLAZE_BLAS_MODE
3065 
3078  template< typename MT3 // Type of the left-hand side target matrix
3079  , typename MT4 // Type of the left-hand side matrix operand
3080  , typename MT5 // Type of the right-hand side matrix operand
3081  , typename ST2 > // Type of the scalar value
3082  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3083  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3084  {
3085  using boost::numeric_cast;
3086 
3087  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3088  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3089  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3090  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3091  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3092  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3093 
3094  const int M ( numeric_cast<int>( A.rows() ) );
3095  const int N ( numeric_cast<int>( B.columns() ) );
3096  const int K ( numeric_cast<int>( A.columns() ) );
3097  const int lda( numeric_cast<int>( A.spacing() ) );
3098  const int ldb( numeric_cast<int>( B.spacing() ) );
3099  const int ldc( numeric_cast<int>( C.spacing() ) );
3100  const complex<double> alpha( scalar );
3101  const complex<double> beta ( 0.0, 0.0 );
3102 
3103  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3104  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3105  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3106  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3107  }
3108 #endif
3109  //**********************************************************************************************
3110 
3111  //**Assignment to sparse matrices***************************************************************
3123  template< typename MT // Type of the target sparse matrix
3124  , bool SO > // Storage order of the target sparse matrix
3125  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
3126  {
3128 
3129  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3130 
3136  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( typename TmpType::CompositeType );
3137 
3138  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3139  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3140 
3141  const TmpType tmp( rhs );
3142  assign( ~lhs, tmp );
3143  }
3144  //**********************************************************************************************
3145 
3146  //**Addition assignment to dense matrices*******************************************************
3158  template< typename MT3 // Type of the target dense matrix
3159  , bool SO > // Storage order of the target dense matrix
3160  friend inline void addAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
3161  {
3163 
3164  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3165  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3166 
3167  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3168  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3169 
3170  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3171  return;
3172  }
3173 
3174  LT A( left ); // Evaluation of the left-hand side dense matrix operand
3175  RT B( right ); // Evaluation of the right-hand side dense matrix operand
3176 
3177  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3178  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3179  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3180  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3181  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3182  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3183 
3184  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
3185  DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3186  else
3187  DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3188  }
3189  //**********************************************************************************************
3190 
3191  //**Default addition assignment to dense matrices***********************************************
3205  template< typename MT3 // Type of the left-hand side target matrix
3206  , typename MT4 // Type of the left-hand side matrix operand
3207  , typename MT5 // Type of the right-hand side matrix operand
3208  , typename ST2 > // Type of the scalar value
3209  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3210  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3211  {
3212  const ResultType tmp( A * B * scalar );
3213  addAssign( C, tmp );
3214  }
3215  //**********************************************************************************************
3216 
3217  //**Vectorized default addition assignment to row-major dense matrices**************************
3231  template< typename MT3 // Type of the left-hand side target matrix
3232  , typename MT4 // Type of the left-hand side matrix operand
3233  , typename MT5 // Type of the right-hand side matrix operand
3234  , typename ST2 > // Type of the scalar value
3235  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3236  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3237  {
3238  typedef IntrinsicTrait<ElementType> IT;
3239 
3240  const size_t M( A.rows() );
3241  const size_t N( B.spacing() );
3242  const size_t K( A.columns() );
3243 
3244  const IntrinsicType factor( set( scalar ) );
3245 
3246  size_t j( 0UL );
3247 
3248  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3249  for( size_t i=0UL; i<M; ++i ) {
3250  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3251  for( size_t k=0UL; k<K; ++k ) {
3252  const IntrinsicType a1( set( A(i,k) ) );
3253  xmm1 = xmm1 + a1 * B.get(k,j );
3254  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3255  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3256  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3257  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3258  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3259  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3260  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3261  }
3262  store( &(~C)(i,j ), load( &(~C)(i,j ) ) + xmm1 * factor );
3263  store( &(~C)(i,j+IT::size ), load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
3264  store( &(~C)(i,j+IT::size*2UL), load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
3265  store( &(~C)(i,j+IT::size*3UL), load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
3266  store( &(~C)(i,j+IT::size*4UL), load( &(~C)(i,j+IT::size*4UL) ) + xmm5 * factor );
3267  store( &(~C)(i,j+IT::size*5UL), load( &(~C)(i,j+IT::size*5UL) ) + xmm6 * factor );
3268  store( &(~C)(i,j+IT::size*6UL), load( &(~C)(i,j+IT::size*6UL) ) + xmm7 * factor );
3269  store( &(~C)(i,j+IT::size*7UL), load( &(~C)(i,j+IT::size*7UL) ) + xmm8 * factor );
3270  }
3271  }
3272  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3273  size_t i( 0UL );
3274  for( ; (i+2UL) <= M; i+=2UL ) {
3275  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3276  for( size_t k=0UL; k<K; ++k ) {
3277  const IntrinsicType a1( set( A(i ,k) ) );
3278  const IntrinsicType a2( set( A(i+1UL,k) ) );
3279  const IntrinsicType b1( B.get(k,j ) );
3280  const IntrinsicType b2( B.get(k,j+IT::size ) );
3281  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
3282  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
3283  xmm1 = xmm1 + a1 * b1;
3284  xmm2 = xmm2 + a1 * b2;
3285  xmm3 = xmm3 + a1 * b3;
3286  xmm4 = xmm4 + a1 * b4;
3287  xmm5 = xmm5 + a2 * b1;
3288  xmm6 = xmm6 + a2 * b2;
3289  xmm7 = xmm7 + a2 * b3;
3290  xmm8 = xmm8 + a2 * b4;
3291  }
3292  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) + xmm1 * factor );
3293  store( &(~C)(i ,j+IT::size ), load( &(~C)(i ,j+IT::size ) ) + xmm2 * factor );
3294  store( &(~C)(i ,j+IT::size*2UL), load( &(~C)(i ,j+IT::size*2UL) ) + xmm3 * factor );
3295  store( &(~C)(i ,j+IT::size*3UL), load( &(~C)(i ,j+IT::size*3UL) ) + xmm4 * factor );
3296  store( &(~C)(i+1UL,j ), load( &(~C)(i+1UL,j ) ) + xmm5 * factor );
3297  store( &(~C)(i+1UL,j+IT::size ), load( &(~C)(i+1UL,j+IT::size ) ) + xmm6 * factor );
3298  store( &(~C)(i+1UL,j+IT::size*2UL), load( &(~C)(i+1UL,j+IT::size*2UL) ) + xmm7 * factor );
3299  store( &(~C)(i+1UL,j+IT::size*3UL), load( &(~C)(i+1UL,j+IT::size*3UL) ) + xmm8 * factor );
3300  }
3301  if( i < M ) {
3302  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3303  for( size_t k=0UL; k<K; ++k ) {
3304  const IntrinsicType a1( set( A(i,k) ) );
3305  xmm1 = xmm1 + a1 * B.get(k,j );
3306  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3307  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3308  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3309  }
3310  store( &(~C)(i,j ), load( &(~C)(i,j ) ) + xmm1 * factor );
3311  store( &(~C)(i,j+IT::size ), load( &(~C)(i,j+IT::size ) ) + xmm2 * factor );
3312  store( &(~C)(i,j+IT::size*2UL), load( &(~C)(i,j+IT::size*2UL) ) + xmm3 * factor );
3313  store( &(~C)(i,j+IT::size*3UL), load( &(~C)(i,j+IT::size*3UL) ) + xmm4 * factor );
3314  }
3315  }
3316  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3317  size_t i( 0UL );
3318  for( ; (i+2UL) <= M; i+=2UL ) {
3319  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3320  for( size_t k=0UL; k<K; ++k ) {
3321  const IntrinsicType a1( set( A(i ,k) ) );
3322  const IntrinsicType a2( set( A(i+1UL,k) ) );
3323  const IntrinsicType b1( B.get(k,j ) );
3324  const IntrinsicType b2( B.get(k,j+IT::size) );
3325  xmm1 = xmm1 + a1 * b1;
3326  xmm2 = xmm2 + a1 * b2;
3327  xmm3 = xmm3 + a2 * b1;
3328  xmm4 = xmm4 + a2 * b2;
3329  }
3330  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) + xmm1 * factor );
3331  store( &(~C)(i ,j+IT::size), load( &(~C)(i ,j+IT::size) ) + xmm2 * factor );
3332  store( &(~C)(i+1UL,j ), load( &(~C)(i+1UL,j ) ) + xmm3 * factor );
3333  store( &(~C)(i+1UL,j+IT::size), load( &(~C)(i+1UL,j+IT::size) ) + xmm4 * factor );
3334  }
3335  if( i < M ) {
3336  IntrinsicType xmm1, xmm2;
3337  for( size_t k=0UL; k<K; ++k ) {
3338  const IntrinsicType a1( set( A(i,k) ) );
3339  xmm1 = xmm1 + a1 * B.get(k,j );
3340  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3341  }
3342  store( &(~C)(i,j ), load( &(~C)(i,j ) ) + xmm1 * factor );
3343  store( &(~C)(i,j+IT::size), load( &(~C)(i,j+IT::size) ) + xmm2 * factor );
3344  }
3345  }
3346  if( j < N ) {
3347  size_t i( 0UL );
3348  for( ; (i+2UL) <= M; i+=2UL ) {
3349  IntrinsicType xmm1, xmm2;
3350  for( size_t k=0UL; k<K; ++k ) {
3351  const IntrinsicType b1( B.get(k,j) );
3352  xmm1 = xmm1 + set( A(i ,k) ) * b1;
3353  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
3354  }
3355  store( &(~C)(i ,j), load( &(~C)(i ,j) ) + xmm1 * factor );
3356  store( &(~C)(i+1UL,j), load( &(~C)(i+1UL,j) ) + xmm2 * factor );
3357  }
3358  if( i < M ) {
3359  IntrinsicType xmm1;
3360  for( size_t k=0UL; k<K; ++k ) {
3361  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
3362  }
3363  store( &(~C)(i,j), load( &(~C)(i,j) ) + xmm1 * factor );
3364  }
3365  }
3366  }
3367  //**********************************************************************************************
3368 
3369  //**Vectorized default addition assignment to column-major dense matrices***********************
3383  template< typename MT3 // Type of the left-hand side target matrix
3384  , typename MT4 // Type of the left-hand side matrix operand
3385  , typename MT5 // Type of the right-hand side matrix operand
3386  , typename ST2 > // Type of the scalar value
3387  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3388  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3389  {
3390  typedef IntrinsicTrait<ElementType> IT;
3391 
3392  const size_t M( A.spacing() );
3393  const size_t N( B.columns() );
3394  const size_t K( A.columns() );
3395 
3396  const IntrinsicType factor( set( scalar ) );
3397 
3398  size_t i( 0UL );
3399 
3400  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3401  for( size_t j=0UL; j<N; ++j ) {
3402  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3403  for( size_t k=0UL; k<K; ++k ) {
3404  const IntrinsicType b1( set( B(k,j) ) );
3405  xmm1 = xmm1 + A.get(i ,k) * b1;
3406  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3407  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3408  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3409  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3410  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3411  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3412  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
3413  }
3414  store( &(~C)(i ,j), load( &(~C)(i ,j) ) + xmm1 * factor );
3415  store( &(~C)(i+IT::size ,j), load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
3416  store( &(~C)(i+IT::size*2UL,j), load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
3417  store( &(~C)(i+IT::size*3UL,j), load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
3418  store( &(~C)(i+IT::size*4UL,j), load( &(~C)(i+IT::size*4UL,j) ) + xmm5 * factor );
3419  store( &(~C)(i+IT::size*5UL,j), load( &(~C)(i+IT::size*5UL,j) ) + xmm6 * factor );
3420  store( &(~C)(i+IT::size*6UL,j), load( &(~C)(i+IT::size*6UL,j) ) + xmm7 * factor );
3421  store( &(~C)(i+IT::size*7UL,j), load( &(~C)(i+IT::size*7UL,j) ) + xmm8 * factor );
3422  }
3423  }
3424  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
3425  size_t j( 0UL );
3426  for( ; (j+2UL) <= N; j+=2UL ) {
3427  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3428  for( size_t k=0UL; k<K; ++k ) {
3429  const IntrinsicType a1( A.get(i ,k) );
3430  const IntrinsicType a2( A.get(i+IT::size ,k) );
3431  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
3432  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
3433  const IntrinsicType b1( set( B(k,j ) ) );
3434  const IntrinsicType b2( set( B(k,j+1UL) ) );
3435  xmm1 = xmm1 + a1 * b1;
3436  xmm2 = xmm2 + a2 * b1;
3437  xmm3 = xmm3 + a3 * b1;
3438  xmm4 = xmm4 + a4 * b1;
3439  xmm5 = xmm5 + a1 * b2;
3440  xmm6 = xmm6 + a2 * b2;
3441  xmm7 = xmm7 + a3 * b2;
3442  xmm8 = xmm8 + a4 * b2;
3443  }
3444  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) + xmm1 * factor );
3445  store( &(~C)(i+IT::size ,j ), load( &(~C)(i+IT::size ,j ) ) + xmm2 * factor );
3446  store( &(~C)(i+IT::size*2UL,j ), load( &(~C)(i+IT::size*2UL,j ) ) + xmm3 * factor );
3447  store( &(~C)(i+IT::size*3UL,j ), load( &(~C)(i+IT::size*3UL,j ) ) + xmm4 * factor );
3448  store( &(~C)(i ,j+1UL), load( &(~C)(i ,j+1UL) ) + xmm5 * factor );
3449  store( &(~C)(i+IT::size ,j+1UL), load( &(~C)(i+IT::size ,j+1UL) ) + xmm6 * factor );
3450  store( &(~C)(i+IT::size*2UL,j+1UL), load( &(~C)(i+IT::size*2UL,j+1UL) ) + xmm7 * factor );
3451  store( &(~C)(i+IT::size*3UL,j+1UL), load( &(~C)(i+IT::size*3UL,j+1UL) ) + xmm8 * factor );
3452  }
3453  if( j < N ) {
3454  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3455  for( size_t k=0UL; k<K; ++k ) {
3456  const IntrinsicType b1( set( B(k,j) ) );
3457  xmm1 = xmm1 + A.get(i ,k) * b1;
3458  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3459  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3460  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3461  }
3462  store( &(~C)(i ,j), load( &(~C)(i ,j) ) + xmm1 * factor );
3463  store( &(~C)(i+IT::size ,j), load( &(~C)(i+IT::size ,j) ) + xmm2 * factor );
3464  store( &(~C)(i+IT::size*2UL,j), load( &(~C)(i+IT::size*2UL,j) ) + xmm3 * factor );
3465  store( &(~C)(i+IT::size*3UL,j), load( &(~C)(i+IT::size*3UL,j) ) + xmm4 * factor );
3466  }
3467  }
3468  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
3469  size_t j( 0UL );
3470  for( ; (j+2UL) <= N; j+=2UL ) {
3471  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3472  for( size_t k=0UL; k<K; ++k ) {
3473  const IntrinsicType a1( A.get(i ,k) );
3474  const IntrinsicType a2( A.get(i+IT::size,k) );
3475  const IntrinsicType b1( set( B(k,j ) ) );
3476  const IntrinsicType b2( set( B(k,j+1UL) ) );
3477  xmm1 = xmm1 + a1 * b1;
3478  xmm2 = xmm2 + a2 * b1;
3479  xmm3 = xmm3 + a1 * b2;
3480  xmm4 = xmm4 + a2 * b2;
3481  }
3482  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) + xmm1 * factor );
3483  store( &(~C)(i+IT::size,j ), load( &(~C)(i+IT::size,j ) ) + xmm2 * factor );
3484  store( &(~C)(i ,j+1UL), load( &(~C)(i ,j+1UL) ) + xmm3 * factor );
3485  store( &(~C)(i+IT::size,j+1UL), load( &(~C)(i+IT::size,j+1UL) ) + xmm4 * factor );
3486  }
3487  if( j < N ) {
3488  IntrinsicType xmm1, xmm2;
3489  for( size_t k=0UL; k<K; ++k ) {
3490  const IntrinsicType b1( set( B(k,j) ) );
3491  xmm1 = xmm1 + A.get(i ,k) * b1;
3492  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
3493  }
3494  store( &(~C)(i ,j), load( &(~C)(i ,j) ) + xmm1 * factor );
3495  store( &(~C)(i+IT::size,j), load( &(~C)(i+IT::size,j) ) + xmm2 * factor );
3496  }
3497  }
3498  if( i < M ) {
3499  size_t j( 0UL );
3500  for( ; (j+2UL) <= N; j+=2UL ) {
3501  IntrinsicType xmm1, xmm2;
3502  for( size_t k=0UL; k<K; ++k ) {
3503  const IntrinsicType a1( A.get(i,k) );
3504  xmm1 = xmm1 + a1 * set( B(k,j ) );
3505  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
3506  }
3507  store( &(~C)(i,j ), load( &(~C)(i,j ) ) + xmm1 * factor );
3508  store( &(~C)(i,j+1UL), load( &(~C)(i,j+1UL) ) + xmm2 * factor );
3509  }
3510  if( j < N ) {
3511  IntrinsicType xmm1;
3512  for( size_t k=0UL; k<K; ++k ) {
3513  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
3514  }
3515  store( &(~C)(i,j), load( &(~C)(i,j) ) + xmm1 * factor );
3516  }
3517  }
3518  }
3519  //**********************************************************************************************
3520 
3521  //**BLAS-based addition assignment to dense matrices (default)**********************************
3535  template< typename MT3 // Type of the left-hand side target matrix
3536  , typename MT4 // Type of the left-hand side matrix operand
3537  , typename MT5 // Type of the right-hand side matrix operand
3538  , typename ST2 > // Type of the scalar value
3539  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3540  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3541  {
3542  selectDefaultAddAssignKernel( C, A, B, scalar );
3543  }
3544  //**********************************************************************************************
3545 
3546  //**BLAS-based addition assignment to dense matrices (single precision)*************************
3547 #if BLAZE_BLAS_MODE
3548 
3561  template< typename MT3 // Type of the left-hand side target matrix
3562  , typename MT4 // Type of the left-hand side matrix operand
3563  , typename MT5 // Type of the right-hand side matrix operand
3564  , typename ST2 > // Type of the scalar value
3565  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3566  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3567  {
3568  using boost::numeric_cast;
3569 
3570  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
3571  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
3572  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
3573 
3574  const int M ( numeric_cast<int>( A.rows() ) );
3575  const int N ( numeric_cast<int>( B.columns() ) );
3576  const int K ( numeric_cast<int>( A.columns() ) );
3577  const int lda( numeric_cast<int>( A.spacing() ) );
3578  const int ldb( numeric_cast<int>( B.spacing() ) );
3579  const int ldc( numeric_cast<int>( C.spacing() ) );
3580 
3581  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3582  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3583  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3584  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3585  }
3586 #endif
3587  //**********************************************************************************************
3588 
3589  //**BLAS-based addition assignment to dense matrices (double precision)*************************
3590 #if BLAZE_BLAS_MODE
3591 
3604  template< typename MT3 // Type of the left-hand side target matrix
3605  , typename MT4 // Type of the left-hand side matrix operand
3606  , typename MT5 // Type of the right-hand side matrix operand
3607  , typename ST2 > // Type of the scalar value
3608  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3609  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3610  {
3611  using boost::numeric_cast;
3612 
3613  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
3614  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
3615  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
3616 
3617  const int M ( numeric_cast<int>( A.rows() ) );
3618  const int N ( numeric_cast<int>( B.columns() ) );
3619  const int K ( numeric_cast<int>( A.columns() ) );
3620  const int lda( numeric_cast<int>( A.spacing() ) );
3621  const int ldb( numeric_cast<int>( B.spacing() ) );
3622  const int ldc( numeric_cast<int>( C.spacing() ) );
3623 
3624  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3625  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3626  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3627  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3628  }
3629 #endif
3630  //**********************************************************************************************
3631 
3632  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
3633 #if BLAZE_BLAS_MODE
3634 
3647  template< typename MT3 // Type of the left-hand side target matrix
3648  , typename MT4 // Type of the left-hand side matrix operand
3649  , typename MT5 // Type of the right-hand side matrix operand
3650  , typename ST2 > // Type of the scalar value
3651  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3652  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3653  {
3654  using boost::numeric_cast;
3655 
3656  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3657  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3658  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3659  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3660  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3661  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3662 
3663  const int M ( numeric_cast<int>( A.rows() ) );
3664  const int N ( numeric_cast<int>( B.columns() ) );
3665  const int K ( numeric_cast<int>( A.columns() ) );
3666  const int lda( numeric_cast<int>( A.spacing() ) );
3667  const int ldb( numeric_cast<int>( B.spacing() ) );
3668  const int ldc( numeric_cast<int>( C.spacing() ) );
3669  const complex<float> alpha( scalar );
3670  const complex<float> beta ( 1.0F, 0.0F );
3671 
3672  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3673  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3674  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3675  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3676  }
3677 #endif
3678  //**********************************************************************************************
3679 
3680  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
3681 #if BLAZE_BLAS_MODE
3682 
3695  template< typename MT3 // Type of the left-hand side target matrix
3696  , typename MT4 // Type of the left-hand side matrix operand
3697  , typename MT5 // Type of the right-hand side matrix operand
3698  , typename ST2 > // Type of the scalar value
3699  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3700  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3701  {
3702  using boost::numeric_cast;
3703 
3704  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
3705  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
3706  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
3707  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3708  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3709  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3710 
3711  const int M ( numeric_cast<int>( A.rows() ) );
3712  const int N ( numeric_cast<int>( B.columns() ) );
3713  const int K ( numeric_cast<int>( A.columns() ) );
3714  const int lda( numeric_cast<int>( A.spacing() ) );
3715  const int ldb( numeric_cast<int>( B.spacing() ) );
3716  const int ldc( numeric_cast<int>( C.spacing() ) );
3717  const complex<double> alpha( scalar );
3718  const complex<double> beta ( 1.0, 0.0 );
3719 
3720  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3721  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3722  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3723  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3724  }
3725 #endif
3726  //**********************************************************************************************
3727 
3728  //**Addition assignment to sparse matrices******************************************************
3729  // No special implementation for the addition assignment to sparse matrices.
3730  //**********************************************************************************************
3731 
3732  //**Subtraction assignment to dense matrices****************************************************
3744  template< typename MT3 // Type of the target dense matrix
3745  , bool SO > // Storage order of the target dense matrix
3746  friend inline void subAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
3747  {
3749 
3750  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3751  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3752 
3753  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3754  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3755 
3756  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3757  return;
3758  }
3759 
3760  LT A( left ); // Evaluation of the left-hand side dense matrix operand
3761  RT B( right ); // Evaluation of the right-hand side dense matrix operand
3762 
3763  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3764  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3765  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3766  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3767  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3768  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3769 
3770  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
3771  DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3772  else
3773  DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3774  }
3775  //**********************************************************************************************
3776 
3777  //**Default subtraction assignment to dense matrices********************************************
3791  template< typename MT3 // Type of the left-hand side target matrix
3792  , typename MT4 // Type of the left-hand side matrix operand
3793  , typename MT5 // Type of the right-hand side matrix operand
3794  , typename ST2 > // Type of the scalar value
3795  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3796  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3797  {
3798  const ResultType tmp( A * B * scalar );
3799  subAssign( C, tmp );
3800  }
3801  //**********************************************************************************************
3802 
3803  //**Vectorized default subtraction assignment to row-major dense matrices***********************
3817  template< typename MT3 // Type of the left-hand side target matrix
3818  , typename MT4 // Type of the left-hand side matrix operand
3819  , typename MT5 // Type of the right-hand side matrix operand
3820  , typename ST2 > // Type of the scalar value
3821  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3822  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3823  {
3824  typedef IntrinsicTrait<ElementType> IT;
3825 
3826  const size_t M( A.rows() );
3827  const size_t N( B.spacing() );
3828  const size_t K( A.columns() );
3829 
3830  const IntrinsicType factor( set( scalar ) );
3831 
3832  size_t j( 0UL );
3833 
3834  for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
3835  for( size_t i=0UL; i<M; ++i ) {
3836  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3837  for( size_t k=0UL; k<K; ++k ) {
3838  const IntrinsicType a1( set( A(i,k) ) );
3839  xmm1 = xmm1 + a1 * B.get(k,j );
3840  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3841  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3842  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3843  xmm5 = xmm5 + a1 * B.get(k,j+IT::size*4UL);
3844  xmm6 = xmm6 + a1 * B.get(k,j+IT::size*5UL);
3845  xmm7 = xmm7 + a1 * B.get(k,j+IT::size*6UL);
3846  xmm8 = xmm8 + a1 * B.get(k,j+IT::size*7UL);
3847  }
3848  store( &(~C)(i,j ), load( &(~C)(i,j ) ) - xmm1 * factor );
3849  store( &(~C)(i,j+IT::size ), load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3850  store( &(~C)(i,j+IT::size*2UL), load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3851  store( &(~C)(i,j+IT::size*3UL), load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3852  store( &(~C)(i,j+IT::size*4UL), load( &(~C)(i,j+IT::size*4UL) ) - xmm5 * factor );
3853  store( &(~C)(i,j+IT::size*5UL), load( &(~C)(i,j+IT::size*5UL) ) - xmm6 * factor );
3854  store( &(~C)(i,j+IT::size*6UL), load( &(~C)(i,j+IT::size*6UL) ) - xmm7 * factor );
3855  store( &(~C)(i,j+IT::size*7UL), load( &(~C)(i,j+IT::size*7UL) ) - xmm8 * factor );
3856  }
3857  }
3858  for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
3859  size_t i( 0UL );
3860  for( ; (i+2UL) <= M; i+=2UL ) {
3861  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3862  for( size_t k=0UL; k<K; ++k ) {
3863  const IntrinsicType a1( set( A(i ,k) ) );
3864  const IntrinsicType a2( set( A(i+1UL,k) ) );
3865  const IntrinsicType b1( B.get(k,j ) );
3866  const IntrinsicType b2( B.get(k,j+IT::size ) );
3867  const IntrinsicType b3( B.get(k,j+IT::size*2UL) );
3868  const IntrinsicType b4( B.get(k,j+IT::size*3UL) );
3869  xmm1 = xmm1 + a1 * b1;
3870  xmm2 = xmm2 + a1 * b2;
3871  xmm3 = xmm3 + a1 * b3;
3872  xmm4 = xmm4 + a1 * b4;
3873  xmm5 = xmm5 + a2 * b1;
3874  xmm6 = xmm6 + a2 * b2;
3875  xmm7 = xmm7 + a2 * b3;
3876  xmm8 = xmm8 + a2 * b4;
3877  }
3878  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) - xmm1 * factor );
3879  store( &(~C)(i ,j+IT::size ), load( &(~C)(i ,j+IT::size ) ) - xmm2 * factor );
3880  store( &(~C)(i ,j+IT::size*2UL), load( &(~C)(i ,j+IT::size*2UL) ) - xmm3 * factor );
3881  store( &(~C)(i ,j+IT::size*3UL), load( &(~C)(i ,j+IT::size*3UL) ) - xmm4 * factor );
3882  store( &(~C)(i+1UL,j ), load( &(~C)(i+1UL,j ) ) - xmm5 * factor );
3883  store( &(~C)(i+1UL,j+IT::size ), load( &(~C)(i+1UL,j+IT::size ) ) - xmm6 * factor );
3884  store( &(~C)(i+1UL,j+IT::size*2UL), load( &(~C)(i+1UL,j+IT::size*2UL) ) - xmm7 * factor );
3885  store( &(~C)(i+1UL,j+IT::size*3UL), load( &(~C)(i+1UL,j+IT::size*3UL) ) - xmm8 * factor );
3886  }
3887  if( i < M ) {
3888  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3889  for( size_t k=0UL; k<K; ++k ) {
3890  const IntrinsicType a1( set( A(i,k) ) );
3891  xmm1 = xmm1 + a1 * B.get(k,j );
3892  xmm2 = xmm2 + a1 * B.get(k,j+IT::size );
3893  xmm3 = xmm3 + a1 * B.get(k,j+IT::size*2UL);
3894  xmm4 = xmm4 + a1 * B.get(k,j+IT::size*3UL);
3895  }
3896  store( &(~C)(i,j ), load( &(~C)(i,j ) ) - xmm1 * factor );
3897  store( &(~C)(i,j+IT::size ), load( &(~C)(i,j+IT::size ) ) - xmm2 * factor );
3898  store( &(~C)(i,j+IT::size*2UL), load( &(~C)(i,j+IT::size*2UL) ) - xmm3 * factor );
3899  store( &(~C)(i,j+IT::size*3UL), load( &(~C)(i,j+IT::size*3UL) ) - xmm4 * factor );
3900  }
3901  }
3902  for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
3903  size_t i( 0UL );
3904  for( ; (i+2UL) <= M; i+=2UL ) {
3905  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3906  for( size_t k=0UL; k<K; ++k ) {
3907  const IntrinsicType a1( set( A(i ,k) ) );
3908  const IntrinsicType a2( set( A(i+1UL,k) ) );
3909  const IntrinsicType b1( B.get(k,j ) );
3910  const IntrinsicType b2( B.get(k,j+IT::size) );
3911  xmm1 = xmm1 + a1 * b1;
3912  xmm2 = xmm2 + a1 * b2;
3913  xmm3 = xmm3 + a2 * b1;
3914  xmm4 = xmm4 + a2 * b2;
3915  }
3916  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) - xmm1 * factor );
3917  store( &(~C)(i ,j+IT::size), load( &(~C)(i ,j+IT::size) ) - xmm2 * factor );
3918  store( &(~C)(i+1UL,j ), load( &(~C)(i+1UL,j ) ) - xmm3 * factor );
3919  store( &(~C)(i+1UL,j+IT::size), load( &(~C)(i+1UL,j+IT::size) ) - xmm4 * factor );
3920  }
3921  if( i < M ) {
3922  IntrinsicType xmm1, xmm2;
3923  for( size_t k=0UL; k<K; ++k ) {
3924  const IntrinsicType a1( set( A(i,k) ) );
3925  xmm1 = xmm1 + a1 * B.get(k,j );
3926  xmm2 = xmm2 + a1 * B.get(k,j+IT::size);
3927  }
3928  store( &(~C)(i,j ), load( &(~C)(i,j ) ) - xmm1 * factor );
3929  store( &(~C)(i,j+IT::size), load( &(~C)(i,j+IT::size) ) - xmm2 * factor );
3930  }
3931  }
3932  if( j < N ) {
3933  size_t i( 0UL );
3934  for( ; (i+2UL) <= M; i+=2UL ) {
3935  IntrinsicType xmm1, xmm2;
3936  for( size_t k=0UL; k<K; ++k ) {
3937  const IntrinsicType b1( B.get(k,j) );
3938  xmm1 = xmm1 + set( A(i ,k) ) * b1;
3939  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
3940  }
3941  store( &(~C)(i ,j), load( &(~C)(i ,j) ) - xmm1 * factor );
3942  store( &(~C)(i+1UL,j), load( &(~C)(i+1UL,j) ) - xmm2 * factor );
3943  }
3944  if( i < M ) {
3945  IntrinsicType xmm1;
3946  for( size_t k=0UL; k<K; ++k ) {
3947  xmm1 = xmm1 + set( A(i,k) ) * B.get(k,j);
3948  }
3949  store( &(~C)(i,j), load( &(~C)(i,j) ) - xmm1 * factor );
3950  }
3951  }
3952  }
3953  //**********************************************************************************************
3954 
3955  //**Vectorized default subtraction assignment to column-major dense matrices********************
3969  template< typename MT3 // Type of the left-hand side target matrix
3970  , typename MT4 // Type of the left-hand side matrix operand
3971  , typename MT5 // Type of the right-hand side matrix operand
3972  , typename ST2 > // Type of the scalar value
3973  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3974  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3975  {
3976  typedef IntrinsicTrait<ElementType> IT;
3977 
3978  const size_t M( A.spacing() );
3979  const size_t N( B.columns() );
3980  const size_t K( A.columns() );
3981 
3982  const IntrinsicType factor( set( scalar ) );
3983 
3984  size_t i( 0UL );
3985 
3986  for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
3987  for( size_t j=0UL; j<N; ++j ) {
3988  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3989  for( size_t k=0UL; k<K; ++k ) {
3990  const IntrinsicType b1( set( B(k,j) ) );
3991  xmm1 = xmm1 + A.get(i ,k) * b1;
3992  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
3993  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
3994  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
3995  xmm5 = xmm5 + A.get(i+IT::size*4UL,k) * b1;
3996  xmm6 = xmm6 + A.get(i+IT::size*5UL,k) * b1;
3997  xmm7 = xmm7 + A.get(i+IT::size*6UL,k) * b1;
3998  xmm8 = xmm8 + A.get(i+IT::size*7UL,k) * b1;
3999  }
4000  store( &(~C)(i ,j), load( &(~C)(i ,j) ) - xmm1 * factor );
4001  store( &(~C)(i+IT::size ,j), load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
4002  store( &(~C)(i+IT::size*2UL,j), load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
4003  store( &(~C)(i+IT::size*3UL,j), load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
4004  store( &(~C)(i+IT::size*4UL,j), load( &(~C)(i+IT::size*4UL,j) ) - xmm5 * factor );
4005  store( &(~C)(i+IT::size*5UL,j), load( &(~C)(i+IT::size*5UL,j) ) - xmm6 * factor );
4006  store( &(~C)(i+IT::size*6UL,j), load( &(~C)(i+IT::size*6UL,j) ) - xmm7 * factor );
4007  store( &(~C)(i+IT::size*7UL,j), load( &(~C)(i+IT::size*7UL,j) ) - xmm8 * factor );
4008  }
4009  }
4010  for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
4011  size_t j( 0UL );
4012  for( ; (j+2UL) <= N; j+=2UL ) {
4013  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4014  for( size_t k=0UL; k<K; ++k ) {
4015  const IntrinsicType a1( A.get(i ,k) );
4016  const IntrinsicType a2( A.get(i+IT::size ,k) );
4017  const IntrinsicType a3( A.get(i+IT::size*2UL,k) );
4018  const IntrinsicType a4( A.get(i+IT::size*3UL,k) );
4019  const IntrinsicType b1( set( B(k,j ) ) );
4020  const IntrinsicType b2( set( B(k,j+1UL) ) );
4021  xmm1 = xmm1 + a1 * b1;
4022  xmm2 = xmm2 + a2 * b1;
4023  xmm3 = xmm3 + a3 * b1;
4024  xmm4 = xmm4 + a4 * b1;
4025  xmm5 = xmm5 + a1 * b2;
4026  xmm6 = xmm6 + a2 * b2;
4027  xmm7 = xmm7 + a3 * b2;
4028  xmm8 = xmm8 + a4 * b2;
4029  }
4030  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) - xmm1 * factor );
4031  store( &(~C)(i+IT::size ,j ), load( &(~C)(i+IT::size ,j ) ) - xmm2 * factor );
4032  store( &(~C)(i+IT::size*2UL,j ), load( &(~C)(i+IT::size*2UL,j ) ) - xmm3 * factor );
4033  store( &(~C)(i+IT::size*3UL,j ), load( &(~C)(i+IT::size*3UL,j ) ) - xmm4 * factor );
4034  store( &(~C)(i ,j+1UL), load( &(~C)(i ,j+1UL) ) - xmm5 * factor );
4035  store( &(~C)(i+IT::size ,j+1UL), load( &(~C)(i+IT::size ,j+1UL) ) - xmm6 * factor );
4036  store( &(~C)(i+IT::size*2UL,j+1UL), load( &(~C)(i+IT::size*2UL,j+1UL) ) - xmm7 * factor );
4037  store( &(~C)(i+IT::size*3UL,j+1UL), load( &(~C)(i+IT::size*3UL,j+1UL) ) - xmm8 * factor );
4038  }
4039  if( j < N ) {
4040  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4041  for( size_t k=0UL; k<K; ++k ) {
4042  const IntrinsicType b1( set( B(k,j) ) );
4043  xmm1 = xmm1 + A.get(i ,k) * b1;
4044  xmm2 = xmm2 + A.get(i+IT::size ,k) * b1;
4045  xmm3 = xmm3 + A.get(i+IT::size*2UL,k) * b1;
4046  xmm4 = xmm4 + A.get(i+IT::size*3UL,k) * b1;
4047  }
4048  store( &(~C)(i ,j), load( &(~C)(i ,j) ) - xmm1 * factor );
4049  store( &(~C)(i+IT::size ,j), load( &(~C)(i+IT::size ,j) ) - xmm2 * factor );
4050  store( &(~C)(i+IT::size*2UL,j), load( &(~C)(i+IT::size*2UL,j) ) - xmm3 * factor );
4051  store( &(~C)(i+IT::size*3UL,j), load( &(~C)(i+IT::size*3UL,j) ) - xmm4 * factor );
4052  }
4053  }
4054  for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
4055  size_t j( 0UL );
4056  for( ; (j+2UL) <= N; j+=2UL ) {
4057  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4058  for( size_t k=0UL; k<K; ++k ) {
4059  const IntrinsicType a1( A.get(i ,k) );
4060  const IntrinsicType a2( A.get(i+IT::size,k) );
4061  const IntrinsicType b1( set( B(k,j ) ) );
4062  const IntrinsicType b2( set( B(k,j+1UL) ) );
4063  xmm1 = xmm1 + a1 * b1;
4064  xmm2 = xmm2 + a2 * b1;
4065  xmm3 = xmm3 + a1 * b2;
4066  xmm4 = xmm4 + a2 * b2;
4067  }
4068  store( &(~C)(i ,j ), load( &(~C)(i ,j ) ) - xmm1 * factor );
4069  store( &(~C)(i+IT::size,j ), load( &(~C)(i+IT::size,j ) ) - xmm2 * factor );
4070  store( &(~C)(i ,j+1UL), load( &(~C)(i ,j+1UL) ) - xmm3 * factor );
4071  store( &(~C)(i+IT::size,j+1UL), load( &(~C)(i+IT::size,j+1UL) ) - xmm4 * factor );
4072  }
4073  if( j < N ) {
4074  IntrinsicType xmm1, xmm2;
4075  for( size_t k=0UL; k<K; ++k ) {
4076  const IntrinsicType b1( set( B(k,j) ) );
4077  xmm1 = xmm1 + A.get(i ,k) * b1;
4078  xmm2 = xmm2 + A.get(i+IT::size,k) * b1;
4079  }
4080  store( &(~C)(i ,j), load( &(~C)(i ,j) ) - xmm1 * factor );
4081  store( &(~C)(i+IT::size,j), load( &(~C)(i+IT::size,j) ) - xmm2 * factor );
4082  }
4083  }
4084  if( i < M ) {
4085  size_t j( 0UL );
4086  for( ; (j+2UL) <= N; j+=2UL ) {
4087  IntrinsicType xmm1, xmm2;
4088  for( size_t k=0UL; k<K; ++k ) {
4089  const IntrinsicType a1( A.get(i,k) );
4090  xmm1 = xmm1 + a1 * set( B(k,j ) );
4091  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
4092  }
4093  store( &(~C)(i,j ), load( &(~C)(i,j ) ) - xmm1 * factor );
4094  store( &(~C)(i,j+1UL), load( &(~C)(i,j+1UL) ) - xmm2 * factor );
4095  }
4096  if( j < N ) {
4097  IntrinsicType xmm1;
4098  for( size_t k=0UL; k<K; ++k ) {
4099  xmm1 = xmm1 + A.get(i,k) * set( B(k,j) );
4100  }
4101  store( &(~C)(i,j), load( &(~C)(i,j) ) - xmm1 * factor );
4102  }
4103  }
4104  }
4105  //**********************************************************************************************
4106 
4107  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4121  template< typename MT3 // Type of the left-hand side target matrix
4122  , typename MT4 // Type of the left-hand side matrix operand
4123  , typename MT5 // Type of the right-hand side matrix operand
4124  , typename ST2 > // Type of the scalar value
4125  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4126  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4127  {
4128  selectDefaultSubAssignKernel( C, A, B, scalar );
4129  }
4130  //**********************************************************************************************
4131 
4132  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
4133 #if BLAZE_BLAS_MODE
4134 
4147  template< typename MT3 // Type of the left-hand side target matrix
4148  , typename MT4 // Type of the left-hand side matrix operand
4149  , typename MT5 // Type of the right-hand side matrix operand
4150  , typename ST2 > // Type of the scalar value
4151  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4152  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4153  {
4154  using boost::numeric_cast;
4155 
4156  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT3::ElementType );
4157  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT4::ElementType );
4158  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE( typename MT5::ElementType );
4159 
4160  const int M ( numeric_cast<int>( A.rows() ) );
4161  const int N ( numeric_cast<int>( B.columns() ) );
4162  const int K ( numeric_cast<int>( A.columns() ) );
4163  const int lda( numeric_cast<int>( A.spacing() ) );
4164  const int ldb( numeric_cast<int>( B.spacing() ) );
4165  const int ldc( numeric_cast<int>( C.spacing() ) );
4166 
4167  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4168  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4169  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4170  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4171  }
4172 #endif
4173  //**********************************************************************************************
4174 
4175  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
4176 #if BLAZE_BLAS_MODE
4177 
4190  template< typename MT3 // Type of the left-hand side target matrix
4191  , typename MT4 // Type of the left-hand side matrix operand
4192  , typename MT5 // Type of the right-hand side matrix operand
4193  , typename ST2 > // Type of the scalar value
4194  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4195  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4196  {
4197  using boost::numeric_cast;
4198 
4199  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT3::ElementType );
4200  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT4::ElementType );
4201  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE( typename MT5::ElementType );
4202 
4203  const int M ( numeric_cast<int>( A.rows() ) );
4204  const int N ( numeric_cast<int>( B.columns() ) );
4205  const int K ( numeric_cast<int>( A.columns() ) );
4206  const int lda( numeric_cast<int>( A.spacing() ) );
4207  const int ldb( numeric_cast<int>( B.spacing() ) );
4208  const int ldc( numeric_cast<int>( C.spacing() ) );
4209 
4210  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4211  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4212  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4213  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4214  }
4215 #endif
4216  //**********************************************************************************************
4217 
4218  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
4219 #if BLAZE_BLAS_MODE
4220 
4233  template< typename MT3 // Type of the left-hand side target matrix
4234  , typename MT4 // Type of the left-hand side matrix operand
4235  , typename MT5 // Type of the right-hand side matrix operand
4236  , typename ST2 > // Type of the scalar value
4237  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4238  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4239  {
4240  using boost::numeric_cast;
4241 
4242  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
4243  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
4244  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
4245  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
4246  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
4247  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
4248 
4249  const int M ( numeric_cast<int>( A.rows() ) );
4250  const int N ( numeric_cast<int>( B.columns() ) );
4251  const int K ( numeric_cast<int>( A.columns() ) );
4252  const int lda( numeric_cast<int>( A.spacing() ) );
4253  const int ldb( numeric_cast<int>( B.spacing() ) );
4254  const int ldc( numeric_cast<int>( C.spacing() ) );
4255  const complex<float> alpha( -scalar );
4256  const complex<float> beta ( 1.0F, 0.0F );
4257 
4258  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4259  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4260  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4261  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4262  }
4263 #endif
4264  //**********************************************************************************************
4265 
4266  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
4267 #if BLAZE_BLAS_MODE
4268 
4281  template< typename MT3 // Type of the left-hand side target matrix
4282  , typename MT4 // Type of the left-hand side matrix operand
4283  , typename MT5 // Type of the right-hand side matrix operand
4284  , typename ST2 > // Type of the scalar value
4285  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4286  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4287  {
4288  using boost::numeric_cast;
4289 
4290  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT3::ElementType );
4291  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT4::ElementType );
4292  BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE( typename MT5::ElementType );
4293  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
4294  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
4295  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
4296 
4297  const int M ( numeric_cast<int>( A.rows() ) );
4298  const int N ( numeric_cast<int>( B.columns() ) );
4299  const int K ( numeric_cast<int>( A.columns() ) );
4300  const int lda( numeric_cast<int>( A.spacing() ) );
4301  const int ldb( numeric_cast<int>( B.spacing() ) );
4302  const int ldc( numeric_cast<int>( C.spacing() ) );
4303  const complex<double> alpha( -scalar );
4304  const complex<double> beta ( 1.0, 0.0 );
4305 
4306  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4307  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4308  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4309  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4310  }
4311 #endif
4312  //**********************************************************************************************
4313 
4314  //**Subtraction assignment to sparse matrices***************************************************
4315  // No special implementation for the subtraction assignment to sparse matrices.
4316  //**********************************************************************************************
4317 
4318  //**Multiplication assignment to dense matrices*************************************************
4319  // No special implementation for the multiplication assignment to dense matrices.
4320  //**********************************************************************************************
4321 
4322  //**Multiplication assignment to sparse matrices************************************************
4323  // No special implementation for the multiplication assignment to sparse matrices.
4324  //**********************************************************************************************
4325 
4326  //**Compile time checks*************************************************************************
4335  //**********************************************************************************************
4336 };
4338 //*************************************************************************************************
4339 
4340 
4341 
4342 
4343 //=================================================================================================
4344 //
4345 // GLOBAL BINARY ARITHMETIC OPERATORS
4346 //
4347 //=================================================================================================
4348 
4349 //*************************************************************************************************
4378 template< typename T1 // Type of the left-hand side dense matrix
4379  , typename T2 > // Type of the right-hand side dense matrix
4380 inline const TDMatDMatMultExpr<T1,T2>
4382 {
4384 
4385  if( (~lhs).columns() != (~rhs).rows() )
4386  throw std::invalid_argument( "Matrix sizes do not match" );
4387 
4388  return TDMatDMatMultExpr<T1,T2>( ~lhs, ~rhs );
4389 }
4390 //*************************************************************************************************
4391 
4392 
4393 
4394 
4395 //=================================================================================================
4396 //
4397 // EXPRESSION TRAIT SPECIALIZATIONS
4398 //
4399 //=================================================================================================
4400 
4401 //*************************************************************************************************
4403 template< typename MT1, typename MT2, typename VT >
4404 struct TDMatDVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
4405 {
4406  public:
4407  //**********************************************************************************************
4408  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4409  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4410  IsDenseVector<VT>::value && !IsTransposeVector<VT>::value
4411  , typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
4412  , INVALID_TYPE >::Type Type;
4413  //**********************************************************************************************
4414 };
4416 //*************************************************************************************************
4417 
4418 
4419 //*************************************************************************************************
4421 template< typename MT1, typename MT2, typename VT >
4422 struct TDMatSVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
4423 {
4424  public:
4425  //**********************************************************************************************
4426  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4427  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4428  IsSparseVector<VT>::value && !IsTransposeVector<VT>::value
4429  , typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
4430  , INVALID_TYPE >::Type Type;
4431  //**********************************************************************************************
4432 };
4434 //*************************************************************************************************
4435 
4436 
4437 //*************************************************************************************************
4439 template< typename VT, typename MT1, typename MT2 >
4440 struct TDVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
4441 {
4442  public:
4443  //**********************************************************************************************
4444  typedef typename SelectType< IsDenseVector<VT>::value && IsTransposeVector<VT>::value &&
4445  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4446  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4447  , typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4448  , INVALID_TYPE >::Type Type;
4449  //**********************************************************************************************
4450 };
4452 //*************************************************************************************************
4453 
4454 
4455 //*************************************************************************************************
4457 template< typename VT, typename MT1, typename MT2 >
4458 struct TSVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
4459 {
4460  public:
4461  //**********************************************************************************************
4462  typedef typename SelectType< IsSparseVector<VT>::value && IsTransposeVector<VT>::value &&
4463  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4464  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4465  , typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4466  , INVALID_TYPE >::Type Type;
4467  //**********************************************************************************************
4468 };
4470 //*************************************************************************************************
4471 
4472 
4473 //*************************************************************************************************
4475 template< typename MT1, typename MT2 >
4476 struct RowExprTrait< TDMatDMatMultExpr<MT1,MT2> >
4477 {
4478  public:
4479  //**********************************************************************************************
4480  typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
4481  //**********************************************************************************************
4482 };
4484 //*************************************************************************************************
4485 
4486 
4487 //*************************************************************************************************
4489 template< typename MT1, typename MT2 >
4490 struct ColumnExprTrait< TDMatDMatMultExpr<MT1,MT2> >
4491 {
4492  public:
4493  //**********************************************************************************************
4494  typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
4495  //**********************************************************************************************
4496 };
4498 //*************************************************************************************************
4499 
4500 } // namespace blaze
4501 
4502 #endif