All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <stdexcept>
44 #include <boost/cast.hpp>
52 #include <blaze/math/Intrinsics.h>
53 #include <blaze/math/shims/Reset.h>
76 #include <blaze/system/BLAS.h>
78 #include <blaze/util/Assert.h>
79 #include <blaze/util/Complex.h>
85 #include <blaze/util/DisableIf.h>
86 #include <blaze/util/EnableIf.h>
87 #include <blaze/util/InvalidType.h>
89 #include <blaze/util/SelectType.h>
90 #include <blaze/util/Types.h>
96 
97 
98 namespace blaze {
99 
100 //=================================================================================================
101 //
102 // CLASS TDMATDMATMULTEXPR
103 //
104 //=================================================================================================
105 
106 //*************************************************************************************************
113 template< typename MT1 // Type of the left-hand side dense matrix
114  , typename MT2 > // Type of the right-hand side dense matrix
115 class TDMatDMatMultExpr : public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
116  , private MatMatMultExpr
117  , private Computation
118 {
119  private:
120  //**Type definitions****************************************************************************
121  typedef typename MT1::ResultType RT1;
122  typedef typename MT2::ResultType RT2;
123  typedef typename RT1::ElementType ET1;
124  typedef typename RT2::ElementType ET2;
125  typedef typename MT1::CompositeType CT1;
126  typedef typename MT2::CompositeType CT2;
127  //**********************************************************************************************
128 
129  //**********************************************************************************************
132  //**********************************************************************************************
133 
134  //**********************************************************************************************
136  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
137  //**********************************************************************************************
138 
139  //**********************************************************************************************
141 
145  template< typename MT >
146  struct UseSMPAssign {
147  enum { value = ( evaluateLeft || evaluateRight ) };
148  };
150  //**********************************************************************************************
151 
152  //**********************************************************************************************
154 
157  template< typename T1, typename T2, typename T3 >
158  struct UseSinglePrecisionKernel {
159  enum { value = IsFloat<typename T1::ElementType>::value &&
160  IsFloat<typename T2::ElementType>::value &&
161  IsFloat<typename T3::ElementType>::value };
162  };
164  //**********************************************************************************************
165 
166  //**********************************************************************************************
168 
171  template< typename T1, typename T2, typename T3 >
172  struct UseDoublePrecisionKernel {
173  enum { value = IsDouble<typename T1::ElementType>::value &&
174  IsDouble<typename T2::ElementType>::value &&
175  IsDouble<typename T3::ElementType>::value };
176  };
178  //**********************************************************************************************
179 
180  //**********************************************************************************************
182 
186  template< typename T1, typename T2, typename T3 >
187  struct UseSinglePrecisionComplexKernel {
188  typedef complex<float> Type;
189  enum { value = IsSame<typename T1::ElementType,Type>::value &&
190  IsSame<typename T2::ElementType,Type>::value &&
191  IsSame<typename T3::ElementType,Type>::value };
192  };
194  //**********************************************************************************************
195 
196  //**********************************************************************************************
198 
202  template< typename T1, typename T2, typename T3 >
203  struct UseDoublePrecisionComplexKernel {
204  typedef complex<double> Type;
205  enum { value = IsSame<typename T1::ElementType,Type>::value &&
206  IsSame<typename T2::ElementType,Type>::value &&
207  IsSame<typename T3::ElementType,Type>::value };
208  };
210  //**********************************************************************************************
211 
212  //**********************************************************************************************
214 
217  template< typename T1, typename T2, typename T3 >
218  struct UseDefaultKernel {
219  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
220  !UseDoublePrecisionKernel<T1,T2,T3>::value &&
221  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
222  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
223  };
225  //**********************************************************************************************
226 
227  //**********************************************************************************************
229 
232  template< typename T1, typename T2, typename T3 >
233  struct UseVectorizedDefaultKernel {
234  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
235  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
236  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
237  IntrinsicTrait<typename T1::ElementType>::addition &&
238  IntrinsicTrait<typename T1::ElementType>::subtraction &&
239  IntrinsicTrait<typename T1::ElementType>::multiplication };
240  };
242  //**********************************************************************************************
243 
244  public:
245  //**Type definitions****************************************************************************
252  typedef const ElementType ReturnType;
253  typedef const ResultType CompositeType;
254 
256  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
257 
259  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
260 
263 
266  //**********************************************************************************************
267 
268  //**Compilation flags***************************************************************************
270  enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
274 
276  enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
277  !evaluateRight && MT2::smpAssignable };
278  //**********************************************************************************************
279 
280  //**Constructor*********************************************************************************
286  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs )
287  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
288  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
289  {
290  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
291  }
292  //**********************************************************************************************
293 
294  //**Access operator*****************************************************************************
301  inline ReturnType operator()( size_t i, size_t j ) const {
302  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
303  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
304 
305  ElementType tmp;
306 
307  if( lhs_.columns() != 0UL ) {
308  const size_t end( ( ( lhs_.columns()-1UL ) & size_t(-2) ) + 1UL );
309  tmp = lhs_(i,0UL) * rhs_(0UL,j);
310  for( size_t k=1UL; k<end; k+=2UL ) {
311  tmp += lhs_(i,k ) * rhs_(k ,j);
312  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
313  }
314  if( end < lhs_.columns() ) {
315  tmp += lhs_(i,end) * rhs_(end,j);
316  }
317  }
318  else {
319  reset( tmp );
320  }
321 
322  return tmp;
323  }
324  //**********************************************************************************************
325 
326  //**Rows function*******************************************************************************
331  inline size_t rows() const {
332  return lhs_.rows();
333  }
334  //**********************************************************************************************
335 
336  //**Columns function****************************************************************************
341  inline size_t columns() const {
342  return rhs_.columns();
343  }
344  //**********************************************************************************************
345 
346  //**Left operand access*************************************************************************
351  inline LeftOperand leftOperand() const {
352  return lhs_;
353  }
354  //**********************************************************************************************
355 
356  //**Right operand access************************************************************************
361  inline RightOperand rightOperand() const {
362  return rhs_;
363  }
364  //**********************************************************************************************
365 
366  //**********************************************************************************************
372  template< typename T >
373  inline bool canAlias( const T* alias ) const {
374  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
375  }
376  //**********************************************************************************************
377 
378  //**********************************************************************************************
384  template< typename T >
385  inline bool isAliased( const T* alias ) const {
386  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
387  }
388  //**********************************************************************************************
389 
390  //**********************************************************************************************
395  inline bool isAligned() const {
396  return lhs_.isAligned() && rhs_.isAligned();
397  }
398  //**********************************************************************************************
399 
400  //**********************************************************************************************
405  inline bool canSMPAssign() const {
406  return ( !BLAZE_BLAS_IS_PARALLEL ||
407  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
409  }
410  //**********************************************************************************************
411 
412  private:
413  //**Member variables****************************************************************************
416  //**********************************************************************************************
417 
418  //**Assignment to dense matrices****************************************************************
431  template< typename MT // Type of the target dense matrix
432  , bool SO > // Storage order of the target dense matrix
433  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
434  {
436 
437  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
438  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
439 
440  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
441  return;
442  }
443  else if( rhs.lhs_.columns() == 0UL ) {
444  reset( ~lhs );
445  return;
446  }
447 
448  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
449  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
450 
451  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
452  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
453  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
454  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
455  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
456  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
457 
458  TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
459  }
461  //**********************************************************************************************
462 
463  //**Assignment to dense matrices (kernel selection)*********************************************
474  template< typename MT3 // Type of the left-hand side target matrix
475  , typename MT4 // Type of the left-hand side matrix operand
476  , typename MT5 > // Type of the right-hand side matrix operand
477  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
478  {
479  if( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD )
480  TDMatDMatMultExpr::selectDefaultAssignKernel( C, A, B );
481  else
482  TDMatDMatMultExpr::selectBlasAssignKernel( C, A, B );
483  }
485  //**********************************************************************************************
486 
487  //**Default assignment to dense matrices********************************************************
501  template< typename MT3 // Type of the left-hand side target matrix
502  , typename MT4 // Type of the left-hand side matrix operand
503  , typename MT5 > // Type of the right-hand side matrix operand
504  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
505  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
506  {
507  const size_t M( A.rows() );
508  const size_t N( B.columns() );
509  const size_t K( A.columns() );
510 
511  for( size_t i=0UL; i<M; ++i ) {
512  for( size_t j=0UL; j<N; ++j ) {
513  C(i,j) = A(i,0UL) * B(0UL,j);
514  }
515  for( size_t k=1UL; k<K; ++k ) {
516  for( size_t j=0UL; j<N; ++j ) {
517  C(i,j) += A(i,k) * B(k,j);
518  }
519  }
520  }
521  }
523  //**********************************************************************************************
524 
525  //**Vectorized default assignment to row-major dense matrices***********************************
539  template< typename MT3 // Type of the left-hand side target matrix
540  , typename MT4 // Type of the left-hand side matrix operand
541  , typename MT5 > // Type of the right-hand side matrix operand
542  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
543  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
544  {
545  typedef IntrinsicTrait<ElementType> IT;
546 
547  const size_t M( A.rows() );
548  const size_t N( B.columns() );
549  const size_t K( A.columns() );
550 
551  size_t j( 0UL );
552 
553  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
554  for( size_t i=0UL; i<M; ++i ) {
555  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
556  for( size_t k=0UL; k<K; ++k ) {
557  const IntrinsicType a1( set( A(i,k) ) );
558  xmm1 = xmm1 + a1 * B.load(k,j );
559  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
560  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
561  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
562  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
563  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
564  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
565  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
566  }
567  (~C).store( i, j , xmm1 );
568  (~C).store( i, j+IT::size , xmm2 );
569  (~C).store( i, j+IT::size*2UL, xmm3 );
570  (~C).store( i, j+IT::size*3UL, xmm4 );
571  (~C).store( i, j+IT::size*4UL, xmm5 );
572  (~C).store( i, j+IT::size*5UL, xmm6 );
573  (~C).store( i, j+IT::size*6UL, xmm7 );
574  (~C).store( i, j+IT::size*7UL, xmm8 );
575  }
576  }
577  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
578  size_t i( 0UL );
579  for( ; (i+2UL) <= M; i+=2UL ) {
580  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
581  for( size_t k=0UL; k<K; ++k ) {
582  const IntrinsicType a1( set( A(i ,k) ) );
583  const IntrinsicType a2( set( A(i+1UL,k) ) );
584  const IntrinsicType b1( B.load(k,j ) );
585  const IntrinsicType b2( B.load(k,j+IT::size ) );
586  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
587  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
588  xmm1 = xmm1 + a1 * b1;
589  xmm2 = xmm2 + a1 * b2;
590  xmm3 = xmm3 + a1 * b3;
591  xmm4 = xmm4 + a1 * b4;
592  xmm5 = xmm5 + a2 * b1;
593  xmm6 = xmm6 + a2 * b2;
594  xmm7 = xmm7 + a2 * b3;
595  xmm8 = xmm8 + a2 * b4;
596  }
597  (~C).store( i , j , xmm1 );
598  (~C).store( i , j+IT::size , xmm2 );
599  (~C).store( i , j+IT::size*2UL, xmm3 );
600  (~C).store( i , j+IT::size*3UL, xmm4 );
601  (~C).store( i+1UL, j , xmm5 );
602  (~C).store( i+1UL, j+IT::size , xmm6 );
603  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
604  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
605  }
606  if( i < M ) {
607  IntrinsicType xmm1, xmm2, xmm3, xmm4;
608  for( size_t k=0UL; k<K; ++k ) {
609  const IntrinsicType a1( set( A(i,k) ) );
610  xmm1 = xmm1 + a1 * B.load(k,j );
611  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
612  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
613  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
614  }
615  (~C).store( i, j , xmm1 );
616  (~C).store( i, j+IT::size , xmm2 );
617  (~C).store( i, j+IT::size*2UL, xmm3 );
618  (~C).store( i, j+IT::size*3UL, xmm4 );
619  }
620  }
621  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
622  size_t i( 0UL );
623  for( ; (i+2UL) <= M; i+=2UL ) {
624  IntrinsicType xmm1, xmm2, xmm3, xmm4;
625  for( size_t k=0UL; k<K; ++k ) {
626  const IntrinsicType a1( set( A(i ,k) ) );
627  const IntrinsicType a2( set( A(i+1UL,k) ) );
628  const IntrinsicType b1( B.load(k,j ) );
629  const IntrinsicType b2( B.load(k,j+IT::size) );
630  xmm1 = xmm1 + a1 * b1;
631  xmm2 = xmm2 + a1 * b2;
632  xmm3 = xmm3 + a2 * b1;
633  xmm4 = xmm4 + a2 * b2;
634  }
635  (~C).store( i , j , xmm1 );
636  (~C).store( i , j+IT::size, xmm2 );
637  (~C).store( i+1UL, j , xmm3 );
638  (~C).store( i+1UL, j+IT::size, xmm4 );
639  }
640  if( i < M ) {
641  IntrinsicType xmm1, xmm2;
642  for( size_t k=0UL; k<K; ++k ) {
643  const IntrinsicType a1( set( A(i,k) ) );
644  xmm1 = xmm1 + a1 * B.load(k,j );
645  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
646  }
647  (~C).store( i, j , xmm1 );
648  (~C).store( i, j+IT::size, xmm2 );
649  }
650  }
651  if( j < N ) {
652  size_t i( 0UL );
653  for( ; (i+2UL) <= M; i+=2UL ) {
654  IntrinsicType xmm1, xmm2;
655  for( size_t k=0UL; k<K; ++k ) {
656  const IntrinsicType b1( B.load(k,j) );
657  xmm1 = xmm1 + set( A(i ,k) ) * b1;
658  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
659  }
660  (~C).store( i , j, xmm1 );
661  (~C).store( i+1UL, j, xmm2 );
662  }
663  if( i < M ) {
664  IntrinsicType xmm1;
665  for( size_t k=0UL; k<K; ++k ) {
666  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
667  }
668  (~C).store( i, j, xmm1 );
669  }
670  }
671  }
673  //**********************************************************************************************
674 
675  //**Vectorized default assignment to column-major dense matrices********************************
689  template< typename MT3 // Type of the left-hand side target matrix
690  , typename MT4 // Type of the left-hand side matrix operand
691  , typename MT5 > // Type of the right-hand side matrix operand
692  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
693  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
694  {
695  typedef IntrinsicTrait<ElementType> IT;
696 
697  const size_t M( A.rows() );
698  const size_t N( B.columns() );
699  const size_t K( A.columns() );
700 
701  size_t i( 0UL );
702 
703  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
704  for( size_t j=0UL; j<N; ++j ) {
705  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
706  for( size_t k=0UL; k<K; ++k ) {
707  const IntrinsicType b1( set( B(k,j) ) );
708  xmm1 = xmm1 + A.load(i ,k) * b1;
709  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
710  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
711  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
712  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
713  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
714  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
715  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
716  }
717  (~C).store( i , j, xmm1 );
718  (~C).store( i+IT::size , j, xmm2 );
719  (~C).store( i+IT::size*2UL, j, xmm3 );
720  (~C).store( i+IT::size*3UL, j, xmm4 );
721  (~C).store( i+IT::size*4UL, j, xmm5 );
722  (~C).store( i+IT::size*5UL, j, xmm6 );
723  (~C).store( i+IT::size*6UL, j, xmm7 );
724  (~C).store( i+IT::size*7UL, j, xmm8 );
725  }
726  }
727  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
728  size_t j( 0UL );
729  for( ; (j+2UL) <= N; j+=2UL ) {
730  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
731  for( size_t k=0UL; k<K; ++k ) {
732  const IntrinsicType a1( A.load(i ,k) );
733  const IntrinsicType a2( A.load(i+IT::size ,k) );
734  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
735  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
736  const IntrinsicType b1( set( B(k,j ) ) );
737  const IntrinsicType b2( set( B(k,j+1UL) ) );
738  xmm1 = xmm1 + a1 * b1;
739  xmm2 = xmm2 + a2 * b1;
740  xmm3 = xmm3 + a3 * b1;
741  xmm4 = xmm4 + a4 * b1;
742  xmm5 = xmm5 + a1 * b2;
743  xmm6 = xmm6 + a2 * b2;
744  xmm7 = xmm7 + a3 * b2;
745  xmm8 = xmm8 + a4 * b2;
746  }
747  (~C).store( i , j , xmm1 );
748  (~C).store( i+IT::size , j , xmm2 );
749  (~C).store( i+IT::size*2UL, j , xmm3 );
750  (~C).store( i+IT::size*3UL, j , xmm4 );
751  (~C).store( i , j+1UL, xmm5 );
752  (~C).store( i+IT::size , j+1UL, xmm6 );
753  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
754  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
755  }
756  if( j < N ) {
757  IntrinsicType xmm1, xmm2, xmm3, xmm4;
758  for( size_t k=0UL; k<K; ++k ) {
759  const IntrinsicType b1( set( B(k,j) ) );
760  xmm1 = xmm1 + A.load(i ,k) * b1;
761  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
762  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
763  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
764  }
765  (~C).store( i , j, xmm1 );
766  (~C).store( i+IT::size , j, xmm2 );
767  (~C).store( i+IT::size*2UL, j, xmm3 );
768  (~C).store( i+IT::size*3UL, j, xmm4 );
769  }
770  }
771  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
772  size_t j( 0UL );
773  for( ; (j+2UL) <= N; j+=2UL ) {
774  IntrinsicType xmm1, xmm2, xmm3, xmm4;
775  for( size_t k=0UL; k<K; ++k ) {
776  const IntrinsicType a1( A.load(i ,k) );
777  const IntrinsicType a2( A.load(i+IT::size,k) );
778  const IntrinsicType b1( set( B(k,j ) ) );
779  const IntrinsicType b2( set( B(k,j+1UL) ) );
780  xmm1 = xmm1 + a1 * b1;
781  xmm2 = xmm2 + a2 * b1;
782  xmm3 = xmm3 + a1 * b2;
783  xmm4 = xmm4 + a2 * b2;
784  }
785  (~C).store( i , j , xmm1 );
786  (~C).store( i+IT::size, j , xmm2 );
787  (~C).store( i , j+1UL, xmm3 );
788  (~C).store( i+IT::size, j+1UL, xmm4 );
789  }
790  if( j < N ) {
791  IntrinsicType xmm1, xmm2;
792  for( size_t k=0UL; k<K; ++k ) {
793  const IntrinsicType b1( set( B(k,j) ) );
794  xmm1 = xmm1 + A.load(i ,k) * b1;
795  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
796  }
797  (~C).store( i , j, xmm1 );
798  (~C).store( i+IT::size, j, xmm2 );
799  }
800  }
801  if( i < M ) {
802  size_t j( 0UL );
803  for( ; (j+2UL) <= N; j+=2UL ) {
804  IntrinsicType xmm1, xmm2;
805  for( size_t k=0UL; k<K; ++k ) {
806  const IntrinsicType a1( A.load(i,k) );
807  xmm1 = xmm1 + a1 * set( B(k,j ) );
808  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
809  }
810  (~C).store( i, j , xmm1 );
811  (~C).store( i, j+1UL, xmm2 );
812  }
813  if( j < N ) {
814  IntrinsicType xmm1;
815  for( size_t k=0UL; k<K; ++k ) {
816  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
817  }
818  (~C).store( i, j, xmm1 );
819  }
820  }
821  }
823  //**********************************************************************************************
824 
825  //**BLAS-based assignment to dense matrices (default)*******************************************
839  template< typename MT3 // Type of the left-hand side target matrix
840  , typename MT4 // Type of the left-hand side matrix operand
841  , typename MT5 > // Type of the right-hand side matrix operand
842  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
843  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
844  {
845  selectDefaultAssignKernel( C, A, B );
846  }
848  //**********************************************************************************************
849 
850  //**BLAS-based assignment to dense matrices (single precision)**********************************
851 #if BLAZE_BLAS_MODE
852 
865  template< typename MT3 // Type of the left-hand side target matrix
866  , typename MT4 // Type of the left-hand side matrix operand
867  , typename MT5 > // Type of the right-hand side matrix operand
868  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
869  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
870  {
871  using boost::numeric_cast;
872 
876 
877  const int M ( numeric_cast<int>( A.rows() ) );
878  const int N ( numeric_cast<int>( B.columns() ) );
879  const int K ( numeric_cast<int>( A.columns() ) );
880  const int lda( numeric_cast<int>( A.spacing() ) );
881  const int ldb( numeric_cast<int>( B.spacing() ) );
882  const int ldc( numeric_cast<int>( C.spacing() ) );
883 
884  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
885  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
886  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
887  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
888  }
890 #endif
891  //**********************************************************************************************
892 
893  //**BLAS-based assignment to dense matrices (double precision)**********************************
894 #if BLAZE_BLAS_MODE
895 
908  template< typename MT3 // Type of the left-hand side target matrix
909  , typename MT4 // Type of the left-hand side matrix operand
910  , typename MT5 > // Type of the right-hand side matrix operand
911  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
912  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
913  {
914  using boost::numeric_cast;
915 
919 
920  const int M ( numeric_cast<int>( A.rows() ) );
921  const int N ( numeric_cast<int>( B.columns() ) );
922  const int K ( numeric_cast<int>( A.columns() ) );
923  const int lda( numeric_cast<int>( A.spacing() ) );
924  const int ldb( numeric_cast<int>( B.spacing() ) );
925  const int ldc( numeric_cast<int>( C.spacing() ) );
926 
927  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
928  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
929  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
930  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
931  }
933 #endif
934  //**********************************************************************************************
935 
936  //**BLAS-based assignment to dense matrices (single precision complex)**************************
937 #if BLAZE_BLAS_MODE
938 
951  template< typename MT3 // Type of the left-hand side target matrix
952  , typename MT4 // Type of the left-hand side matrix operand
953  , typename MT5 > // Type of the right-hand side matrix operand
954  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
955  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
956  {
957  using boost::numeric_cast;
958 
962  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
963  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
964  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
965 
966  const int M ( numeric_cast<int>( A.rows() ) );
967  const int N ( numeric_cast<int>( B.columns() ) );
968  const int K ( numeric_cast<int>( A.columns() ) );
969  const int lda( numeric_cast<int>( A.spacing() ) );
970  const int ldb( numeric_cast<int>( B.spacing() ) );
971  const int ldc( numeric_cast<int>( C.spacing() ) );
972  const complex<float> alpha( 1.0F, 0.0F );
973  const complex<float> beta ( 0.0F, 0.0F );
974 
975  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
976  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
977  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
978  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
979  }
981 #endif
982  //**********************************************************************************************
983 
984  //**BLAS-based assignment to dense matrices (double precision complex)**************************
985 #if BLAZE_BLAS_MODE
986 
999  template< typename MT3 // Type of the left-hand side target matrix
1000  , typename MT4 // Type of the left-hand side matrix operand
1001  , typename MT5 > // Type of the right-hand side matrix operand
1002  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1003  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1004  {
1005  using boost::numeric_cast;
1006 
1010  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1011  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1012  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1013 
1014  const int M ( numeric_cast<int>( A.rows() ) );
1015  const int N ( numeric_cast<int>( B.columns() ) );
1016  const int K ( numeric_cast<int>( A.columns() ) );
1017  const int lda( numeric_cast<int>( A.spacing() ) );
1018  const int ldb( numeric_cast<int>( B.spacing() ) );
1019  const int ldc( numeric_cast<int>( C.spacing() ) );
1020  const complex<double> alpha( 1.0, 0.0 );
1021  const complex<double> beta ( 0.0, 0.0 );
1022 
1023  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1024  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1025  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1026  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1027  }
1029 #endif
1030  //**********************************************************************************************
1031 
1032  //**Assignment to sparse matrices***************************************************************
1044  template< typename MT // Type of the target sparse matrix
1045  , bool SO > // Storage order of the target sparse matrix
1046  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
1047  {
1049 
1050  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
1051 
1058 
1059  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1060  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1061 
1062  const TmpType tmp( serial( rhs ) );
1063  assign( ~lhs, tmp );
1064  }
1066  //**********************************************************************************************
1067 
1068  //**Addition assignment to dense matrices*******************************************************
1081  template< typename MT // Type of the target dense matrix
1082  , bool SO > // Storage order of the target dense matrix
1083  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
1084  {
1086 
1087  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1088  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1089 
1090  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1091  return;
1092  }
1093 
1094  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1095  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1096 
1097  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1098  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1099  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1100  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1101  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1102  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1103 
1104  TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1105  }
1107  //**********************************************************************************************
1108 
1109  //**Addition assignment to dense matrices (kernel selection)************************************
1120  template< typename MT3 // Type of the left-hand side target matrix
1121  , typename MT4 // Type of the left-hand side matrix operand
1122  , typename MT5 > // Type of the right-hand side matrix operand
1123  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1124  {
1125  if( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD )
1126  TDMatDMatMultExpr::selectDefaultAddAssignKernel( C, A, B );
1127  else
1128  TDMatDMatMultExpr::selectBlasAddAssignKernel( C, A, B );
1129  }
1131  //**********************************************************************************************
1132 
1133  //**Default addition assignment to dense matrices***********************************************
1147  template< typename MT3 // Type of the left-hand side target matrix
1148  , typename MT4 // Type of the left-hand side matrix operand
1149  , typename MT5 > // Type of the right-hand side matrix operand
1150  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1151  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1152  {
1153  const size_t M( A.rows() );
1154  const size_t N( B.columns() );
1155  const size_t K( A.columns() );
1156 
1157  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1158  const size_t end( N & size_t(-2) );
1159 
1160  for( size_t i=0UL; i<M; ++i ) {
1161  for( size_t k=0UL; k<K; ++k ) {
1162  for( size_t j=0UL; j<end; j+=2UL ) {
1163  C(i,j ) += A(i,k) * B(k,j );
1164  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1165  }
1166  if( end < N ) {
1167  C(i,end) += A(i,k) * B(k,end);
1168  }
1169  }
1170  }
1171  }
1173  //**********************************************************************************************
1174 
1175  //**Vectorized default addition assignment to row-major dense matrices**************************
1189  template< typename MT3 // Type of the left-hand side target matrix
1190  , typename MT4 // Type of the left-hand side matrix operand
1191  , typename MT5 > // Type of the right-hand side matrix operand
1192  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1193  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1194  {
1195  typedef IntrinsicTrait<ElementType> IT;
1196 
1197  const size_t M( A.rows() );
1198  const size_t N( B.columns() );
1199  const size_t K( A.columns() );
1200 
1201  size_t j( 0UL );
1202 
1203  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1204  for( size_t i=0UL; i<M; ++i ) {
1205  IntrinsicType xmm1( (~C).load(i,j ) );
1206  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
1207  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
1208  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
1209  IntrinsicType xmm5( (~C).load(i,j+IT::size*4UL) );
1210  IntrinsicType xmm6( (~C).load(i,j+IT::size*5UL) );
1211  IntrinsicType xmm7( (~C).load(i,j+IT::size*6UL) );
1212  IntrinsicType xmm8( (~C).load(i,j+IT::size*7UL) );
1213  for( size_t k=0UL; k<K; ++k ) {
1214  const IntrinsicType a1( set( A(i,k) ) );
1215  xmm1 = xmm1 + a1 * B.load(k,j );
1216  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1217  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1218  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1219  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
1220  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
1221  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
1222  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
1223  }
1224  (~C).store( i, j , xmm1 );
1225  (~C).store( i, j+IT::size , xmm2 );
1226  (~C).store( i, j+IT::size*2UL, xmm3 );
1227  (~C).store( i, j+IT::size*3UL, xmm4 );
1228  (~C).store( i, j+IT::size*4UL, xmm5 );
1229  (~C).store( i, j+IT::size*5UL, xmm6 );
1230  (~C).store( i, j+IT::size*6UL, xmm7 );
1231  (~C).store( i, j+IT::size*7UL, xmm8 );
1232  }
1233  }
1234  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1235  size_t i( 0UL );
1236  for( ; (i+2UL) <= M; i+=2UL ) {
1237  IntrinsicType xmm1( (~C).load(i ,j ) );
1238  IntrinsicType xmm2( (~C).load(i ,j+IT::size ) );
1239  IntrinsicType xmm3( (~C).load(i ,j+IT::size*2UL) );
1240  IntrinsicType xmm4( (~C).load(i ,j+IT::size*3UL) );
1241  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1242  IntrinsicType xmm6( (~C).load(i+1UL,j+IT::size ) );
1243  IntrinsicType xmm7( (~C).load(i+1UL,j+IT::size*2UL) );
1244  IntrinsicType xmm8( (~C).load(i+1UL,j+IT::size*3UL) );
1245  for( size_t k=0UL; k<K; ++k ) {
1246  const IntrinsicType a1( set( A(i ,k) ) );
1247  const IntrinsicType a2( set( A(i+1UL,k) ) );
1248  const IntrinsicType b1( B.load(k,j ) );
1249  const IntrinsicType b2( B.load(k,j+IT::size ) );
1250  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
1251  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
1252  xmm1 = xmm1 + a1 * b1;
1253  xmm2 = xmm2 + a1 * b2;
1254  xmm3 = xmm3 + a1 * b3;
1255  xmm4 = xmm4 + a1 * b4;
1256  xmm5 = xmm5 + a2 * b1;
1257  xmm6 = xmm6 + a2 * b2;
1258  xmm7 = xmm7 + a2 * b3;
1259  xmm8 = xmm8 + a2 * b4;
1260  }
1261  (~C).store( i , j , xmm1 );
1262  (~C).store( i , j+IT::size , xmm2 );
1263  (~C).store( i , j+IT::size*2UL, xmm3 );
1264  (~C).store( i , j+IT::size*3UL, xmm4 );
1265  (~C).store( i+1UL, j , xmm5 );
1266  (~C).store( i+1UL, j+IT::size , xmm6 );
1267  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
1268  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
1269  }
1270  if( i < M ) {
1271  IntrinsicType xmm1( (~C).load(i,j ) );
1272  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
1273  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
1274  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
1275  for( size_t k=0UL; k<K; ++k ) {
1276  const IntrinsicType a1( set( A(i,k) ) );
1277  xmm1 = xmm1 + a1 * B.load(k,j );
1278  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1279  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1280  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1281  }
1282  (~C).store( i, j , xmm1 );
1283  (~C).store( i, j+IT::size , xmm2 );
1284  (~C).store( i, j+IT::size*2UL, xmm3 );
1285  (~C).store( i, j+IT::size*3UL, xmm4 );
1286  }
1287  }
1288  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1289  size_t i( 0UL );
1290  for( ; (i+2UL) <= M; i+=2UL ) {
1291  IntrinsicType xmm1( (~C).load(i ,j ) );
1292  IntrinsicType xmm2( (~C).load(i ,j+IT::size) );
1293  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1294  IntrinsicType xmm4( (~C).load(i+1UL,j+IT::size) );
1295  for( size_t k=0UL; k<K; ++k ) {
1296  const IntrinsicType a1( set( A(i ,k) ) );
1297  const IntrinsicType a2( set( A(i+1UL,k) ) );
1298  const IntrinsicType b1( B.load(k,j ) );
1299  const IntrinsicType b2( B.load(k,j+IT::size) );
1300  xmm1 = xmm1 + a1 * b1;
1301  xmm2 = xmm2 + a1 * b2;
1302  xmm3 = xmm3 + a2 * b1;
1303  xmm4 = xmm4 + a2 * b2;
1304  }
1305  (~C).store( i , j , xmm1 );
1306  (~C).store( i , j+IT::size, xmm2 );
1307  (~C).store( i+1UL, j , xmm3 );
1308  (~C).store( i+1UL, j+IT::size, xmm4 );
1309  }
1310  if( i < M ) {
1311  IntrinsicType xmm1( (~C).load(i,j ) );
1312  IntrinsicType xmm2( (~C).load(i,j+IT::size) );
1313  for( size_t k=0UL; k<K; ++k ) {
1314  const IntrinsicType a1( set( A(i,k) ) );
1315  xmm1 = xmm1 + a1 * B.load(k,j );
1316  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
1317  }
1318  (~C).store( i, j , xmm1 );
1319  (~C).store( i, j+IT::size, xmm2 );
1320  }
1321  }
1322  if( j < N ) {
1323  size_t i( 0UL );
1324  for( ; (i+2UL) <= M; i+=2UL ) {
1325  IntrinsicType xmm1( (~C).load(i ,j) );
1326  IntrinsicType xmm2( (~C).load(i+1UL,j) );
1327  for( size_t k=0UL; k<K; ++k ) {
1328  const IntrinsicType b1( B.load(k,j) );
1329  xmm1 = xmm1 + set( A(i ,k) ) * b1;
1330  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
1331  }
1332  (~C).store( i , j, xmm1 );
1333  (~C).store( i+1UL, j, xmm2 );
1334  }
1335  if( i < M ) {
1336  IntrinsicType xmm1( (~C).load(i,j) );
1337  for( size_t k=0UL; k<K; ++k ) {
1338  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
1339  }
1340  (~C).store( i, j, xmm1 );
1341  }
1342  }
1343  }
1345  //**********************************************************************************************
1346 
1347  //**Vectorized default addition assignment to column-major dense matrices***********************
1361  template< typename MT3 // Type of the left-hand side target matrix
1362  , typename MT4 // Type of the left-hand side matrix operand
1363  , typename MT5 > // Type of the right-hand side matrix operand
1364  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1365  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1366  {
1367  typedef IntrinsicTrait<ElementType> IT;
1368 
1369  const size_t M( A.rows() );
1370  const size_t N( B.columns() );
1371  const size_t K( A.columns() );
1372 
1373  size_t i( 0UL );
1374 
1375  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1376  for( size_t j=0UL; j<N; ++j ) {
1377  IntrinsicType xmm1( (~C).load(i ,j) );
1378  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
1379  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
1380  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
1381  IntrinsicType xmm5( (~C).load(i+IT::size*4UL,j) );
1382  IntrinsicType xmm6( (~C).load(i+IT::size*5UL,j) );
1383  IntrinsicType xmm7( (~C).load(i+IT::size*6UL,j) );
1384  IntrinsicType xmm8( (~C).load(i+IT::size*7UL,j) );
1385  for( size_t k=0UL; k<K; ++k ) {
1386  const IntrinsicType b1( set( B(k,j) ) );
1387  xmm1 = xmm1 + A.load(i ,k) * b1;
1388  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1389  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1390  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1391  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
1392  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
1393  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
1394  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
1395  }
1396  (~C).store( i , j, xmm1 );
1397  (~C).store( i+IT::size , j, xmm2 );
1398  (~C).store( i+IT::size*2UL, j, xmm3 );
1399  (~C).store( i+IT::size*3UL, j, xmm4 );
1400  (~C).store( i+IT::size*4UL, j, xmm5 );
1401  (~C).store( i+IT::size*5UL, j, xmm6 );
1402  (~C).store( i+IT::size*6UL, j, xmm7 );
1403  (~C).store( i+IT::size*7UL, j, xmm8 );
1404  }
1405  }
1406  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1407  size_t j( 0UL );
1408  for( ; (j+2UL) <= N; j+=2UL ) {
1409  IntrinsicType xmm1( (~C).load(i ,j ) );
1410  IntrinsicType xmm2( (~C).load(i+IT::size ,j ) );
1411  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j ) );
1412  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j ) );
1413  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
1414  IntrinsicType xmm6( (~C).load(i+IT::size ,j+1UL) );
1415  IntrinsicType xmm7( (~C).load(i+IT::size*2UL,j+1UL) );
1416  IntrinsicType xmm8( (~C).load(i+IT::size*3UL,j+1UL) );
1417  for( size_t k=0UL; k<K; ++k ) {
1418  const IntrinsicType a1( A.load(i ,k) );
1419  const IntrinsicType a2( A.load(i+IT::size ,k) );
1420  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
1421  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
1422  const IntrinsicType b1( set( B(k,j ) ) );
1423  const IntrinsicType b2( set( B(k,j+1UL) ) );
1424  xmm1 = xmm1 + a1 * b1;
1425  xmm2 = xmm2 + a2 * b1;
1426  xmm3 = xmm3 + a3 * b1;
1427  xmm4 = xmm4 + a4 * b1;
1428  xmm5 = xmm5 + a1 * b2;
1429  xmm6 = xmm6 + a2 * b2;
1430  xmm7 = xmm7 + a3 * b2;
1431  xmm8 = xmm8 + a4 * b2;
1432  }
1433  (~C).store( i , j , xmm1 );
1434  (~C).store( i+IT::size , j , xmm2 );
1435  (~C).store( i+IT::size*2UL, j , xmm3 );
1436  (~C).store( i+IT::size*3UL, j , xmm4 );
1437  (~C).store( i , j+1UL, xmm5 );
1438  (~C).store( i+IT::size , j+1UL, xmm6 );
1439  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
1440  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
1441  }
1442  if( j < N ) {
1443  IntrinsicType xmm1( (~C).load(i ,j) );
1444  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
1445  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
1446  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
1447  for( size_t k=0UL; k<K; ++k ) {
1448  const IntrinsicType b1( set( B(k,j) ) );
1449  xmm1 = xmm1 + A.load(i ,k) * b1;
1450  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1451  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1452  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1453  }
1454  (~C).store( i , j, xmm1 );
1455  (~C).store( i+IT::size , j, xmm2 );
1456  (~C).store( i+IT::size*2UL, j, xmm3 );
1457  (~C).store( i+IT::size*3UL, j, xmm4 );
1458  }
1459  }
1460  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1461  size_t j( 0UL );
1462  for( ; (j+2UL) <= N; j+=2UL ) {
1463  IntrinsicType xmm1( (~C).load(i ,j ) );
1464  IntrinsicType xmm2( (~C).load(i+IT::size,j ) );
1465  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
1466  IntrinsicType xmm4( (~C).load(i+IT::size,j+1UL) );
1467  for( size_t k=0UL; k<K; ++k ) {
1468  const IntrinsicType a1( A.load(i ,k) );
1469  const IntrinsicType a2( A.load(i+IT::size,k) );
1470  const IntrinsicType b1( set( B(k,j ) ) );
1471  const IntrinsicType b2( set( B(k,j+1UL) ) );
1472  xmm1 = xmm1 + a1 * b1;
1473  xmm2 = xmm2 + a2 * b1;
1474  xmm3 = xmm3 + a1 * b2;
1475  xmm4 = xmm4 + a2 * b2;
1476  }
1477  (~C).store( i , j , xmm1 );
1478  (~C).store( i+IT::size, j , xmm2 );
1479  (~C).store( i , j+1UL, xmm3 );
1480  (~C).store( i+IT::size, j+1UL, xmm4 );
1481  }
1482  if( j < N ) {
1483  IntrinsicType xmm1( (~C).load(i ,j) );
1484  IntrinsicType xmm2( (~C).load(i+IT::size,j) );
1485  for( size_t k=0UL; k<K; ++k ) {
1486  const IntrinsicType b1( set( B(k,j) ) );
1487  xmm1 = xmm1 + A.load(i ,k) * b1;
1488  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
1489  }
1490  (~C).store( i , j, xmm1 );
1491  (~C).store( i+IT::size, j, xmm2 );
1492  }
1493  }
1494  if( i < M ) {
1495  size_t j( 0UL );
1496  for( ; (j+2UL) <= N; j+=2UL ) {
1497  IntrinsicType xmm1( (~C).load(i,j ) );
1498  IntrinsicType xmm2( (~C).load(i,j+1UL) );
1499  for( size_t k=0UL; k<K; ++k ) {
1500  const IntrinsicType a1( A.load(i,k) );
1501  xmm1 = xmm1 + a1 * set( B(k,j ) );
1502  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
1503  }
1504  (~C).store( i, j , xmm1 );
1505  (~C).store( i, j+1UL, xmm2 );
1506  }
1507  if( j < N ) {
1508  IntrinsicType xmm1( (~C).load(i,j) );
1509  for( size_t k=0UL; k<K; ++k ) {
1510  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
1511  }
1512  (~C).store( i, j, xmm1 );
1513  }
1514  }
1515  }
1517  //**********************************************************************************************
1518 
1519  //**BLAS-based addition assignment to dense matrices (default)**********************************
1533  template< typename MT3 // Type of the left-hand side target matrix
1534  , typename MT4 // Type of the left-hand side matrix operand
1535  , typename MT5 > // Type of the right-hand side matrix operand
1536  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1537  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1538  {
1539  selectDefaultAddAssignKernel( C, A, B );
1540  }
1542  //**********************************************************************************************
1543 
1544  //**BLAS-based addition assignment to dense matrices (single precision)*************************
1545 #if BLAZE_BLAS_MODE
1546 
1559  template< typename MT3 // Type of the left-hand side target matrix
1560  , typename MT4 // Type of the left-hand side matrix operand
1561  , typename MT5 > // Type of the right-hand side matrix operand
1562  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1563  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1564  {
1565  using boost::numeric_cast;
1566 
1570 
1571  const int M ( numeric_cast<int>( A.rows() ) );
1572  const int N ( numeric_cast<int>( B.columns() ) );
1573  const int K ( numeric_cast<int>( A.columns() ) );
1574  const int lda( numeric_cast<int>( A.spacing() ) );
1575  const int ldb( numeric_cast<int>( B.spacing() ) );
1576  const int ldc( numeric_cast<int>( C.spacing() ) );
1577 
1578  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1579  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1580  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1581  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1582  }
1584 #endif
1585  //**********************************************************************************************
1586 
1587  //**BLAS-based addition assignment to dense matrices (double precision)*************************
1588 #if BLAZE_BLAS_MODE
1589 
1602  template< typename MT3 // Type of the left-hand side target matrix
1603  , typename MT4 // Type of the left-hand side matrix operand
1604  , typename MT5 > // Type of the right-hand side matrix operand
1605  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1606  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1607  {
1608  using boost::numeric_cast;
1609 
1613 
1614  const int M ( numeric_cast<int>( A.rows() ) );
1615  const int N ( numeric_cast<int>( B.columns() ) );
1616  const int K ( numeric_cast<int>( A.columns() ) );
1617  const int lda( numeric_cast<int>( A.spacing() ) );
1618  const int ldb( numeric_cast<int>( B.spacing() ) );
1619  const int ldc( numeric_cast<int>( C.spacing() ) );
1620 
1621  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1622  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1623  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1624  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1625  }
1627 #endif
1628  //**********************************************************************************************
1629 
1630  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
1631 #if BLAZE_BLAS_MODE
1632 
1645  template< typename MT3 // Type of the left-hand side target matrix
1646  , typename MT4 // Type of the left-hand side matrix operand
1647  , typename MT5 > // Type of the right-hand side matrix operand
1648  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1649  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1650  {
1651  using boost::numeric_cast;
1652 
1656  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
1657  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
1658  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
1659 
1660  const int M ( numeric_cast<int>( A.rows() ) );
1661  const int N ( numeric_cast<int>( B.columns() ) );
1662  const int K ( numeric_cast<int>( A.columns() ) );
1663  const int lda( numeric_cast<int>( A.spacing() ) );
1664  const int ldb( numeric_cast<int>( B.spacing() ) );
1665  const int ldc( numeric_cast<int>( C.spacing() ) );
1666  const complex<float> alpha( 1.0F, 0.0F );
1667  const complex<float> beta ( 1.0F, 0.0F );
1668 
1669  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1670  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1671  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1672  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1673  }
1675 #endif
1676  //**********************************************************************************************
1677 
1678  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
1679 #if BLAZE_BLAS_MODE
1680 
1693  template< typename MT3 // Type of the left-hand side target matrix
1694  , typename MT4 // Type of the left-hand side matrix operand
1695  , typename MT5 > // Type of the right-hand side matrix operand
1696  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1697  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1698  {
1699  using boost::numeric_cast;
1700 
1704  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1705  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1706  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1707 
1708  const int M ( numeric_cast<int>( A.rows() ) );
1709  const int N ( numeric_cast<int>( B.columns() ) );
1710  const int K ( numeric_cast<int>( A.columns() ) );
1711  const int lda( numeric_cast<int>( A.spacing() ) );
1712  const int ldb( numeric_cast<int>( B.spacing() ) );
1713  const int ldc( numeric_cast<int>( C.spacing() ) );
1714  const complex<double> alpha( 1.0, 0.0 );
1715  const complex<double> beta ( 1.0, 0.0 );
1716 
1717  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1718  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1719  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1720  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1721  }
1723 #endif
1724  //**********************************************************************************************
1725 
1726  //**Addition assignment to sparse matrices******************************************************
1727  // No special implementation for the addition assignment to sparse matrices.
1728  //**********************************************************************************************
1729 
1730  //**Subtraction assignment to dense matrices****************************************************
1743  template< typename MT // Type of the target dense matrix
1744  , bool SO > // Storage order of the target dense matrix
1745  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
1746  {
1748 
1749  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1750  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1751 
1752  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1753  return;
1754  }
1755 
1756  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1757  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1758 
1759  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1760  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1761  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1762  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1763  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1764  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1765 
1766  TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
1767  }
1769  //**********************************************************************************************
1770 
1771  //**Subtraction assignment to dense matrices (kernel selection)*********************************
1782  template< typename MT3 // Type of the left-hand side target matrix
1783  , typename MT4 // Type of the left-hand side matrix operand
1784  , typename MT5 > // Type of the right-hand side matrix operand
1785  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1786  {
1787  if( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD )
1788  TDMatDMatMultExpr::selectDefaultSubAssignKernel( C, A, B );
1789  else
1790  TDMatDMatMultExpr::selectBlasSubAssignKernel( C, A, B );
1791  }
1793  //**********************************************************************************************
1794 
1795  //**Default subtraction assignment to dense matrices********************************************
1809  template< typename MT3 // Type of the left-hand side target matrix
1810  , typename MT4 // Type of the left-hand side matrix operand
1811  , typename MT5 > // Type of the right-hand side matrix operand
1812  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1813  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1814  {
1815  const size_t M( A.rows() );
1816  const size_t N( B.columns() );
1817  const size_t K( A.columns() );
1818 
1819  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1820  const size_t end( N & size_t(-2) );
1821 
1822  for( size_t i=0UL; i<M; ++i ) {
1823  for( size_t k=0UL; k<K; ++k ) {
1824  for( size_t j=0UL; j<end; j+=2UL ) {
1825  C(i,j ) -= A(i,k) * B(k,j );
1826  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1827  }
1828  if( end < N ) {
1829  C(i,end) -= A(i,k) * B(k,end);
1830  }
1831  }
1832  }
1833  }
1835  //**********************************************************************************************
1836 
1837  //**Vectorized default subtraction assignment to row-major dense matrices***********************
1851  template< typename MT3 // Type of the left-hand side target matrix
1852  , typename MT4 // Type of the left-hand side matrix operand
1853  , typename MT5 > // Type of the right-hand side matrix operand
1854  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1855  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1856  {
1857  typedef IntrinsicTrait<ElementType> IT;
1858 
1859  const size_t M( A.rows() );
1860  const size_t N( B.columns() );
1861  const size_t K( A.columns() );
1862 
1863  size_t j( 0UL );
1864 
1865  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1866  for( size_t i=0UL; i<M; ++i ) {
1867  IntrinsicType xmm1( (~C).load(i,j ) );
1868  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
1869  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
1870  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
1871  IntrinsicType xmm5( (~C).load(i,j+IT::size*4UL) );
1872  IntrinsicType xmm6( (~C).load(i,j+IT::size*5UL) );
1873  IntrinsicType xmm7( (~C).load(i,j+IT::size*6UL) );
1874  IntrinsicType xmm8( (~C).load(i,j+IT::size*7UL) );
1875  for( size_t k=0UL; k<K; ++k ) {
1876  const IntrinsicType a1( set( A(i,k) ) );
1877  xmm1 = xmm1 - a1 * B.load(k,j );
1878  xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
1879  xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
1880  xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
1881  xmm5 = xmm5 - a1 * B.load(k,j+IT::size*4UL);
1882  xmm6 = xmm6 - a1 * B.load(k,j+IT::size*5UL);
1883  xmm7 = xmm7 - a1 * B.load(k,j+IT::size*6UL);
1884  xmm8 = xmm8 - a1 * B.load(k,j+IT::size*7UL);
1885  }
1886  (~C).store( i, j , xmm1 );
1887  (~C).store( i, j+IT::size , xmm2 );
1888  (~C).store( i, j+IT::size*2UL, xmm3 );
1889  (~C).store( i, j+IT::size*3UL, xmm4 );
1890  (~C).store( i, j+IT::size*4UL, xmm5 );
1891  (~C).store( i, j+IT::size*5UL, xmm6 );
1892  (~C).store( i, j+IT::size*6UL, xmm7 );
1893  (~C).store( i, j+IT::size*7UL, xmm8 );
1894  }
1895  }
1896  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1897  size_t i( 0UL );
1898  for( ; (i+2UL) <= M; i+=2UL ) {
1899  IntrinsicType xmm1( (~C).load(i ,j ) );
1900  IntrinsicType xmm2( (~C).load(i ,j+IT::size ) );
1901  IntrinsicType xmm3( (~C).load(i ,j+IT::size*2UL) );
1902  IntrinsicType xmm4( (~C).load(i ,j+IT::size*3UL) );
1903  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1904  IntrinsicType xmm6( (~C).load(i+1UL,j+IT::size ) );
1905  IntrinsicType xmm7( (~C).load(i+1UL,j+IT::size*2UL) );
1906  IntrinsicType xmm8( (~C).load(i+1UL,j+IT::size*3UL) );
1907  for( size_t k=0UL; k<K; ++k ) {
1908  const IntrinsicType a1( set( A(i ,k) ) );
1909  const IntrinsicType a2( set( A(i+1UL,k) ) );
1910  const IntrinsicType b1( B.load(k,j ) );
1911  const IntrinsicType b2( B.load(k,j+IT::size ) );
1912  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
1913  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
1914  xmm1 = xmm1 - a1 * b1;
1915  xmm2 = xmm2 - a1 * b2;
1916  xmm3 = xmm3 - a1 * b3;
1917  xmm4 = xmm4 - a1 * b4;
1918  xmm5 = xmm5 - a2 * b1;
1919  xmm6 = xmm6 - a2 * b2;
1920  xmm7 = xmm7 - a2 * b3;
1921  xmm8 = xmm8 - a2 * b4;
1922  }
1923  (~C).store( i , j , xmm1 );
1924  (~C).store( i , j+IT::size , xmm2 );
1925  (~C).store( i , j+IT::size*2UL, xmm3 );
1926  (~C).store( i , j+IT::size*3UL, xmm4 );
1927  (~C).store( i+1UL, j , xmm5 );
1928  (~C).store( i+1UL, j+IT::size , xmm6 );
1929  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
1930  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
1931  }
1932  if( i < M ) {
1933  IntrinsicType xmm1( (~C).load(i,j ) );
1934  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
1935  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
1936  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
1937  for( size_t k=0UL; k<K; ++k ) {
1938  const IntrinsicType a1( set( A(i,k) ) );
1939  xmm1 = xmm1 - a1 * B.load(k,j );
1940  xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
1941  xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
1942  xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
1943  }
1944  (~C).store( i, j , xmm1 );
1945  (~C).store( i, j+IT::size , xmm2 );
1946  (~C).store( i, j+IT::size*2UL, xmm3 );
1947  (~C).store( i, j+IT::size*3UL, xmm4 );
1948  }
1949  }
1950  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1951  size_t i( 0UL );
1952  for( ; (i+2UL) <= M; i+=2UL ) {
1953  IntrinsicType xmm1( (~C).load(i ,j ) );
1954  IntrinsicType xmm2( (~C).load(i ,j+IT::size) );
1955  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1956  IntrinsicType xmm4( (~C).load(i+1UL,j+IT::size) );
1957  for( size_t k=0UL; k<K; ++k ) {
1958  const IntrinsicType a1( set( A(i ,k) ) );
1959  const IntrinsicType a2( set( A(i+1UL,k) ) );
1960  const IntrinsicType b1( B.load(k,j ) );
1961  const IntrinsicType b2( B.load(k,j+IT::size) );
1962  xmm1 = xmm1 - a1 * b1;
1963  xmm2 = xmm2 - a1 * b2;
1964  xmm3 = xmm3 - a2 * b1;
1965  xmm4 = xmm4 - a2 * b2;
1966  }
1967  (~C).store( i , j , xmm1 );
1968  (~C).store( i , j+IT::size, xmm2 );
1969  (~C).store( i+1UL, j , xmm3 );
1970  (~C).store( i+1UL, j+IT::size, xmm4 );
1971  }
1972  if( i < M ) {
1973  IntrinsicType xmm1( (~C).load(i,j ) );
1974  IntrinsicType xmm2( (~C).load(i,j+IT::size) );
1975  for( size_t k=0UL; k<K; ++k ) {
1976  const IntrinsicType a1( set( A(i,k) ) );
1977  xmm1 = xmm1 - a1 * B.load(k,j );
1978  xmm2 = xmm2 - a1 * B.load(k,j+IT::size);
1979  }
1980  (~C).store( i, j , xmm1 );
1981  (~C).store( i, j+IT::size, xmm2 );
1982  }
1983  }
1984  if( j < N ) {
1985  size_t i( 0UL );
1986  for( ; (i+2UL) <= M; i+=2UL ) {
1987  IntrinsicType xmm1( (~C).load(i ,j) );
1988  IntrinsicType xmm2( (~C).load(i+1UL,j) );
1989  for( size_t k=0UL; k<K; ++k ) {
1990  const IntrinsicType b1( B.load(k,j) );
1991  xmm1 = xmm1 - set( A(i ,k) ) * b1;
1992  xmm2 = xmm2 - set( A(i+1UL,k) ) * b1;
1993  }
1994  (~C).store( i , j, xmm1 );
1995  (~C).store( i+1UL, j, xmm2 );
1996  }
1997  if( i < M ) {
1998  IntrinsicType xmm1( (~C).load(i,j) );
1999  for( size_t k=0UL; k<K; ++k ) {
2000  xmm1 = xmm1 - set( A(i,k) ) * B.load(k,j);
2001  }
2002  (~C).store( i, j, xmm1 );
2003  }
2004  }
2005  }
2007  //**********************************************************************************************
2008 
2009  //**Vectorized default subtraction assignment to column-major dense matrices********************
2023  template< typename MT3 // Type of the left-hand side target matrix
2024  , typename MT4 // Type of the left-hand side matrix operand
2025  , typename MT5 > // Type of the right-hand side matrix operand
2026  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2027  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2028  {
2029  typedef IntrinsicTrait<ElementType> IT;
2030 
2031  const size_t M( A.rows() );
2032  const size_t N( B.columns() );
2033  const size_t K( A.columns() );
2034 
2035  size_t i( 0UL );
2036 
2037  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2038  for( size_t j=0UL; j<N; ++j ) {
2039  IntrinsicType xmm1( (~C).load(i ,j) );
2040  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
2041  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
2042  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
2043  IntrinsicType xmm5( (~C).load(i+IT::size*4UL,j) );
2044  IntrinsicType xmm6( (~C).load(i+IT::size*5UL,j) );
2045  IntrinsicType xmm7( (~C).load(i+IT::size*6UL,j) );
2046  IntrinsicType xmm8( (~C).load(i+IT::size*7UL,j) );
2047  for( size_t k=0UL; k<K; ++k ) {
2048  const IntrinsicType b1( set( B(k,j) ) );
2049  xmm1 = xmm1 - A.load(i ,k) * b1;
2050  xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
2051  xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
2052  xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
2053  xmm5 = xmm5 - A.load(i+IT::size*4UL,k) * b1;
2054  xmm6 = xmm6 - A.load(i+IT::size*5UL,k) * b1;
2055  xmm7 = xmm7 - A.load(i+IT::size*6UL,k) * b1;
2056  xmm8 = xmm8 - A.load(i+IT::size*7UL,k) * b1;
2057  }
2058  (~C).store( i , j, xmm1 );
2059  (~C).store( i+IT::size , j, xmm2 );
2060  (~C).store( i+IT::size*2UL, j, xmm3 );
2061  (~C).store( i+IT::size*3UL, j, xmm4 );
2062  (~C).store( i+IT::size*4UL, j, xmm5 );
2063  (~C).store( i+IT::size*5UL, j, xmm6 );
2064  (~C).store( i+IT::size*6UL, j, xmm7 );
2065  (~C).store( i+IT::size*7UL, j, xmm8 );
2066  }
2067  }
2068  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2069  size_t j( 0UL );
2070  for( ; (j+2UL) <= N; j+=2UL ) {
2071  IntrinsicType xmm1( (~C).load(i ,j ) );
2072  IntrinsicType xmm2( (~C).load(i+IT::size ,j ) );
2073  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j ) );
2074  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j ) );
2075  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
2076  IntrinsicType xmm6( (~C).load(i+IT::size ,j+1UL) );
2077  IntrinsicType xmm7( (~C).load(i+IT::size*2UL,j+1UL) );
2078  IntrinsicType xmm8( (~C).load(i+IT::size*3UL,j+1UL) );
2079  for( size_t k=0UL; k<K; ++k ) {
2080  const IntrinsicType a1( A.load(i ,k) );
2081  const IntrinsicType a2( A.load(i+IT::size ,k) );
2082  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
2083  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
2084  const IntrinsicType b1( set( B(k,j ) ) );
2085  const IntrinsicType b2( set( B(k,j+1UL) ) );
2086  xmm1 = xmm1 - a1 * b1;
2087  xmm2 = xmm2 - a2 * b1;
2088  xmm3 = xmm3 - a3 * b1;
2089  xmm4 = xmm4 - a4 * b1;
2090  xmm5 = xmm5 - a1 * b2;
2091  xmm6 = xmm6 - a2 * b2;
2092  xmm7 = xmm7 - a3 * b2;
2093  xmm8 = xmm8 - a4 * b2;
2094  }
2095  (~C).store( i , j , xmm1 );
2096  (~C).store( i+IT::size , j , xmm2 );
2097  (~C).store( i+IT::size*2UL, j , xmm3 );
2098  (~C).store( i+IT::size*3UL, j , xmm4 );
2099  (~C).store( i , j+1UL, xmm5 );
2100  (~C).store( i+IT::size , j+1UL, xmm6 );
2101  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
2102  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
2103  }
2104  if( j < N ) {
2105  IntrinsicType xmm1( (~C).load(i ,j) );
2106  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
2107  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
2108  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
2109  for( size_t k=0UL; k<K; ++k ) {
2110  const IntrinsicType b1( set( B(k,j) ) );
2111  xmm1 = xmm1 - A.load(i ,k) * b1;
2112  xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
2113  xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
2114  xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
2115  }
2116  (~C).store( i , j, xmm1 );
2117  (~C).store( i+IT::size , j, xmm2 );
2118  (~C).store( i+IT::size*2UL, j, xmm3 );
2119  (~C).store( i+IT::size*3UL, j, xmm4 );
2120  }
2121  }
2122  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2123  size_t j( 0UL );
2124  for( ; (j+2UL) <= N; j+=2UL ) {
2125  IntrinsicType xmm1( (~C).load(i ,j ) );
2126  IntrinsicType xmm2( (~C).load(i+IT::size,j ) );
2127  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2128  IntrinsicType xmm4( (~C).load(i+IT::size,j+1UL) );
2129  for( size_t k=0UL; k<K; ++k ) {
2130  const IntrinsicType a1( A.load(i ,k) );
2131  const IntrinsicType a2( A.load(i+IT::size,k) );
2132  const IntrinsicType b1( set( B(k,j ) ) );
2133  const IntrinsicType b2( set( B(k,j+1UL) ) );
2134  xmm1 = xmm1 - a1 * b1;
2135  xmm2 = xmm2 - a2 * b1;
2136  xmm3 = xmm3 - a1 * b2;
2137  xmm4 = xmm4 - a2 * b2;
2138  }
2139  (~C).store( i , j , xmm1 );
2140  (~C).store( i+IT::size, j , xmm2 );
2141  (~C).store( i , j+1UL, xmm3 );
2142  (~C).store( i+IT::size, j+1UL, xmm4 );
2143  }
2144  if( j < N ) {
2145  IntrinsicType xmm1( (~C).load(i ,j) );
2146  IntrinsicType xmm2( (~C).load(i+IT::size,j) );
2147  for( size_t k=0UL; k<K; ++k ) {
2148  const IntrinsicType b1( set( B(k,j) ) );
2149  xmm1 = xmm1 - A.load(i ,k) * b1;
2150  xmm2 = xmm2 - A.load(i+IT::size,k) * b1;
2151  }
2152  (~C).store( i , j, xmm1 );
2153  (~C).store( i+IT::size, j, xmm2 );
2154  }
2155  }
2156  if( i < M ) {
2157  size_t j( 0UL );
2158  for( ; (j+2UL) <= N; j+=2UL ) {
2159  IntrinsicType xmm1( (~C).load(i,j ) );
2160  IntrinsicType xmm2( (~C).load(i,j+1UL) );
2161  for( size_t k=0UL; k<K; ++k ) {
2162  const IntrinsicType a1( A.load(i,k) );
2163  xmm1 = xmm1 - a1 * set( B(k,j ) );
2164  xmm2 = xmm2 - a1 * set( B(k,j+1UL) );
2165  }
2166  (~C).store( i, j , xmm1 );
2167  (~C).store( i, j+1UL, xmm2 );
2168  }
2169  if( j < N ) {
2170  IntrinsicType xmm1( (~C).load(i,j) );
2171  for( size_t k=0UL; k<K; ++k ) {
2172  xmm1 = xmm1 - A.load(i,k) * set( B(k,j) );
2173  }
2174  (~C).store( i, j, xmm1 );
2175  }
2176  }
2177  }
2179  //**********************************************************************************************
2180 
2181  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
2195  template< typename MT3 // Type of the left-hand side target matrix
2196  , typename MT4 // Type of the left-hand side matrix operand
2197  , typename MT5 > // Type of the right-hand side matrix operand
2198  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2199  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2200  {
2201  selectDefaultSubAssignKernel( C, A, B );
2202  }
2204  //**********************************************************************************************
2205 
2206  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
2207 #if BLAZE_BLAS_MODE
2208 
2221  template< typename MT3 // Type of the left-hand side target matrix
2222  , typename MT4 // Type of the left-hand side matrix operand
2223  , typename MT5 > // Type of the right-hand side matrix operand
2224  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2225  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2226  {
2227  using boost::numeric_cast;
2228 
2232 
2233  const int M ( numeric_cast<int>( A.rows() ) );
2234  const int N ( numeric_cast<int>( B.columns() ) );
2235  const int K ( numeric_cast<int>( A.columns() ) );
2236  const int lda( numeric_cast<int>( A.spacing() ) );
2237  const int ldb( numeric_cast<int>( B.spacing() ) );
2238  const int ldc( numeric_cast<int>( C.spacing() ) );
2239 
2240  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2241  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2242  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2243  M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2244  }
2246 #endif
2247  //**********************************************************************************************
2248 
2249  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
2250 #if BLAZE_BLAS_MODE
2251 
2264  template< typename MT3 // Type of the left-hand side target matrix
2265  , typename MT4 // Type of the left-hand side matrix operand
2266  , typename MT5 > // Type of the right-hand side matrix operand
2267  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2268  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2269  {
2270  using boost::numeric_cast;
2271 
2275 
2276  const int M ( numeric_cast<int>( A.rows() ) );
2277  const int N ( numeric_cast<int>( B.columns() ) );
2278  const int K ( numeric_cast<int>( A.columns() ) );
2279  const int lda( numeric_cast<int>( A.spacing() ) );
2280  const int ldb( numeric_cast<int>( B.spacing() ) );
2281  const int ldc( numeric_cast<int>( C.spacing() ) );
2282 
2283  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2284  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2285  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2286  M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2287  }
2289 #endif
2290  //**********************************************************************************************
2291 
2292  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
2293 #if BLAZE_BLAS_MODE
2294 
2307  template< typename MT3 // Type of the left-hand side target matrix
2308  , typename MT4 // Type of the left-hand side matrix operand
2309  , typename MT5 > // Type of the right-hand side matrix operand
2310  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2311  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2312  {
2313  using boost::numeric_cast;
2314 
2318  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
2319  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
2320  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
2321 
2322  const int M ( numeric_cast<int>( A.rows() ) );
2323  const int N ( numeric_cast<int>( B.columns() ) );
2324  const int K ( numeric_cast<int>( A.columns() ) );
2325  const int lda( numeric_cast<int>( A.spacing() ) );
2326  const int ldb( numeric_cast<int>( B.spacing() ) );
2327  const int ldc( numeric_cast<int>( C.spacing() ) );
2328  const complex<float> alpha( -1.0F, 0.0F );
2329  const complex<float> beta ( 1.0F, 0.0F );
2330 
2331  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2332  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2333  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2334  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2335  }
2337 #endif
2338  //**********************************************************************************************
2339 
2340  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
2341 #if BLAZE_BLAS_MODE
2342 
2355  template< typename MT3 // Type of the left-hand side target matrix
2356  , typename MT4 // Type of the left-hand side matrix operand
2357  , typename MT5 > // Type of the right-hand side matrix operand
2358  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2359  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2360  {
2361  using boost::numeric_cast;
2362 
2366  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
2367  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
2368  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
2369 
2370  const int M ( numeric_cast<int>( A.rows() ) );
2371  const int N ( numeric_cast<int>( B.columns() ) );
2372  const int K ( numeric_cast<int>( A.columns() ) );
2373  const int lda( numeric_cast<int>( A.spacing() ) );
2374  const int ldb( numeric_cast<int>( B.spacing() ) );
2375  const int ldc( numeric_cast<int>( C.spacing() ) );
2376  const complex<double> alpha( -1.0, 0.0 );
2377  const complex<double> beta ( 1.0, 0.0 );
2378 
2379  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2380  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2381  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2382  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2383  }
2385 #endif
2386  //**********************************************************************************************
2387 
2388  //**Subtraction assignment to sparse matrices***************************************************
2389  // No special implementation for the subtraction assignment to sparse matrices.
2390  //**********************************************************************************************
2391 
2392  //**Multiplication assignment to dense matrices*************************************************
2393  // No special implementation for the multiplication assignment to dense matrices.
2394  //**********************************************************************************************
2395 
2396  //**Multiplication assignment to sparse matrices************************************************
2397  // No special implementation for the multiplication assignment to sparse matrices.
2398  //**********************************************************************************************
2399 
2400  //**SMP assignment to dense matrices************************************************************
2415  template< typename MT // Type of the target dense matrix
2416  , bool SO > // Storage order of the target dense matrix
2417  friend inline typename EnableIf< UseSMPAssign<MT> >::Type
2418  smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2419  {
2421 
2422  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2423  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2424 
2425  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
2426  return;
2427  }
2428  else if( rhs.lhs_.columns() == 0UL ) {
2429  reset( ~lhs );
2430  return;
2431  }
2432 
2433  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
2434  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
2435 
2436  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2437  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2438  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2439  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2440  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2441  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2442 
2443  smpAssign( ~lhs, A * B );
2444  }
2446  //**********************************************************************************************
2447 
2448  //**SMP assignment to sparse matrices***********************************************************
2462  template< typename MT // Type of the target sparse matrix
2463  , bool SO > // Storage order of the target sparse matrix
2464  friend inline typename EnableIf< UseSMPAssign<MT> >::Type
2465  smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2466  {
2468 
2469  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
2470 
2477 
2478  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2479  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2480 
2481  const TmpType tmp( rhs );
2482  smpAssign( ~lhs, tmp );
2483  }
2485  //**********************************************************************************************
2486 
2487  //**SMP addition assignment to dense matrices***************************************************
2502  template< typename MT // Type of the target dense matrix
2503  , bool SO > // Storage order of the target dense matrix
2504  friend inline typename EnableIf< UseSMPAssign<MT> >::Type
2505  smpAddAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2506  {
2508 
2509  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2510  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2511 
2512  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2513  return;
2514  }
2515 
2516  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
2517  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
2518 
2519  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2520  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2521  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2522  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2523  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2524  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2525 
2526  smpAddAssign( ~lhs, A * B );
2527  }
2529  //**********************************************************************************************
2530 
2531  //**SMP addition assignment to sparse matrices**************************************************
2532  // No special implementation for the SMP addition assignment to sparse matrices.
2533  //**********************************************************************************************
2534 
2535  //**SMP subtraction assignment to dense matrices************************************************
2550  template< typename MT // Type of the target dense matrix
2551  , bool SO > // Storage order of the target dense matrix
2552  friend inline typename EnableIf< UseSMPAssign<MT> >::Type
2553  smpSubAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2554  {
2556 
2557  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2558  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2559 
2560  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2561  return;
2562  }
2563 
2564  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
2565  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
2566 
2567  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2568  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2569  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2570  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2571  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2572  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2573 
2574  smpSubAssign( ~lhs, A * B );
2575  }
2577  //**********************************************************************************************
2578 
2579  //**SMP subtraction assignment to sparse matrices***********************************************
2580  // No special implementation for the SMP subtraction assignment to sparse matrices.
2581  //**********************************************************************************************
2582 
2583  //**SMP multiplication assignment to dense matrices*********************************************
2584  // No special implementation for the SMP multiplication assignment to dense matrices.
2585  //**********************************************************************************************
2586 
2587  //**SMP multiplication assignment to sparse matrices********************************************
2588  // No special implementation for the SMP multiplication assignment to sparse matrices.
2589  //**********************************************************************************************
2590 
2591  //**Compile time checks*************************************************************************
2598  //**********************************************************************************************
2599 };
2600 //*************************************************************************************************
2601 
2602 
2603 
2604 
2605 //=================================================================================================
2606 //
2607 // DMATSCALARMULTEXPR SPECIALIZATION
2608 //
2609 //=================================================================================================
2610 
2611 //*************************************************************************************************
2619 template< typename MT1 // Type of the left-hand side dense matrix
2620  , typename MT2 // Type of the right-hand side dense matrix
2621  , typename ST > // Type of the right-hand side scalar value
2622 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >
2623  : public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2624  , private MatScalarMultExpr
2625  , private Computation
2626 {
2627  private:
2628  //**Type definitions****************************************************************************
2629  typedef TDMatDMatMultExpr<MT1,MT2> MMM;
2630  typedef typename MMM::ResultType RES;
2631  typedef typename MT1::ResultType RT1;
2632  typedef typename MT2::ResultType RT2;
2633  typedef typename RT1::ElementType ET1;
2634  typedef typename RT2::ElementType ET2;
2635  typedef typename MT1::CompositeType CT1;
2636  typedef typename MT2::CompositeType CT2;
2637  //**********************************************************************************************
2638 
2639  //**********************************************************************************************
2641  enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
2642  //**********************************************************************************************
2643 
2644  //**********************************************************************************************
2646  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
2647  //**********************************************************************************************
2648 
2649  //**********************************************************************************************
2651 
2654  template< typename MT >
2655  struct UseSMPAssign {
2656  enum { value = ( evaluateLeft || evaluateRight ) };
2657  };
2658  //**********************************************************************************************
2659 
2660  //**********************************************************************************************
2662 
2665  template< typename T1, typename T2, typename T3, typename T4 >
2666  struct UseSinglePrecisionKernel {
2667  enum { value = IsFloat<typename T1::ElementType>::value &&
2668  IsFloat<typename T2::ElementType>::value &&
2669  IsFloat<typename T3::ElementType>::value &&
2670  !IsComplex<T4>::value };
2671  };
2672  //**********************************************************************************************
2673 
2674  //**********************************************************************************************
2676 
2679  template< typename T1, typename T2, typename T3, typename T4 >
2680  struct UseDoublePrecisionKernel {
2681  enum { value = IsDouble<typename T1::ElementType>::value &&
2682  IsDouble<typename T2::ElementType>::value &&
2683  IsDouble<typename T3::ElementType>::value &&
2684  !IsComplex<T4>::value };
2685  };
2686  //**********************************************************************************************
2687 
2688  //**********************************************************************************************
2690 
2693  template< typename T1, typename T2, typename T3 >
2694  struct UseSinglePrecisionComplexKernel {
2695  typedef complex<float> Type;
2696  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2697  IsSame<typename T2::ElementType,Type>::value &&
2698  IsSame<typename T3::ElementType,Type>::value };
2699  };
2700  //**********************************************************************************************
2701 
2702  //**********************************************************************************************
2704 
2707  template< typename T1, typename T2, typename T3 >
2708  struct UseDoublePrecisionComplexKernel {
2709  typedef complex<double> Type;
2710  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2711  IsSame<typename T2::ElementType,Type>::value &&
2712  IsSame<typename T3::ElementType,Type>::value };
2713  };
2714  //**********************************************************************************************
2715 
2716  //**********************************************************************************************
2718 
2720  template< typename T1, typename T2, typename T3, typename T4 >
2721  struct UseDefaultKernel {
2722  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2723  !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2724  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2725  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2726  };
2727  //**********************************************************************************************
2728 
2729  //**********************************************************************************************
2731 
2733  template< typename T1, typename T2, typename T3, typename T4 >
2734  struct UseVectorizedDefaultKernel {
2735  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2736  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2737  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2738  IsSame<typename T1::ElementType,T4>::value &&
2739  IntrinsicTrait<typename T1::ElementType>::addition &&
2740  IntrinsicTrait<typename T1::ElementType>::subtraction &&
2741  IntrinsicTrait<typename T1::ElementType>::multiplication };
2742  };
2743  //**********************************************************************************************
2744 
2745  public:
2746  //**Type definitions****************************************************************************
2747  typedef DMatScalarMultExpr<MMM,ST,true> This;
2748  typedef typename MultTrait<RES,ST>::Type ResultType;
2749  typedef typename ResultType::OppositeType OppositeType;
2750  typedef typename ResultType::TransposeType TransposeType;
2751  typedef typename ResultType::ElementType ElementType;
2752  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
2753  typedef const ElementType ReturnType;
2754  typedef const ResultType CompositeType;
2755 
2757  typedef const TDMatDMatMultExpr<MT1,MT2> LeftOperand;
2758 
2760  typedef ST RightOperand;
2761 
2763  typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type LT;
2764 
2766  typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type RT;
2767  //**********************************************************************************************
2768 
2769  //**Compilation flags***************************************************************************
2771  enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
2772  IsSame<ET1,ET2>::value &&
2773  IsSame<ET1,ST>::value &&
2774  IntrinsicTrait<ET1>::addition &&
2775  IntrinsicTrait<ET1>::multiplication };
2776 
2778  enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
2779  !evaluateRight && MT2::smpAssignable };
2780  //**********************************************************************************************
2781 
2782  //**Constructor*********************************************************************************
2788  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
2789  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
2790  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2791  {}
2792  //**********************************************************************************************
2793 
2794  //**Access operator*****************************************************************************
2801  inline ResultType operator()( size_t i, size_t j ) const {
2802  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
2803  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
2804  return matrix_(i,j) * scalar_;
2805  }
2806  //**********************************************************************************************
2807 
2808  //**Rows function*******************************************************************************
2813  inline size_t rows() const {
2814  return matrix_.rows();
2815  }
2816  //**********************************************************************************************
2817 
2818  //**Columns function****************************************************************************
2823  inline size_t columns() const {
2824  return matrix_.columns();
2825  }
2826  //**********************************************************************************************
2827 
2828  //**Left operand access*************************************************************************
2833  inline LeftOperand leftOperand() const {
2834  return matrix_;
2835  }
2836  //**********************************************************************************************
2837 
2838  //**Right operand access************************************************************************
2843  inline RightOperand rightOperand() const {
2844  return scalar_;
2845  }
2846  //**********************************************************************************************
2847 
2848  //**********************************************************************************************
2854  template< typename T >
2855  inline bool canAlias( const T* alias ) const {
2856  return matrix_.canAlias( alias );
2857  }
2858  //**********************************************************************************************
2859 
2860  //**********************************************************************************************
2866  template< typename T >
2867  inline bool isAliased( const T* alias ) const {
2868  return matrix_.isAliased( alias );
2869  }
2870  //**********************************************************************************************
2871 
2872  //**********************************************************************************************
2877  inline bool isAligned() const {
2878  return matrix_.isAligned();
2879  }
2880  //**********************************************************************************************
2881 
2882  //**********************************************************************************************
2887  inline bool canSMPAssign() const {
2888  typename MMM::RightOperand B( matrix_.rightOperand() );
2889  return ( !BLAZE_BLAS_IS_PARALLEL ||
2890  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
2891  ( B.columns() > SMP_TDMATDMATMULT_THRESHOLD );
2892  }
2893  //**********************************************************************************************
2894 
2895  private:
2896  //**Member variables****************************************************************************
2897  LeftOperand matrix_;
2898  RightOperand scalar_;
2899  //**********************************************************************************************
2900 
2901  //**Assignment to dense matrices****************************************************************
2913  template< typename MT // Type of the target dense matrix
2914  , bool SO > // Storage order of the target dense matrix
2915  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
2916  {
2918 
2919  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2920  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2921 
2922  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2923  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2924 
2925  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
2926  return;
2927  }
2928  else if( left.columns() == 0UL ) {
2929  reset( ~lhs );
2930  return;
2931  }
2932 
2933  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
2934  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
2935 
2936  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2937  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
2938  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
2939  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
2940  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2941  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
2942 
2943  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
2944  }
2945  //**********************************************************************************************
2946 
2947  //**Assignment to dense matrices (kernel selection)*********************************************
2958  template< typename MT3 // Type of the left-hand side target matrix
2959  , typename MT4 // Type of the left-hand side matrix operand
2960  , typename MT5 // Type of the right-hand side matrix operand
2961  , typename ST2 > // Type of the scalar value
2962  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2963  {
2964  if( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD )
2965  DMatScalarMultExpr::selectDefaultAssignKernel( C, A, B, scalar );
2966  else
2967  DMatScalarMultExpr::selectBlasAssignKernel( C, A, B, scalar );
2968  }
2969  //**********************************************************************************************
2970 
2971  //**Default assignment to dense matrices********************************************************
2985  template< typename MT3 // Type of the left-hand side target matrix
2986  , typename MT4 // Type of the left-hand side matrix operand
2987  , typename MT5 // Type of the right-hand side matrix operand
2988  , typename ST2 > // Type of the scalar value
2989  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2990  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2991  {
2992  for( size_t i=0UL; i<A.rows(); ++i ) {
2993  for( size_t k=0UL; k<B.columns(); ++k ) {
2994  C(i,k) = A(i,0UL) * B(0UL,k);
2995  }
2996  for( size_t j=1UL; j<A.columns(); ++j ) {
2997  for( size_t k=0UL; k<B.columns(); ++k ) {
2998  C(i,k) += A(i,j) * B(j,k);
2999  }
3000  }
3001  for( size_t k=0UL; k<B.columns(); ++k ) {
3002  C(i,k) *= scalar;
3003  }
3004  }
3005  }
3006  //**********************************************************************************************
3007 
3008  //**Vectorized default assignment to row-major dense matrices***********************************
3022  template< typename MT3 // Type of the left-hand side target matrix
3023  , typename MT4 // Type of the left-hand side matrix operand
3024  , typename MT5 // Type of the right-hand side matrix operand
3025  , typename ST2 > // Type of the scalar value
3026  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3027  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3028  {
3029  typedef IntrinsicTrait<ElementType> IT;
3030 
3031  const size_t M( A.rows() );
3032  const size_t N( B.columns() );
3033  const size_t K( A.columns() );
3034 
3035  const IntrinsicType factor( set( scalar ) );
3036 
3037  size_t j( 0UL );
3038 
3039  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
3040  for( size_t i=0UL; i<M; ++i ) {
3041  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3042  for( size_t k=0UL; k<K; ++k ) {
3043  const IntrinsicType a1( set( A(i,k) ) );
3044  xmm1 = xmm1 + a1 * B.load(k,j );
3045  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3046  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3047  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3048  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
3049  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
3050  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
3051  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
3052  }
3053  (~C).store( i, j , xmm1 * factor );
3054  (~C).store( i, j+IT::size , xmm2 * factor );
3055  (~C).store( i, j+IT::size*2UL, xmm3 * factor );
3056  (~C).store( i, j+IT::size*3UL, xmm4 * factor );
3057  (~C).store( i, j+IT::size*4UL, xmm5 * factor );
3058  (~C).store( i, j+IT::size*5UL, xmm6 * factor );
3059  (~C).store( i, j+IT::size*6UL, xmm7 * factor );
3060  (~C).store( i, j+IT::size*7UL, xmm8 * factor );
3061  }
3062  }
3063  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
3064  size_t i( 0UL );
3065  for( ; (i+2UL) <= M; i+=2UL ) {
3066  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3067  for( size_t k=0UL; k<K; ++k ) {
3068  const IntrinsicType a1( set( A(i ,k) ) );
3069  const IntrinsicType a2( set( A(i+1UL,k) ) );
3070  const IntrinsicType b1( B.load(k,j ) );
3071  const IntrinsicType b2( B.load(k,j+IT::size ) );
3072  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
3073  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
3074  xmm1 = xmm1 + a1 * b1;
3075  xmm2 = xmm2 + a1 * b2;
3076  xmm3 = xmm3 + a1 * b3;
3077  xmm4 = xmm4 + a1 * b4;
3078  xmm5 = xmm5 + a2 * b1;
3079  xmm6 = xmm6 + a2 * b2;
3080  xmm7 = xmm7 + a2 * b3;
3081  xmm8 = xmm8 + a2 * b4;
3082  }
3083  (~C).store( i , j , xmm1 * factor );
3084  (~C).store( i , j+IT::size , xmm2 * factor );
3085  (~C).store( i , j+IT::size*2UL, xmm3 * factor );
3086  (~C).store( i , j+IT::size*3UL, xmm4 * factor );
3087  (~C).store( i+1UL, j , xmm5 * factor );
3088  (~C).store( i+1UL, j+IT::size , xmm6 * factor );
3089  (~C).store( i+1UL, j+IT::size*2UL, xmm7 * factor );
3090  (~C).store( i+1UL, j+IT::size*3UL, xmm8 * factor );
3091  }
3092  if( i < M ) {
3093  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3094  for( size_t k=0UL; k<K; ++k ) {
3095  const IntrinsicType a1( set( A(i,k) ) );
3096  xmm1 = xmm1 + a1 * B.load(k,j );
3097  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3098  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3099  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3100  }
3101  (~C).store( i, j , xmm1 * factor );
3102  (~C).store( i, j+IT::size , xmm2 * factor );
3103  (~C).store( i, j+IT::size*2UL, xmm3 * factor );
3104  (~C).store( i, j+IT::size*3UL, xmm4 * factor );
3105  }
3106  }
3107  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3108  size_t i( 0UL );
3109  for( ; (i+2UL) <= M; i+=2UL ) {
3110  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3111  for( size_t k=0UL; k<K; ++k ) {
3112  const IntrinsicType a1( set( A(i ,k) ) );
3113  const IntrinsicType a2( set( A(i+1UL,k) ) );
3114  const IntrinsicType b1( B.load(k,j ) );
3115  const IntrinsicType b2( B.load(k,j+IT::size) );
3116  xmm1 = xmm1 + a1 * b1;
3117  xmm2 = xmm2 + a1 * b2;
3118  xmm3 = xmm3 + a2 * b1;
3119  xmm4 = xmm4 + a2 * b2;
3120  }
3121  (~C).store( i , j , xmm1 * factor );
3122  (~C).store( i , j+IT::size, xmm2 * factor );
3123  (~C).store( i+1UL, j , xmm3 * factor );
3124  (~C).store( i+1UL, j+IT::size, xmm4 * factor );
3125  }
3126  if( i < M ) {
3127  IntrinsicType xmm1, xmm2;
3128  for( size_t k=0UL; k<K; ++k ) {
3129  const IntrinsicType a1( set( A(i,k) ) );
3130  xmm1 = xmm1 + a1 * B.load(k,j );
3131  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3132  }
3133  (~C).store( i, j , xmm1 * factor );
3134  (~C).store( i, j+IT::size, xmm2 * factor );
3135  }
3136  }
3137  if( j < N ) {
3138  size_t i( 0UL );
3139  for( ; (i+2UL) <= M; i+=2UL ) {
3140  IntrinsicType xmm1, xmm2;
3141  for( size_t k=0UL; k<K; ++k ) {
3142  const IntrinsicType b1( B.load(k,j) );
3143  xmm1 = xmm1 + set( A(i ,k) ) * b1;
3144  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
3145  }
3146  (~C).store( i , j, xmm1 * factor );
3147  (~C).store( i+1UL, j, xmm2 * factor );
3148  }
3149  if( i < M ) {
3150  IntrinsicType xmm1;
3151  for( size_t k=0UL; k<K; ++k ) {
3152  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
3153  }
3154  (~C).store( i, j, xmm1 * factor );
3155  }
3156  }
3157  }
3158  //**********************************************************************************************
3159 
3160  //**Vectorized default assignment to column-major dense matrices********************************
3174  template< typename MT3 // Type of the left-hand side target matrix
3175  , typename MT4 // Type of the left-hand side matrix operand
3176  , typename MT5 // Type of the right-hand side matrix operand
3177  , typename ST2 > // Type of the scalar value
3178  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3179  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3180  {
3181  typedef IntrinsicTrait<ElementType> IT;
3182 
3183  const size_t M( A.rows() );
3184  const size_t N( B.columns() );
3185  const size_t K( A.columns() );
3186 
3187  const IntrinsicType factor( set( scalar ) );
3188 
3189  size_t i( 0UL );
3190 
3191  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
3192  for( size_t j=0UL; j<N; ++j ) {
3193  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3194  for( size_t k=0UL; k<K; ++k ) {
3195  const IntrinsicType b1( set( B(k,j) ) );
3196  xmm1 = xmm1 + A.load(i ,k) * b1;
3197  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3198  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3199  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3200  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
3201  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
3202  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
3203  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
3204  }
3205  (~C).store( i , j, xmm1 * factor );
3206  (~C).store( i+IT::size , j, xmm2 * factor );
3207  (~C).store( i+IT::size*2UL, j, xmm3 * factor );
3208  (~C).store( i+IT::size*3UL, j, xmm4 * factor );
3209  (~C).store( i+IT::size*4UL, j, xmm5 * factor );
3210  (~C).store( i+IT::size*5UL, j, xmm6 * factor );
3211  (~C).store( i+IT::size*6UL, j, xmm7 * factor );
3212  (~C).store( i+IT::size*7UL, j, xmm8 * factor );
3213  }
3214  }
3215  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
3216  size_t j( 0UL );
3217  for( ; (j+2UL) <= N; j+=2UL ) {
3218  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3219  for( size_t k=0UL; k<K; ++k ) {
3220  const IntrinsicType a1( A.load(i ,k) );
3221  const IntrinsicType a2( A.load(i+IT::size ,k) );
3222  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
3223  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
3224  const IntrinsicType b1( set( B(k,j ) ) );
3225  const IntrinsicType b2( set( B(k,j+1UL) ) );
3226  xmm1 = xmm1 + a1 * b1;
3227  xmm2 = xmm2 + a2 * b1;
3228  xmm3 = xmm3 + a3 * b1;
3229  xmm4 = xmm4 + a4 * b1;
3230  xmm5 = xmm5 + a1 * b2;
3231  xmm6 = xmm6 + a2 * b2;
3232  xmm7 = xmm7 + a3 * b2;
3233  xmm8 = xmm8 + a4 * b2;
3234  }
3235  (~C).store( i , j , xmm1 * factor );
3236  (~C).store( i+IT::size , j , xmm2 * factor );
3237  (~C).store( i+IT::size*2UL, j , xmm3 * factor );
3238  (~C).store( i+IT::size*3UL, j , xmm4 * factor );
3239  (~C).store( i , j+1UL, xmm5 * factor );
3240  (~C).store( i+IT::size , j+1UL, xmm6 * factor );
3241  (~C).store( i+IT::size*2UL, j+1UL, xmm7 * factor );
3242  (~C).store( i+IT::size*3UL, j+1UL, xmm8 * factor );
3243  }
3244  if( j < N ) {
3245  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3246  for( size_t k=0UL; k<K; ++k ) {
3247  const IntrinsicType b1( set( B(k,j) ) );
3248  xmm1 = xmm1 + A.load(i ,k) * b1;
3249  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3250  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3251  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3252  }
3253  (~C).store( i , j, xmm1 * factor );
3254  (~C).store( i+IT::size , j, xmm2 * factor );
3255  (~C).store( i+IT::size*2UL, j, xmm3 * factor );
3256  (~C).store( i+IT::size*3UL, j, xmm4 * factor );
3257  }
3258  }
3259  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
3260  size_t j( 0UL );
3261  for( ; (j+2UL) <= N; j+=2UL ) {
3262  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3263  for( size_t k=0UL; k<K; ++k ) {
3264  const IntrinsicType a1( A.load(i ,k) );
3265  const IntrinsicType a2( A.load(i+IT::size,k) );
3266  const IntrinsicType b1( set( B(k,j ) ) );
3267  const IntrinsicType b2( set( B(k,j+1UL) ) );
3268  xmm1 = xmm1 + a1 * b1;
3269  xmm2 = xmm2 + a2 * b1;
3270  xmm3 = xmm3 + a1 * b2;
3271  xmm4 = xmm4 + a2 * b2;
3272  }
3273  (~C).store( i , j , xmm1 * factor );
3274  (~C).store( i+IT::size, j , xmm2 * factor );
3275  (~C).store( i , j+1UL, xmm3 * factor );
3276  (~C).store( i+IT::size, j+1UL, xmm4 * factor );
3277  }
3278  if( j < N ) {
3279  IntrinsicType xmm1, xmm2;
3280  for( size_t k=0UL; k<K; ++k ) {
3281  const IntrinsicType b1( set( B(k,j) ) );
3282  xmm1 = xmm1 + A.load(i ,k) * b1;
3283  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
3284  }
3285  (~C).store( i , j, xmm1 * factor );
3286  (~C).store( i+IT::size, j, xmm2 * factor );
3287  }
3288  }
3289  if( i < M ) {
3290  size_t j( 0UL );
3291  for( ; (j+2UL) <= N; j+=2UL ) {
3292  IntrinsicType xmm1, xmm2;
3293  for( size_t k=0UL; k<K; ++k ) {
3294  const IntrinsicType a1( A.load(i,k) );
3295  xmm1 = xmm1 + a1 * set( B(k,j ) );
3296  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
3297  }
3298  (~C).store( i, j , xmm1 * factor );
3299  (~C).store( i, j+1UL, xmm2 * factor );
3300  }
3301  if( j < N ) {
3302  IntrinsicType xmm1;
3303  for( size_t k=0UL; k<K; ++k ) {
3304  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
3305  }
3306  (~C).store( i, j, xmm1 * factor );
3307  }
3308  }
3309  }
3310  //**********************************************************************************************
3311 
3312  //**BLAS-based assignment to dense matrices (default)*******************************************
3326  template< typename MT3 // Type of the left-hand side target matrix
3327  , typename MT4 // Type of the left-hand side matrix operand
3328  , typename MT5 // Type of the right-hand side matrix operand
3329  , typename ST2 > // Type of the scalar value
3330  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3331  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3332  {
3333  selectDefaultAssignKernel( C, A, B, scalar );
3334  }
3335  //**********************************************************************************************
3336 
3337  //**BLAS-based assignment to dense matrices (single precision)**********************************
3338 #if BLAZE_BLAS_MODE
3339 
3352  template< typename MT3 // Type of the left-hand side target matrix
3353  , typename MT4 // Type of the left-hand side matrix operand
3354  , typename MT5 // Type of the right-hand side matrix operand
3355  , typename ST2 > // Type of the scalar value
3356  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3357  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3358  {
3359  using boost::numeric_cast;
3360 
3364 
3365  const int M ( numeric_cast<int>( A.rows() ) );
3366  const int N ( numeric_cast<int>( B.columns() ) );
3367  const int K ( numeric_cast<int>( A.columns() ) );
3368  const int lda( numeric_cast<int>( A.spacing() ) );
3369  const int ldb( numeric_cast<int>( B.spacing() ) );
3370  const int ldc( numeric_cast<int>( C.spacing() ) );
3371 
3372  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3373  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3374  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3375  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
3376  }
3377 #endif
3378  //**********************************************************************************************
3379 
3380  //**BLAS-based assignment to dense matrices (double precision)**********************************
3381 #if BLAZE_BLAS_MODE
3382 
3395  template< typename MT3 // Type of the left-hand side target matrix
3396  , typename MT4 // Type of the left-hand side matrix operand
3397  , typename MT5 // Type of the right-hand side matrix operand
3398  , typename ST2 > // Type of the scalar value
3399  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3400  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3401  {
3402  using boost::numeric_cast;
3403 
3407 
3408  const int M ( numeric_cast<int>( A.rows() ) );
3409  const int N ( numeric_cast<int>( B.columns() ) );
3410  const int K ( numeric_cast<int>( A.columns() ) );
3411  const int lda( numeric_cast<int>( A.spacing() ) );
3412  const int ldb( numeric_cast<int>( B.spacing() ) );
3413  const int ldc( numeric_cast<int>( C.spacing() ) );
3414 
3415  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3416  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3417  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3418  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
3419  }
3420 #endif
3421  //**********************************************************************************************
3422 
3423  //**BLAS-based assignment to dense matrices (single precision complex)**************************
3424 #if BLAZE_BLAS_MODE
3425 
3438  template< typename MT3 // Type of the left-hand side target matrix
3439  , typename MT4 // Type of the left-hand side matrix operand
3440  , typename MT5 // Type of the right-hand side matrix operand
3441  , typename ST2 > // Type of the scalar value
3442  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3443  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3444  {
3445  using boost::numeric_cast;
3446 
3450  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3451  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3452  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3453 
3454  const int M ( numeric_cast<int>( A.rows() ) );
3455  const int N ( numeric_cast<int>( B.columns() ) );
3456  const int K ( numeric_cast<int>( A.columns() ) );
3457  const int lda( numeric_cast<int>( A.spacing() ) );
3458  const int ldb( numeric_cast<int>( B.spacing() ) );
3459  const int ldc( numeric_cast<int>( C.spacing() ) );
3460  const complex<float> alpha( scalar );
3461  const complex<float> beta ( 0.0F, 0.0F );
3462 
3463  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3464  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3465  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3466  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3467  }
3468 #endif
3469  //**********************************************************************************************
3470 
3471  //**BLAS-based assignment to dense matrices (double precision complex)**************************
3472 #if BLAZE_BLAS_MODE
3473 
3486  template< typename MT3 // Type of the left-hand side target matrix
3487  , typename MT4 // Type of the left-hand side matrix operand
3488  , typename MT5 // Type of the right-hand side matrix operand
3489  , typename ST2 > // Type of the scalar value
3490  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3491  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3492  {
3493  using boost::numeric_cast;
3494 
3498  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3499  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3500  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3501 
3502  const int M ( numeric_cast<int>( A.rows() ) );
3503  const int N ( numeric_cast<int>( B.columns() ) );
3504  const int K ( numeric_cast<int>( A.columns() ) );
3505  const int lda( numeric_cast<int>( A.spacing() ) );
3506  const int ldb( numeric_cast<int>( B.spacing() ) );
3507  const int ldc( numeric_cast<int>( C.spacing() ) );
3508  const complex<double> alpha( scalar );
3509  const complex<double> beta ( 0.0, 0.0 );
3510 
3511  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3512  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3513  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3514  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3515  }
3516 #endif
3517  //**********************************************************************************************
3518 
3519  //**Assignment to sparse matrices***************************************************************
3531  template< typename MT // Type of the target sparse matrix
3532  , bool SO > // Storage order of the target sparse matrix
3533  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
3534  {
3536 
3537  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3538 
3545 
3546  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3547  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3548 
3549  const TmpType tmp( serial( rhs ) );
3550  assign( ~lhs, tmp );
3551  }
3552  //**********************************************************************************************
3553 
3554  //**Addition assignment to dense matrices*******************************************************
3566  template< typename MT // Type of the target dense matrix
3567  , bool SO > // Storage order of the target dense matrix
3568  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
3569  {
3571 
3572  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3573  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3574 
3575  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3576  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3577 
3578  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3579  return;
3580  }
3581 
3582  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
3583  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3584 
3585  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3586  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3587  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3588  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3589  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3590  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3591 
3592  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3593  }
3594  //**********************************************************************************************
3595 
3596  //**Addition assignment to dense matrices (kernel selection)************************************
3607  template< typename MT3 // Type of the left-hand side target matrix
3608  , typename MT4 // Type of the left-hand side matrix operand
3609  , typename MT5 // Type of the right-hand side matrix operand
3610  , typename ST2 > // Type of the scalar value
3611  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3612  {
3613  if( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD )
3614  DMatScalarMultExpr::selectDefaultAddAssignKernel( C, A, B, scalar );
3615  else
3616  DMatScalarMultExpr::selectBlasAddAssignKernel( C, A, B, scalar );
3617  }
3618  //**********************************************************************************************
3619 
3620  //**Default addition assignment to dense matrices***********************************************
3634  template< typename MT3 // Type of the left-hand side target matrix
3635  , typename MT4 // Type of the left-hand side matrix operand
3636  , typename MT5 // Type of the right-hand side matrix operand
3637  , typename ST2 > // Type of the scalar value
3638  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3639  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3640  {
3641  const ResultType tmp( serial( A * B * scalar ) );
3642  addAssign( C, tmp );
3643  }
3644  //**********************************************************************************************
3645 
3646  //**Vectorized default addition assignment to row-major dense matrices**************************
3660  template< typename MT3 // Type of the left-hand side target matrix
3661  , typename MT4 // Type of the left-hand side matrix operand
3662  , typename MT5 // Type of the right-hand side matrix operand
3663  , typename ST2 > // Type of the scalar value
3664  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3665  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3666  {
3667  typedef IntrinsicTrait<ElementType> IT;
3668 
3669  const size_t M( A.rows() );
3670  const size_t N( B.columns() );
3671  const size_t K( A.columns() );
3672 
3673  const IntrinsicType factor( set( scalar ) );
3674 
3675  size_t j( 0UL );
3676 
3677  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
3678  for( size_t i=0UL; i<M; ++i ) {
3679  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3680  for( size_t k=0UL; k<K; ++k ) {
3681  const IntrinsicType a1( set( A(i,k) ) );
3682  xmm1 = xmm1 + a1 * B.load(k,j );
3683  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3684  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3685  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3686  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
3687  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
3688  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
3689  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
3690  }
3691  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
3692  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
3693  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
3694  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
3695  (~C).store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) + xmm5 * factor );
3696  (~C).store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) + xmm6 * factor );
3697  (~C).store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) + xmm7 * factor );
3698  (~C).store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) + xmm8 * factor );
3699  }
3700  }
3701  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
3702  size_t i( 0UL );
3703  for( ; (i+2UL) <= M; i+=2UL ) {
3704  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3705  for( size_t k=0UL; k<K; ++k ) {
3706  const IntrinsicType a1( set( A(i ,k) ) );
3707  const IntrinsicType a2( set( A(i+1UL,k) ) );
3708  const IntrinsicType b1( B.load(k,j ) );
3709  const IntrinsicType b2( B.load(k,j+IT::size ) );
3710  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
3711  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
3712  xmm1 = xmm1 + a1 * b1;
3713  xmm2 = xmm2 + a1 * b2;
3714  xmm3 = xmm3 + a1 * b3;
3715  xmm4 = xmm4 + a1 * b4;
3716  xmm5 = xmm5 + a2 * b1;
3717  xmm6 = xmm6 + a2 * b2;
3718  xmm7 = xmm7 + a2 * b3;
3719  xmm8 = xmm8 + a2 * b4;
3720  }
3721  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3722  (~C).store( i , j+IT::size , (~C).load(i ,j+IT::size ) + xmm2 * factor );
3723  (~C).store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) + xmm3 * factor );
3724  (~C).store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) + xmm4 * factor );
3725  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
3726  (~C).store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) + xmm6 * factor );
3727  (~C).store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) + xmm7 * factor );
3728  (~C).store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) + xmm8 * factor );
3729  }
3730  if( i < M ) {
3731  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3732  for( size_t k=0UL; k<K; ++k ) {
3733  const IntrinsicType a1( set( A(i,k) ) );
3734  xmm1 = xmm1 + a1 * B.load(k,j );
3735  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3736  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3737  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3738  }
3739  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
3740  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
3741  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
3742  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
3743  }
3744  }
3745  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3746  size_t i( 0UL );
3747  for( ; (i+2UL) <= M; i+=2UL ) {
3748  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3749  for( size_t k=0UL; k<K; ++k ) {
3750  const IntrinsicType a1( set( A(i ,k) ) );
3751  const IntrinsicType a2( set( A(i+1UL,k) ) );
3752  const IntrinsicType b1( B.load(k,j ) );
3753  const IntrinsicType b2( B.load(k,j+IT::size) );
3754  xmm1 = xmm1 + a1 * b1;
3755  xmm2 = xmm2 + a1 * b2;
3756  xmm3 = xmm3 + a2 * b1;
3757  xmm4 = xmm4 + a2 * b2;
3758  }
3759  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3760  (~C).store( i , j+IT::size, (~C).load(i ,j+IT::size) + xmm2 * factor );
3761  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
3762  (~C).store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) + xmm4 * factor );
3763  }
3764  if( i < M ) {
3765  IntrinsicType xmm1, xmm2;
3766  for( size_t k=0UL; k<K; ++k ) {
3767  const IntrinsicType a1( set( A(i,k) ) );
3768  xmm1 = xmm1 + a1 * B.load(k,j );
3769  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3770  }
3771  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
3772  (~C).store( i, j+IT::size, (~C).load(i,j+IT::size) + xmm2 * factor );
3773  }
3774  }
3775  if( j < N ) {
3776  size_t i( 0UL );
3777  for( ; (i+2UL) <= M; i+=2UL ) {
3778  IntrinsicType xmm1, xmm2;
3779  for( size_t k=0UL; k<K; ++k ) {
3780  const IntrinsicType b1( B.load(k,j) );
3781  xmm1 = xmm1 + set( A(i ,k) ) * b1;
3782  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
3783  }
3784  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
3785  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
3786  }
3787  if( i < M ) {
3788  IntrinsicType xmm1;
3789  for( size_t k=0UL; k<K; ++k ) {
3790  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
3791  }
3792  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
3793  }
3794  }
3795  }
3796  //**********************************************************************************************
3797 
3798  //**Vectorized default addition assignment to column-major dense matrices***********************
3812  template< typename MT3 // Type of the left-hand side target matrix
3813  , typename MT4 // Type of the left-hand side matrix operand
3814  , typename MT5 // Type of the right-hand side matrix operand
3815  , typename ST2 > // Type of the scalar value
3816  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3817  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3818  {
3819  typedef IntrinsicTrait<ElementType> IT;
3820 
3821  const size_t M( A.rows() );
3822  const size_t N( B.columns() );
3823  const size_t K( A.columns() );
3824 
3825  const IntrinsicType factor( set( scalar ) );
3826 
3827  size_t i( 0UL );
3828 
3829  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
3830  for( size_t j=0UL; j<N; ++j ) {
3831  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3832  for( size_t k=0UL; k<K; ++k ) {
3833  const IntrinsicType b1( set( B(k,j) ) );
3834  xmm1 = xmm1 + A.load(i ,k) * b1;
3835  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3836  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3837  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3838  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
3839  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
3840  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
3841  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
3842  }
3843  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
3844  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
3845  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
3846  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
3847  (~C).store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) + xmm5 * factor );
3848  (~C).store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) + xmm6 * factor );
3849  (~C).store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) + xmm7 * factor );
3850  (~C).store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) + xmm8 * factor );
3851  }
3852  }
3853  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
3854  size_t j( 0UL );
3855  for( ; (j+2UL) <= N; j+=2UL ) {
3856  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3857  for( size_t k=0UL; k<K; ++k ) {
3858  const IntrinsicType a1( A.load(i ,k) );
3859  const IntrinsicType a2( A.load(i+IT::size ,k) );
3860  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
3861  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
3862  const IntrinsicType b1( set( B(k,j ) ) );
3863  const IntrinsicType b2( set( B(k,j+1UL) ) );
3864  xmm1 = xmm1 + a1 * b1;
3865  xmm2 = xmm2 + a2 * b1;
3866  xmm3 = xmm3 + a3 * b1;
3867  xmm4 = xmm4 + a4 * b1;
3868  xmm5 = xmm5 + a1 * b2;
3869  xmm6 = xmm6 + a2 * b2;
3870  xmm7 = xmm7 + a3 * b2;
3871  xmm8 = xmm8 + a4 * b2;
3872  }
3873  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3874  (~C).store( i+IT::size , j , (~C).load(i+IT::size ,j ) + xmm2 * factor );
3875  (~C).store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) + xmm3 * factor );
3876  (~C).store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) + xmm4 * factor );
3877  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
3878  (~C).store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) + xmm6 * factor );
3879  (~C).store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) + xmm7 * factor );
3880  (~C).store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) + xmm8 * factor );
3881  }
3882  if( j < N ) {
3883  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3884  for( size_t k=0UL; k<K; ++k ) {
3885  const IntrinsicType b1( set( B(k,j) ) );
3886  xmm1 = xmm1 + A.load(i ,k) * b1;
3887  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3888  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3889  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3890  }
3891  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
3892  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
3893  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
3894  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
3895  }
3896  }
3897  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
3898  size_t j( 0UL );
3899  for( ; (j+2UL) <= N; j+=2UL ) {
3900  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3901  for( size_t k=0UL; k<K; ++k ) {
3902  const IntrinsicType a1( A.load(i ,k) );
3903  const IntrinsicType a2( A.load(i+IT::size,k) );
3904  const IntrinsicType b1( set( B(k,j ) ) );
3905  const IntrinsicType b2( set( B(k,j+1UL) ) );
3906  xmm1 = xmm1 + a1 * b1;
3907  xmm2 = xmm2 + a2 * b1;
3908  xmm3 = xmm3 + a1 * b2;
3909  xmm4 = xmm4 + a2 * b2;
3910  }
3911  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3912  (~C).store( i+IT::size, j , (~C).load(i+IT::size,j ) + xmm2 * factor );
3913  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
3914  (~C).store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) + xmm4 * factor );
3915  }
3916  if( j < N ) {
3917  IntrinsicType xmm1, xmm2;
3918  for( size_t k=0UL; k<K; ++k ) {
3919  const IntrinsicType b1( set( B(k,j) ) );
3920  xmm1 = xmm1 + A.load(i ,k) * b1;
3921  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
3922  }
3923  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
3924  (~C).store( i+IT::size, j, (~C).load(i+IT::size,j) + xmm2 * factor );
3925  }
3926  }
3927  if( i < M ) {
3928  size_t j( 0UL );
3929  for( ; (j+2UL) <= N; j+=2UL ) {
3930  IntrinsicType xmm1, xmm2;
3931  for( size_t k=0UL; k<K; ++k ) {
3932  const IntrinsicType a1( A.load(i,k) );
3933  xmm1 = xmm1 + a1 * set( B(k,j ) );
3934  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
3935  }
3936  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
3937  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
3938  }
3939  if( j < N ) {
3940  IntrinsicType xmm1;
3941  for( size_t k=0UL; k<K; ++k ) {
3942  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
3943  }
3944  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
3945  }
3946  }
3947  }
3948  //**********************************************************************************************
3949 
3950  //**BLAS-based addition assignment to dense matrices (default)**********************************
3964  template< typename MT3 // Type of the left-hand side target matrix
3965  , typename MT4 // Type of the left-hand side matrix operand
3966  , typename MT5 // Type of the right-hand side matrix operand
3967  , typename ST2 > // Type of the scalar value
3968  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3969  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3970  {
3971  selectDefaultAddAssignKernel( C, A, B, scalar );
3972  }
3973  //**********************************************************************************************
3974 
3975  //**BLAS-based addition assignment to dense matrices (single precision)*************************
3976 #if BLAZE_BLAS_MODE
3977 
3990  template< typename MT3 // Type of the left-hand side target matrix
3991  , typename MT4 // Type of the left-hand side matrix operand
3992  , typename MT5 // Type of the right-hand side matrix operand
3993  , typename ST2 > // Type of the scalar value
3994  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3995  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3996  {
3997  using boost::numeric_cast;
3998 
4002 
4003  const int M ( numeric_cast<int>( A.rows() ) );
4004  const int N ( numeric_cast<int>( B.columns() ) );
4005  const int K ( numeric_cast<int>( A.columns() ) );
4006  const int lda( numeric_cast<int>( A.spacing() ) );
4007  const int ldb( numeric_cast<int>( B.spacing() ) );
4008  const int ldc( numeric_cast<int>( C.spacing() ) );
4009 
4010  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4011  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4012  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4013  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4014  }
4015 #endif
4016  //**********************************************************************************************
4017 
4018  //**BLAS-based addition assignment to dense matrices (double precision)*************************
4019 #if BLAZE_BLAS_MODE
4020 
4033  template< typename MT3 // Type of the left-hand side target matrix
4034  , typename MT4 // Type of the left-hand side matrix operand
4035  , typename MT5 // Type of the right-hand side matrix operand
4036  , typename ST2 > // Type of the scalar value
4037  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4038  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4039  {
4040  using boost::numeric_cast;
4041 
4045 
4046  const int M ( numeric_cast<int>( A.rows() ) );
4047  const int N ( numeric_cast<int>( B.columns() ) );
4048  const int K ( numeric_cast<int>( A.columns() ) );
4049  const int lda( numeric_cast<int>( A.spacing() ) );
4050  const int ldb( numeric_cast<int>( B.spacing() ) );
4051  const int ldc( numeric_cast<int>( C.spacing() ) );
4052 
4053  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4054  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4055  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4056  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4057  }
4058 #endif
4059  //**********************************************************************************************
4060 
4061  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
4062 #if BLAZE_BLAS_MODE
4063 
4076  template< typename MT3 // Type of the left-hand side target matrix
4077  , typename MT4 // Type of the left-hand side matrix operand
4078  , typename MT5 // Type of the right-hand side matrix operand
4079  , typename ST2 > // Type of the scalar value
4080  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4081  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4082  {
4083  using boost::numeric_cast;
4084 
4088  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
4089  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
4090  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
4091 
4092  const int M ( numeric_cast<int>( A.rows() ) );
4093  const int N ( numeric_cast<int>( B.columns() ) );
4094  const int K ( numeric_cast<int>( A.columns() ) );
4095  const int lda( numeric_cast<int>( A.spacing() ) );
4096  const int ldb( numeric_cast<int>( B.spacing() ) );
4097  const int ldc( numeric_cast<int>( C.spacing() ) );
4098  const complex<float> alpha( scalar );
4099  const complex<float> beta ( 1.0F, 0.0F );
4100 
4101  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4102  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4103  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4104  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4105  }
4106 #endif
4107  //**********************************************************************************************
4108 
4109  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
4110 #if BLAZE_BLAS_MODE
4111 
4124  template< typename MT3 // Type of the left-hand side target matrix
4125  , typename MT4 // Type of the left-hand side matrix operand
4126  , typename MT5 // Type of the right-hand side matrix operand
4127  , typename ST2 > // Type of the scalar value
4128  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4129  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4130  {
4131  using boost::numeric_cast;
4132 
4136  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
4137  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
4138  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
4139 
4140  const int M ( numeric_cast<int>( A.rows() ) );
4141  const int N ( numeric_cast<int>( B.columns() ) );
4142  const int K ( numeric_cast<int>( A.columns() ) );
4143  const int lda( numeric_cast<int>( A.spacing() ) );
4144  const int ldb( numeric_cast<int>( B.spacing() ) );
4145  const int ldc( numeric_cast<int>( C.spacing() ) );
4146  const complex<double> alpha( scalar );
4147  const complex<double> beta ( 1.0, 0.0 );
4148 
4149  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4150  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4151  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4152  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4153  }
4154 #endif
4155  //**********************************************************************************************
4156 
4157  //**Addition assignment to sparse matrices******************************************************
4158  // No special implementation for the addition assignment to sparse matrices.
4159  //**********************************************************************************************
4160 
4161  //**Subtraction assignment to dense matrices****************************************************
4173  template< typename MT // Type of the target dense matrix
4174  , bool SO > // Storage order of the target dense matrix
4175  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4176  {
4178 
4179  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4180  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4181 
4182  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4183  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4184 
4185  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
4186  return;
4187  }
4188 
4189  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4190  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4191 
4192  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4193  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4194  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4195  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4196  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4197  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4198 
4199  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
4200  }
4201  //**********************************************************************************************
4202 
4203  //**Subtraction assignment to dense matrices (kernel selection)*********************************
4214  template< typename MT3 // Type of the left-hand side target matrix
4215  , typename MT4 // Type of the left-hand side matrix operand
4216  , typename MT5 // Type of the right-hand side matrix operand
4217  , typename ST2 > // Type of the scalar value
4218  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4219  {
4220  if( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD )
4221  DMatScalarMultExpr::selectDefaultSubAssignKernel( C, A, B, scalar );
4222  else
4223  DMatScalarMultExpr::selectBlasSubAssignKernel( C, A, B, scalar );
4224  }
4225  //**********************************************************************************************
4226 
4227  //**Default subtraction assignment to dense matrices********************************************
4241  template< typename MT3 // Type of the left-hand side target matrix
4242  , typename MT4 // Type of the left-hand side matrix operand
4243  , typename MT5 // Type of the right-hand side matrix operand
4244  , typename ST2 > // Type of the scalar value
4245  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4246  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4247  {
4248  const ResultType tmp( serial( A * B * scalar ) );
4249  subAssign( C, tmp );
4250  }
4251  //**********************************************************************************************
4252 
4253  //**Vectorized default subtraction assignment to row-major dense matrices***********************
4267  template< typename MT3 // Type of the left-hand side target matrix
4268  , typename MT4 // Type of the left-hand side matrix operand
4269  , typename MT5 // Type of the right-hand side matrix operand
4270  , typename ST2 > // Type of the scalar value
4271  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4272  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
4273  {
4274  typedef IntrinsicTrait<ElementType> IT;
4275 
4276  const size_t M( A.rows() );
4277  const size_t N( B.columns() );
4278  const size_t K( A.columns() );
4279 
4280  const IntrinsicType factor( set( scalar ) );
4281 
4282  size_t j( 0UL );
4283 
4284  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
4285  for( size_t i=0UL; i<M; ++i ) {
4286  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4287  for( size_t k=0UL; k<K; ++k ) {
4288  const IntrinsicType a1( set( A(i,k) ) );
4289  xmm1 = xmm1 + a1 * B.load(k,j );
4290  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
4291  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
4292  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
4293  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
4294  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
4295  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
4296  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
4297  }
4298  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
4299  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
4300  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
4301  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
4302  (~C).store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) - xmm5 * factor );
4303  (~C).store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) - xmm6 * factor );
4304  (~C).store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) - xmm7 * factor );
4305  (~C).store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) - xmm8 * factor );
4306  }
4307  }
4308  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
4309  size_t i( 0UL );
4310  for( ; (i+2UL) <= M; i+=2UL ) {
4311  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4312  for( size_t k=0UL; k<K; ++k ) {
4313  const IntrinsicType a1( set( A(i ,k) ) );
4314  const IntrinsicType a2( set( A(i+1UL,k) ) );
4315  const IntrinsicType b1( B.load(k,j ) );
4316  const IntrinsicType b2( B.load(k,j+IT::size ) );
4317  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
4318  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
4319  xmm1 = xmm1 + a1 * b1;
4320  xmm2 = xmm2 + a1 * b2;
4321  xmm3 = xmm3 + a1 * b3;
4322  xmm4 = xmm4 + a1 * b4;
4323  xmm5 = xmm5 + a2 * b1;
4324  xmm6 = xmm6 + a2 * b2;
4325  xmm7 = xmm7 + a2 * b3;
4326  xmm8 = xmm8 + a2 * b4;
4327  }
4328  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4329  (~C).store( i , j+IT::size , (~C).load(i ,j+IT::size ) - xmm2 * factor );
4330  (~C).store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) - xmm3 * factor );
4331  (~C).store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) - xmm4 * factor );
4332  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
4333  (~C).store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) - xmm6 * factor );
4334  (~C).store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) - xmm7 * factor );
4335  (~C).store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) - xmm8 * factor );
4336  }
4337  if( i < M ) {
4338  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4339  for( size_t k=0UL; k<K; ++k ) {
4340  const IntrinsicType a1( set( A(i,k) ) );
4341  xmm1 = xmm1 + a1 * B.load(k,j );
4342  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
4343  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
4344  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
4345  }
4346  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
4347  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
4348  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
4349  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
4350  }
4351  }
4352  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
4353  size_t i( 0UL );
4354  for( ; (i+2UL) <= M; i+=2UL ) {
4355  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4356  for( size_t k=0UL; k<K; ++k ) {
4357  const IntrinsicType a1( set( A(i ,k) ) );
4358  const IntrinsicType a2( set( A(i+1UL,k) ) );
4359  const IntrinsicType b1( B.load(k,j ) );
4360  const IntrinsicType b2( B.load(k,j+IT::size) );
4361  xmm1 = xmm1 + a1 * b1;
4362  xmm2 = xmm2 + a1 * b2;
4363  xmm3 = xmm3 + a2 * b1;
4364  xmm4 = xmm4 + a2 * b2;
4365  }
4366  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4367  (~C).store( i , j+IT::size, (~C).load(i ,j+IT::size) - xmm2 * factor );
4368  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
4369  (~C).store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) - xmm4 * factor );
4370  }
4371  if( i < M ) {
4372  IntrinsicType xmm1, xmm2;
4373  for( size_t k=0UL; k<K; ++k ) {
4374  const IntrinsicType a1( set( A(i,k) ) );
4375  xmm1 = xmm1 + a1 * B.load(k,j );
4376  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
4377  }
4378  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
4379  (~C).store( i, j+IT::size, (~C).load(i,j+IT::size) - xmm2 * factor );
4380  }
4381  }
4382  if( j < N ) {
4383  size_t i( 0UL );
4384  for( ; (i+2UL) <= M; i+=2UL ) {
4385  IntrinsicType xmm1, xmm2;
4386  for( size_t k=0UL; k<K; ++k ) {
4387  const IntrinsicType b1( B.load(k,j) );
4388  xmm1 = xmm1 + set( A(i ,k) ) * b1;
4389  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
4390  }
4391  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
4392  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
4393  }
4394  if( i < M ) {
4395  IntrinsicType xmm1;
4396  for( size_t k=0UL; k<K; ++k ) {
4397  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
4398  }
4399  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
4400  }
4401  }
4402  }
4403  //**********************************************************************************************
4404 
4405  //**Vectorized default subtraction assignment to column-major dense matrices********************
4419  template< typename MT3 // Type of the left-hand side target matrix
4420  , typename MT4 // Type of the left-hand side matrix operand
4421  , typename MT5 // Type of the right-hand side matrix operand
4422  , typename ST2 > // Type of the scalar value
4423  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4424  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
4425  {
4426  typedef IntrinsicTrait<ElementType> IT;
4427 
4428  const size_t M( A.rows() );
4429  const size_t N( B.columns() );
4430  const size_t K( A.columns() );
4431 
4432  const IntrinsicType factor( set( scalar ) );
4433 
4434  size_t i( 0UL );
4435 
4436  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
4437  for( size_t j=0UL; j<N; ++j ) {
4438  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4439  for( size_t k=0UL; k<K; ++k ) {
4440  const IntrinsicType b1( set( B(k,j) ) );
4441  xmm1 = xmm1 + A.load(i ,k) * b1;
4442  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4443  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4444  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4445  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
4446  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
4447  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
4448  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
4449  }
4450  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
4451  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
4452  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
4453  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
4454  (~C).store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) - xmm5 * factor );
4455  (~C).store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) - xmm6 * factor );
4456  (~C).store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) - xmm7 * factor );
4457  (~C).store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) - xmm8 * factor );
4458  }
4459  }
4460  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
4461  size_t j( 0UL );
4462  for( ; (j+2UL) <= N; j+=2UL ) {
4463  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4464  for( size_t k=0UL; k<K; ++k ) {
4465  const IntrinsicType a1( A.load(i ,k) );
4466  const IntrinsicType a2( A.load(i+IT::size ,k) );
4467  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
4468  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
4469  const IntrinsicType b1( set( B(k,j ) ) );
4470  const IntrinsicType b2( set( B(k,j+1UL) ) );
4471  xmm1 = xmm1 + a1 * b1;
4472  xmm2 = xmm2 + a2 * b1;
4473  xmm3 = xmm3 + a3 * b1;
4474  xmm4 = xmm4 + a4 * b1;
4475  xmm5 = xmm5 + a1 * b2;
4476  xmm6 = xmm6 + a2 * b2;
4477  xmm7 = xmm7 + a3 * b2;
4478  xmm8 = xmm8 + a4 * b2;
4479  }
4480  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4481  (~C).store( i+IT::size , j , (~C).load(i+IT::size ,j ) - xmm2 * factor );
4482  (~C).store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) - xmm3 * factor );
4483  (~C).store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) - xmm4 * factor );
4484  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
4485  (~C).store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) - xmm6 * factor );
4486  (~C).store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) - xmm7 * factor );
4487  (~C).store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) - xmm8 * factor );
4488  }
4489  if( j < N ) {
4490  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4491  for( size_t k=0UL; k<K; ++k ) {
4492  const IntrinsicType b1( set( B(k,j) ) );
4493  xmm1 = xmm1 + A.load(i ,k) * b1;
4494  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4495  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4496  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4497  }
4498  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
4499  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
4500  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
4501  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
4502  }
4503  }
4504  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
4505  size_t j( 0UL );
4506  for( ; (j+2UL) <= N; j+=2UL ) {
4507  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4508  for( size_t k=0UL; k<K; ++k ) {
4509  const IntrinsicType a1( A.load(i ,k) );
4510  const IntrinsicType a2( A.load(i+IT::size,k) );
4511  const IntrinsicType b1( set( B(k,j ) ) );
4512  const IntrinsicType b2( set( B(k,j+1UL) ) );
4513  xmm1 = xmm1 + a1 * b1;
4514  xmm2 = xmm2 + a2 * b1;
4515  xmm3 = xmm3 + a1 * b2;
4516  xmm4 = xmm4 + a2 * b2;
4517  }
4518  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4519  (~C).store( i+IT::size, j , (~C).load(i+IT::size,j ) - xmm2 * factor );
4520  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
4521  (~C).store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) - xmm4 * factor );
4522  }
4523  if( j < N ) {
4524  IntrinsicType xmm1, xmm2;
4525  for( size_t k=0UL; k<K; ++k ) {
4526  const IntrinsicType b1( set( B(k,j) ) );
4527  xmm1 = xmm1 + A.load(i ,k) * b1;
4528  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
4529  }
4530  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
4531  (~C).store( i+IT::size, j, (~C).load(i+IT::size,j) - xmm2 * factor );
4532  }
4533  }
4534  if( i < M ) {
4535  size_t j( 0UL );
4536  for( ; (j+2UL) <= N; j+=2UL ) {
4537  IntrinsicType xmm1, xmm2;
4538  for( size_t k=0UL; k<K; ++k ) {
4539  const IntrinsicType a1( A.load(i,k) );
4540  xmm1 = xmm1 + a1 * set( B(k,j ) );
4541  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
4542  }
4543  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
4544  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
4545  }
4546  if( j < N ) {
4547  IntrinsicType xmm1;
4548  for( size_t k=0UL; k<K; ++k ) {
4549  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
4550  }
4551  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
4552  }
4553  }
4554  }
4555  //**********************************************************************************************
4556 
4557  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4571  template< typename MT3 // Type of the left-hand side target matrix
4572  , typename MT4 // Type of the left-hand side matrix operand
4573  , typename MT5 // Type of the right-hand side matrix operand
4574  , typename ST2 > // Type of the scalar value
4575  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4576  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4577  {
4578  selectDefaultSubAssignKernel( C, A, B, scalar );
4579  }
4580  //**********************************************************************************************
4581 
4582  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
4583 #if BLAZE_BLAS_MODE
4584 
4597  template< typename MT3 // Type of the left-hand side target matrix
4598  , typename MT4 // Type of the left-hand side matrix operand
4599  , typename MT5 // Type of the right-hand side matrix operand
4600  , typename ST2 > // Type of the scalar value
4601  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4602  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4603  {
4604  using boost::numeric_cast;
4605 
4609 
4610  const int M ( numeric_cast<int>( A.rows() ) );
4611  const int N ( numeric_cast<int>( B.columns() ) );
4612  const int K ( numeric_cast<int>( A.columns() ) );
4613  const int lda( numeric_cast<int>( A.spacing() ) );
4614  const int ldb( numeric_cast<int>( B.spacing() ) );
4615  const int ldc( numeric_cast<int>( C.spacing() ) );
4616 
4617  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4618  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4619  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4620  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4621  }
4622 #endif
4623  //**********************************************************************************************
4624 
4625  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
4626 #if BLAZE_BLAS_MODE
4627 
4640  template< typename MT3 // Type of the left-hand side target matrix
4641  , typename MT4 // Type of the left-hand side matrix operand
4642  , typename MT5 // Type of the right-hand side matrix operand
4643  , typename ST2 > // Type of the scalar value
4644  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4645  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4646  {
4647  using boost::numeric_cast;
4648 
4652 
4653  const int M ( numeric_cast<int>( A.rows() ) );
4654  const int N ( numeric_cast<int>( B.columns() ) );
4655  const int K ( numeric_cast<int>( A.columns() ) );
4656  const int lda( numeric_cast<int>( A.spacing() ) );
4657  const int ldb( numeric_cast<int>( B.spacing() ) );
4658  const int ldc( numeric_cast<int>( C.spacing() ) );
4659 
4660  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4661  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4662  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4663  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4664  }
4665 #endif
4666  //**********************************************************************************************
4667 
4668  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
4669 #if BLAZE_BLAS_MODE
4670 
4683  template< typename MT3 // Type of the left-hand side target matrix
4684  , typename MT4 // Type of the left-hand side matrix operand
4685  , typename MT5 // Type of the right-hand side matrix operand
4686  , typename ST2 > // Type of the scalar value
4687  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4688  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4689  {
4690  using boost::numeric_cast;
4691 
4695  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
4696  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
4697  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
4698 
4699  const int M ( numeric_cast<int>( A.rows() ) );
4700  const int N ( numeric_cast<int>( B.columns() ) );
4701  const int K ( numeric_cast<int>( A.columns() ) );
4702  const int lda( numeric_cast<int>( A.spacing() ) );
4703  const int ldb( numeric_cast<int>( B.spacing() ) );
4704  const int ldc( numeric_cast<int>( C.spacing() ) );
4705  const complex<float> alpha( -scalar );
4706  const complex<float> beta ( 1.0F, 0.0F );
4707 
4708  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4709  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4710  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4711  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4712  }
4713 #endif
4714  //**********************************************************************************************
4715 
4716  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
4717 #if BLAZE_BLAS_MODE
4718 
4731  template< typename MT3 // Type of the left-hand side target matrix
4732  , typename MT4 // Type of the left-hand side matrix operand
4733  , typename MT5 // Type of the right-hand side matrix operand
4734  , typename ST2 > // Type of the scalar value
4735  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4736  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4737  {
4738  using boost::numeric_cast;
4739 
4743  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
4744  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
4745  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
4746 
4747  const int M ( numeric_cast<int>( A.rows() ) );
4748  const int N ( numeric_cast<int>( B.columns() ) );
4749  const int K ( numeric_cast<int>( A.columns() ) );
4750  const int lda( numeric_cast<int>( A.spacing() ) );
4751  const int ldb( numeric_cast<int>( B.spacing() ) );
4752  const int ldc( numeric_cast<int>( C.spacing() ) );
4753  const complex<double> alpha( -scalar );
4754  const complex<double> beta ( 1.0, 0.0 );
4755 
4756  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4757  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4758  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4759  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4760  }
4761 #endif
4762  //**********************************************************************************************
4763 
4764  //**Subtraction assignment to sparse matrices***************************************************
4765  // No special implementation for the subtraction assignment to sparse matrices.
4766  //**********************************************************************************************
4767 
4768  //**Multiplication assignment to dense matrices*************************************************
4769  // No special implementation for the multiplication assignment to dense matrices.
4770  //**********************************************************************************************
4771 
4772  //**Multiplication assignment to sparse matrices************************************************
4773  // No special implementation for the multiplication assignment to sparse matrices.
4774  //**********************************************************************************************
4775 
4776  //**SMP assignment to dense matrices************************************************************
4790  template< typename MT // Type of the target dense matrix
4791  , bool SO > // Storage order of the target dense matrix
4792  friend inline typename EnableIf< UseSMPAssign<MT> >::Type
4793  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4794  {
4796 
4797  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4798  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4799 
4800  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4801  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4802 
4803  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4804  return;
4805  }
4806  else if( left.columns() == 0UL ) {
4807  reset( ~lhs );
4808  return;
4809  }
4810 
4811  LT A( left ); // Evaluation of the left-hand side dense matrix operand
4812  RT B( right ); // Evaluation of the right-hand side dense matrix operand
4813 
4814  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4815  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4816  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4817  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4818  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4819  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4820 
4821  smpAssign( ~lhs, A * B * rhs.scalar_ );
4822  }
4823  //**********************************************************************************************
4824 
4825  //**SMP assignment to sparse matrices***********************************************************
4839  template< typename MT // Type of the target sparse matrix
4840  , bool SO > // Storage order of the target sparse matrix
4841  friend inline typename EnableIf< UseSMPAssign<MT> >::Type
4842  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4843  {
4845 
4846  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
4847 
4854 
4855  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4856  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4857 
4858  const TmpType tmp( rhs );
4859  smpAssign( ~lhs, tmp );
4860  }
4861  //**********************************************************************************************
4862 
4863  //**SMP addition assignment to dense matrices***************************************************
4877  template< typename MT // Type of the target dense matrix
4878  , bool SO > // Storage order of the target dense matrix
4879  friend inline typename EnableIf< UseSMPAssign<MT> >::Type
4880  smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4881  {
4883 
4884  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4885  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4886 
4887  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4888  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4889 
4890  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
4891  return;
4892  }
4893 
4894  LT A( left ); // Evaluation of the left-hand side dense matrix operand
4895  RT B( right ); // Evaluation of the right-hand side dense matrix operand
4896 
4897  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4898  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4899  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4900  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4901  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4902  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4903 
4904  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
4905  }
4906  //**********************************************************************************************
4907 
4908  //**SMP addition assignment to sparse matrices**************************************************
4909  // No special implementation for the SMP addition assignment to sparse matrices.
4910  //**********************************************************************************************
4911 
4912  //**SMP subtraction assignment to dense matrices************************************************
4926  template< typename MT // Type of the target dense matrix
4927  , bool SO > // Storage order of the target dense matrix
4928  friend inline typename EnableIf< UseSMPAssign<MT> >::Type
4929  smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4930  {
4932 
4933  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4934  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4935 
4936  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4937  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4938 
4939  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
4940  return;
4941  }
4942 
4943  LT A( left ); // Evaluation of the left-hand side dense matrix operand
4944  RT B( right ); // Evaluation of the right-hand side dense matrix operand
4945 
4946  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4947  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4948  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4949  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4950  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4951  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4952 
4953  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
4954  }
4955  //**********************************************************************************************
4956 
4957  //**SMP subtraction assignment to sparse matrices***********************************************
4958  // No special implementation for the SMP subtraction assignment to sparse matrices.
4959  //**********************************************************************************************
4960 
4961  //**SMP multiplication assignment to dense matrices*********************************************
4962  // No special implementation for the SMP multiplication assignment to dense matrices.
4963  //**********************************************************************************************
4964 
4965  //**SMP multiplication assignment to sparse matrices********************************************
4966  // No special implementation for the SMP multiplication assignment to sparse matrices.
4967  //**********************************************************************************************
4968 
4969  //**Compile time checks*************************************************************************
4978  //**********************************************************************************************
4979 };
4981 //*************************************************************************************************
4982 
4983 
4984 
4985 
4986 //=================================================================================================
4987 //
4988 // GLOBAL BINARY ARITHMETIC OPERATORS
4989 //
4990 //=================================================================================================
4991 
4992 //*************************************************************************************************
5021 template< typename T1 // Type of the left-hand side dense matrix
5022  , typename T2 > // Type of the right-hand side dense matrix
5023 inline const TDMatDMatMultExpr<T1,T2>
5025 {
5027 
5028  if( (~lhs).columns() != (~rhs).rows() )
5029  throw std::invalid_argument( "Matrix sizes do not match" );
5030 
5031  return TDMatDMatMultExpr<T1,T2>( ~lhs, ~rhs );
5032 }
5033 //*************************************************************************************************
5034 
5035 
5036 
5037 
5038 //=================================================================================================
5039 //
5040 // EXPRESSION TRAIT SPECIALIZATIONS
5041 //
5042 //=================================================================================================
5043 
5044 //*************************************************************************************************
5046 template< typename MT1, typename MT2, typename VT >
5047 struct TDMatDVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
5048 {
5049  public:
5050  //**********************************************************************************************
5051  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
5052  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
5053  IsDenseVector<VT>::value && IsColumnVector<VT>::value
5054  , typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
5055  , INVALID_TYPE >::Type Type;
5056  //**********************************************************************************************
5057 };
5059 //*************************************************************************************************
5060 
5061 
5062 //*************************************************************************************************
5064 template< typename MT1, typename MT2, typename VT >
5065 struct TDMatSVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
5066 {
5067  public:
5068  //**********************************************************************************************
5069  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
5070  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
5071  IsSparseVector<VT>::value && IsColumnVector<VT>::value
5072  , typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
5073  , INVALID_TYPE >::Type Type;
5074  //**********************************************************************************************
5075 };
5077 //*************************************************************************************************
5078 
5079 
5080 //*************************************************************************************************
5082 template< typename VT, typename MT1, typename MT2 >
5083 struct TDVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
5084 {
5085  public:
5086  //**********************************************************************************************
5087  typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
5088  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
5089  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
5090  , typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
5091  , INVALID_TYPE >::Type Type;
5092  //**********************************************************************************************
5093 };
5095 //*************************************************************************************************
5096 
5097 
5098 //*************************************************************************************************
5100 template< typename VT, typename MT1, typename MT2 >
5101 struct TSVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
5102 {
5103  public:
5104  //**********************************************************************************************
5105  typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
5106  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
5107  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
5108  , typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
5109  , INVALID_TYPE >::Type Type;
5110  //**********************************************************************************************
5111 };
5113 //*************************************************************************************************
5114 
5115 
5116 //*************************************************************************************************
5118 template< typename MT1, typename MT2, bool AF >
5119 struct SubmatrixExprTrait< TDMatDMatMultExpr<MT1,MT2>, AF >
5120 {
5121  public:
5122  //**********************************************************************************************
5123  typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
5124  , typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
5125  //**********************************************************************************************
5126 };
5128 //*************************************************************************************************
5129 
5130 
5131 //*************************************************************************************************
5133 template< typename MT1, typename MT2 >
5134 struct RowExprTrait< TDMatDMatMultExpr<MT1,MT2> >
5135 {
5136  public:
5137  //**********************************************************************************************
5138  typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
5139  //**********************************************************************************************
5140 };
5142 //*************************************************************************************************
5143 
5144 
5145 //*************************************************************************************************
5147 template< typename MT1, typename MT2 >
5148 struct ColumnExprTrait< TDMatDMatMultExpr<MT1,MT2> >
5149 {
5150  public:
5151  //**********************************************************************************************
5152  typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
5153  //**********************************************************************************************
5154 };
5156 //*************************************************************************************************
5157 
5158 } // namespace blaze
5159 
5160 #endif
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4599
EnableIf< IsIntegral< T > >::Type store(T *address, const typename Store< T, sizeof(T)>::Type &value)
Aligned store of a vector of integral values.
Definition: Store.h:223
EnableIf< IsIntegral< T >, Load< T, sizeof(T)> >::Type::Type load(const T *address)
Loads a vector of integral values.
Definition: Load.h:222
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:341
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4329
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:414
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:152
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:199
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
Header file for the IsColumnMajorMatrix type trait.
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:395
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2408
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:251
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:122
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:249
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:361
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:690
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:250
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
const size_t TDMATDMATMULT_THRESHOLD
Column-major dense matrix/row-major dense matrix multiplication threshold.This setting specifies the ...
Definition: Thresholds.h:159
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:125
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:121
Constraint on the data type.
Header file for the MultExprTrait class template.
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:122
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:125
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:126
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:286
Header file for the multiplication trait.
Header file for the IsDouble type trait.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:262
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the TDMatSVecMultExprTrait class template.
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDMatMultExpr.h:251
Header file for the DenseMatrix base class.
const size_t SMP_TDMATDMATMULT_THRESHOLD
SMP column-major dense matrix/row-major dense matrix multiplication threshold.This threshold specifie...
Definition: Thresholds.h:880
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:271
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:259
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:246
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:265
Constraints on the storage order of matrix types.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2406
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:256
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
Header file for the serial shim.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:252
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:92
Header file for the IsNumeric type trait.
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:123
System settings for the BLAS mode.
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:301
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:405
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:331
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:124
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:249
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:385
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:351
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:250
Header file for the TDVecDMatMultExprTrait class template.
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:248
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2403
Header file for basic type definitions.
Header file for the IsComplex type trait.
Header file for the complex data type.
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:331
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:253
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:415
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:301
Constraint on the data type.
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:247
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:209
Header file for the IsExpression type trait class.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:373
Header file for the FunctionTrace class.