All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <stdexcept>
44 #include <boost/cast.hpp>
52 #include <blaze/math/Intrinsics.h>
53 #include <blaze/math/shims/Reset.h>
77 #include <blaze/system/BLAS.h>
79 #include <blaze/util/Assert.h>
80 #include <blaze/util/Complex.h>
86 #include <blaze/util/DisableIf.h>
87 #include <blaze/util/EnableIf.h>
88 #include <blaze/util/InvalidType.h>
90 #include <blaze/util/SelectType.h>
91 #include <blaze/util/Types.h>
97 
98 
99 namespace blaze {
100 
101 //=================================================================================================
102 //
103 // CLASS TDMATDMATMULTEXPR
104 //
105 //=================================================================================================
106 
107 //*************************************************************************************************
114 template< typename MT1 // Type of the left-hand side dense matrix
115  , typename MT2 > // Type of the right-hand side dense matrix
116 class TDMatDMatMultExpr : public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
117  , private MatMatMultExpr
118  , private Computation
119 {
120  private:
121  //**Type definitions****************************************************************************
122  typedef typename MT1::ResultType RT1;
123  typedef typename MT2::ResultType RT2;
124  typedef typename RT1::ElementType ET1;
125  typedef typename RT2::ElementType ET2;
126  typedef typename MT1::CompositeType CT1;
127  typedef typename MT2::CompositeType CT2;
128  //**********************************************************************************************
129 
130  //**********************************************************************************************
133  //**********************************************************************************************
134 
135  //**********************************************************************************************
137  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
138  //**********************************************************************************************
139 
140  //**********************************************************************************************
142 
145  template< typename T1, typename T2, typename T3 >
146  struct UseSMPAssignKernel {
147  enum { value = evaluateLeft || evaluateRight };
148  };
150  //**********************************************************************************************
151 
152  //**********************************************************************************************
154 
157  template< typename T1, typename T2, typename T3 >
158  struct UseSinglePrecisionKernel {
159  enum { value = IsFloat<typename T1::ElementType>::value &&
160  IsFloat<typename T2::ElementType>::value &&
161  IsFloat<typename T3::ElementType>::value };
162  };
164  //**********************************************************************************************
165 
166  //**********************************************************************************************
168 
171  template< typename T1, typename T2, typename T3 >
172  struct UseDoublePrecisionKernel {
173  enum { value = IsDouble<typename T1::ElementType>::value &&
174  IsDouble<typename T2::ElementType>::value &&
175  IsDouble<typename T3::ElementType>::value };
176  };
178  //**********************************************************************************************
179 
180  //**********************************************************************************************
182 
186  template< typename T1, typename T2, typename T3 >
187  struct UseSinglePrecisionComplexKernel {
188  typedef complex<float> Type;
189  enum { value = IsSame<typename T1::ElementType,Type>::value &&
190  IsSame<typename T2::ElementType,Type>::value &&
191  IsSame<typename T3::ElementType,Type>::value };
192  };
194  //**********************************************************************************************
195 
196  //**********************************************************************************************
198 
202  template< typename T1, typename T2, typename T3 >
203  struct UseDoublePrecisionComplexKernel {
204  typedef complex<double> Type;
205  enum { value = IsSame<typename T1::ElementType,Type>::value &&
206  IsSame<typename T2::ElementType,Type>::value &&
207  IsSame<typename T3::ElementType,Type>::value };
208  };
210  //**********************************************************************************************
211 
212  //**********************************************************************************************
214 
217  template< typename T1, typename T2, typename T3 >
218  struct UseDefaultKernel {
219  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
220  !UseDoublePrecisionKernel<T1,T2,T3>::value &&
221  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
222  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
223  };
225  //**********************************************************************************************
226 
227  //**********************************************************************************************
229 
232  template< typename T1, typename T2, typename T3 >
233  struct UseVectorizedDefaultKernel {
234  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
235  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
236  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
237  IntrinsicTrait<typename T1::ElementType>::addition &&
238  IntrinsicTrait<typename T1::ElementType>::subtraction &&
239  IntrinsicTrait<typename T1::ElementType>::multiplication };
240  };
242  //**********************************************************************************************
243 
244  public:
245  //**Type definitions****************************************************************************
252  typedef const ElementType ReturnType;
253  typedef const ResultType CompositeType;
254 
256  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
257 
259  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
260 
263 
266  //**********************************************************************************************
267 
268  //**Compilation flags***************************************************************************
270  enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
274 
276  enum { smpAssignable = !evaluateLeft && !evaluateRight };
277  //**********************************************************************************************
278 
279  //**Constructor*********************************************************************************
285  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs )
286  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
287  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
288  {
289  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
290  }
291  //**********************************************************************************************
292 
293  //**Access operator*****************************************************************************
300  inline ReturnType operator()( size_t i, size_t j ) const {
301  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
302  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
303 
304  ElementType tmp;
305 
306  if( lhs_.columns() != 0UL ) {
307  const size_t end( ( ( lhs_.columns()-1UL ) & size_t(-2) ) + 1UL );
308  tmp = lhs_(i,0UL) * rhs_(0UL,j);
309  for( size_t k=1UL; k<end; k+=2UL ) {
310  tmp += lhs_(i,k ) * rhs_(k ,j);
311  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
312  }
313  if( end < lhs_.columns() ) {
314  tmp += lhs_(i,end) * rhs_(end,j);
315  }
316  }
317  else {
318  reset( tmp );
319  }
320 
321  return tmp;
322  }
323  //**********************************************************************************************
324 
325  //**Rows function*******************************************************************************
330  inline size_t rows() const {
331  return lhs_.rows();
332  }
333  //**********************************************************************************************
334 
335  //**Columns function****************************************************************************
340  inline size_t columns() const {
341  return rhs_.columns();
342  }
343  //**********************************************************************************************
344 
345  //**Left operand access*************************************************************************
350  inline LeftOperand leftOperand() const {
351  return lhs_;
352  }
353  //**********************************************************************************************
354 
355  //**Right operand access************************************************************************
360  inline RightOperand rightOperand() const {
361  return rhs_;
362  }
363  //**********************************************************************************************
364 
365  //**********************************************************************************************
371  template< typename T >
372  inline bool canAlias( const T* alias ) const {
373  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
374  }
375  //**********************************************************************************************
376 
377  //**********************************************************************************************
383  template< typename T >
384  inline bool isAliased( const T* alias ) const {
385  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
386  }
387  //**********************************************************************************************
388 
389  //**********************************************************************************************
394  inline bool isAligned() const {
395  return lhs_.isAligned() && rhs_.isAligned();
396  }
397  //**********************************************************************************************
398 
399  //**********************************************************************************************
404  inline bool canSMPAssign() const {
405  return ( !BLAZE_BLAS_IS_PARALLEL ||
406  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
408  }
409  //**********************************************************************************************
410 
411  private:
412  //**Member variables****************************************************************************
415  //**********************************************************************************************
416 
417  //**Assignment to dense matrices****************************************************************
427  template< typename MT // Type of the target dense matrix
428  , bool SO > // Storage order of the target dense matrix
429  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
430  {
432 
433  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
434  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
435 
436  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
437  return;
438  }
439  else if( rhs.lhs_.columns() == 0UL ) {
440  reset( ~lhs );
441  return;
442  }
443 
444  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
445  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
446 
447  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
448  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
449  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
450  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
451  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
452  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
453 
454  TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
455  }
457  //**********************************************************************************************
458 
459  //**Assignment to dense matrices (kernel selection)*********************************************
470  template< typename MT3 // Type of the left-hand side target matrix
471  , typename MT4 // Type of the left-hand side matrix operand
472  , typename MT5 > // Type of the right-hand side matrix operand
473  static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
474  selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
475  {
476  if( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD )
477  TDMatDMatMultExpr::selectDefaultAssignKernel( C, A, B );
478  else
479  TDMatDMatMultExpr::selectBlasAssignKernel( C, A, B );
480  }
482  //**********************************************************************************************
483 
484  //**Assignment to dense matrices (kernel selection)*********************************************
495  template< typename MT3 // Type of the left-hand side target matrix
496  , typename MT4 // Type of the left-hand side matrix operand
497  , typename MT5 > // Type of the right-hand side matrix operand
498  static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
499  selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
500  {
501  smpAssign( C, A * B );
502  }
504  //**********************************************************************************************
505 
506  //**Default assignment to dense matrices********************************************************
520  template< typename MT3 // Type of the left-hand side target matrix
521  , typename MT4 // Type of the left-hand side matrix operand
522  , typename MT5 > // Type of the right-hand side matrix operand
523  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
524  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
525  {
526  const size_t M( A.rows() );
527  const size_t N( B.columns() );
528  const size_t K( A.columns() );
529 
530  for( size_t i=0UL; i<M; ++i ) {
531  for( size_t j=0UL; j<N; ++j ) {
532  C(i,j) = A(i,0UL) * B(0UL,j);
533  }
534  for( size_t k=1UL; k<K; ++k ) {
535  for( size_t j=0UL; j<N; ++j ) {
536  C(i,j) += A(i,k) * B(k,j);
537  }
538  }
539  }
540  }
542  //**********************************************************************************************
543 
544  //**Vectorized default assignment to row-major dense matrices***********************************
558  template< typename MT3 // Type of the left-hand side target matrix
559  , typename MT4 // Type of the left-hand side matrix operand
560  , typename MT5 > // Type of the right-hand side matrix operand
561  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
562  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
563  {
564  typedef IntrinsicTrait<ElementType> IT;
565 
566  const size_t M( A.rows() );
567  const size_t N( B.columns() );
568  const size_t K( A.columns() );
569 
570  size_t j( 0UL );
571 
572  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
573  for( size_t i=0UL; i<M; ++i ) {
574  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
575  for( size_t k=0UL; k<K; ++k ) {
576  const IntrinsicType a1( set( A(i,k) ) );
577  xmm1 = xmm1 + a1 * B.load(k,j );
578  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
579  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
580  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
581  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
582  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
583  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
584  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
585  }
586  (~C).store( i, j , xmm1 );
587  (~C).store( i, j+IT::size , xmm2 );
588  (~C).store( i, j+IT::size*2UL, xmm3 );
589  (~C).store( i, j+IT::size*3UL, xmm4 );
590  (~C).store( i, j+IT::size*4UL, xmm5 );
591  (~C).store( i, j+IT::size*5UL, xmm6 );
592  (~C).store( i, j+IT::size*6UL, xmm7 );
593  (~C).store( i, j+IT::size*7UL, xmm8 );
594  }
595  }
596  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
597  size_t i( 0UL );
598  for( ; (i+2UL) <= M; i+=2UL ) {
599  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
600  for( size_t k=0UL; k<K; ++k ) {
601  const IntrinsicType a1( set( A(i ,k) ) );
602  const IntrinsicType a2( set( A(i+1UL,k) ) );
603  const IntrinsicType b1( B.load(k,j ) );
604  const IntrinsicType b2( B.load(k,j+IT::size ) );
605  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
606  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
607  xmm1 = xmm1 + a1 * b1;
608  xmm2 = xmm2 + a1 * b2;
609  xmm3 = xmm3 + a1 * b3;
610  xmm4 = xmm4 + a1 * b4;
611  xmm5 = xmm5 + a2 * b1;
612  xmm6 = xmm6 + a2 * b2;
613  xmm7 = xmm7 + a2 * b3;
614  xmm8 = xmm8 + a2 * b4;
615  }
616  (~C).store( i , j , xmm1 );
617  (~C).store( i , j+IT::size , xmm2 );
618  (~C).store( i , j+IT::size*2UL, xmm3 );
619  (~C).store( i , j+IT::size*3UL, xmm4 );
620  (~C).store( i+1UL, j , xmm5 );
621  (~C).store( i+1UL, j+IT::size , xmm6 );
622  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
623  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
624  }
625  if( i < M ) {
626  IntrinsicType xmm1, xmm2, xmm3, xmm4;
627  for( size_t k=0UL; k<K; ++k ) {
628  const IntrinsicType a1( set( A(i,k) ) );
629  xmm1 = xmm1 + a1 * B.load(k,j );
630  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
631  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
632  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
633  }
634  (~C).store( i, j , xmm1 );
635  (~C).store( i, j+IT::size , xmm2 );
636  (~C).store( i, j+IT::size*2UL, xmm3 );
637  (~C).store( i, j+IT::size*3UL, xmm4 );
638  }
639  }
640  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
641  size_t i( 0UL );
642  for( ; (i+2UL) <= M; i+=2UL ) {
643  IntrinsicType xmm1, xmm2, xmm3, xmm4;
644  for( size_t k=0UL; k<K; ++k ) {
645  const IntrinsicType a1( set( A(i ,k) ) );
646  const IntrinsicType a2( set( A(i+1UL,k) ) );
647  const IntrinsicType b1( B.load(k,j ) );
648  const IntrinsicType b2( B.load(k,j+IT::size) );
649  xmm1 = xmm1 + a1 * b1;
650  xmm2 = xmm2 + a1 * b2;
651  xmm3 = xmm3 + a2 * b1;
652  xmm4 = xmm4 + a2 * b2;
653  }
654  (~C).store( i , j , xmm1 );
655  (~C).store( i , j+IT::size, xmm2 );
656  (~C).store( i+1UL, j , xmm3 );
657  (~C).store( i+1UL, j+IT::size, xmm4 );
658  }
659  if( i < M ) {
660  IntrinsicType xmm1, xmm2;
661  for( size_t k=0UL; k<K; ++k ) {
662  const IntrinsicType a1( set( A(i,k) ) );
663  xmm1 = xmm1 + a1 * B.load(k,j );
664  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
665  }
666  (~C).store( i, j , xmm1 );
667  (~C).store( i, j+IT::size, xmm2 );
668  }
669  }
670  if( j < N ) {
671  size_t i( 0UL );
672  for( ; (i+2UL) <= M; i+=2UL ) {
673  IntrinsicType xmm1, xmm2;
674  for( size_t k=0UL; k<K; ++k ) {
675  const IntrinsicType b1( B.load(k,j) );
676  xmm1 = xmm1 + set( A(i ,k) ) * b1;
677  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
678  }
679  (~C).store( i , j, xmm1 );
680  (~C).store( i+1UL, j, xmm2 );
681  }
682  if( i < M ) {
683  IntrinsicType xmm1;
684  for( size_t k=0UL; k<K; ++k ) {
685  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
686  }
687  (~C).store( i, j, xmm1 );
688  }
689  }
690  }
692  //**********************************************************************************************
693 
694  //**Vectorized default assignment to column-major dense matrices********************************
708  template< typename MT3 // Type of the left-hand side target matrix
709  , typename MT4 // Type of the left-hand side matrix operand
710  , typename MT5 > // Type of the right-hand side matrix operand
711  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
712  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
713  {
714  typedef IntrinsicTrait<ElementType> IT;
715 
716  const size_t M( A.rows() );
717  const size_t N( B.columns() );
718  const size_t K( A.columns() );
719 
720  size_t i( 0UL );
721 
722  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
723  for( size_t j=0UL; j<N; ++j ) {
724  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
725  for( size_t k=0UL; k<K; ++k ) {
726  const IntrinsicType b1( set( B(k,j) ) );
727  xmm1 = xmm1 + A.load(i ,k) * b1;
728  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
729  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
730  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
731  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
732  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
733  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
734  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
735  }
736  (~C).store( i , j, xmm1 );
737  (~C).store( i+IT::size , j, xmm2 );
738  (~C).store( i+IT::size*2UL, j, xmm3 );
739  (~C).store( i+IT::size*3UL, j, xmm4 );
740  (~C).store( i+IT::size*4UL, j, xmm5 );
741  (~C).store( i+IT::size*5UL, j, xmm6 );
742  (~C).store( i+IT::size*6UL, j, xmm7 );
743  (~C).store( i+IT::size*7UL, j, xmm8 );
744  }
745  }
746  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
747  size_t j( 0UL );
748  for( ; (j+2UL) <= N; j+=2UL ) {
749  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
750  for( size_t k=0UL; k<K; ++k ) {
751  const IntrinsicType a1( A.load(i ,k) );
752  const IntrinsicType a2( A.load(i+IT::size ,k) );
753  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
754  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
755  const IntrinsicType b1( set( B(k,j ) ) );
756  const IntrinsicType b2( set( B(k,j+1UL) ) );
757  xmm1 = xmm1 + a1 * b1;
758  xmm2 = xmm2 + a2 * b1;
759  xmm3 = xmm3 + a3 * b1;
760  xmm4 = xmm4 + a4 * b1;
761  xmm5 = xmm5 + a1 * b2;
762  xmm6 = xmm6 + a2 * b2;
763  xmm7 = xmm7 + a3 * b2;
764  xmm8 = xmm8 + a4 * b2;
765  }
766  (~C).store( i , j , xmm1 );
767  (~C).store( i+IT::size , j , xmm2 );
768  (~C).store( i+IT::size*2UL, j , xmm3 );
769  (~C).store( i+IT::size*3UL, j , xmm4 );
770  (~C).store( i , j+1UL, xmm5 );
771  (~C).store( i+IT::size , j+1UL, xmm6 );
772  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
773  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
774  }
775  if( j < N ) {
776  IntrinsicType xmm1, xmm2, xmm3, xmm4;
777  for( size_t k=0UL; k<K; ++k ) {
778  const IntrinsicType b1( set( B(k,j) ) );
779  xmm1 = xmm1 + A.load(i ,k) * b1;
780  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
781  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
782  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
783  }
784  (~C).store( i , j, xmm1 );
785  (~C).store( i+IT::size , j, xmm2 );
786  (~C).store( i+IT::size*2UL, j, xmm3 );
787  (~C).store( i+IT::size*3UL, j, xmm4 );
788  }
789  }
790  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
791  size_t j( 0UL );
792  for( ; (j+2UL) <= N; j+=2UL ) {
793  IntrinsicType xmm1, xmm2, xmm3, xmm4;
794  for( size_t k=0UL; k<K; ++k ) {
795  const IntrinsicType a1( A.load(i ,k) );
796  const IntrinsicType a2( A.load(i+IT::size,k) );
797  const IntrinsicType b1( set( B(k,j ) ) );
798  const IntrinsicType b2( set( B(k,j+1UL) ) );
799  xmm1 = xmm1 + a1 * b1;
800  xmm2 = xmm2 + a2 * b1;
801  xmm3 = xmm3 + a1 * b2;
802  xmm4 = xmm4 + a2 * b2;
803  }
804  (~C).store( i , j , xmm1 );
805  (~C).store( i+IT::size, j , xmm2 );
806  (~C).store( i , j+1UL, xmm3 );
807  (~C).store( i+IT::size, j+1UL, xmm4 );
808  }
809  if( j < N ) {
810  IntrinsicType xmm1, xmm2;
811  for( size_t k=0UL; k<K; ++k ) {
812  const IntrinsicType b1( set( B(k,j) ) );
813  xmm1 = xmm1 + A.load(i ,k) * b1;
814  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
815  }
816  (~C).store( i , j, xmm1 );
817  (~C).store( i+IT::size, j, xmm2 );
818  }
819  }
820  if( i < M ) {
821  size_t j( 0UL );
822  for( ; (j+2UL) <= N; j+=2UL ) {
823  IntrinsicType xmm1, xmm2;
824  for( size_t k=0UL; k<K; ++k ) {
825  const IntrinsicType a1( A.load(i,k) );
826  xmm1 = xmm1 + a1 * set( B(k,j ) );
827  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
828  }
829  (~C).store( i, j , xmm1 );
830  (~C).store( i, j+1UL, xmm2 );
831  }
832  if( j < N ) {
833  IntrinsicType xmm1;
834  for( size_t k=0UL; k<K; ++k ) {
835  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
836  }
837  (~C).store( i, j, xmm1 );
838  }
839  }
840  }
842  //**********************************************************************************************
843 
844  //**BLAS-based assignment to dense matrices (default)*******************************************
858  template< typename MT3 // Type of the left-hand side target matrix
859  , typename MT4 // Type of the left-hand side matrix operand
860  , typename MT5 > // Type of the right-hand side matrix operand
861  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
862  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
863  {
864  selectDefaultAssignKernel( C, A, B );
865  }
867  //**********************************************************************************************
868 
869  //**BLAS-based assignment to dense matrices (single precision)**********************************
870 #if BLAZE_BLAS_MODE
871 
884  template< typename MT3 // Type of the left-hand side target matrix
885  , typename MT4 // Type of the left-hand side matrix operand
886  , typename MT5 > // Type of the right-hand side matrix operand
887  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
888  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
889  {
890  using boost::numeric_cast;
891 
895 
896  const int M ( numeric_cast<int>( A.rows() ) );
897  const int N ( numeric_cast<int>( B.columns() ) );
898  const int K ( numeric_cast<int>( A.columns() ) );
899  const int lda( numeric_cast<int>( A.spacing() ) );
900  const int ldb( numeric_cast<int>( B.spacing() ) );
901  const int ldc( numeric_cast<int>( C.spacing() ) );
902 
903  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
904  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
905  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
906  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
907  }
909 #endif
910  //**********************************************************************************************
911 
912  //**BLAS-based assignment to dense matrices (double precision)**********************************
913 #if BLAZE_BLAS_MODE
914 
927  template< typename MT3 // Type of the left-hand side target matrix
928  , typename MT4 // Type of the left-hand side matrix operand
929  , typename MT5 > // Type of the right-hand side matrix operand
930  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
931  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
932  {
933  using boost::numeric_cast;
934 
938 
939  const int M ( numeric_cast<int>( A.rows() ) );
940  const int N ( numeric_cast<int>( B.columns() ) );
941  const int K ( numeric_cast<int>( A.columns() ) );
942  const int lda( numeric_cast<int>( A.spacing() ) );
943  const int ldb( numeric_cast<int>( B.spacing() ) );
944  const int ldc( numeric_cast<int>( C.spacing() ) );
945 
946  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
947  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
948  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
949  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
950  }
952 #endif
953  //**********************************************************************************************
954 
955  //**BLAS-based assignment to dense matrices (single precision complex)**************************
956 #if BLAZE_BLAS_MODE
957 
970  template< typename MT3 // Type of the left-hand side target matrix
971  , typename MT4 // Type of the left-hand side matrix operand
972  , typename MT5 > // Type of the right-hand side matrix operand
973  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
974  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
975  {
976  using boost::numeric_cast;
977 
981  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
982  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
983  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
984 
985  const int M ( numeric_cast<int>( A.rows() ) );
986  const int N ( numeric_cast<int>( B.columns() ) );
987  const int K ( numeric_cast<int>( A.columns() ) );
988  const int lda( numeric_cast<int>( A.spacing() ) );
989  const int ldb( numeric_cast<int>( B.spacing() ) );
990  const int ldc( numeric_cast<int>( C.spacing() ) );
991  const complex<float> alpha( 1.0F, 0.0F );
992  const complex<float> beta ( 0.0F, 0.0F );
993 
994  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
995  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
996  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
997  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
998  }
1000 #endif
1001  //**********************************************************************************************
1002 
1003  //**BLAS-based assignment to dense matrices (double precision complex)**************************
1004 #if BLAZE_BLAS_MODE
1005 
1018  template< typename MT3 // Type of the left-hand side target matrix
1019  , typename MT4 // Type of the left-hand side matrix operand
1020  , typename MT5 > // Type of the right-hand side matrix operand
1021  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1022  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1023  {
1024  using boost::numeric_cast;
1025 
1029  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1030  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1031  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1032 
1033  const int M ( numeric_cast<int>( A.rows() ) );
1034  const int N ( numeric_cast<int>( B.columns() ) );
1035  const int K ( numeric_cast<int>( A.columns() ) );
1036  const int lda( numeric_cast<int>( A.spacing() ) );
1037  const int ldb( numeric_cast<int>( B.spacing() ) );
1038  const int ldc( numeric_cast<int>( C.spacing() ) );
1039  const complex<double> alpha( 1.0, 0.0 );
1040  const complex<double> beta ( 0.0, 0.0 );
1041 
1042  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1043  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1044  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1045  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1046  }
1048 #endif
1049  //**********************************************************************************************
1050 
1051  //**Assignment to sparse matrices***************************************************************
1063  template< typename MT // Type of the target sparse matrix
1064  , bool SO > // Storage order of the target sparse matrix
1065  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
1066  {
1068 
1069  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
1070 
1077 
1078  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1079  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1080 
1081  const TmpType tmp( rhs );
1082  smpAssign( ~lhs, tmp );
1083  }
1085  //**********************************************************************************************
1086 
1087  //**Addition assignment to dense matrices*******************************************************
1100  template< typename MT // Type of the target dense matrix
1101  , bool SO > // Storage order of the target dense matrix
1102  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
1103  {
1105 
1106  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1107  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1108 
1109  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1110  return;
1111  }
1112 
1113  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
1114  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
1115 
1116  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1117  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1118  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1119  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1120  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1121  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1122 
1123  TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1124  }
1126  //**********************************************************************************************
1127 
1128  //**Addition assignment to dense matrices (kernel selection)************************************
1139  template< typename MT3 // Type of the left-hand side target matrix
1140  , typename MT4 // Type of the left-hand side matrix operand
1141  , typename MT5 > // Type of the right-hand side matrix operand
1142  static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
1143  selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1144  {
1145  if( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD )
1146  TDMatDMatMultExpr::selectDefaultAddAssignKernel( C, A, B );
1147  else
1148  TDMatDMatMultExpr::selectBlasAddAssignKernel( C, A, B );
1149  }
1151  //**********************************************************************************************
1152 
1153  //**Addition assignment to dense matrices (kernel selection)************************************
1164  template< typename MT3 // Type of the left-hand side target matrix
1165  , typename MT4 // Type of the left-hand side matrix operand
1166  , typename MT5 > // Type of the right-hand side matrix operand
1167  static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
1168  selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1169  {
1170  smpAddAssign( C, A * B );
1171  }
1173  //**********************************************************************************************
1174 
1175  //**Default addition assignment to dense matrices***********************************************
1189  template< typename MT3 // Type of the left-hand side target matrix
1190  , typename MT4 // Type of the left-hand side matrix operand
1191  , typename MT5 > // Type of the right-hand side matrix operand
1192  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1193  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1194  {
1195  const size_t M( A.rows() );
1196  const size_t N( B.columns() );
1197  const size_t K( A.columns() );
1198 
1199  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1200  const size_t end( N & size_t(-2) );
1201 
1202  for( size_t i=0UL; i<M; ++i ) {
1203  for( size_t k=0UL; k<K; ++k ) {
1204  for( size_t j=0UL; j<end; j+=2UL ) {
1205  C(i,j ) += A(i,k) * B(k,j );
1206  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1207  }
1208  if( end < N ) {
1209  C(i,end) += A(i,k) * B(k,end);
1210  }
1211  }
1212  }
1213  }
1215  //**********************************************************************************************
1216 
1217  //**Vectorized default addition assignment to row-major dense matrices**************************
1231  template< typename MT3 // Type of the left-hand side target matrix
1232  , typename MT4 // Type of the left-hand side matrix operand
1233  , typename MT5 > // Type of the right-hand side matrix operand
1234  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1235  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1236  {
1237  typedef IntrinsicTrait<ElementType> IT;
1238 
1239  const size_t M( A.rows() );
1240  const size_t N( B.columns() );
1241  const size_t K( A.columns() );
1242 
1243  size_t j( 0UL );
1244 
1245  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1246  for( size_t i=0UL; i<M; ++i ) {
1247  IntrinsicType xmm1( (~C).load(i,j ) );
1248  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
1249  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
1250  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
1251  IntrinsicType xmm5( (~C).load(i,j+IT::size*4UL) );
1252  IntrinsicType xmm6( (~C).load(i,j+IT::size*5UL) );
1253  IntrinsicType xmm7( (~C).load(i,j+IT::size*6UL) );
1254  IntrinsicType xmm8( (~C).load(i,j+IT::size*7UL) );
1255  for( size_t k=0UL; k<K; ++k ) {
1256  const IntrinsicType a1( set( A(i,k) ) );
1257  xmm1 = xmm1 + a1 * B.load(k,j );
1258  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1259  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1260  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1261  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
1262  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
1263  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
1264  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
1265  }
1266  (~C).store( i, j , xmm1 );
1267  (~C).store( i, j+IT::size , xmm2 );
1268  (~C).store( i, j+IT::size*2UL, xmm3 );
1269  (~C).store( i, j+IT::size*3UL, xmm4 );
1270  (~C).store( i, j+IT::size*4UL, xmm5 );
1271  (~C).store( i, j+IT::size*5UL, xmm6 );
1272  (~C).store( i, j+IT::size*6UL, xmm7 );
1273  (~C).store( i, j+IT::size*7UL, xmm8 );
1274  }
1275  }
1276  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1277  size_t i( 0UL );
1278  for( ; (i+2UL) <= M; i+=2UL ) {
1279  IntrinsicType xmm1( (~C).load(i ,j ) );
1280  IntrinsicType xmm2( (~C).load(i ,j+IT::size ) );
1281  IntrinsicType xmm3( (~C).load(i ,j+IT::size*2UL) );
1282  IntrinsicType xmm4( (~C).load(i ,j+IT::size*3UL) );
1283  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1284  IntrinsicType xmm6( (~C).load(i+1UL,j+IT::size ) );
1285  IntrinsicType xmm7( (~C).load(i+1UL,j+IT::size*2UL) );
1286  IntrinsicType xmm8( (~C).load(i+1UL,j+IT::size*3UL) );
1287  for( size_t k=0UL; k<K; ++k ) {
1288  const IntrinsicType a1( set( A(i ,k) ) );
1289  const IntrinsicType a2( set( A(i+1UL,k) ) );
1290  const IntrinsicType b1( B.load(k,j ) );
1291  const IntrinsicType b2( B.load(k,j+IT::size ) );
1292  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
1293  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
1294  xmm1 = xmm1 + a1 * b1;
1295  xmm2 = xmm2 + a1 * b2;
1296  xmm3 = xmm3 + a1 * b3;
1297  xmm4 = xmm4 + a1 * b4;
1298  xmm5 = xmm5 + a2 * b1;
1299  xmm6 = xmm6 + a2 * b2;
1300  xmm7 = xmm7 + a2 * b3;
1301  xmm8 = xmm8 + a2 * b4;
1302  }
1303  (~C).store( i , j , xmm1 );
1304  (~C).store( i , j+IT::size , xmm2 );
1305  (~C).store( i , j+IT::size*2UL, xmm3 );
1306  (~C).store( i , j+IT::size*3UL, xmm4 );
1307  (~C).store( i+1UL, j , xmm5 );
1308  (~C).store( i+1UL, j+IT::size , xmm6 );
1309  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
1310  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
1311  }
1312  if( i < M ) {
1313  IntrinsicType xmm1( (~C).load(i,j ) );
1314  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
1315  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
1316  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
1317  for( size_t k=0UL; k<K; ++k ) {
1318  const IntrinsicType a1( set( A(i,k) ) );
1319  xmm1 = xmm1 + a1 * B.load(k,j );
1320  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1321  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1322  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1323  }
1324  (~C).store( i, j , xmm1 );
1325  (~C).store( i, j+IT::size , xmm2 );
1326  (~C).store( i, j+IT::size*2UL, xmm3 );
1327  (~C).store( i, j+IT::size*3UL, xmm4 );
1328  }
1329  }
1330  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1331  size_t i( 0UL );
1332  for( ; (i+2UL) <= M; i+=2UL ) {
1333  IntrinsicType xmm1( (~C).load(i ,j ) );
1334  IntrinsicType xmm2( (~C).load(i ,j+IT::size) );
1335  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1336  IntrinsicType xmm4( (~C).load(i+1UL,j+IT::size) );
1337  for( size_t k=0UL; k<K; ++k ) {
1338  const IntrinsicType a1( set( A(i ,k) ) );
1339  const IntrinsicType a2( set( A(i+1UL,k) ) );
1340  const IntrinsicType b1( B.load(k,j ) );
1341  const IntrinsicType b2( B.load(k,j+IT::size) );
1342  xmm1 = xmm1 + a1 * b1;
1343  xmm2 = xmm2 + a1 * b2;
1344  xmm3 = xmm3 + a2 * b1;
1345  xmm4 = xmm4 + a2 * b2;
1346  }
1347  (~C).store( i , j , xmm1 );
1348  (~C).store( i , j+IT::size, xmm2 );
1349  (~C).store( i+1UL, j , xmm3 );
1350  (~C).store( i+1UL, j+IT::size, xmm4 );
1351  }
1352  if( i < M ) {
1353  IntrinsicType xmm1( (~C).load(i,j ) );
1354  IntrinsicType xmm2( (~C).load(i,j+IT::size) );
1355  for( size_t k=0UL; k<K; ++k ) {
1356  const IntrinsicType a1( set( A(i,k) ) );
1357  xmm1 = xmm1 + a1 * B.load(k,j );
1358  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
1359  }
1360  (~C).store( i, j , xmm1 );
1361  (~C).store( i, j+IT::size, xmm2 );
1362  }
1363  }
1364  if( j < N ) {
1365  size_t i( 0UL );
1366  for( ; (i+2UL) <= M; i+=2UL ) {
1367  IntrinsicType xmm1( (~C).load(i ,j) );
1368  IntrinsicType xmm2( (~C).load(i+1UL,j) );
1369  for( size_t k=0UL; k<K; ++k ) {
1370  const IntrinsicType b1( B.load(k,j) );
1371  xmm1 = xmm1 + set( A(i ,k) ) * b1;
1372  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
1373  }
1374  (~C).store( i , j, xmm1 );
1375  (~C).store( i+1UL, j, xmm2 );
1376  }
1377  if( i < M ) {
1378  IntrinsicType xmm1( (~C).load(i,j) );
1379  for( size_t k=0UL; k<K; ++k ) {
1380  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
1381  }
1382  (~C).store( i, j, xmm1 );
1383  }
1384  }
1385  }
1387  //**********************************************************************************************
1388 
1389  //**Vectorized default addition assignment to column-major dense matrices***********************
1403  template< typename MT3 // Type of the left-hand side target matrix
1404  , typename MT4 // Type of the left-hand side matrix operand
1405  , typename MT5 > // Type of the right-hand side matrix operand
1406  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1407  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1408  {
1409  typedef IntrinsicTrait<ElementType> IT;
1410 
1411  const size_t M( A.rows() );
1412  const size_t N( B.columns() );
1413  const size_t K( A.columns() );
1414 
1415  size_t i( 0UL );
1416 
1417  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1418  for( size_t j=0UL; j<N; ++j ) {
1419  IntrinsicType xmm1( (~C).load(i ,j) );
1420  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
1421  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
1422  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
1423  IntrinsicType xmm5( (~C).load(i+IT::size*4UL,j) );
1424  IntrinsicType xmm6( (~C).load(i+IT::size*5UL,j) );
1425  IntrinsicType xmm7( (~C).load(i+IT::size*6UL,j) );
1426  IntrinsicType xmm8( (~C).load(i+IT::size*7UL,j) );
1427  for( size_t k=0UL; k<K; ++k ) {
1428  const IntrinsicType b1( set( B(k,j) ) );
1429  xmm1 = xmm1 + A.load(i ,k) * b1;
1430  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1431  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1432  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1433  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
1434  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
1435  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
1436  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
1437  }
1438  (~C).store( i , j, xmm1 );
1439  (~C).store( i+IT::size , j, xmm2 );
1440  (~C).store( i+IT::size*2UL, j, xmm3 );
1441  (~C).store( i+IT::size*3UL, j, xmm4 );
1442  (~C).store( i+IT::size*4UL, j, xmm5 );
1443  (~C).store( i+IT::size*5UL, j, xmm6 );
1444  (~C).store( i+IT::size*6UL, j, xmm7 );
1445  (~C).store( i+IT::size*7UL, j, xmm8 );
1446  }
1447  }
1448  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1449  size_t j( 0UL );
1450  for( ; (j+2UL) <= N; j+=2UL ) {
1451  IntrinsicType xmm1( (~C).load(i ,j ) );
1452  IntrinsicType xmm2( (~C).load(i+IT::size ,j ) );
1453  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j ) );
1454  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j ) );
1455  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
1456  IntrinsicType xmm6( (~C).load(i+IT::size ,j+1UL) );
1457  IntrinsicType xmm7( (~C).load(i+IT::size*2UL,j+1UL) );
1458  IntrinsicType xmm8( (~C).load(i+IT::size*3UL,j+1UL) );
1459  for( size_t k=0UL; k<K; ++k ) {
1460  const IntrinsicType a1( A.load(i ,k) );
1461  const IntrinsicType a2( A.load(i+IT::size ,k) );
1462  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
1463  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
1464  const IntrinsicType b1( set( B(k,j ) ) );
1465  const IntrinsicType b2( set( B(k,j+1UL) ) );
1466  xmm1 = xmm1 + a1 * b1;
1467  xmm2 = xmm2 + a2 * b1;
1468  xmm3 = xmm3 + a3 * b1;
1469  xmm4 = xmm4 + a4 * b1;
1470  xmm5 = xmm5 + a1 * b2;
1471  xmm6 = xmm6 + a2 * b2;
1472  xmm7 = xmm7 + a3 * b2;
1473  xmm8 = xmm8 + a4 * b2;
1474  }
1475  (~C).store( i , j , xmm1 );
1476  (~C).store( i+IT::size , j , xmm2 );
1477  (~C).store( i+IT::size*2UL, j , xmm3 );
1478  (~C).store( i+IT::size*3UL, j , xmm4 );
1479  (~C).store( i , j+1UL, xmm5 );
1480  (~C).store( i+IT::size , j+1UL, xmm6 );
1481  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
1482  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
1483  }
1484  if( j < N ) {
1485  IntrinsicType xmm1( (~C).load(i ,j) );
1486  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
1487  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
1488  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
1489  for( size_t k=0UL; k<K; ++k ) {
1490  const IntrinsicType b1( set( B(k,j) ) );
1491  xmm1 = xmm1 + A.load(i ,k) * b1;
1492  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1493  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1494  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1495  }
1496  (~C).store( i , j, xmm1 );
1497  (~C).store( i+IT::size , j, xmm2 );
1498  (~C).store( i+IT::size*2UL, j, xmm3 );
1499  (~C).store( i+IT::size*3UL, j, xmm4 );
1500  }
1501  }
1502  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1503  size_t j( 0UL );
1504  for( ; (j+2UL) <= N; j+=2UL ) {
1505  IntrinsicType xmm1( (~C).load(i ,j ) );
1506  IntrinsicType xmm2( (~C).load(i+IT::size,j ) );
1507  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
1508  IntrinsicType xmm4( (~C).load(i+IT::size,j+1UL) );
1509  for( size_t k=0UL; k<K; ++k ) {
1510  const IntrinsicType a1( A.load(i ,k) );
1511  const IntrinsicType a2( A.load(i+IT::size,k) );
1512  const IntrinsicType b1( set( B(k,j ) ) );
1513  const IntrinsicType b2( set( B(k,j+1UL) ) );
1514  xmm1 = xmm1 + a1 * b1;
1515  xmm2 = xmm2 + a2 * b1;
1516  xmm3 = xmm3 + a1 * b2;
1517  xmm4 = xmm4 + a2 * b2;
1518  }
1519  (~C).store( i , j , xmm1 );
1520  (~C).store( i+IT::size, j , xmm2 );
1521  (~C).store( i , j+1UL, xmm3 );
1522  (~C).store( i+IT::size, j+1UL, xmm4 );
1523  }
1524  if( j < N ) {
1525  IntrinsicType xmm1( (~C).load(i ,j) );
1526  IntrinsicType xmm2( (~C).load(i+IT::size,j) );
1527  for( size_t k=0UL; k<K; ++k ) {
1528  const IntrinsicType b1( set( B(k,j) ) );
1529  xmm1 = xmm1 + A.load(i ,k) * b1;
1530  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
1531  }
1532  (~C).store( i , j, xmm1 );
1533  (~C).store( i+IT::size, j, xmm2 );
1534  }
1535  }
1536  if( i < M ) {
1537  size_t j( 0UL );
1538  for( ; (j+2UL) <= N; j+=2UL ) {
1539  IntrinsicType xmm1( (~C).load(i,j ) );
1540  IntrinsicType xmm2( (~C).load(i,j+1UL) );
1541  for( size_t k=0UL; k<K; ++k ) {
1542  const IntrinsicType a1( A.load(i,k) );
1543  xmm1 = xmm1 + a1 * set( B(k,j ) );
1544  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
1545  }
1546  (~C).store( i, j , xmm1 );
1547  (~C).store( i, j+1UL, xmm2 );
1548  }
1549  if( j < N ) {
1550  IntrinsicType xmm1( (~C).load(i,j) );
1551  for( size_t k=0UL; k<K; ++k ) {
1552  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
1553  }
1554  (~C).store( i, j, xmm1 );
1555  }
1556  }
1557  }
1559  //**********************************************************************************************
1560 
1561  //**BLAS-based addition assignment to dense matrices (default)**********************************
1575  template< typename MT3 // Type of the left-hand side target matrix
1576  , typename MT4 // Type of the left-hand side matrix operand
1577  , typename MT5 > // Type of the right-hand side matrix operand
1578  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1579  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1580  {
1581  selectDefaultAddAssignKernel( C, A, B );
1582  }
1584  //**********************************************************************************************
1585 
1586  //**BLAS-based addition assignment to dense matrices (single precision)*************************
1587 #if BLAZE_BLAS_MODE
1588 
1601  template< typename MT3 // Type of the left-hand side target matrix
1602  , typename MT4 // Type of the left-hand side matrix operand
1603  , typename MT5 > // Type of the right-hand side matrix operand
1604  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1605  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1606  {
1607  using boost::numeric_cast;
1608 
1612 
1613  const int M ( numeric_cast<int>( A.rows() ) );
1614  const int N ( numeric_cast<int>( B.columns() ) );
1615  const int K ( numeric_cast<int>( A.columns() ) );
1616  const int lda( numeric_cast<int>( A.spacing() ) );
1617  const int ldb( numeric_cast<int>( B.spacing() ) );
1618  const int ldc( numeric_cast<int>( C.spacing() ) );
1619 
1620  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1621  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1622  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1623  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1624  }
1626 #endif
1627  //**********************************************************************************************
1628 
1629  //**BLAS-based addition assignment to dense matrices (double precision)*************************
1630 #if BLAZE_BLAS_MODE
1631 
1644  template< typename MT3 // Type of the left-hand side target matrix
1645  , typename MT4 // Type of the left-hand side matrix operand
1646  , typename MT5 > // Type of the right-hand side matrix operand
1647  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1648  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1649  {
1650  using boost::numeric_cast;
1651 
1655 
1656  const int M ( numeric_cast<int>( A.rows() ) );
1657  const int N ( numeric_cast<int>( B.columns() ) );
1658  const int K ( numeric_cast<int>( A.columns() ) );
1659  const int lda( numeric_cast<int>( A.spacing() ) );
1660  const int ldb( numeric_cast<int>( B.spacing() ) );
1661  const int ldc( numeric_cast<int>( C.spacing() ) );
1662 
1663  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1664  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1665  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1666  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1667  }
1669 #endif
1670  //**********************************************************************************************
1671 
1672  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
1673 #if BLAZE_BLAS_MODE
1674 
1687  template< typename MT3 // Type of the left-hand side target matrix
1688  , typename MT4 // Type of the left-hand side matrix operand
1689  , typename MT5 > // Type of the right-hand side matrix operand
1690  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1691  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1692  {
1693  using boost::numeric_cast;
1694 
1698  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
1699  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
1700  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
1701 
1702  const int M ( numeric_cast<int>( A.rows() ) );
1703  const int N ( numeric_cast<int>( B.columns() ) );
1704  const int K ( numeric_cast<int>( A.columns() ) );
1705  const int lda( numeric_cast<int>( A.spacing() ) );
1706  const int ldb( numeric_cast<int>( B.spacing() ) );
1707  const int ldc( numeric_cast<int>( C.spacing() ) );
1708  const complex<float> alpha( 1.0F, 0.0F );
1709  const complex<float> beta ( 1.0F, 0.0F );
1710 
1711  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1712  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1713  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1714  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1715  }
1717 #endif
1718  //**********************************************************************************************
1719 
1720  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
1721 #if BLAZE_BLAS_MODE
1722 
1735  template< typename MT3 // Type of the left-hand side target matrix
1736  , typename MT4 // Type of the left-hand side matrix operand
1737  , typename MT5 > // Type of the right-hand side matrix operand
1738  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1739  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1740  {
1741  using boost::numeric_cast;
1742 
1746  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1747  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1748  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1749 
1750  const int M ( numeric_cast<int>( A.rows() ) );
1751  const int N ( numeric_cast<int>( B.columns() ) );
1752  const int K ( numeric_cast<int>( A.columns() ) );
1753  const int lda( numeric_cast<int>( A.spacing() ) );
1754  const int ldb( numeric_cast<int>( B.spacing() ) );
1755  const int ldc( numeric_cast<int>( C.spacing() ) );
1756  const complex<double> alpha( 1.0, 0.0 );
1757  const complex<double> beta ( 1.0, 0.0 );
1758 
1759  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1760  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1761  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1762  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1763  }
1765 #endif
1766  //**********************************************************************************************
1767 
1768  //**Addition assignment to sparse matrices******************************************************
1769  // No special implementation for the addition assignment to sparse matrices.
1770  //**********************************************************************************************
1771 
1772  //**Subtraction assignment to dense matrices****************************************************
1785  template< typename MT // Type of the target dense matrix
1786  , bool SO > // Storage order of the target dense matrix
1787  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
1788  {
1790 
1791  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1792  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1793 
1794  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1795  return;
1796  }
1797 
1798  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
1799  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
1800 
1801  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1802  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1803  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1804  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1805  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1806  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1807 
1808  TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
1809  }
1811  //**********************************************************************************************
1812 
1813  //**Subtraction assignment to dense matrices (kernel selection)*********************************
1824  template< typename MT3 // Type of the left-hand side target matrix
1825  , typename MT4 // Type of the left-hand side matrix operand
1826  , typename MT5 > // Type of the right-hand side matrix operand
1827  static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
1828  selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1829  {
1830  if( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD )
1831  TDMatDMatMultExpr::selectDefaultSubAssignKernel( C, A, B );
1832  else
1833  TDMatDMatMultExpr::selectBlasSubAssignKernel( C, A, B );
1834  }
1836  //**********************************************************************************************
1837 
1838  //**Subtraction assignment to dense matrices (kernel selection)*********************************
1849  template< typename MT3 // Type of the left-hand side target matrix
1850  , typename MT4 // Type of the left-hand side matrix operand
1851  , typename MT5 > // Type of the right-hand side matrix operand
1852  static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5> >::Type
1853  selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1854  {
1855  smpSubAssign( C, A * B );
1856  }
1858  //**********************************************************************************************
1859 
1860  //**Default subtraction assignment to dense matrices********************************************
1874  template< typename MT3 // Type of the left-hand side target matrix
1875  , typename MT4 // Type of the left-hand side matrix operand
1876  , typename MT5 > // Type of the right-hand side matrix operand
1877  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1878  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1879  {
1880  const size_t M( A.rows() );
1881  const size_t N( B.columns() );
1882  const size_t K( A.columns() );
1883 
1884  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1885  const size_t end( N & size_t(-2) );
1886 
1887  for( size_t i=0UL; i<M; ++i ) {
1888  for( size_t k=0UL; k<K; ++k ) {
1889  for( size_t j=0UL; j<end; j+=2UL ) {
1890  C(i,j ) -= A(i,k) * B(k,j );
1891  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1892  }
1893  if( end < N ) {
1894  C(i,end) -= A(i,k) * B(k,end);
1895  }
1896  }
1897  }
1898  }
1900  //**********************************************************************************************
1901 
1902  //**Vectorized default subtraction assignment to row-major dense matrices***********************
1916  template< typename MT3 // Type of the left-hand side target matrix
1917  , typename MT4 // Type of the left-hand side matrix operand
1918  , typename MT5 > // Type of the right-hand side matrix operand
1919  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1920  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1921  {
1922  typedef IntrinsicTrait<ElementType> IT;
1923 
1924  const size_t M( A.rows() );
1925  const size_t N( B.columns() );
1926  const size_t K( A.columns() );
1927 
1928  size_t j( 0UL );
1929 
1930  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1931  for( size_t i=0UL; i<M; ++i ) {
1932  IntrinsicType xmm1( (~C).load(i,j ) );
1933  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
1934  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
1935  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
1936  IntrinsicType xmm5( (~C).load(i,j+IT::size*4UL) );
1937  IntrinsicType xmm6( (~C).load(i,j+IT::size*5UL) );
1938  IntrinsicType xmm7( (~C).load(i,j+IT::size*6UL) );
1939  IntrinsicType xmm8( (~C).load(i,j+IT::size*7UL) );
1940  for( size_t k=0UL; k<K; ++k ) {
1941  const IntrinsicType a1( set( A(i,k) ) );
1942  xmm1 = xmm1 - a1 * B.load(k,j );
1943  xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
1944  xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
1945  xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
1946  xmm5 = xmm5 - a1 * B.load(k,j+IT::size*4UL);
1947  xmm6 = xmm6 - a1 * B.load(k,j+IT::size*5UL);
1948  xmm7 = xmm7 - a1 * B.load(k,j+IT::size*6UL);
1949  xmm8 = xmm8 - a1 * B.load(k,j+IT::size*7UL);
1950  }
1951  (~C).store( i, j , xmm1 );
1952  (~C).store( i, j+IT::size , xmm2 );
1953  (~C).store( i, j+IT::size*2UL, xmm3 );
1954  (~C).store( i, j+IT::size*3UL, xmm4 );
1955  (~C).store( i, j+IT::size*4UL, xmm5 );
1956  (~C).store( i, j+IT::size*5UL, xmm6 );
1957  (~C).store( i, j+IT::size*6UL, xmm7 );
1958  (~C).store( i, j+IT::size*7UL, xmm8 );
1959  }
1960  }
1961  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1962  size_t i( 0UL );
1963  for( ; (i+2UL) <= M; i+=2UL ) {
1964  IntrinsicType xmm1( (~C).load(i ,j ) );
1965  IntrinsicType xmm2( (~C).load(i ,j+IT::size ) );
1966  IntrinsicType xmm3( (~C).load(i ,j+IT::size*2UL) );
1967  IntrinsicType xmm4( (~C).load(i ,j+IT::size*3UL) );
1968  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1969  IntrinsicType xmm6( (~C).load(i+1UL,j+IT::size ) );
1970  IntrinsicType xmm7( (~C).load(i+1UL,j+IT::size*2UL) );
1971  IntrinsicType xmm8( (~C).load(i+1UL,j+IT::size*3UL) );
1972  for( size_t k=0UL; k<K; ++k ) {
1973  const IntrinsicType a1( set( A(i ,k) ) );
1974  const IntrinsicType a2( set( A(i+1UL,k) ) );
1975  const IntrinsicType b1( B.load(k,j ) );
1976  const IntrinsicType b2( B.load(k,j+IT::size ) );
1977  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
1978  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
1979  xmm1 = xmm1 - a1 * b1;
1980  xmm2 = xmm2 - a1 * b2;
1981  xmm3 = xmm3 - a1 * b3;
1982  xmm4 = xmm4 - a1 * b4;
1983  xmm5 = xmm5 - a2 * b1;
1984  xmm6 = xmm6 - a2 * b2;
1985  xmm7 = xmm7 - a2 * b3;
1986  xmm8 = xmm8 - a2 * b4;
1987  }
1988  (~C).store( i , j , xmm1 );
1989  (~C).store( i , j+IT::size , xmm2 );
1990  (~C).store( i , j+IT::size*2UL, xmm3 );
1991  (~C).store( i , j+IT::size*3UL, xmm4 );
1992  (~C).store( i+1UL, j , xmm5 );
1993  (~C).store( i+1UL, j+IT::size , xmm6 );
1994  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
1995  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
1996  }
1997  if( i < M ) {
1998  IntrinsicType xmm1( (~C).load(i,j ) );
1999  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
2000  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
2001  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
2002  for( size_t k=0UL; k<K; ++k ) {
2003  const IntrinsicType a1( set( A(i,k) ) );
2004  xmm1 = xmm1 - a1 * B.load(k,j );
2005  xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
2006  xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
2007  xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
2008  }
2009  (~C).store( i, j , xmm1 );
2010  (~C).store( i, j+IT::size , xmm2 );
2011  (~C).store( i, j+IT::size*2UL, xmm3 );
2012  (~C).store( i, j+IT::size*3UL, xmm4 );
2013  }
2014  }
2015  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
2016  size_t i( 0UL );
2017  for( ; (i+2UL) <= M; i+=2UL ) {
2018  IntrinsicType xmm1( (~C).load(i ,j ) );
2019  IntrinsicType xmm2( (~C).load(i ,j+IT::size) );
2020  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
2021  IntrinsicType xmm4( (~C).load(i+1UL,j+IT::size) );
2022  for( size_t k=0UL; k<K; ++k ) {
2023  const IntrinsicType a1( set( A(i ,k) ) );
2024  const IntrinsicType a2( set( A(i+1UL,k) ) );
2025  const IntrinsicType b1( B.load(k,j ) );
2026  const IntrinsicType b2( B.load(k,j+IT::size) );
2027  xmm1 = xmm1 - a1 * b1;
2028  xmm2 = xmm2 - a1 * b2;
2029  xmm3 = xmm3 - a2 * b1;
2030  xmm4 = xmm4 - a2 * b2;
2031  }
2032  (~C).store( i , j , xmm1 );
2033  (~C).store( i , j+IT::size, xmm2 );
2034  (~C).store( i+1UL, j , xmm3 );
2035  (~C).store( i+1UL, j+IT::size, xmm4 );
2036  }
2037  if( i < M ) {
2038  IntrinsicType xmm1( (~C).load(i,j ) );
2039  IntrinsicType xmm2( (~C).load(i,j+IT::size) );
2040  for( size_t k=0UL; k<K; ++k ) {
2041  const IntrinsicType a1( set( A(i,k) ) );
2042  xmm1 = xmm1 - a1 * B.load(k,j );
2043  xmm2 = xmm2 - a1 * B.load(k,j+IT::size);
2044  }
2045  (~C).store( i, j , xmm1 );
2046  (~C).store( i, j+IT::size, xmm2 );
2047  }
2048  }
2049  if( j < N ) {
2050  size_t i( 0UL );
2051  for( ; (i+2UL) <= M; i+=2UL ) {
2052  IntrinsicType xmm1( (~C).load(i ,j) );
2053  IntrinsicType xmm2( (~C).load(i+1UL,j) );
2054  for( size_t k=0UL; k<K; ++k ) {
2055  const IntrinsicType b1( B.load(k,j) );
2056  xmm1 = xmm1 - set( A(i ,k) ) * b1;
2057  xmm2 = xmm2 - set( A(i+1UL,k) ) * b1;
2058  }
2059  (~C).store( i , j, xmm1 );
2060  (~C).store( i+1UL, j, xmm2 );
2061  }
2062  if( i < M ) {
2063  IntrinsicType xmm1( (~C).load(i,j) );
2064  for( size_t k=0UL; k<K; ++k ) {
2065  xmm1 = xmm1 - set( A(i,k) ) * B.load(k,j);
2066  }
2067  (~C).store( i, j, xmm1 );
2068  }
2069  }
2070  }
2072  //**********************************************************************************************
2073 
2074  //**Vectorized default subtraction assignment to column-major dense matrices********************
2088  template< typename MT3 // Type of the left-hand side target matrix
2089  , typename MT4 // Type of the left-hand side matrix operand
2090  , typename MT5 > // Type of the right-hand side matrix operand
2091  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2092  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2093  {
2094  typedef IntrinsicTrait<ElementType> IT;
2095 
2096  const size_t M( A.rows() );
2097  const size_t N( B.columns() );
2098  const size_t K( A.columns() );
2099 
2100  size_t i( 0UL );
2101 
2102  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2103  for( size_t j=0UL; j<N; ++j ) {
2104  IntrinsicType xmm1( (~C).load(i ,j) );
2105  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
2106  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
2107  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
2108  IntrinsicType xmm5( (~C).load(i+IT::size*4UL,j) );
2109  IntrinsicType xmm6( (~C).load(i+IT::size*5UL,j) );
2110  IntrinsicType xmm7( (~C).load(i+IT::size*6UL,j) );
2111  IntrinsicType xmm8( (~C).load(i+IT::size*7UL,j) );
2112  for( size_t k=0UL; k<K; ++k ) {
2113  const IntrinsicType b1( set( B(k,j) ) );
2114  xmm1 = xmm1 - A.load(i ,k) * b1;
2115  xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
2116  xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
2117  xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
2118  xmm5 = xmm5 - A.load(i+IT::size*4UL,k) * b1;
2119  xmm6 = xmm6 - A.load(i+IT::size*5UL,k) * b1;
2120  xmm7 = xmm7 - A.load(i+IT::size*6UL,k) * b1;
2121  xmm8 = xmm8 - A.load(i+IT::size*7UL,k) * b1;
2122  }
2123  (~C).store( i , j, xmm1 );
2124  (~C).store( i+IT::size , j, xmm2 );
2125  (~C).store( i+IT::size*2UL, j, xmm3 );
2126  (~C).store( i+IT::size*3UL, j, xmm4 );
2127  (~C).store( i+IT::size*4UL, j, xmm5 );
2128  (~C).store( i+IT::size*5UL, j, xmm6 );
2129  (~C).store( i+IT::size*6UL, j, xmm7 );
2130  (~C).store( i+IT::size*7UL, j, xmm8 );
2131  }
2132  }
2133  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2134  size_t j( 0UL );
2135  for( ; (j+2UL) <= N; j+=2UL ) {
2136  IntrinsicType xmm1( (~C).load(i ,j ) );
2137  IntrinsicType xmm2( (~C).load(i+IT::size ,j ) );
2138  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j ) );
2139  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j ) );
2140  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
2141  IntrinsicType xmm6( (~C).load(i+IT::size ,j+1UL) );
2142  IntrinsicType xmm7( (~C).load(i+IT::size*2UL,j+1UL) );
2143  IntrinsicType xmm8( (~C).load(i+IT::size*3UL,j+1UL) );
2144  for( size_t k=0UL; k<K; ++k ) {
2145  const IntrinsicType a1( A.load(i ,k) );
2146  const IntrinsicType a2( A.load(i+IT::size ,k) );
2147  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
2148  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
2149  const IntrinsicType b1( set( B(k,j ) ) );
2150  const IntrinsicType b2( set( B(k,j+1UL) ) );
2151  xmm1 = xmm1 - a1 * b1;
2152  xmm2 = xmm2 - a2 * b1;
2153  xmm3 = xmm3 - a3 * b1;
2154  xmm4 = xmm4 - a4 * b1;
2155  xmm5 = xmm5 - a1 * b2;
2156  xmm6 = xmm6 - a2 * b2;
2157  xmm7 = xmm7 - a3 * b2;
2158  xmm8 = xmm8 - a4 * b2;
2159  }
2160  (~C).store( i , j , xmm1 );
2161  (~C).store( i+IT::size , j , xmm2 );
2162  (~C).store( i+IT::size*2UL, j , xmm3 );
2163  (~C).store( i+IT::size*3UL, j , xmm4 );
2164  (~C).store( i , j+1UL, xmm5 );
2165  (~C).store( i+IT::size , j+1UL, xmm6 );
2166  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
2167  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
2168  }
2169  if( j < N ) {
2170  IntrinsicType xmm1( (~C).load(i ,j) );
2171  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
2172  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
2173  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
2174  for( size_t k=0UL; k<K; ++k ) {
2175  const IntrinsicType b1( set( B(k,j) ) );
2176  xmm1 = xmm1 - A.load(i ,k) * b1;
2177  xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
2178  xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
2179  xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
2180  }
2181  (~C).store( i , j, xmm1 );
2182  (~C).store( i+IT::size , j, xmm2 );
2183  (~C).store( i+IT::size*2UL, j, xmm3 );
2184  (~C).store( i+IT::size*3UL, j, xmm4 );
2185  }
2186  }
2187  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2188  size_t j( 0UL );
2189  for( ; (j+2UL) <= N; j+=2UL ) {
2190  IntrinsicType xmm1( (~C).load(i ,j ) );
2191  IntrinsicType xmm2( (~C).load(i+IT::size,j ) );
2192  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2193  IntrinsicType xmm4( (~C).load(i+IT::size,j+1UL) );
2194  for( size_t k=0UL; k<K; ++k ) {
2195  const IntrinsicType a1( A.load(i ,k) );
2196  const IntrinsicType a2( A.load(i+IT::size,k) );
2197  const IntrinsicType b1( set( B(k,j ) ) );
2198  const IntrinsicType b2( set( B(k,j+1UL) ) );
2199  xmm1 = xmm1 - a1 * b1;
2200  xmm2 = xmm2 - a2 * b1;
2201  xmm3 = xmm3 - a1 * b2;
2202  xmm4 = xmm4 - a2 * b2;
2203  }
2204  (~C).store( i , j , xmm1 );
2205  (~C).store( i+IT::size, j , xmm2 );
2206  (~C).store( i , j+1UL, xmm3 );
2207  (~C).store( i+IT::size, j+1UL, xmm4 );
2208  }
2209  if( j < N ) {
2210  IntrinsicType xmm1( (~C).load(i ,j) );
2211  IntrinsicType xmm2( (~C).load(i+IT::size,j) );
2212  for( size_t k=0UL; k<K; ++k ) {
2213  const IntrinsicType b1( set( B(k,j) ) );
2214  xmm1 = xmm1 - A.load(i ,k) * b1;
2215  xmm2 = xmm2 - A.load(i+IT::size,k) * b1;
2216  }
2217  (~C).store( i , j, xmm1 );
2218  (~C).store( i+IT::size, j, xmm2 );
2219  }
2220  }
2221  if( i < M ) {
2222  size_t j( 0UL );
2223  for( ; (j+2UL) <= N; j+=2UL ) {
2224  IntrinsicType xmm1( (~C).load(i,j ) );
2225  IntrinsicType xmm2( (~C).load(i,j+1UL) );
2226  for( size_t k=0UL; k<K; ++k ) {
2227  const IntrinsicType a1( A.load(i,k) );
2228  xmm1 = xmm1 - a1 * set( B(k,j ) );
2229  xmm2 = xmm2 - a1 * set( B(k,j+1UL) );
2230  }
2231  (~C).store( i, j , xmm1 );
2232  (~C).store( i, j+1UL, xmm2 );
2233  }
2234  if( j < N ) {
2235  IntrinsicType xmm1( (~C).load(i,j) );
2236  for( size_t k=0UL; k<K; ++k ) {
2237  xmm1 = xmm1 - A.load(i,k) * set( B(k,j) );
2238  }
2239  (~C).store( i, j, xmm1 );
2240  }
2241  }
2242  }
2244  //**********************************************************************************************
2245 
2246  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
2260  template< typename MT3 // Type of the left-hand side target matrix
2261  , typename MT4 // Type of the left-hand side matrix operand
2262  , typename MT5 > // Type of the right-hand side matrix operand
2263  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2264  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2265  {
2266  selectDefaultSubAssignKernel( C, A, B );
2267  }
2269  //**********************************************************************************************
2270 
2271  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
2272 #if BLAZE_BLAS_MODE
2273 
2286  template< typename MT3 // Type of the left-hand side target matrix
2287  , typename MT4 // Type of the left-hand side matrix operand
2288  , typename MT5 > // Type of the right-hand side matrix operand
2289  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2290  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2291  {
2292  using boost::numeric_cast;
2293 
2297 
2298  const int M ( numeric_cast<int>( A.rows() ) );
2299  const int N ( numeric_cast<int>( B.columns() ) );
2300  const int K ( numeric_cast<int>( A.columns() ) );
2301  const int lda( numeric_cast<int>( A.spacing() ) );
2302  const int ldb( numeric_cast<int>( B.spacing() ) );
2303  const int ldc( numeric_cast<int>( C.spacing() ) );
2304 
2305  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2306  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2307  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2308  M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2309  }
2311 #endif
2312  //**********************************************************************************************
2313 
2314  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
2315 #if BLAZE_BLAS_MODE
2316 
2329  template< typename MT3 // Type of the left-hand side target matrix
2330  , typename MT4 // Type of the left-hand side matrix operand
2331  , typename MT5 > // Type of the right-hand side matrix operand
2332  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2333  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2334  {
2335  using boost::numeric_cast;
2336 
2340 
2341  const int M ( numeric_cast<int>( A.rows() ) );
2342  const int N ( numeric_cast<int>( B.columns() ) );
2343  const int K ( numeric_cast<int>( A.columns() ) );
2344  const int lda( numeric_cast<int>( A.spacing() ) );
2345  const int ldb( numeric_cast<int>( B.spacing() ) );
2346  const int ldc( numeric_cast<int>( C.spacing() ) );
2347 
2348  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2349  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2350  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2351  M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2352  }
2354 #endif
2355  //**********************************************************************************************
2356 
2357  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
2358 #if BLAZE_BLAS_MODE
2359 
2372  template< typename MT3 // Type of the left-hand side target matrix
2373  , typename MT4 // Type of the left-hand side matrix operand
2374  , typename MT5 > // Type of the right-hand side matrix operand
2375  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2376  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2377  {
2378  using boost::numeric_cast;
2379 
2383  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
2384  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
2385  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
2386 
2387  const int M ( numeric_cast<int>( A.rows() ) );
2388  const int N ( numeric_cast<int>( B.columns() ) );
2389  const int K ( numeric_cast<int>( A.columns() ) );
2390  const int lda( numeric_cast<int>( A.spacing() ) );
2391  const int ldb( numeric_cast<int>( B.spacing() ) );
2392  const int ldc( numeric_cast<int>( C.spacing() ) );
2393  const complex<float> alpha( -1.0F, 0.0F );
2394  const complex<float> beta ( 1.0F, 0.0F );
2395 
2396  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2397  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2398  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2399  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2400  }
2402 #endif
2403  //**********************************************************************************************
2404 
2405  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
2406 #if BLAZE_BLAS_MODE
2407 
2420  template< typename MT3 // Type of the left-hand side target matrix
2421  , typename MT4 // Type of the left-hand side matrix operand
2422  , typename MT5 > // Type of the right-hand side matrix operand
2423  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2424  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2425  {
2426  using boost::numeric_cast;
2427 
2431  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
2432  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
2433  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
2434 
2435  const int M ( numeric_cast<int>( A.rows() ) );
2436  const int N ( numeric_cast<int>( B.columns() ) );
2437  const int K ( numeric_cast<int>( A.columns() ) );
2438  const int lda( numeric_cast<int>( A.spacing() ) );
2439  const int ldb( numeric_cast<int>( B.spacing() ) );
2440  const int ldc( numeric_cast<int>( C.spacing() ) );
2441  const complex<double> alpha( -1.0, 0.0 );
2442  const complex<double> beta ( 1.0, 0.0 );
2443 
2444  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2445  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2446  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2447  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2448  }
2450 #endif
2451  //**********************************************************************************************
2452 
2453  //**Subtraction assignment to sparse matrices***************************************************
2454  // No special implementation for the subtraction assignment to sparse matrices.
2455  //**********************************************************************************************
2456 
2457  //**Multiplication assignment to dense matrices*************************************************
2458  // No special implementation for the multiplication assignment to dense matrices.
2459  //**********************************************************************************************
2460 
2461  //**Multiplication assignment to sparse matrices************************************************
2462  // No special implementation for the multiplication assignment to sparse matrices.
2463  //**********************************************************************************************
2464 
2465  //**Compile time checks*************************************************************************
2472  //**********************************************************************************************
2473 };
2474 //*************************************************************************************************
2475 
2476 
2477 
2478 
2479 //=================================================================================================
2480 //
2481 // DMATSCALARMULTEXPR SPECIALIZATION
2482 //
2483 //=================================================================================================
2484 
2485 //*************************************************************************************************
2493 template< typename MT1 // Type of the left-hand side dense matrix
2494  , typename MT2 // Type of the right-hand side dense matrix
2495  , typename ST > // Type of the right-hand side scalar value
2496 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >
2497  : public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2498  , private MatScalarMultExpr
2499  , private Computation
2500 {
2501  private:
2502  //**Type definitions****************************************************************************
2503  typedef TDMatDMatMultExpr<MT1,MT2> MMM;
2504  typedef typename MMM::ResultType RES;
2505  typedef typename MT1::ResultType RT1;
2506  typedef typename MT2::ResultType RT2;
2507  typedef typename RT1::ElementType ET1;
2508  typedef typename RT2::ElementType ET2;
2509  typedef typename MT1::CompositeType CT1;
2510  typedef typename MT2::CompositeType CT2;
2511  //**********************************************************************************************
2512 
2513  //**********************************************************************************************
2515  enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
2516  //**********************************************************************************************
2517 
2518  //**********************************************************************************************
2520  enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
2521  //**********************************************************************************************
2522 
2523  //**********************************************************************************************
2525 
2527  template< typename T1, typename T2, typename T3, typename T4 >
2528  struct UseSMPAssignKernel {
2529  enum { value = evaluateLeft || evaluateRight };
2530  };
2531  //**********************************************************************************************
2532 
2533  //**********************************************************************************************
2535 
2538  template< typename T1, typename T2, typename T3, typename T4 >
2539  struct UseSinglePrecisionKernel {
2540  enum { value = IsFloat<typename T1::ElementType>::value &&
2541  IsFloat<typename T2::ElementType>::value &&
2542  IsFloat<typename T3::ElementType>::value &&
2543  !IsComplex<T4>::value };
2544  };
2545  //**********************************************************************************************
2546 
2547  //**********************************************************************************************
2549 
2552  template< typename T1, typename T2, typename T3, typename T4 >
2553  struct UseDoublePrecisionKernel {
2554  enum { value = IsDouble<typename T1::ElementType>::value &&
2555  IsDouble<typename T2::ElementType>::value &&
2556  IsDouble<typename T3::ElementType>::value &&
2557  !IsComplex<T4>::value };
2558  };
2559  //**********************************************************************************************
2560 
2561  //**********************************************************************************************
2563 
2566  template< typename T1, typename T2, typename T3 >
2567  struct UseSinglePrecisionComplexKernel {
2568  typedef complex<float> Type;
2569  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2570  IsSame<typename T2::ElementType,Type>::value &&
2571  IsSame<typename T3::ElementType,Type>::value };
2572  };
2573  //**********************************************************************************************
2574 
2575  //**********************************************************************************************
2577 
2580  template< typename T1, typename T2, typename T3 >
2581  struct UseDoublePrecisionComplexKernel {
2582  typedef complex<double> Type;
2583  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2584  IsSame<typename T2::ElementType,Type>::value &&
2585  IsSame<typename T3::ElementType,Type>::value };
2586  };
2587  //**********************************************************************************************
2588 
2589  //**********************************************************************************************
2591 
2593  template< typename T1, typename T2, typename T3, typename T4 >
2594  struct UseDefaultKernel {
2595  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2596  !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2597  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2598  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2599  };
2600  //**********************************************************************************************
2601 
2602  //**********************************************************************************************
2604 
2606  template< typename T1, typename T2, typename T3, typename T4 >
2607  struct UseVectorizedDefaultKernel {
2608  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2609  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2610  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2611  IsSame<typename T1::ElementType,T4>::value &&
2612  IntrinsicTrait<typename T1::ElementType>::addition &&
2613  IntrinsicTrait<typename T1::ElementType>::subtraction &&
2614  IntrinsicTrait<typename T1::ElementType>::multiplication };
2615  };
2616  //**********************************************************************************************
2617 
2618  public:
2619  //**Type definitions****************************************************************************
2620  typedef DMatScalarMultExpr<MMM,ST,true> This;
2621  typedef typename MultTrait<RES,ST>::Type ResultType;
2622  typedef typename ResultType::OppositeType OppositeType;
2623  typedef typename ResultType::TransposeType TransposeType;
2624  typedef typename ResultType::ElementType ElementType;
2625  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
2626  typedef const ElementType ReturnType;
2627  typedef const ResultType CompositeType;
2628 
2630  typedef const TDMatDMatMultExpr<MT1,MT2> LeftOperand;
2631 
2633  typedef ST RightOperand;
2634 
2636  typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type LT;
2637 
2639  typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type RT;
2640  //**********************************************************************************************
2641 
2642  //**Compilation flags***************************************************************************
2644  enum { vectorizable = MT1::vectorizable && MT2::vectorizable &&
2645  IsSame<ET1,ET2>::value &&
2646  IsSame<ET1,ST>::value &&
2647  IntrinsicTrait<ET1>::addition &&
2648  IntrinsicTrait<ET1>::multiplication };
2649 
2651  enum { smpAssignable = !evaluateLeft && !evaluateRight };
2652  //**********************************************************************************************
2653 
2654  //**Constructor*********************************************************************************
2660  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
2661  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
2662  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2663  {}
2664  //**********************************************************************************************
2665 
2666  //**Access operator*****************************************************************************
2673  inline ResultType operator()( size_t i, size_t j ) const {
2674  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
2675  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
2676  return matrix_(i,j) * scalar_;
2677  }
2678  //**********************************************************************************************
2679 
2680  //**Rows function*******************************************************************************
2685  inline size_t rows() const {
2686  return matrix_.rows();
2687  }
2688  //**********************************************************************************************
2689 
2690  //**Columns function****************************************************************************
2695  inline size_t columns() const {
2696  return matrix_.columns();
2697  }
2698  //**********************************************************************************************
2699 
2700  //**Left operand access*************************************************************************
2705  inline LeftOperand leftOperand() const {
2706  return matrix_;
2707  }
2708  //**********************************************************************************************
2709 
2710  //**Right operand access************************************************************************
2715  inline RightOperand rightOperand() const {
2716  return scalar_;
2717  }
2718  //**********************************************************************************************
2719 
2720  //**********************************************************************************************
2726  template< typename T >
2727  inline bool canAlias( const T* alias ) const {
2728  return matrix_.canAlias( alias );
2729  }
2730  //**********************************************************************************************
2731 
2732  //**********************************************************************************************
2738  template< typename T >
2739  inline bool isAliased( const T* alias ) const {
2740  return matrix_.isAliased( alias );
2741  }
2742  //**********************************************************************************************
2743 
2744  //**********************************************************************************************
2749  inline bool isAligned() const {
2750  return matrix_.isAligned();
2751  }
2752  //**********************************************************************************************
2753 
2754  //**********************************************************************************************
2759  inline bool canSMPAssign() const {
2760  typename MMM::RightOperand B( matrix_.rightOperand() );
2761  return ( !BLAZE_BLAS_IS_PARALLEL ||
2762  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
2763  ( B.columns() > SMP_TDMATDMATMULT_THRESHOLD );
2764  }
2765  //**********************************************************************************************
2766 
2767  private:
2768  //**Member variables****************************************************************************
2769  LeftOperand matrix_;
2770  RightOperand scalar_;
2771  //**********************************************************************************************
2772 
2773  //**Assignment to dense matrices****************************************************************
2782  template< typename MT3 // Type of the target dense matrix
2783  , bool SO > // Storage order of the target dense matrix
2784  friend inline void assign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
2785  {
2787 
2788  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2789  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2790 
2791  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2792  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2793 
2794  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
2795  return;
2796  }
2797  else if( left.columns() == 0UL ) {
2798  reset( ~lhs );
2799  return;
2800  }
2801 
2802  LT A( left ); // Evaluation of the left-hand side dense matrix operand
2803  RT B( right ); // Evaluation of the right-hand side dense matrix operand
2804 
2805  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2806  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
2807  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
2808  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
2809  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2810  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
2811 
2812  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
2813  }
2814  //**********************************************************************************************
2815 
2816  //**Assignment to dense matrices (kernel selection)*********************************************
2827  template< typename MT3 // Type of the left-hand side target matrix
2828  , typename MT4 // Type of the left-hand side matrix operand
2829  , typename MT5 // Type of the right-hand side matrix operand
2830  , typename ST2 > // Type of the scalar value
2831  static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
2832  selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2833  {
2834  if( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD )
2835  DMatScalarMultExpr::selectDefaultAssignKernel( C, A, B, scalar );
2836  else
2837  DMatScalarMultExpr::selectBlasAssignKernel( C, A, B, scalar );
2838  }
2839  //**********************************************************************************************
2840 
2841  //**Assignment to dense matrices (kernel selection)*********************************************
2852  template< typename MT3 // Type of the left-hand side target matrix
2853  , typename MT4 // Type of the left-hand side matrix operand
2854  , typename MT5 // Type of the right-hand side matrix operand
2855  , typename ST2 > // Type of the scalar value
2856  static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
2857  selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2858  {
2859  smpAssign( C, A * B * scalar );
2860  }
2861  //**********************************************************************************************
2862 
2863  //**Default assignment to dense matrices********************************************************
2877  template< typename MT3 // Type of the left-hand side target matrix
2878  , typename MT4 // Type of the left-hand side matrix operand
2879  , typename MT5 // Type of the right-hand side matrix operand
2880  , typename ST2 > // Type of the scalar value
2881  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2882  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2883  {
2884  for( size_t i=0UL; i<A.rows(); ++i ) {
2885  for( size_t k=0UL; k<B.columns(); ++k ) {
2886  C(i,k) = A(i,0UL) * B(0UL,k);
2887  }
2888  for( size_t j=1UL; j<A.columns(); ++j ) {
2889  for( size_t k=0UL; k<B.columns(); ++k ) {
2890  C(i,k) += A(i,j) * B(j,k);
2891  }
2892  }
2893  for( size_t k=0UL; k<B.columns(); ++k ) {
2894  C(i,k) *= scalar;
2895  }
2896  }
2897  }
2898  //**********************************************************************************************
2899 
2900  //**Vectorized default assignment to row-major dense matrices***********************************
2914  template< typename MT3 // Type of the left-hand side target matrix
2915  , typename MT4 // Type of the left-hand side matrix operand
2916  , typename MT5 // Type of the right-hand side matrix operand
2917  , typename ST2 > // Type of the scalar value
2918  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2919  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
2920  {
2921  typedef IntrinsicTrait<ElementType> IT;
2922 
2923  const size_t M( A.rows() );
2924  const size_t N( B.columns() );
2925  const size_t K( A.columns() );
2926 
2927  const IntrinsicType factor( set( scalar ) );
2928 
2929  size_t j( 0UL );
2930 
2931  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2932  for( size_t i=0UL; i<M; ++i ) {
2933  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2934  for( size_t k=0UL; k<K; ++k ) {
2935  const IntrinsicType a1( set( A(i,k) ) );
2936  xmm1 = xmm1 + a1 * B.load(k,j );
2937  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2938  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2939  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2940  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
2941  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
2942  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
2943  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
2944  }
2945  (~C).store( i, j , xmm1 * factor );
2946  (~C).store( i, j+IT::size , xmm2 * factor );
2947  (~C).store( i, j+IT::size*2UL, xmm3 * factor );
2948  (~C).store( i, j+IT::size*3UL, xmm4 * factor );
2949  (~C).store( i, j+IT::size*4UL, xmm5 * factor );
2950  (~C).store( i, j+IT::size*5UL, xmm6 * factor );
2951  (~C).store( i, j+IT::size*6UL, xmm7 * factor );
2952  (~C).store( i, j+IT::size*7UL, xmm8 * factor );
2953  }
2954  }
2955  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
2956  size_t i( 0UL );
2957  for( ; (i+2UL) <= M; i+=2UL ) {
2958  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2959  for( size_t k=0UL; k<K; ++k ) {
2960  const IntrinsicType a1( set( A(i ,k) ) );
2961  const IntrinsicType a2( set( A(i+1UL,k) ) );
2962  const IntrinsicType b1( B.load(k,j ) );
2963  const IntrinsicType b2( B.load(k,j+IT::size ) );
2964  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
2965  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
2966  xmm1 = xmm1 + a1 * b1;
2967  xmm2 = xmm2 + a1 * b2;
2968  xmm3 = xmm3 + a1 * b3;
2969  xmm4 = xmm4 + a1 * b4;
2970  xmm5 = xmm5 + a2 * b1;
2971  xmm6 = xmm6 + a2 * b2;
2972  xmm7 = xmm7 + a2 * b3;
2973  xmm8 = xmm8 + a2 * b4;
2974  }
2975  (~C).store( i , j , xmm1 * factor );
2976  (~C).store( i , j+IT::size , xmm2 * factor );
2977  (~C).store( i , j+IT::size*2UL, xmm3 * factor );
2978  (~C).store( i , j+IT::size*3UL, xmm4 * factor );
2979  (~C).store( i+1UL, j , xmm5 * factor );
2980  (~C).store( i+1UL, j+IT::size , xmm6 * factor );
2981  (~C).store( i+1UL, j+IT::size*2UL, xmm7 * factor );
2982  (~C).store( i+1UL, j+IT::size*3UL, xmm8 * factor );
2983  }
2984  if( i < M ) {
2985  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2986  for( size_t k=0UL; k<K; ++k ) {
2987  const IntrinsicType a1( set( A(i,k) ) );
2988  xmm1 = xmm1 + a1 * B.load(k,j );
2989  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2990  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2991  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2992  }
2993  (~C).store( i, j , xmm1 * factor );
2994  (~C).store( i, j+IT::size , xmm2 * factor );
2995  (~C).store( i, j+IT::size*2UL, xmm3 * factor );
2996  (~C).store( i, j+IT::size*3UL, xmm4 * factor );
2997  }
2998  }
2999  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3000  size_t i( 0UL );
3001  for( ; (i+2UL) <= M; i+=2UL ) {
3002  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3003  for( size_t k=0UL; k<K; ++k ) {
3004  const IntrinsicType a1( set( A(i ,k) ) );
3005  const IntrinsicType a2( set( A(i+1UL,k) ) );
3006  const IntrinsicType b1( B.load(k,j ) );
3007  const IntrinsicType b2( B.load(k,j+IT::size) );
3008  xmm1 = xmm1 + a1 * b1;
3009  xmm2 = xmm2 + a1 * b2;
3010  xmm3 = xmm3 + a2 * b1;
3011  xmm4 = xmm4 + a2 * b2;
3012  }
3013  (~C).store( i , j , xmm1 * factor );
3014  (~C).store( i , j+IT::size, xmm2 * factor );
3015  (~C).store( i+1UL, j , xmm3 * factor );
3016  (~C).store( i+1UL, j+IT::size, xmm4 * factor );
3017  }
3018  if( i < M ) {
3019  IntrinsicType xmm1, xmm2;
3020  for( size_t k=0UL; k<K; ++k ) {
3021  const IntrinsicType a1( set( A(i,k) ) );
3022  xmm1 = xmm1 + a1 * B.load(k,j );
3023  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3024  }
3025  (~C).store( i, j , xmm1 * factor );
3026  (~C).store( i, j+IT::size, xmm2 * factor );
3027  }
3028  }
3029  if( j < N ) {
3030  size_t i( 0UL );
3031  for( ; (i+2UL) <= M; i+=2UL ) {
3032  IntrinsicType xmm1, xmm2;
3033  for( size_t k=0UL; k<K; ++k ) {
3034  const IntrinsicType b1( B.load(k,j) );
3035  xmm1 = xmm1 + set( A(i ,k) ) * b1;
3036  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
3037  }
3038  (~C).store( i , j, xmm1 * factor );
3039  (~C).store( i+1UL, j, xmm2 * factor );
3040  }
3041  if( i < M ) {
3042  IntrinsicType xmm1;
3043  for( size_t k=0UL; k<K; ++k ) {
3044  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
3045  }
3046  (~C).store( i, j, xmm1 * factor );
3047  }
3048  }
3049  }
3050  //**********************************************************************************************
3051 
3052  //**Vectorized default assignment to column-major dense matrices********************************
3066  template< typename MT3 // Type of the left-hand side target matrix
3067  , typename MT4 // Type of the left-hand side matrix operand
3068  , typename MT5 // Type of the right-hand side matrix operand
3069  , typename ST2 > // Type of the scalar value
3070  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3071  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3072  {
3073  typedef IntrinsicTrait<ElementType> IT;
3074 
3075  const size_t M( A.rows() );
3076  const size_t N( B.columns() );
3077  const size_t K( A.columns() );
3078 
3079  const IntrinsicType factor( set( scalar ) );
3080 
3081  size_t i( 0UL );
3082 
3083  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
3084  for( size_t j=0UL; j<N; ++j ) {
3085  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3086  for( size_t k=0UL; k<K; ++k ) {
3087  const IntrinsicType b1( set( B(k,j) ) );
3088  xmm1 = xmm1 + A.load(i ,k) * b1;
3089  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3090  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3091  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3092  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
3093  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
3094  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
3095  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
3096  }
3097  (~C).store( i , j, xmm1 * factor );
3098  (~C).store( i+IT::size , j, xmm2 * factor );
3099  (~C).store( i+IT::size*2UL, j, xmm3 * factor );
3100  (~C).store( i+IT::size*3UL, j, xmm4 * factor );
3101  (~C).store( i+IT::size*4UL, j, xmm5 * factor );
3102  (~C).store( i+IT::size*5UL, j, xmm6 * factor );
3103  (~C).store( i+IT::size*6UL, j, xmm7 * factor );
3104  (~C).store( i+IT::size*7UL, j, xmm8 * factor );
3105  }
3106  }
3107  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
3108  size_t j( 0UL );
3109  for( ; (j+2UL) <= N; j+=2UL ) {
3110  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3111  for( size_t k=0UL; k<K; ++k ) {
3112  const IntrinsicType a1( A.load(i ,k) );
3113  const IntrinsicType a2( A.load(i+IT::size ,k) );
3114  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
3115  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
3116  const IntrinsicType b1( set( B(k,j ) ) );
3117  const IntrinsicType b2( set( B(k,j+1UL) ) );
3118  xmm1 = xmm1 + a1 * b1;
3119  xmm2 = xmm2 + a2 * b1;
3120  xmm3 = xmm3 + a3 * b1;
3121  xmm4 = xmm4 + a4 * b1;
3122  xmm5 = xmm5 + a1 * b2;
3123  xmm6 = xmm6 + a2 * b2;
3124  xmm7 = xmm7 + a3 * b2;
3125  xmm8 = xmm8 + a4 * b2;
3126  }
3127  (~C).store( i , j , xmm1 * factor );
3128  (~C).store( i+IT::size , j , xmm2 * factor );
3129  (~C).store( i+IT::size*2UL, j , xmm3 * factor );
3130  (~C).store( i+IT::size*3UL, j , xmm4 * factor );
3131  (~C).store( i , j+1UL, xmm5 * factor );
3132  (~C).store( i+IT::size , j+1UL, xmm6 * factor );
3133  (~C).store( i+IT::size*2UL, j+1UL, xmm7 * factor );
3134  (~C).store( i+IT::size*3UL, j+1UL, xmm8 * factor );
3135  }
3136  if( j < N ) {
3137  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3138  for( size_t k=0UL; k<K; ++k ) {
3139  const IntrinsicType b1( set( B(k,j) ) );
3140  xmm1 = xmm1 + A.load(i ,k) * b1;
3141  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3142  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3143  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3144  }
3145  (~C).store( i , j, xmm1 * factor );
3146  (~C).store( i+IT::size , j, xmm2 * factor );
3147  (~C).store( i+IT::size*2UL, j, xmm3 * factor );
3148  (~C).store( i+IT::size*3UL, j, xmm4 * factor );
3149  }
3150  }
3151  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
3152  size_t j( 0UL );
3153  for( ; (j+2UL) <= N; j+=2UL ) {
3154  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3155  for( size_t k=0UL; k<K; ++k ) {
3156  const IntrinsicType a1( A.load(i ,k) );
3157  const IntrinsicType a2( A.load(i+IT::size,k) );
3158  const IntrinsicType b1( set( B(k,j ) ) );
3159  const IntrinsicType b2( set( B(k,j+1UL) ) );
3160  xmm1 = xmm1 + a1 * b1;
3161  xmm2 = xmm2 + a2 * b1;
3162  xmm3 = xmm3 + a1 * b2;
3163  xmm4 = xmm4 + a2 * b2;
3164  }
3165  (~C).store( i , j , xmm1 * factor );
3166  (~C).store( i+IT::size, j , xmm2 * factor );
3167  (~C).store( i , j+1UL, xmm3 * factor );
3168  (~C).store( i+IT::size, j+1UL, xmm4 * factor );
3169  }
3170  if( j < N ) {
3171  IntrinsicType xmm1, xmm2;
3172  for( size_t k=0UL; k<K; ++k ) {
3173  const IntrinsicType b1( set( B(k,j) ) );
3174  xmm1 = xmm1 + A.load(i ,k) * b1;
3175  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
3176  }
3177  (~C).store( i , j, xmm1 * factor );
3178  (~C).store( i+IT::size, j, xmm2 * factor );
3179  }
3180  }
3181  if( i < M ) {
3182  size_t j( 0UL );
3183  for( ; (j+2UL) <= N; j+=2UL ) {
3184  IntrinsicType xmm1, xmm2;
3185  for( size_t k=0UL; k<K; ++k ) {
3186  const IntrinsicType a1( A.load(i,k) );
3187  xmm1 = xmm1 + a1 * set( B(k,j ) );
3188  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
3189  }
3190  (~C).store( i, j , xmm1 * factor );
3191  (~C).store( i, j+1UL, xmm2 * factor );
3192  }
3193  if( j < N ) {
3194  IntrinsicType xmm1;
3195  for( size_t k=0UL; k<K; ++k ) {
3196  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
3197  }
3198  (~C).store( i, j, xmm1 * factor );
3199  }
3200  }
3201  }
3202  //**********************************************************************************************
3203 
3204  //**BLAS-based assignment to dense matrices (default)*******************************************
3218  template< typename MT3 // Type of the left-hand side target matrix
3219  , typename MT4 // Type of the left-hand side matrix operand
3220  , typename MT5 // Type of the right-hand side matrix operand
3221  , typename ST2 > // Type of the scalar value
3222  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3223  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3224  {
3225  selectDefaultAssignKernel( C, A, B, scalar );
3226  }
3227  //**********************************************************************************************
3228 
3229  //**BLAS-based assignment to dense matrices (single precision)**********************************
3230 #if BLAZE_BLAS_MODE
3231 
3244  template< typename MT3 // Type of the left-hand side target matrix
3245  , typename MT4 // Type of the left-hand side matrix operand
3246  , typename MT5 // Type of the right-hand side matrix operand
3247  , typename ST2 > // Type of the scalar value
3248  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3249  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3250  {
3251  using boost::numeric_cast;
3252 
3256 
3257  const int M ( numeric_cast<int>( A.rows() ) );
3258  const int N ( numeric_cast<int>( B.columns() ) );
3259  const int K ( numeric_cast<int>( A.columns() ) );
3260  const int lda( numeric_cast<int>( A.spacing() ) );
3261  const int ldb( numeric_cast<int>( B.spacing() ) );
3262  const int ldc( numeric_cast<int>( C.spacing() ) );
3263 
3264  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3265  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3266  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3267  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
3268  }
3269 #endif
3270  //**********************************************************************************************
3271 
3272  //**BLAS-based assignment to dense matrices (double precision)**********************************
3273 #if BLAZE_BLAS_MODE
3274 
3287  template< typename MT3 // Type of the left-hand side target matrix
3288  , typename MT4 // Type of the left-hand side matrix operand
3289  , typename MT5 // Type of the right-hand side matrix operand
3290  , typename ST2 > // Type of the scalar value
3291  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3292  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3293  {
3294  using boost::numeric_cast;
3295 
3299 
3300  const int M ( numeric_cast<int>( A.rows() ) );
3301  const int N ( numeric_cast<int>( B.columns() ) );
3302  const int K ( numeric_cast<int>( A.columns() ) );
3303  const int lda( numeric_cast<int>( A.spacing() ) );
3304  const int ldb( numeric_cast<int>( B.spacing() ) );
3305  const int ldc( numeric_cast<int>( C.spacing() ) );
3306 
3307  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3308  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3309  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3310  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
3311  }
3312 #endif
3313  //**********************************************************************************************
3314 
3315  //**BLAS-based assignment to dense matrices (single precision complex)**************************
3316 #if BLAZE_BLAS_MODE
3317 
3330  template< typename MT3 // Type of the left-hand side target matrix
3331  , typename MT4 // Type of the left-hand side matrix operand
3332  , typename MT5 // Type of the right-hand side matrix operand
3333  , typename ST2 > // Type of the scalar value
3334  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3335  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3336  {
3337  using boost::numeric_cast;
3338 
3342  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3343  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3344  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3345 
3346  const int M ( numeric_cast<int>( A.rows() ) );
3347  const int N ( numeric_cast<int>( B.columns() ) );
3348  const int K ( numeric_cast<int>( A.columns() ) );
3349  const int lda( numeric_cast<int>( A.spacing() ) );
3350  const int ldb( numeric_cast<int>( B.spacing() ) );
3351  const int ldc( numeric_cast<int>( C.spacing() ) );
3352  const complex<float> alpha( scalar );
3353  const complex<float> beta ( 0.0F, 0.0F );
3354 
3355  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3356  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3357  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3358  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3359  }
3360 #endif
3361  //**********************************************************************************************
3362 
3363  //**BLAS-based assignment to dense matrices (double precision complex)**************************
3364 #if BLAZE_BLAS_MODE
3365 
3378  template< typename MT3 // Type of the left-hand side target matrix
3379  , typename MT4 // Type of the left-hand side matrix operand
3380  , typename MT5 // Type of the right-hand side matrix operand
3381  , typename ST2 > // Type of the scalar value
3382  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3383  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3384  {
3385  using boost::numeric_cast;
3386 
3390  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3391  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3392  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3393 
3394  const int M ( numeric_cast<int>( A.rows() ) );
3395  const int N ( numeric_cast<int>( B.columns() ) );
3396  const int K ( numeric_cast<int>( A.columns() ) );
3397  const int lda( numeric_cast<int>( A.spacing() ) );
3398  const int ldb( numeric_cast<int>( B.spacing() ) );
3399  const int ldc( numeric_cast<int>( C.spacing() ) );
3400  const complex<double> alpha( scalar );
3401  const complex<double> beta ( 0.0, 0.0 );
3402 
3403  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3404  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3405  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3406  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3407  }
3408 #endif
3409  //**********************************************************************************************
3410 
3411  //**Assignment to sparse matrices***************************************************************
3423  template< typename MT // Type of the target sparse matrix
3424  , bool SO > // Storage order of the target sparse matrix
3425  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
3426  {
3428 
3429  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3430 
3437 
3438  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3439  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3440 
3441  const TmpType tmp( rhs );
3442  smpAssign( ~lhs, tmp );
3443  }
3444  //**********************************************************************************************
3445 
3446  //**Addition assignment to dense matrices*******************************************************
3458  template< typename MT3 // Type of the target dense matrix
3459  , bool SO > // Storage order of the target dense matrix
3460  friend inline void addAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
3461  {
3463 
3464  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3465  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3466 
3467  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3468  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3469 
3470  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3471  return;
3472  }
3473 
3474  LT A( left ); // Evaluation of the left-hand side dense matrix operand
3475  RT B( right ); // Evaluation of the right-hand side dense matrix operand
3476 
3477  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3478  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3479  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3480  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3481  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3482  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3483 
3484  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3485  }
3486  //**********************************************************************************************
3487 
3488  //**Addition assignment to dense matrices (kernel selection)************************************
3499  template< typename MT3 // Type of the left-hand side target matrix
3500  , typename MT4 // Type of the left-hand side matrix operand
3501  , typename MT5 // Type of the right-hand side matrix operand
3502  , typename ST2 > // Type of the scalar value
3503  static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
3504  selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3505  {
3506  if( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD )
3507  DMatScalarMultExpr::selectDefaultAddAssignKernel( C, A, B, scalar );
3508  else
3509  DMatScalarMultExpr::selectBlasAddAssignKernel( C, A, B, scalar );
3510  }
3511  //**********************************************************************************************
3512 
3513  //**Addition assignment to dense matrices (kernel selection)************************************
3524  template< typename MT3 // Type of the left-hand side target matrix
3525  , typename MT4 // Type of the left-hand side matrix operand
3526  , typename MT5 // Type of the right-hand side matrix operand
3527  , typename ST2 > // Type of the scalar value
3528  static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
3529  selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3530  {
3531  smpAddAssign( C, A * B * scalar );
3532  }
3533  //**********************************************************************************************
3534 
3535  //**Default addition assignment to dense matrices***********************************************
3549  template< typename MT3 // Type of the left-hand side target matrix
3550  , typename MT4 // Type of the left-hand side matrix operand
3551  , typename MT5 // Type of the right-hand side matrix operand
3552  , typename ST2 > // Type of the scalar value
3553  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3554  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3555  {
3556  const ResultType tmp( A * B * scalar );
3557  addAssign( C, tmp );
3558  }
3559  //**********************************************************************************************
3560 
3561  //**Vectorized default addition assignment to row-major dense matrices**************************
3575  template< typename MT3 // Type of the left-hand side target matrix
3576  , typename MT4 // Type of the left-hand side matrix operand
3577  , typename MT5 // Type of the right-hand side matrix operand
3578  , typename ST2 > // Type of the scalar value
3579  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3580  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3581  {
3582  typedef IntrinsicTrait<ElementType> IT;
3583 
3584  const size_t M( A.rows() );
3585  const size_t N( B.columns() );
3586  const size_t K( A.columns() );
3587 
3588  const IntrinsicType factor( set( scalar ) );
3589 
3590  size_t j( 0UL );
3591 
3592  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
3593  for( size_t i=0UL; i<M; ++i ) {
3594  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3595  for( size_t k=0UL; k<K; ++k ) {
3596  const IntrinsicType a1( set( A(i,k) ) );
3597  xmm1 = xmm1 + a1 * B.load(k,j );
3598  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3599  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3600  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3601  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
3602  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
3603  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
3604  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
3605  }
3606  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
3607  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
3608  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
3609  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
3610  (~C).store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) + xmm5 * factor );
3611  (~C).store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) + xmm6 * factor );
3612  (~C).store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) + xmm7 * factor );
3613  (~C).store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) + xmm8 * factor );
3614  }
3615  }
3616  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
3617  size_t i( 0UL );
3618  for( ; (i+2UL) <= M; i+=2UL ) {
3619  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3620  for( size_t k=0UL; k<K; ++k ) {
3621  const IntrinsicType a1( set( A(i ,k) ) );
3622  const IntrinsicType a2( set( A(i+1UL,k) ) );
3623  const IntrinsicType b1( B.load(k,j ) );
3624  const IntrinsicType b2( B.load(k,j+IT::size ) );
3625  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
3626  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
3627  xmm1 = xmm1 + a1 * b1;
3628  xmm2 = xmm2 + a1 * b2;
3629  xmm3 = xmm3 + a1 * b3;
3630  xmm4 = xmm4 + a1 * b4;
3631  xmm5 = xmm5 + a2 * b1;
3632  xmm6 = xmm6 + a2 * b2;
3633  xmm7 = xmm7 + a2 * b3;
3634  xmm8 = xmm8 + a2 * b4;
3635  }
3636  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3637  (~C).store( i , j+IT::size , (~C).load(i ,j+IT::size ) + xmm2 * factor );
3638  (~C).store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) + xmm3 * factor );
3639  (~C).store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) + xmm4 * factor );
3640  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
3641  (~C).store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) + xmm6 * factor );
3642  (~C).store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) + xmm7 * factor );
3643  (~C).store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) + xmm8 * factor );
3644  }
3645  if( i < M ) {
3646  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3647  for( size_t k=0UL; k<K; ++k ) {
3648  const IntrinsicType a1( set( A(i,k) ) );
3649  xmm1 = xmm1 + a1 * B.load(k,j );
3650  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3651  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3652  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3653  }
3654  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
3655  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
3656  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
3657  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
3658  }
3659  }
3660  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3661  size_t i( 0UL );
3662  for( ; (i+2UL) <= M; i+=2UL ) {
3663  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3664  for( size_t k=0UL; k<K; ++k ) {
3665  const IntrinsicType a1( set( A(i ,k) ) );
3666  const IntrinsicType a2( set( A(i+1UL,k) ) );
3667  const IntrinsicType b1( B.load(k,j ) );
3668  const IntrinsicType b2( B.load(k,j+IT::size) );
3669  xmm1 = xmm1 + a1 * b1;
3670  xmm2 = xmm2 + a1 * b2;
3671  xmm3 = xmm3 + a2 * b1;
3672  xmm4 = xmm4 + a2 * b2;
3673  }
3674  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3675  (~C).store( i , j+IT::size, (~C).load(i ,j+IT::size) + xmm2 * factor );
3676  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
3677  (~C).store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) + xmm4 * factor );
3678  }
3679  if( i < M ) {
3680  IntrinsicType xmm1, xmm2;
3681  for( size_t k=0UL; k<K; ++k ) {
3682  const IntrinsicType a1( set( A(i,k) ) );
3683  xmm1 = xmm1 + a1 * B.load(k,j );
3684  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3685  }
3686  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
3687  (~C).store( i, j+IT::size, (~C).load(i,j+IT::size) + xmm2 * factor );
3688  }
3689  }
3690  if( j < N ) {
3691  size_t i( 0UL );
3692  for( ; (i+2UL) <= M; i+=2UL ) {
3693  IntrinsicType xmm1, xmm2;
3694  for( size_t k=0UL; k<K; ++k ) {
3695  const IntrinsicType b1( B.load(k,j) );
3696  xmm1 = xmm1 + set( A(i ,k) ) * b1;
3697  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
3698  }
3699  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
3700  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
3701  }
3702  if( i < M ) {
3703  IntrinsicType xmm1;
3704  for( size_t k=0UL; k<K; ++k ) {
3705  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
3706  }
3707  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
3708  }
3709  }
3710  }
3711  //**********************************************************************************************
3712 
3713  //**Vectorized default addition assignment to column-major dense matrices***********************
3727  template< typename MT3 // Type of the left-hand side target matrix
3728  , typename MT4 // Type of the left-hand side matrix operand
3729  , typename MT5 // Type of the right-hand side matrix operand
3730  , typename ST2 > // Type of the scalar value
3731  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3732  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3733  {
3734  typedef IntrinsicTrait<ElementType> IT;
3735 
3736  const size_t M( A.rows() );
3737  const size_t N( B.columns() );
3738  const size_t K( A.columns() );
3739 
3740  const IntrinsicType factor( set( scalar ) );
3741 
3742  size_t i( 0UL );
3743 
3744  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
3745  for( size_t j=0UL; j<N; ++j ) {
3746  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3747  for( size_t k=0UL; k<K; ++k ) {
3748  const IntrinsicType b1( set( B(k,j) ) );
3749  xmm1 = xmm1 + A.load(i ,k) * b1;
3750  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3751  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3752  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3753  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
3754  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
3755  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
3756  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
3757  }
3758  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
3759  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
3760  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
3761  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
3762  (~C).store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) + xmm5 * factor );
3763  (~C).store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) + xmm6 * factor );
3764  (~C).store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) + xmm7 * factor );
3765  (~C).store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) + xmm8 * factor );
3766  }
3767  }
3768  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
3769  size_t j( 0UL );
3770  for( ; (j+2UL) <= N; j+=2UL ) {
3771  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3772  for( size_t k=0UL; k<K; ++k ) {
3773  const IntrinsicType a1( A.load(i ,k) );
3774  const IntrinsicType a2( A.load(i+IT::size ,k) );
3775  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
3776  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
3777  const IntrinsicType b1( set( B(k,j ) ) );
3778  const IntrinsicType b2( set( B(k,j+1UL) ) );
3779  xmm1 = xmm1 + a1 * b1;
3780  xmm2 = xmm2 + a2 * b1;
3781  xmm3 = xmm3 + a3 * b1;
3782  xmm4 = xmm4 + a4 * b1;
3783  xmm5 = xmm5 + a1 * b2;
3784  xmm6 = xmm6 + a2 * b2;
3785  xmm7 = xmm7 + a3 * b2;
3786  xmm8 = xmm8 + a4 * b2;
3787  }
3788  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3789  (~C).store( i+IT::size , j , (~C).load(i+IT::size ,j ) + xmm2 * factor );
3790  (~C).store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) + xmm3 * factor );
3791  (~C).store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) + xmm4 * factor );
3792  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
3793  (~C).store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) + xmm6 * factor );
3794  (~C).store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) + xmm7 * factor );
3795  (~C).store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) + xmm8 * factor );
3796  }
3797  if( j < N ) {
3798  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3799  for( size_t k=0UL; k<K; ++k ) {
3800  const IntrinsicType b1( set( B(k,j) ) );
3801  xmm1 = xmm1 + A.load(i ,k) * b1;
3802  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3803  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3804  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3805  }
3806  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
3807  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
3808  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
3809  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
3810  }
3811  }
3812  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
3813  size_t j( 0UL );
3814  for( ; (j+2UL) <= N; j+=2UL ) {
3815  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3816  for( size_t k=0UL; k<K; ++k ) {
3817  const IntrinsicType a1( A.load(i ,k) );
3818  const IntrinsicType a2( A.load(i+IT::size,k) );
3819  const IntrinsicType b1( set( B(k,j ) ) );
3820  const IntrinsicType b2( set( B(k,j+1UL) ) );
3821  xmm1 = xmm1 + a1 * b1;
3822  xmm2 = xmm2 + a2 * b1;
3823  xmm3 = xmm3 + a1 * b2;
3824  xmm4 = xmm4 + a2 * b2;
3825  }
3826  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3827  (~C).store( i+IT::size, j , (~C).load(i+IT::size,j ) + xmm2 * factor );
3828  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
3829  (~C).store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) + xmm4 * factor );
3830  }
3831  if( j < N ) {
3832  IntrinsicType xmm1, xmm2;
3833  for( size_t k=0UL; k<K; ++k ) {
3834  const IntrinsicType b1( set( B(k,j) ) );
3835  xmm1 = xmm1 + A.load(i ,k) * b1;
3836  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
3837  }
3838  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
3839  (~C).store( i+IT::size, j, (~C).load(i+IT::size,j) + xmm2 * factor );
3840  }
3841  }
3842  if( i < M ) {
3843  size_t j( 0UL );
3844  for( ; (j+2UL) <= N; j+=2UL ) {
3845  IntrinsicType xmm1, xmm2;
3846  for( size_t k=0UL; k<K; ++k ) {
3847  const IntrinsicType a1( A.load(i,k) );
3848  xmm1 = xmm1 + a1 * set( B(k,j ) );
3849  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
3850  }
3851  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
3852  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
3853  }
3854  if( j < N ) {
3855  IntrinsicType xmm1;
3856  for( size_t k=0UL; k<K; ++k ) {
3857  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
3858  }
3859  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
3860  }
3861  }
3862  }
3863  //**********************************************************************************************
3864 
3865  //**BLAS-based addition assignment to dense matrices (default)**********************************
3879  template< typename MT3 // Type of the left-hand side target matrix
3880  , typename MT4 // Type of the left-hand side matrix operand
3881  , typename MT5 // Type of the right-hand side matrix operand
3882  , typename ST2 > // Type of the scalar value
3883  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3884  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3885  {
3886  selectDefaultAddAssignKernel( C, A, B, scalar );
3887  }
3888  //**********************************************************************************************
3889 
3890  //**BLAS-based addition assignment to dense matrices (single precision)*************************
3891 #if BLAZE_BLAS_MODE
3892 
3905  template< typename MT3 // Type of the left-hand side target matrix
3906  , typename MT4 // Type of the left-hand side matrix operand
3907  , typename MT5 // Type of the right-hand side matrix operand
3908  , typename ST2 > // Type of the scalar value
3909  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3910  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3911  {
3912  using boost::numeric_cast;
3913 
3917 
3918  const int M ( numeric_cast<int>( A.rows() ) );
3919  const int N ( numeric_cast<int>( B.columns() ) );
3920  const int K ( numeric_cast<int>( A.columns() ) );
3921  const int lda( numeric_cast<int>( A.spacing() ) );
3922  const int ldb( numeric_cast<int>( B.spacing() ) );
3923  const int ldc( numeric_cast<int>( C.spacing() ) );
3924 
3925  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3926  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3927  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3928  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3929  }
3930 #endif
3931  //**********************************************************************************************
3932 
3933  //**BLAS-based addition assignment to dense matrices (double precision)*************************
3934 #if BLAZE_BLAS_MODE
3935 
3948  template< typename MT3 // Type of the left-hand side target matrix
3949  , typename MT4 // Type of the left-hand side matrix operand
3950  , typename MT5 // Type of the right-hand side matrix operand
3951  , typename ST2 > // Type of the scalar value
3952  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3953  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3954  {
3955  using boost::numeric_cast;
3956 
3960 
3961  const int M ( numeric_cast<int>( A.rows() ) );
3962  const int N ( numeric_cast<int>( B.columns() ) );
3963  const int K ( numeric_cast<int>( A.columns() ) );
3964  const int lda( numeric_cast<int>( A.spacing() ) );
3965  const int ldb( numeric_cast<int>( B.spacing() ) );
3966  const int ldc( numeric_cast<int>( C.spacing() ) );
3967 
3968  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3969  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3970  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3971  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3972  }
3973 #endif
3974  //**********************************************************************************************
3975 
3976  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
3977 #if BLAZE_BLAS_MODE
3978 
3991  template< typename MT3 // Type of the left-hand side target matrix
3992  , typename MT4 // Type of the left-hand side matrix operand
3993  , typename MT5 // Type of the right-hand side matrix operand
3994  , typename ST2 > // Type of the scalar value
3995  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3996  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3997  {
3998  using boost::numeric_cast;
3999 
4003  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
4004  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
4005  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
4006 
4007  const int M ( numeric_cast<int>( A.rows() ) );
4008  const int N ( numeric_cast<int>( B.columns() ) );
4009  const int K ( numeric_cast<int>( A.columns() ) );
4010  const int lda( numeric_cast<int>( A.spacing() ) );
4011  const int ldb( numeric_cast<int>( B.spacing() ) );
4012  const int ldc( numeric_cast<int>( C.spacing() ) );
4013  const complex<float> alpha( scalar );
4014  const complex<float> beta ( 1.0F, 0.0F );
4015 
4016  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4017  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4018  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4019  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4020  }
4021 #endif
4022  //**********************************************************************************************
4023 
4024  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
4025 #if BLAZE_BLAS_MODE
4026 
4039  template< typename MT3 // Type of the left-hand side target matrix
4040  , typename MT4 // Type of the left-hand side matrix operand
4041  , typename MT5 // Type of the right-hand side matrix operand
4042  , typename ST2 > // Type of the scalar value
4043  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4044  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4045  {
4046  using boost::numeric_cast;
4047 
4051  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
4052  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
4053  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
4054 
4055  const int M ( numeric_cast<int>( A.rows() ) );
4056  const int N ( numeric_cast<int>( B.columns() ) );
4057  const int K ( numeric_cast<int>( A.columns() ) );
4058  const int lda( numeric_cast<int>( A.spacing() ) );
4059  const int ldb( numeric_cast<int>( B.spacing() ) );
4060  const int ldc( numeric_cast<int>( C.spacing() ) );
4061  const complex<double> alpha( scalar );
4062  const complex<double> beta ( 1.0, 0.0 );
4063 
4064  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4065  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4066  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4067  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4068  }
4069 #endif
4070  //**********************************************************************************************
4071 
4072  //**Addition assignment to sparse matrices******************************************************
4073  // No special implementation for the addition assignment to sparse matrices.
4074  //**********************************************************************************************
4075 
4076  //**Subtraction assignment to dense matrices****************************************************
4088  template< typename MT3 // Type of the target dense matrix
4089  , bool SO > // Storage order of the target dense matrix
4090  friend inline void subAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
4091  {
4093 
4094  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4095  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4096 
4097  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4098  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4099 
4100  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
4101  return;
4102  }
4103 
4104  LT A( left ); // Evaluation of the left-hand side dense matrix operand
4105  RT B( right ); // Evaluation of the right-hand side dense matrix operand
4106 
4107  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4108  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4109  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4110  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4111  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4112  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4113 
4114  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
4115  }
4116  //**********************************************************************************************
4117 
4118  //**Subtraction assignment to dense matrices (kernel selection)*********************************
4129  template< typename MT3 // Type of the left-hand side target matrix
4130  , typename MT4 // Type of the left-hand side matrix operand
4131  , typename MT5 // Type of the right-hand side matrix operand
4132  , typename ST2 > // Type of the scalar value
4133  static inline typename DisableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
4134  selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4135  {
4136  if( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD )
4137  DMatScalarMultExpr::selectDefaultSubAssignKernel( C, A, B, scalar );
4138  else
4139  DMatScalarMultExpr::selectBlasSubAssignKernel( C, A, B, scalar );
4140  }
4141  //**********************************************************************************************
4142 
4143  //**Subtraction assignment to dense matrices (kernel selection)*********************************
4154  template< typename MT3 // Type of the left-hand side target matrix
4155  , typename MT4 // Type of the left-hand side matrix operand
4156  , typename MT5 // Type of the right-hand side matrix operand
4157  , typename ST2 > // Type of the scalar value
4158  static inline typename EnableIf< UseSMPAssignKernel<MT3,MT4,MT5,ST2> >::Type
4159  selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4160  {
4161  smpSubAssign( C, A * B * scalar );
4162  }
4163  //**********************************************************************************************
4164 
4165  //**Default subtraction assignment to dense matrices********************************************
4179  template< typename MT3 // Type of the left-hand side target matrix
4180  , typename MT4 // Type of the left-hand side matrix operand
4181  , typename MT5 // Type of the right-hand side matrix operand
4182  , typename ST2 > // Type of the scalar value
4183  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4184  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4185  {
4186  const ResultType tmp( A * B * scalar );
4187  subAssign( C, tmp );
4188  }
4189  //**********************************************************************************************
4190 
4191  //**Vectorized default subtraction assignment to row-major dense matrices***********************
4205  template< typename MT3 // Type of the left-hand side target matrix
4206  , typename MT4 // Type of the left-hand side matrix operand
4207  , typename MT5 // Type of the right-hand side matrix operand
4208  , typename ST2 > // Type of the scalar value
4209  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4210  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
4211  {
4212  typedef IntrinsicTrait<ElementType> IT;
4213 
4214  const size_t M( A.rows() );
4215  const size_t N( B.columns() );
4216  const size_t K( A.columns() );
4217 
4218  const IntrinsicType factor( set( scalar ) );
4219 
4220  size_t j( 0UL );
4221 
4222  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
4223  for( size_t i=0UL; i<M; ++i ) {
4224  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4225  for( size_t k=0UL; k<K; ++k ) {
4226  const IntrinsicType a1( set( A(i,k) ) );
4227  xmm1 = xmm1 + a1 * B.load(k,j );
4228  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
4229  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
4230  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
4231  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
4232  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
4233  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
4234  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
4235  }
4236  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
4237  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
4238  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
4239  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
4240  (~C).store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) - xmm5 * factor );
4241  (~C).store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) - xmm6 * factor );
4242  (~C).store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) - xmm7 * factor );
4243  (~C).store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) - xmm8 * factor );
4244  }
4245  }
4246  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
4247  size_t i( 0UL );
4248  for( ; (i+2UL) <= M; i+=2UL ) {
4249  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4250  for( size_t k=0UL; k<K; ++k ) {
4251  const IntrinsicType a1( set( A(i ,k) ) );
4252  const IntrinsicType a2( set( A(i+1UL,k) ) );
4253  const IntrinsicType b1( B.load(k,j ) );
4254  const IntrinsicType b2( B.load(k,j+IT::size ) );
4255  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
4256  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
4257  xmm1 = xmm1 + a1 * b1;
4258  xmm2 = xmm2 + a1 * b2;
4259  xmm3 = xmm3 + a1 * b3;
4260  xmm4 = xmm4 + a1 * b4;
4261  xmm5 = xmm5 + a2 * b1;
4262  xmm6 = xmm6 + a2 * b2;
4263  xmm7 = xmm7 + a2 * b3;
4264  xmm8 = xmm8 + a2 * b4;
4265  }
4266  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4267  (~C).store( i , j+IT::size , (~C).load(i ,j+IT::size ) - xmm2 * factor );
4268  (~C).store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) - xmm3 * factor );
4269  (~C).store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) - xmm4 * factor );
4270  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
4271  (~C).store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) - xmm6 * factor );
4272  (~C).store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) - xmm7 * factor );
4273  (~C).store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) - xmm8 * factor );
4274  }
4275  if( i < M ) {
4276  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4277  for( size_t k=0UL; k<K; ++k ) {
4278  const IntrinsicType a1( set( A(i,k) ) );
4279  xmm1 = xmm1 + a1 * B.load(k,j );
4280  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
4281  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
4282  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
4283  }
4284  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
4285  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
4286  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
4287  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
4288  }
4289  }
4290  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
4291  size_t i( 0UL );
4292  for( ; (i+2UL) <= M; i+=2UL ) {
4293  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4294  for( size_t k=0UL; k<K; ++k ) {
4295  const IntrinsicType a1( set( A(i ,k) ) );
4296  const IntrinsicType a2( set( A(i+1UL,k) ) );
4297  const IntrinsicType b1( B.load(k,j ) );
4298  const IntrinsicType b2( B.load(k,j+IT::size) );
4299  xmm1 = xmm1 + a1 * b1;
4300  xmm2 = xmm2 + a1 * b2;
4301  xmm3 = xmm3 + a2 * b1;
4302  xmm4 = xmm4 + a2 * b2;
4303  }
4304  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4305  (~C).store( i , j+IT::size, (~C).load(i ,j+IT::size) - xmm2 * factor );
4306  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
4307  (~C).store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) - xmm4 * factor );
4308  }
4309  if( i < M ) {
4310  IntrinsicType xmm1, xmm2;
4311  for( size_t k=0UL; k<K; ++k ) {
4312  const IntrinsicType a1( set( A(i,k) ) );
4313  xmm1 = xmm1 + a1 * B.load(k,j );
4314  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
4315  }
4316  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
4317  (~C).store( i, j+IT::size, (~C).load(i,j+IT::size) - xmm2 * factor );
4318  }
4319  }
4320  if( j < N ) {
4321  size_t i( 0UL );
4322  for( ; (i+2UL) <= M; i+=2UL ) {
4323  IntrinsicType xmm1, xmm2;
4324  for( size_t k=0UL; k<K; ++k ) {
4325  const IntrinsicType b1( B.load(k,j) );
4326  xmm1 = xmm1 + set( A(i ,k) ) * b1;
4327  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
4328  }
4329  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
4330  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
4331  }
4332  if( i < M ) {
4333  IntrinsicType xmm1;
4334  for( size_t k=0UL; k<K; ++k ) {
4335  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
4336  }
4337  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
4338  }
4339  }
4340  }
4341  //**********************************************************************************************
4342 
4343  //**Vectorized default subtraction assignment to column-major dense matrices********************
4357  template< typename MT3 // Type of the left-hand side target matrix
4358  , typename MT4 // Type of the left-hand side matrix operand
4359  , typename MT5 // Type of the right-hand side matrix operand
4360  , typename ST2 > // Type of the scalar value
4361  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4362  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
4363  {
4364  typedef IntrinsicTrait<ElementType> IT;
4365 
4366  const size_t M( A.rows() );
4367  const size_t N( B.columns() );
4368  const size_t K( A.columns() );
4369 
4370  const IntrinsicType factor( set( scalar ) );
4371 
4372  size_t i( 0UL );
4373 
4374  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
4375  for( size_t j=0UL; j<N; ++j ) {
4376  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4377  for( size_t k=0UL; k<K; ++k ) {
4378  const IntrinsicType b1( set( B(k,j) ) );
4379  xmm1 = xmm1 + A.load(i ,k) * b1;
4380  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4381  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4382  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4383  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
4384  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
4385  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
4386  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
4387  }
4388  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
4389  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
4390  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
4391  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
4392  (~C).store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) - xmm5 * factor );
4393  (~C).store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) - xmm6 * factor );
4394  (~C).store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) - xmm7 * factor );
4395  (~C).store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) - xmm8 * factor );
4396  }
4397  }
4398  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
4399  size_t j( 0UL );
4400  for( ; (j+2UL) <= N; j+=2UL ) {
4401  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4402  for( size_t k=0UL; k<K; ++k ) {
4403  const IntrinsicType a1( A.load(i ,k) );
4404  const IntrinsicType a2( A.load(i+IT::size ,k) );
4405  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
4406  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
4407  const IntrinsicType b1( set( B(k,j ) ) );
4408  const IntrinsicType b2( set( B(k,j+1UL) ) );
4409  xmm1 = xmm1 + a1 * b1;
4410  xmm2 = xmm2 + a2 * b1;
4411  xmm3 = xmm3 + a3 * b1;
4412  xmm4 = xmm4 + a4 * b1;
4413  xmm5 = xmm5 + a1 * b2;
4414  xmm6 = xmm6 + a2 * b2;
4415  xmm7 = xmm7 + a3 * b2;
4416  xmm8 = xmm8 + a4 * b2;
4417  }
4418  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4419  (~C).store( i+IT::size , j , (~C).load(i+IT::size ,j ) - xmm2 * factor );
4420  (~C).store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) - xmm3 * factor );
4421  (~C).store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) - xmm4 * factor );
4422  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
4423  (~C).store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) - xmm6 * factor );
4424  (~C).store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) - xmm7 * factor );
4425  (~C).store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) - xmm8 * factor );
4426  }
4427  if( j < N ) {
4428  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4429  for( size_t k=0UL; k<K; ++k ) {
4430  const IntrinsicType b1( set( B(k,j) ) );
4431  xmm1 = xmm1 + A.load(i ,k) * b1;
4432  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4433  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4434  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4435  }
4436  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
4437  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
4438  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
4439  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
4440  }
4441  }
4442  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
4443  size_t j( 0UL );
4444  for( ; (j+2UL) <= N; j+=2UL ) {
4445  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4446  for( size_t k=0UL; k<K; ++k ) {
4447  const IntrinsicType a1( A.load(i ,k) );
4448  const IntrinsicType a2( A.load(i+IT::size,k) );
4449  const IntrinsicType b1( set( B(k,j ) ) );
4450  const IntrinsicType b2( set( B(k,j+1UL) ) );
4451  xmm1 = xmm1 + a1 * b1;
4452  xmm2 = xmm2 + a2 * b1;
4453  xmm3 = xmm3 + a1 * b2;
4454  xmm4 = xmm4 + a2 * b2;
4455  }
4456  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4457  (~C).store( i+IT::size, j , (~C).load(i+IT::size,j ) - xmm2 * factor );
4458  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
4459  (~C).store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) - xmm4 * factor );
4460  }
4461  if( j < N ) {
4462  IntrinsicType xmm1, xmm2;
4463  for( size_t k=0UL; k<K; ++k ) {
4464  const IntrinsicType b1( set( B(k,j) ) );
4465  xmm1 = xmm1 + A.load(i ,k) * b1;
4466  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
4467  }
4468  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
4469  (~C).store( i+IT::size, j, (~C).load(i+IT::size,j) - xmm2 * factor );
4470  }
4471  }
4472  if( i < M ) {
4473  size_t j( 0UL );
4474  for( ; (j+2UL) <= N; j+=2UL ) {
4475  IntrinsicType xmm1, xmm2;
4476  for( size_t k=0UL; k<K; ++k ) {
4477  const IntrinsicType a1( A.load(i,k) );
4478  xmm1 = xmm1 + a1 * set( B(k,j ) );
4479  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
4480  }
4481  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
4482  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
4483  }
4484  if( j < N ) {
4485  IntrinsicType xmm1;
4486  for( size_t k=0UL; k<K; ++k ) {
4487  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
4488  }
4489  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
4490  }
4491  }
4492  }
4493  //**********************************************************************************************
4494 
4495  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4509  template< typename MT3 // Type of the left-hand side target matrix
4510  , typename MT4 // Type of the left-hand side matrix operand
4511  , typename MT5 // Type of the right-hand side matrix operand
4512  , typename ST2 > // Type of the scalar value
4513  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4514  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4515  {
4516  selectDefaultSubAssignKernel( C, A, B, scalar );
4517  }
4518  //**********************************************************************************************
4519 
4520  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
4521 #if BLAZE_BLAS_MODE
4522 
4535  template< typename MT3 // Type of the left-hand side target matrix
4536  , typename MT4 // Type of the left-hand side matrix operand
4537  , typename MT5 // Type of the right-hand side matrix operand
4538  , typename ST2 > // Type of the scalar value
4539  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4540  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4541  {
4542  using boost::numeric_cast;
4543 
4547 
4548  const int M ( numeric_cast<int>( A.rows() ) );
4549  const int N ( numeric_cast<int>( B.columns() ) );
4550  const int K ( numeric_cast<int>( A.columns() ) );
4551  const int lda( numeric_cast<int>( A.spacing() ) );
4552  const int ldb( numeric_cast<int>( B.spacing() ) );
4553  const int ldc( numeric_cast<int>( C.spacing() ) );
4554 
4555  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4556  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4557  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4558  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4559  }
4560 #endif
4561  //**********************************************************************************************
4562 
4563  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
4564 #if BLAZE_BLAS_MODE
4565 
4578  template< typename MT3 // Type of the left-hand side target matrix
4579  , typename MT4 // Type of the left-hand side matrix operand
4580  , typename MT5 // Type of the right-hand side matrix operand
4581  , typename ST2 > // Type of the scalar value
4582  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4583  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4584  {
4585  using boost::numeric_cast;
4586 
4590 
4591  const int M ( numeric_cast<int>( A.rows() ) );
4592  const int N ( numeric_cast<int>( B.columns() ) );
4593  const int K ( numeric_cast<int>( A.columns() ) );
4594  const int lda( numeric_cast<int>( A.spacing() ) );
4595  const int ldb( numeric_cast<int>( B.spacing() ) );
4596  const int ldc( numeric_cast<int>( C.spacing() ) );
4597 
4598  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4599  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4600  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4601  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4602  }
4603 #endif
4604  //**********************************************************************************************
4605 
4606  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
4607 #if BLAZE_BLAS_MODE
4608 
4621  template< typename MT3 // Type of the left-hand side target matrix
4622  , typename MT4 // Type of the left-hand side matrix operand
4623  , typename MT5 // Type of the right-hand side matrix operand
4624  , typename ST2 > // Type of the scalar value
4625  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4626  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4627  {
4628  using boost::numeric_cast;
4629 
4633  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
4634  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
4635  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
4636 
4637  const int M ( numeric_cast<int>( A.rows() ) );
4638  const int N ( numeric_cast<int>( B.columns() ) );
4639  const int K ( numeric_cast<int>( A.columns() ) );
4640  const int lda( numeric_cast<int>( A.spacing() ) );
4641  const int ldb( numeric_cast<int>( B.spacing() ) );
4642  const int ldc( numeric_cast<int>( C.spacing() ) );
4643  const complex<float> alpha( -scalar );
4644  const complex<float> beta ( 1.0F, 0.0F );
4645 
4646  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4647  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4648  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4649  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4650  }
4651 #endif
4652  //**********************************************************************************************
4653 
4654  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
4655 #if BLAZE_BLAS_MODE
4656 
4669  template< typename MT3 // Type of the left-hand side target matrix
4670  , typename MT4 // Type of the left-hand side matrix operand
4671  , typename MT5 // Type of the right-hand side matrix operand
4672  , typename ST2 > // Type of the scalar value
4673  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4674  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4675  {
4676  using boost::numeric_cast;
4677 
4681  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
4682  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
4683  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
4684 
4685  const int M ( numeric_cast<int>( A.rows() ) );
4686  const int N ( numeric_cast<int>( B.columns() ) );
4687  const int K ( numeric_cast<int>( A.columns() ) );
4688  const int lda( numeric_cast<int>( A.spacing() ) );
4689  const int ldb( numeric_cast<int>( B.spacing() ) );
4690  const int ldc( numeric_cast<int>( C.spacing() ) );
4691  const complex<double> alpha( -scalar );
4692  const complex<double> beta ( 1.0, 0.0 );
4693 
4694  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4695  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4696  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4697  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4698  }
4699 #endif
4700  //**********************************************************************************************
4701 
4702  //**Subtraction assignment to sparse matrices***************************************************
4703  // No special implementation for the subtraction assignment to sparse matrices.
4704  //**********************************************************************************************
4705 
4706  //**Multiplication assignment to dense matrices*************************************************
4707  // No special implementation for the multiplication assignment to dense matrices.
4708  //**********************************************************************************************
4709 
4710  //**Multiplication assignment to sparse matrices************************************************
4711  // No special implementation for the multiplication assignment to sparse matrices.
4712  //**********************************************************************************************
4713 
4714  //**Compile time checks*************************************************************************
4723  //**********************************************************************************************
4724 };
4726 //*************************************************************************************************
4727 
4728 
4729 
4730 
4731 //=================================================================================================
4732 //
4733 // GLOBAL BINARY ARITHMETIC OPERATORS
4734 //
4735 //=================================================================================================
4736 
4737 //*************************************************************************************************
4766 template< typename T1 // Type of the left-hand side dense matrix
4767  , typename T2 > // Type of the right-hand side dense matrix
4768 inline const TDMatDMatMultExpr<T1,T2>
4770 {
4772 
4773  if( (~lhs).columns() != (~rhs).rows() )
4774  throw std::invalid_argument( "Matrix sizes do not match" );
4775 
4776  return TDMatDMatMultExpr<T1,T2>( ~lhs, ~rhs );
4777 }
4778 //*************************************************************************************************
4779 
4780 
4781 
4782 
4783 //=================================================================================================
4784 //
4785 // EXPRESSION TRAIT SPECIALIZATIONS
4786 //
4787 //=================================================================================================
4788 
4789 //*************************************************************************************************
4791 template< typename MT1, typename MT2, typename VT >
4792 struct TDMatDVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
4793 {
4794  public:
4795  //**********************************************************************************************
4796  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4797  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4798  IsDenseVector<VT>::value && IsColumnVector<VT>::value
4799  , typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
4800  , INVALID_TYPE >::Type Type;
4801  //**********************************************************************************************
4802 };
4804 //*************************************************************************************************
4805 
4806 
4807 //*************************************************************************************************
4809 template< typename MT1, typename MT2, typename VT >
4810 struct TDMatSVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
4811 {
4812  public:
4813  //**********************************************************************************************
4814  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4815  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4816  IsSparseVector<VT>::value && IsColumnVector<VT>::value
4817  , typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
4818  , INVALID_TYPE >::Type Type;
4819  //**********************************************************************************************
4820 };
4822 //*************************************************************************************************
4823 
4824 
4825 //*************************************************************************************************
4827 template< typename VT, typename MT1, typename MT2 >
4828 struct TDVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
4829 {
4830  public:
4831  //**********************************************************************************************
4832  typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
4833  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4834  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4835  , typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4836  , INVALID_TYPE >::Type Type;
4837  //**********************************************************************************************
4838 };
4840 //*************************************************************************************************
4841 
4842 
4843 //*************************************************************************************************
4845 template< typename VT, typename MT1, typename MT2 >
4846 struct TSVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
4847 {
4848  public:
4849  //**********************************************************************************************
4850  typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
4851  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4852  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4853  , typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4854  , INVALID_TYPE >::Type Type;
4855  //**********************************************************************************************
4856 };
4858 //*************************************************************************************************
4859 
4860 
4861 //*************************************************************************************************
4863 template< typename MT1, typename MT2, bool AF >
4864 struct SubmatrixExprTrait< TDMatDMatMultExpr<MT1,MT2>, AF >
4865 {
4866  public:
4867  //**********************************************************************************************
4868  typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
4869  , typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
4870  //**********************************************************************************************
4871 };
4873 //*************************************************************************************************
4874 
4875 
4876 //*************************************************************************************************
4878 template< typename MT1, typename MT2 >
4879 struct RowExprTrait< TDMatDMatMultExpr<MT1,MT2> >
4880 {
4881  public:
4882  //**********************************************************************************************
4883  typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
4884  //**********************************************************************************************
4885 };
4887 //*************************************************************************************************
4888 
4889 
4890 //*************************************************************************************************
4892 template< typename MT1, typename MT2 >
4893 struct ColumnExprTrait< TDMatDMatMultExpr<MT1,MT2> >
4894 {
4895  public:
4896  //**********************************************************************************************
4897  typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
4898  //**********************************************************************************************
4899 };
4901 //*************************************************************************************************
4902 
4903 } // namespace blaze
4904 
4905 #endif
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4579
EnableIf< IsIntegral< T >, Load< T, sizeof(T)> >::Type::Type load(const T *address)
Loads a vector of integral values.
Definition: Load.h:222
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:340
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4075
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:413
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:151
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:197
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
Header file for the IsColumnMajorMatrix type trait.
Header file for the sparse matrix SMP implementation.
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:394
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2384
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:249
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:123
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:247
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:360
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:250
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
const size_t TDMATDMATMULT_THRESHOLD
Column-major dense matrix/row-major dense matrix multiplication threshold.This setting specifies the ...
Definition: Thresholds.h:159
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:121
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:122
Constraint on the data type.
Header file for the MultExprTrait class template.
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:121
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:126
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:127
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:285
Header file for the multiplication trait.
Header file for the IsDouble type trait.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:262
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the TDMatSVecMultExprTrait class template.
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDMatMultExpr.h:251
Header file for the dense matrix SMP implementation.
const size_t SMP_TDMATDMATMULT_THRESHOLD
SMP column-major dense matrix/row-major dense matrix multiplication threshold.This threshold represen...
Definition: Thresholds.h:459
Header file for the DenseMatrix base class.
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:259
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:246
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:265
Constraints on the storage order of matrix types.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2382
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:256
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:252
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:91
Header file for the IsNumeric type trait.
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:124
System settings for the BLAS mode.
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:404
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:125
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:249
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:384
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:350
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:248
Header file for the TDVecDMatMultExprTrait class template.
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:248
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2379
Header file for basic type definitions.
Header file for the IsComplex type trait.
Header file for the complex data type.
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:330
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:253
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:414
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:300
Constraint on the data type.
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:247
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:209
void store(float *address, const sse_float_t &value)
Aligned store of a vector of &#39;float&#39; values.
Definition: Store.h:242
Header file for the IsExpression type trait class.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:372
Header file for the FunctionTrace class.