All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <stdexcept>
44 #include <boost/cast.hpp>
52 #include <blaze/math/Intrinsics.h>
53 #include <blaze/math/shims/Reset.h>
75 #include <blaze/system/BLAS.h>
77 #include <blaze/util/Assert.h>
78 #include <blaze/util/Complex.h>
84 #include <blaze/util/DisableIf.h>
85 #include <blaze/util/EnableIf.h>
86 #include <blaze/util/InvalidType.h>
88 #include <blaze/util/SelectType.h>
89 #include <blaze/util/Types.h>
95 
96 
97 namespace blaze {
98 
99 //=================================================================================================
100 //
101 // CLASS TDMATDMATMULTEXPR
102 //
103 //=================================================================================================
104 
105 //*************************************************************************************************
112 template< typename MT1 // Type of the left-hand side dense matrix
113  , typename MT2 > // Type of the right-hand side dense matrix
114 class TDMatDMatMultExpr : public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
115  , private MatMatMultExpr
116  , private Computation
117 {
118  private:
119  //**Type definitions****************************************************************************
120  typedef typename MT1::ResultType RT1;
121  typedef typename MT2::ResultType RT2;
122  typedef typename MT1::CompositeType CT1;
123  typedef typename MT2::CompositeType CT2;
124  //**********************************************************************************************
125 
126  //**********************************************************************************************
128 
131  template< typename T1, typename T2, typename T3 >
132  struct UseSinglePrecisionKernel {
136  };
138  //**********************************************************************************************
139 
140  //**********************************************************************************************
142 
145  template< typename T1, typename T2, typename T3 >
146  struct UseDoublePrecisionKernel {
150  };
152  //**********************************************************************************************
153 
154  //**********************************************************************************************
156 
160  template< typename T1, typename T2, typename T3 >
161  struct UseSinglePrecisionComplexKernel {
162  typedef complex<float> Type;
163  enum { value = IsSame<typename T1::ElementType,Type>::value &&
164  IsSame<typename T2::ElementType,Type>::value &&
165  IsSame<typename T3::ElementType,Type>::value };
166  };
168  //**********************************************************************************************
169 
170  //**********************************************************************************************
172 
176  template< typename T1, typename T2, typename T3 >
177  struct UseDoublePrecisionComplexKernel {
178  typedef complex<double> Type;
179  enum { value = IsSame<typename T1::ElementType,Type>::value &&
180  IsSame<typename T2::ElementType,Type>::value &&
181  IsSame<typename T3::ElementType,Type>::value };
182  };
184  //**********************************************************************************************
185 
186  //**********************************************************************************************
188 
191  template< typename T1, typename T2, typename T3 >
192  struct UseDefaultKernel {
193  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
194  !UseDoublePrecisionKernel<T1,T2,T3>::value &&
195  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
196  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
197  };
199  //**********************************************************************************************
200 
201  //**********************************************************************************************
203 
206  template< typename T1, typename T2, typename T3 >
207  struct UseVectorizedDefaultKernel {
208  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
209  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
210  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
211  IntrinsicTrait<typename T1::ElementType>::addition &&
212  IntrinsicTrait<typename T1::ElementType>::multiplication };
213  };
215  //**********************************************************************************************
216 
217  public:
218  //**Type definitions****************************************************************************
225  typedef const ElementType ReturnType;
226  typedef const ResultType CompositeType;
227 
229  typedef typename SelectType< IsExpression<MT1>::value, const MT1, const MT1& >::Type LeftOperand;
230 
232  typedef typename SelectType< IsExpression<MT2>::value, const MT2, const MT2& >::Type RightOperand;
233 
235  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
236 
238  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
239  //**********************************************************************************************
240 
241  //**Compilation flags***************************************************************************
243  enum { vectorizable = 0 };
244  //**********************************************************************************************
245 
246  //**Constructor*********************************************************************************
252  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs )
253  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
254  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
255  {
256  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
257  }
258  //**********************************************************************************************
259 
260  //**Access operator*****************************************************************************
267  inline ReturnType operator()( size_t i, size_t j ) const {
268  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
269  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
270 
271  ElementType tmp;
272 
273  if( lhs_.columns() != 0UL ) {
274  const size_t end( ( ( lhs_.columns()-1UL ) & size_t(-2) ) + 1UL );
275  tmp = lhs_(i,0UL) * rhs_(0UL,j);
276  for( size_t k=1UL; k<end; k+=2UL ) {
277  tmp += lhs_(i,k ) * rhs_(k ,j);
278  tmp += lhs_(i,k+1UL) * rhs_(k+1UL,j);
279  }
280  if( end < lhs_.columns() ) {
281  tmp += lhs_(i,end) * rhs_(end,j);
282  }
283  }
284  else {
285  reset( tmp );
286  }
287 
288  return tmp;
289  }
290  //**********************************************************************************************
291 
292  //**Rows function*******************************************************************************
297  inline size_t rows() const {
298  return lhs_.rows();
299  }
300  //**********************************************************************************************
301 
302  //**Columns function****************************************************************************
307  inline size_t columns() const {
308  return rhs_.columns();
309  }
310  //**********************************************************************************************
311 
312  //**Left operand access*************************************************************************
317  inline LeftOperand leftOperand() const {
318  return lhs_;
319  }
320  //**********************************************************************************************
321 
322  //**Right operand access************************************************************************
327  inline RightOperand rightOperand() const {
328  return rhs_;
329  }
330  //**********************************************************************************************
331 
332  //**********************************************************************************************
338  template< typename T >
339  inline bool canAlias( const T* alias ) const {
340  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
341  }
342  //**********************************************************************************************
343 
344  //**********************************************************************************************
350  template< typename T >
351  inline bool isAliased( const T* alias ) const {
352  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
353  }
354  //**********************************************************************************************
355 
356  private:
357  //**Member variables****************************************************************************
360  //**********************************************************************************************
361 
362  //**Assignment to dense matrices****************************************************************
371  template< typename MT // Type of the target dense matrix
372  , bool SO > // Storage order of the target dense matrix
373  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
374  {
376 
377  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
378  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
379 
380  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
381  return;
382  }
383  else if( rhs.lhs_.columns() == 0UL ) {
384  reset( ~lhs );
385  return;
386  }
387 
388  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
389  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
390 
391  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
392  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
393  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
394  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
395  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
396  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
397 
398  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
399  TDMatDMatMultExpr::selectDefaultAssignKernel( ~lhs, A, B );
400  else
401  TDMatDMatMultExpr::selectBlasAssignKernel( ~lhs, A, B );
402  }
404  //**********************************************************************************************
405 
406  //**Default assignment to dense matrices********************************************************
420  template< typename MT3 // Type of the left-hand side target matrix
421  , typename MT4 // Type of the left-hand side matrix operand
422  , typename MT5 > // Type of the right-hand side matrix operand
423  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
424  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
425  {
426  const size_t M( A.rows() );
427  const size_t N( B.columns() );
428  const size_t K( A.columns() );
429 
430  for( size_t i=0UL; i<M; ++i ) {
431  for( size_t j=0UL; j<N; ++j ) {
432  C(i,j) = A(i,0UL) * B(0UL,j);
433  }
434  for( size_t k=1UL; k<K; ++k ) {
435  for( size_t j=0UL; j<N; ++j ) {
436  C(i,j) += A(i,k) * B(k,j);
437  }
438  }
439  }
440  }
442  //**********************************************************************************************
443 
444  //**Vectorized default assignment to row-major dense matrices***********************************
458  template< typename MT3 // Type of the left-hand side target matrix
459  , typename MT4 // Type of the left-hand side matrix operand
460  , typename MT5 > // Type of the right-hand side matrix operand
461  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
462  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
463  {
464  typedef IntrinsicTrait<ElementType> IT;
465 
466  const size_t M( A.rows() );
467  const size_t N( B.columns() );
468  const size_t K( A.columns() );
469 
470  size_t j( 0UL );
471 
472  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
473  for( size_t i=0UL; i<M; ++i ) {
474  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
475  for( size_t k=0UL; k<K; ++k ) {
476  const IntrinsicType a1( set( A(i,k) ) );
477  xmm1 = xmm1 + a1 * B.load(k,j );
478  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
479  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
480  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
481  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
482  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
483  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
484  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
485  }
486  (~C).store( i, j , xmm1 );
487  (~C).store( i, j+IT::size , xmm2 );
488  (~C).store( i, j+IT::size*2UL, xmm3 );
489  (~C).store( i, j+IT::size*3UL, xmm4 );
490  (~C).store( i, j+IT::size*4UL, xmm5 );
491  (~C).store( i, j+IT::size*5UL, xmm6 );
492  (~C).store( i, j+IT::size*6UL, xmm7 );
493  (~C).store( i, j+IT::size*7UL, xmm8 );
494  }
495  }
496  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
497  size_t i( 0UL );
498  for( ; (i+2UL) <= M; i+=2UL ) {
499  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
500  for( size_t k=0UL; k<K; ++k ) {
501  const IntrinsicType a1( set( A(i ,k) ) );
502  const IntrinsicType a2( set( A(i+1UL,k) ) );
503  const IntrinsicType b1( B.load(k,j ) );
504  const IntrinsicType b2( B.load(k,j+IT::size ) );
505  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
506  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
507  xmm1 = xmm1 + a1 * b1;
508  xmm2 = xmm2 + a1 * b2;
509  xmm3 = xmm3 + a1 * b3;
510  xmm4 = xmm4 + a1 * b4;
511  xmm5 = xmm5 + a2 * b1;
512  xmm6 = xmm6 + a2 * b2;
513  xmm7 = xmm7 + a2 * b3;
514  xmm8 = xmm8 + a2 * b4;
515  }
516  (~C).store( i , j , xmm1 );
517  (~C).store( i , j+IT::size , xmm2 );
518  (~C).store( i , j+IT::size*2UL, xmm3 );
519  (~C).store( i , j+IT::size*3UL, xmm4 );
520  (~C).store( i+1UL, j , xmm5 );
521  (~C).store( i+1UL, j+IT::size , xmm6 );
522  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
523  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
524  }
525  if( i < M ) {
526  IntrinsicType xmm1, xmm2, xmm3, xmm4;
527  for( size_t k=0UL; k<K; ++k ) {
528  const IntrinsicType a1( set( A(i,k) ) );
529  xmm1 = xmm1 + a1 * B.load(k,j );
530  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
531  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
532  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
533  }
534  (~C).store( i, j , xmm1 );
535  (~C).store( i, j+IT::size , xmm2 );
536  (~C).store( i, j+IT::size*2UL, xmm3 );
537  (~C).store( i, j+IT::size*3UL, xmm4 );
538  }
539  }
540  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
541  size_t i( 0UL );
542  for( ; (i+2UL) <= M; i+=2UL ) {
543  IntrinsicType xmm1, xmm2, xmm3, xmm4;
544  for( size_t k=0UL; k<K; ++k ) {
545  const IntrinsicType a1( set( A(i ,k) ) );
546  const IntrinsicType a2( set( A(i+1UL,k) ) );
547  const IntrinsicType b1( B.load(k,j ) );
548  const IntrinsicType b2( B.load(k,j+IT::size) );
549  xmm1 = xmm1 + a1 * b1;
550  xmm2 = xmm2 + a1 * b2;
551  xmm3 = xmm3 + a2 * b1;
552  xmm4 = xmm4 + a2 * b2;
553  }
554  (~C).store( i , j , xmm1 );
555  (~C).store( i , j+IT::size, xmm2 );
556  (~C).store( i+1UL, j , xmm3 );
557  (~C).store( i+1UL, j+IT::size, xmm4 );
558  }
559  if( i < M ) {
560  IntrinsicType xmm1, xmm2;
561  for( size_t k=0UL; k<K; ++k ) {
562  const IntrinsicType a1( set( A(i,k) ) );
563  xmm1 = xmm1 + a1 * B.load(k,j );
564  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
565  }
566  (~C).store( i, j , xmm1 );
567  (~C).store( i, j+IT::size, xmm2 );
568  }
569  }
570  if( j < N ) {
571  size_t i( 0UL );
572  for( ; (i+2UL) <= M; i+=2UL ) {
573  IntrinsicType xmm1, xmm2;
574  for( size_t k=0UL; k<K; ++k ) {
575  const IntrinsicType b1( B.load(k,j) );
576  xmm1 = xmm1 + set( A(i ,k) ) * b1;
577  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
578  }
579  (~C).store( i , j, xmm1 );
580  (~C).store( i+1UL, j, xmm2 );
581  }
582  if( i < M ) {
583  IntrinsicType xmm1;
584  for( size_t k=0UL; k<K; ++k ) {
585  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
586  }
587  (~C).store( i, j, xmm1 );
588  }
589  }
590  }
592  //**********************************************************************************************
593 
594  //**Vectorized default assignment to column-major dense matrices********************************
608  template< typename MT3 // Type of the left-hand side target matrix
609  , typename MT4 // Type of the left-hand side matrix operand
610  , typename MT5 > // Type of the right-hand side matrix operand
611  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
612  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
613  {
614  typedef IntrinsicTrait<ElementType> IT;
615 
616  const size_t M( A.rows() );
617  const size_t N( B.columns() );
618  const size_t K( A.columns() );
619 
620  size_t i( 0UL );
621 
622  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
623  for( size_t j=0UL; j<N; ++j ) {
624  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
625  for( size_t k=0UL; k<K; ++k ) {
626  const IntrinsicType b1( set( B(k,j) ) );
627  xmm1 = xmm1 + A.load(i ,k) * b1;
628  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
629  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
630  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
631  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
632  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
633  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
634  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
635  }
636  (~C).store( i , j, xmm1 );
637  (~C).store( i+IT::size , j, xmm2 );
638  (~C).store( i+IT::size*2UL, j, xmm3 );
639  (~C).store( i+IT::size*3UL, j, xmm4 );
640  (~C).store( i+IT::size*4UL, j, xmm5 );
641  (~C).store( i+IT::size*5UL, j, xmm6 );
642  (~C).store( i+IT::size*6UL, j, xmm7 );
643  (~C).store( i+IT::size*7UL, j, xmm8 );
644  }
645  }
646  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
647  size_t j( 0UL );
648  for( ; (j+2UL) <= N; j+=2UL ) {
649  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
650  for( size_t k=0UL; k<K; ++k ) {
651  const IntrinsicType a1( A.load(i ,k) );
652  const IntrinsicType a2( A.load(i+IT::size ,k) );
653  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
654  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
655  const IntrinsicType b1( set( B(k,j ) ) );
656  const IntrinsicType b2( set( B(k,j+1UL) ) );
657  xmm1 = xmm1 + a1 * b1;
658  xmm2 = xmm2 + a2 * b1;
659  xmm3 = xmm3 + a3 * b1;
660  xmm4 = xmm4 + a4 * b1;
661  xmm5 = xmm5 + a1 * b2;
662  xmm6 = xmm6 + a2 * b2;
663  xmm7 = xmm7 + a3 * b2;
664  xmm8 = xmm8 + a4 * b2;
665  }
666  (~C).store( i , j , xmm1 );
667  (~C).store( i+IT::size , j , xmm2 );
668  (~C).store( i+IT::size*2UL, j , xmm3 );
669  (~C).store( i+IT::size*3UL, j , xmm4 );
670  (~C).store( i , j+1UL, xmm5 );
671  (~C).store( i+IT::size , j+1UL, xmm6 );
672  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
673  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
674  }
675  if( j < N ) {
676  IntrinsicType xmm1, xmm2, xmm3, xmm4;
677  for( size_t k=0UL; k<K; ++k ) {
678  const IntrinsicType b1( set( B(k,j) ) );
679  xmm1 = xmm1 + A.load(i ,k) * b1;
680  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
681  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
682  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
683  }
684  (~C).store( i , j, xmm1 );
685  (~C).store( i+IT::size , j, xmm2 );
686  (~C).store( i+IT::size*2UL, j, xmm3 );
687  (~C).store( i+IT::size*3UL, j, xmm4 );
688  }
689  }
690  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
691  size_t j( 0UL );
692  for( ; (j+2UL) <= N; j+=2UL ) {
693  IntrinsicType xmm1, xmm2, xmm3, xmm4;
694  for( size_t k=0UL; k<K; ++k ) {
695  const IntrinsicType a1( A.load(i ,k) );
696  const IntrinsicType a2( A.load(i+IT::size,k) );
697  const IntrinsicType b1( set( B(k,j ) ) );
698  const IntrinsicType b2( set( B(k,j+1UL) ) );
699  xmm1 = xmm1 + a1 * b1;
700  xmm2 = xmm2 + a2 * b1;
701  xmm3 = xmm3 + a1 * b2;
702  xmm4 = xmm4 + a2 * b2;
703  }
704  (~C).store( i , j , xmm1 );
705  (~C).store( i+IT::size, j , xmm2 );
706  (~C).store( i , j+1UL, xmm3 );
707  (~C).store( i+IT::size, j+1UL, xmm4 );
708  }
709  if( j < N ) {
710  IntrinsicType xmm1, xmm2;
711  for( size_t k=0UL; k<K; ++k ) {
712  const IntrinsicType b1( set( B(k,j) ) );
713  xmm1 = xmm1 + A.load(i ,k) * b1;
714  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
715  }
716  (~C).store( i , j, xmm1 );
717  (~C).store( i+IT::size, j, xmm2 );
718  }
719  }
720  if( i < M ) {
721  size_t j( 0UL );
722  for( ; (j+2UL) <= N; j+=2UL ) {
723  IntrinsicType xmm1, xmm2;
724  for( size_t k=0UL; k<K; ++k ) {
725  const IntrinsicType a1( A.load(i,k) );
726  xmm1 = xmm1 + a1 * set( B(k,j ) );
727  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
728  }
729  (~C).store( i, j , xmm1 );
730  (~C).store( i, j+1UL, xmm2 );
731  }
732  if( j < N ) {
733  IntrinsicType xmm1;
734  for( size_t k=0UL; k<K; ++k ) {
735  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
736  }
737  (~C).store( i, j, xmm1 );
738  }
739  }
740  }
742  //**********************************************************************************************
743 
744  //**BLAS-based assignment to dense matrices (default)*******************************************
758  template< typename MT3 // Type of the left-hand side target matrix
759  , typename MT4 // Type of the left-hand side matrix operand
760  , typename MT5 > // Type of the right-hand side matrix operand
761  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
762  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
763  {
764  selectDefaultAssignKernel( C, A, B );
765  }
767  //**********************************************************************************************
768 
769  //**BLAS-based assignment to dense matrices (single precision)**********************************
770 #if BLAZE_BLAS_MODE
771 
784  template< typename MT3 // Type of the left-hand side target matrix
785  , typename MT4 // Type of the left-hand side matrix operand
786  , typename MT5 > // Type of the right-hand side matrix operand
787  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
788  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
789  {
790  using boost::numeric_cast;
791 
795 
796  const int M ( numeric_cast<int>( A.rows() ) );
797  const int N ( numeric_cast<int>( B.columns() ) );
798  const int K ( numeric_cast<int>( A.columns() ) );
799  const int lda( numeric_cast<int>( A.spacing() ) );
800  const int ldb( numeric_cast<int>( B.spacing() ) );
801  const int ldc( numeric_cast<int>( C.spacing() ) );
802 
803  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
804  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
805  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
806  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
807  }
809 #endif
810  //**********************************************************************************************
811 
812  //**BLAS-based assignment to dense matrices (double precision)**********************************
813 #if BLAZE_BLAS_MODE
814 
827  template< typename MT3 // Type of the left-hand side target matrix
828  , typename MT4 // Type of the left-hand side matrix operand
829  , typename MT5 > // Type of the right-hand side matrix operand
830  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
831  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
832  {
833  using boost::numeric_cast;
834 
838 
839  const int M ( numeric_cast<int>( A.rows() ) );
840  const int N ( numeric_cast<int>( B.columns() ) );
841  const int K ( numeric_cast<int>( A.columns() ) );
842  const int lda( numeric_cast<int>( A.spacing() ) );
843  const int ldb( numeric_cast<int>( B.spacing() ) );
844  const int ldc( numeric_cast<int>( C.spacing() ) );
845 
846  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
847  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
848  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
849  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
850  }
852 #endif
853  //**********************************************************************************************
854 
855  //**BLAS-based assignment to dense matrices (single precision complex)**************************
856 #if BLAZE_BLAS_MODE
857 
870  template< typename MT3 // Type of the left-hand side target matrix
871  , typename MT4 // Type of the left-hand side matrix operand
872  , typename MT5 > // Type of the right-hand side matrix operand
873  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
874  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
875  {
876  using boost::numeric_cast;
877 
881  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
882  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
883  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
884 
885  const int M ( numeric_cast<int>( A.rows() ) );
886  const int N ( numeric_cast<int>( B.columns() ) );
887  const int K ( numeric_cast<int>( A.columns() ) );
888  const int lda( numeric_cast<int>( A.spacing() ) );
889  const int ldb( numeric_cast<int>( B.spacing() ) );
890  const int ldc( numeric_cast<int>( C.spacing() ) );
891  const complex<float> alpha( 1.0F, 0.0F );
892  const complex<float> beta ( 0.0F, 0.0F );
893 
894  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
895  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
896  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
897  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
898  }
900 #endif
901  //**********************************************************************************************
902 
903  //**BLAS-based assignment to dense matrices (double precision complex)**************************
904 #if BLAZE_BLAS_MODE
905 
918  template< typename MT3 // Type of the left-hand side target matrix
919  , typename MT4 // Type of the left-hand side matrix operand
920  , typename MT5 > // Type of the right-hand side matrix operand
921  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
922  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
923  {
924  using boost::numeric_cast;
925 
929  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
930  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
931  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
932 
933  const int M ( numeric_cast<int>( A.rows() ) );
934  const int N ( numeric_cast<int>( B.columns() ) );
935  const int K ( numeric_cast<int>( A.columns() ) );
936  const int lda( numeric_cast<int>( A.spacing() ) );
937  const int ldb( numeric_cast<int>( B.spacing() ) );
938  const int ldc( numeric_cast<int>( C.spacing() ) );
939  const complex<double> alpha( 1.0, 0.0 );
940  const complex<double> beta ( 0.0, 0.0 );
941 
942  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
943  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
944  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
945  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
946  }
948 #endif
949  //**********************************************************************************************
950 
951  //**Assignment to sparse matrices***************************************************************
963  template< typename MT // Type of the target sparse matrix
964  , bool SO > // Storage order of the target sparse matrix
965  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
966  {
968 
969  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
970 
977 
978  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
979  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
980 
981  const TmpType tmp( rhs );
982  assign( ~lhs, tmp );
983  }
985  //**********************************************************************************************
986 
987  //**Addition assignment to dense matrices*******************************************************
1000  template< typename MT // Type of the target dense matrix
1001  , bool SO > // Storage order of the target dense matrix
1002  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
1003  {
1005 
1006  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1007  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1008 
1009  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1010  return;
1011  }
1012 
1013  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
1014  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
1015 
1016  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1017  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1018  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1019  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1020  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1021  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1022 
1023  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
1024  TDMatDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B );
1025  else
1026  TDMatDMatMultExpr::selectBlasAddAssignKernel( ~lhs, A, B );
1027  }
1029  //**********************************************************************************************
1030 
1031  //**Default addition assignment to dense matrices***********************************************
1045  template< typename MT3 // Type of the left-hand side target matrix
1046  , typename MT4 // Type of the left-hand side matrix operand
1047  , typename MT5 > // Type of the right-hand side matrix operand
1048  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1049  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1050  {
1051  const size_t M( A.rows() );
1052  const size_t N( B.columns() );
1053  const size_t K( A.columns() );
1054 
1055  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1056  const size_t end( N & size_t(-2) );
1057 
1058  for( size_t i=0UL; i<M; ++i ) {
1059  for( size_t k=0UL; k<K; ++k ) {
1060  for( size_t j=0UL; j<end; j+=2UL ) {
1061  C(i,j ) += A(i,k) * B(k,j );
1062  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1063  }
1064  if( end < N ) {
1065  C(i,end) += A(i,k) * B(k,end);
1066  }
1067  }
1068  }
1069  }
1071  //**********************************************************************************************
1072 
1073  //**Vectorized default addition assignment to row-major dense matrices**************************
1087  template< typename MT3 // Type of the left-hand side target matrix
1088  , typename MT4 // Type of the left-hand side matrix operand
1089  , typename MT5 > // Type of the right-hand side matrix operand
1090  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1091  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1092  {
1093  typedef IntrinsicTrait<ElementType> IT;
1094 
1095  const size_t M( A.rows() );
1096  const size_t N( B.columns() );
1097  const size_t K( A.columns() );
1098 
1099  size_t j( 0UL );
1100 
1101  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1102  for( size_t i=0UL; i<M; ++i ) {
1103  IntrinsicType xmm1( (~C).load(i,j ) );
1104  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
1105  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
1106  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
1107  IntrinsicType xmm5( (~C).load(i,j+IT::size*4UL) );
1108  IntrinsicType xmm6( (~C).load(i,j+IT::size*5UL) );
1109  IntrinsicType xmm7( (~C).load(i,j+IT::size*6UL) );
1110  IntrinsicType xmm8( (~C).load(i,j+IT::size*7UL) );
1111  for( size_t k=0UL; k<K; ++k ) {
1112  const IntrinsicType a1( set( A(i,k) ) );
1113  xmm1 = xmm1 + a1 * B.load(k,j );
1114  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1115  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1116  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1117  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
1118  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
1119  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
1120  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
1121  }
1122  (~C).store( i, j , xmm1 );
1123  (~C).store( i, j+IT::size , xmm2 );
1124  (~C).store( i, j+IT::size*2UL, xmm3 );
1125  (~C).store( i, j+IT::size*3UL, xmm4 );
1126  (~C).store( i, j+IT::size*4UL, xmm5 );
1127  (~C).store( i, j+IT::size*5UL, xmm6 );
1128  (~C).store( i, j+IT::size*6UL, xmm7 );
1129  (~C).store( i, j+IT::size*7UL, xmm8 );
1130  }
1131  }
1132  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1133  size_t i( 0UL );
1134  for( ; (i+2UL) <= M; i+=2UL ) {
1135  IntrinsicType xmm1( (~C).load(i ,j ) );
1136  IntrinsicType xmm2( (~C).load(i ,j+IT::size ) );
1137  IntrinsicType xmm3( (~C).load(i ,j+IT::size*2UL) );
1138  IntrinsicType xmm4( (~C).load(i ,j+IT::size*3UL) );
1139  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1140  IntrinsicType xmm6( (~C).load(i+1UL,j+IT::size ) );
1141  IntrinsicType xmm7( (~C).load(i+1UL,j+IT::size*2UL) );
1142  IntrinsicType xmm8( (~C).load(i+1UL,j+IT::size*3UL) );
1143  for( size_t k=0UL; k<K; ++k ) {
1144  const IntrinsicType a1( set( A(i ,k) ) );
1145  const IntrinsicType a2( set( A(i+1UL,k) ) );
1146  const IntrinsicType b1( B.load(k,j ) );
1147  const IntrinsicType b2( B.load(k,j+IT::size ) );
1148  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
1149  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
1150  xmm1 = xmm1 + a1 * b1;
1151  xmm2 = xmm2 + a1 * b2;
1152  xmm3 = xmm3 + a1 * b3;
1153  xmm4 = xmm4 + a1 * b4;
1154  xmm5 = xmm5 + a2 * b1;
1155  xmm6 = xmm6 + a2 * b2;
1156  xmm7 = xmm7 + a2 * b3;
1157  xmm8 = xmm8 + a2 * b4;
1158  }
1159  (~C).store( i , j , xmm1 );
1160  (~C).store( i , j+IT::size , xmm2 );
1161  (~C).store( i , j+IT::size*2UL, xmm3 );
1162  (~C).store( i , j+IT::size*3UL, xmm4 );
1163  (~C).store( i+1UL, j , xmm5 );
1164  (~C).store( i+1UL, j+IT::size , xmm6 );
1165  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
1166  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
1167  }
1168  if( i < M ) {
1169  IntrinsicType xmm1( (~C).load(i,j ) );
1170  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
1171  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
1172  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
1173  for( size_t k=0UL; k<K; ++k ) {
1174  const IntrinsicType a1( set( A(i,k) ) );
1175  xmm1 = xmm1 + a1 * B.load(k,j );
1176  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
1177  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
1178  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
1179  }
1180  (~C).store( i, j , xmm1 );
1181  (~C).store( i, j+IT::size , xmm2 );
1182  (~C).store( i, j+IT::size*2UL, xmm3 );
1183  (~C).store( i, j+IT::size*3UL, xmm4 );
1184  }
1185  }
1186  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1187  size_t i( 0UL );
1188  for( ; (i+2UL) <= M; i+=2UL ) {
1189  IntrinsicType xmm1( (~C).load(i ,j ) );
1190  IntrinsicType xmm2( (~C).load(i ,j+IT::size) );
1191  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1192  IntrinsicType xmm4( (~C).load(i+1UL,j+IT::size) );
1193  for( size_t k=0UL; k<K; ++k ) {
1194  const IntrinsicType a1( set( A(i ,k) ) );
1195  const IntrinsicType a2( set( A(i+1UL,k) ) );
1196  const IntrinsicType b1( B.load(k,j ) );
1197  const IntrinsicType b2( B.load(k,j+IT::size) );
1198  xmm1 = xmm1 + a1 * b1;
1199  xmm2 = xmm2 + a1 * b2;
1200  xmm3 = xmm3 + a2 * b1;
1201  xmm4 = xmm4 + a2 * b2;
1202  }
1203  (~C).store( i , j , xmm1 );
1204  (~C).store( i , j+IT::size, xmm2 );
1205  (~C).store( i+1UL, j , xmm3 );
1206  (~C).store( i+1UL, j+IT::size, xmm4 );
1207  }
1208  if( i < M ) {
1209  IntrinsicType xmm1( (~C).load(i,j ) );
1210  IntrinsicType xmm2( (~C).load(i,j+IT::size) );
1211  for( size_t k=0UL; k<K; ++k ) {
1212  const IntrinsicType a1( set( A(i,k) ) );
1213  xmm1 = xmm1 + a1 * B.load(k,j );
1214  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
1215  }
1216  (~C).store( i, j , xmm1 );
1217  (~C).store( i, j+IT::size, xmm2 );
1218  }
1219  }
1220  if( j < N ) {
1221  size_t i( 0UL );
1222  for( ; (i+2UL) <= M; i+=2UL ) {
1223  IntrinsicType xmm1( (~C).load(i ,j) );
1224  IntrinsicType xmm2( (~C).load(i+1UL,j) );
1225  for( size_t k=0UL; k<K; ++k ) {
1226  const IntrinsicType b1( B.load(k,j) );
1227  xmm1 = xmm1 + set( A(i ,k) ) * b1;
1228  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
1229  }
1230  (~C).store( i , j, xmm1 );
1231  (~C).store( i+1UL, j, xmm2 );
1232  }
1233  if( i < M ) {
1234  IntrinsicType xmm1( (~C).load(i,j) );
1235  for( size_t k=0UL; k<K; ++k ) {
1236  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
1237  }
1238  (~C).store( i, j, xmm1 );
1239  }
1240  }
1241  }
1243  //**********************************************************************************************
1244 
1245  //**Vectorized default addition assignment to column-major dense matrices***********************
1259  template< typename MT3 // Type of the left-hand side target matrix
1260  , typename MT4 // Type of the left-hand side matrix operand
1261  , typename MT5 > // Type of the right-hand side matrix operand
1262  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1263  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1264  {
1265  typedef IntrinsicTrait<ElementType> IT;
1266 
1267  const size_t M( A.rows() );
1268  const size_t N( B.columns() );
1269  const size_t K( A.columns() );
1270 
1271  size_t i( 0UL );
1272 
1273  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1274  for( size_t j=0UL; j<N; ++j ) {
1275  IntrinsicType xmm1( (~C).load(i ,j) );
1276  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
1277  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
1278  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
1279  IntrinsicType xmm5( (~C).load(i+IT::size*4UL,j) );
1280  IntrinsicType xmm6( (~C).load(i+IT::size*5UL,j) );
1281  IntrinsicType xmm7( (~C).load(i+IT::size*6UL,j) );
1282  IntrinsicType xmm8( (~C).load(i+IT::size*7UL,j) );
1283  for( size_t k=0UL; k<K; ++k ) {
1284  const IntrinsicType b1( set( B(k,j) ) );
1285  xmm1 = xmm1 + A.load(i ,k) * b1;
1286  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1287  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1288  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1289  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
1290  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
1291  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
1292  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
1293  }
1294  (~C).store( i , j, xmm1 );
1295  (~C).store( i+IT::size , j, xmm2 );
1296  (~C).store( i+IT::size*2UL, j, xmm3 );
1297  (~C).store( i+IT::size*3UL, j, xmm4 );
1298  (~C).store( i+IT::size*4UL, j, xmm5 );
1299  (~C).store( i+IT::size*5UL, j, xmm6 );
1300  (~C).store( i+IT::size*6UL, j, xmm7 );
1301  (~C).store( i+IT::size*7UL, j, xmm8 );
1302  }
1303  }
1304  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1305  size_t j( 0UL );
1306  for( ; (j+2UL) <= N; j+=2UL ) {
1307  IntrinsicType xmm1( (~C).load(i ,j ) );
1308  IntrinsicType xmm2( (~C).load(i+IT::size ,j ) );
1309  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j ) );
1310  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j ) );
1311  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
1312  IntrinsicType xmm6( (~C).load(i+IT::size ,j+1UL) );
1313  IntrinsicType xmm7( (~C).load(i+IT::size*2UL,j+1UL) );
1314  IntrinsicType xmm8( (~C).load(i+IT::size*3UL,j+1UL) );
1315  for( size_t k=0UL; k<K; ++k ) {
1316  const IntrinsicType a1( A.load(i ,k) );
1317  const IntrinsicType a2( A.load(i+IT::size ,k) );
1318  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
1319  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
1320  const IntrinsicType b1( set( B(k,j ) ) );
1321  const IntrinsicType b2( set( B(k,j+1UL) ) );
1322  xmm1 = xmm1 + a1 * b1;
1323  xmm2 = xmm2 + a2 * b1;
1324  xmm3 = xmm3 + a3 * b1;
1325  xmm4 = xmm4 + a4 * b1;
1326  xmm5 = xmm5 + a1 * b2;
1327  xmm6 = xmm6 + a2 * b2;
1328  xmm7 = xmm7 + a3 * b2;
1329  xmm8 = xmm8 + a4 * b2;
1330  }
1331  (~C).store( i , j , xmm1 );
1332  (~C).store( i+IT::size , j , xmm2 );
1333  (~C).store( i+IT::size*2UL, j , xmm3 );
1334  (~C).store( i+IT::size*3UL, j , xmm4 );
1335  (~C).store( i , j+1UL, xmm5 );
1336  (~C).store( i+IT::size , j+1UL, xmm6 );
1337  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
1338  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
1339  }
1340  if( j < N ) {
1341  IntrinsicType xmm1( (~C).load(i ,j) );
1342  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
1343  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
1344  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
1345  for( size_t k=0UL; k<K; ++k ) {
1346  const IntrinsicType b1( set( B(k,j) ) );
1347  xmm1 = xmm1 + A.load(i ,k) * b1;
1348  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
1349  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
1350  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
1351  }
1352  (~C).store( i , j, xmm1 );
1353  (~C).store( i+IT::size , j, xmm2 );
1354  (~C).store( i+IT::size*2UL, j, xmm3 );
1355  (~C).store( i+IT::size*3UL, j, xmm4 );
1356  }
1357  }
1358  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1359  size_t j( 0UL );
1360  for( ; (j+2UL) <= N; j+=2UL ) {
1361  IntrinsicType xmm1( (~C).load(i ,j ) );
1362  IntrinsicType xmm2( (~C).load(i+IT::size,j ) );
1363  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
1364  IntrinsicType xmm4( (~C).load(i+IT::size,j+1UL) );
1365  for( size_t k=0UL; k<K; ++k ) {
1366  const IntrinsicType a1( A.load(i ,k) );
1367  const IntrinsicType a2( A.load(i+IT::size,k) );
1368  const IntrinsicType b1( set( B(k,j ) ) );
1369  const IntrinsicType b2( set( B(k,j+1UL) ) );
1370  xmm1 = xmm1 + a1 * b1;
1371  xmm2 = xmm2 + a2 * b1;
1372  xmm3 = xmm3 + a1 * b2;
1373  xmm4 = xmm4 + a2 * b2;
1374  }
1375  (~C).store( i , j , xmm1 );
1376  (~C).store( i+IT::size, j , xmm2 );
1377  (~C).store( i , j+1UL, xmm3 );
1378  (~C).store( i+IT::size, j+1UL, xmm4 );
1379  }
1380  if( j < N ) {
1381  IntrinsicType xmm1( (~C).load(i ,j) );
1382  IntrinsicType xmm2( (~C).load(i+IT::size,j) );
1383  for( size_t k=0UL; k<K; ++k ) {
1384  const IntrinsicType b1( set( B(k,j) ) );
1385  xmm1 = xmm1 + A.load(i ,k) * b1;
1386  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
1387  }
1388  (~C).store( i , j, xmm1 );
1389  (~C).store( i+IT::size, j, xmm2 );
1390  }
1391  }
1392  if( i < M ) {
1393  size_t j( 0UL );
1394  for( ; (j+2UL) <= N; j+=2UL ) {
1395  IntrinsicType xmm1( (~C).load(i,j ) );
1396  IntrinsicType xmm2( (~C).load(i,j+1UL) );
1397  for( size_t k=0UL; k<K; ++k ) {
1398  const IntrinsicType a1( A.load(i,k) );
1399  xmm1 = xmm1 + a1 * set( B(k,j ) );
1400  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
1401  }
1402  (~C).store( i, j , xmm1 );
1403  (~C).store( i, j+1UL, xmm2 );
1404  }
1405  if( j < N ) {
1406  IntrinsicType xmm1( (~C).load(i,j) );
1407  for( size_t k=0UL; k<K; ++k ) {
1408  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
1409  }
1410  (~C).store( i, j, xmm1 );
1411  }
1412  }
1413  }
1415  //**********************************************************************************************
1416 
1417  //**BLAS-based addition assignment to dense matrices (default)**********************************
1431  template< typename MT3 // Type of the left-hand side target matrix
1432  , typename MT4 // Type of the left-hand side matrix operand
1433  , typename MT5 > // Type of the right-hand side matrix operand
1434  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1435  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1436  {
1437  selectDefaultAddAssignKernel( C, A, B );
1438  }
1440  //**********************************************************************************************
1441 
1442  //**BLAS-based addition assignment to dense matrices (single precision)*************************
1443 #if BLAZE_BLAS_MODE
1444 
1457  template< typename MT3 // Type of the left-hand side target matrix
1458  , typename MT4 // Type of the left-hand side matrix operand
1459  , typename MT5 > // Type of the right-hand side matrix operand
1460  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1461  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1462  {
1463  using boost::numeric_cast;
1464 
1468 
1469  const int M ( numeric_cast<int>( A.rows() ) );
1470  const int N ( numeric_cast<int>( B.columns() ) );
1471  const int K ( numeric_cast<int>( A.columns() ) );
1472  const int lda( numeric_cast<int>( A.spacing() ) );
1473  const int ldb( numeric_cast<int>( B.spacing() ) );
1474  const int ldc( numeric_cast<int>( C.spacing() ) );
1475 
1476  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1477  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1478  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1479  M, N, K, 1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
1480  }
1482 #endif
1483  //**********************************************************************************************
1484 
1485  //**BLAS-based addition assignment to dense matrices (double precision)*************************
1486 #if BLAZE_BLAS_MODE
1487 
1500  template< typename MT3 // Type of the left-hand side target matrix
1501  , typename MT4 // Type of the left-hand side matrix operand
1502  , typename MT5 > // Type of the right-hand side matrix operand
1503  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1504  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1505  {
1506  using boost::numeric_cast;
1507 
1511 
1512  const int M ( numeric_cast<int>( A.rows() ) );
1513  const int N ( numeric_cast<int>( B.columns() ) );
1514  const int K ( numeric_cast<int>( A.columns() ) );
1515  const int lda( numeric_cast<int>( A.spacing() ) );
1516  const int ldb( numeric_cast<int>( B.spacing() ) );
1517  const int ldc( numeric_cast<int>( C.spacing() ) );
1518 
1519  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1520  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1521  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1522  M, N, K, 1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
1523  }
1525 #endif
1526  //**********************************************************************************************
1527 
1528  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
1529 #if BLAZE_BLAS_MODE
1530 
1543  template< typename MT3 // Type of the left-hand side target matrix
1544  , typename MT4 // Type of the left-hand side matrix operand
1545  , typename MT5 > // Type of the right-hand side matrix operand
1546  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1547  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1548  {
1549  using boost::numeric_cast;
1550 
1554  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
1555  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
1556  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
1557 
1558  const int M ( numeric_cast<int>( A.rows() ) );
1559  const int N ( numeric_cast<int>( B.columns() ) );
1560  const int K ( numeric_cast<int>( A.columns() ) );
1561  const int lda( numeric_cast<int>( A.spacing() ) );
1562  const int ldb( numeric_cast<int>( B.spacing() ) );
1563  const int ldc( numeric_cast<int>( C.spacing() ) );
1564  const complex<float> alpha( 1.0F, 0.0F );
1565  const complex<float> beta ( 1.0F, 0.0F );
1566 
1567  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1568  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1569  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1570  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1571  }
1573 #endif
1574  //**********************************************************************************************
1575 
1576  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
1577 #if BLAZE_BLAS_MODE
1578 
1591  template< typename MT3 // Type of the left-hand side target matrix
1592  , typename MT4 // Type of the left-hand side matrix operand
1593  , typename MT5 > // Type of the right-hand side matrix operand
1594  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1595  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1596  {
1597  using boost::numeric_cast;
1598 
1602  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
1603  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
1604  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
1605 
1606  const int M ( numeric_cast<int>( A.rows() ) );
1607  const int N ( numeric_cast<int>( B.columns() ) );
1608  const int K ( numeric_cast<int>( A.columns() ) );
1609  const int lda( numeric_cast<int>( A.spacing() ) );
1610  const int ldb( numeric_cast<int>( B.spacing() ) );
1611  const int ldc( numeric_cast<int>( C.spacing() ) );
1612  const complex<double> alpha( 1.0, 0.0 );
1613  const complex<double> beta ( 1.0, 0.0 );
1614 
1615  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
1616  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
1617  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
1618  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
1619  }
1621 #endif
1622  //**********************************************************************************************
1623 
1624  //**Addition assignment to sparse matrices******************************************************
1625  // No special implementation for the addition assignment to sparse matrices.
1626  //**********************************************************************************************
1627 
1628  //**Subtraction assignment to dense matrices****************************************************
1641  template< typename MT // Type of the target dense matrix
1642  , bool SO > // Storage order of the target dense matrix
1643  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
1644  {
1646 
1647  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1648  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1649 
1650  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1651  return;
1652  }
1653 
1654  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
1655  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
1656 
1657  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1658  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1659  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1660  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1661  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1662  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1663 
1664  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
1665  TDMatDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B );
1666  else
1667  TDMatDMatMultExpr::selectBlasSubAssignKernel( ~lhs, A, B );
1668  }
1670  //**********************************************************************************************
1671 
1672  //**Default subtraction assignment to dense matrices********************************************
1686  template< typename MT3 // Type of the left-hand side target matrix
1687  , typename MT4 // Type of the left-hand side matrix operand
1688  , typename MT5 > // Type of the right-hand side matrix operand
1689  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1690  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
1691  {
1692  const size_t M( A.rows() );
1693  const size_t N( B.columns() );
1694  const size_t K( A.columns() );
1695 
1696  BLAZE_INTERNAL_ASSERT( ( N - ( N % 2UL ) ) == ( N & size_t(-2) ), "Invalid end calculation" );
1697  const size_t end( N & size_t(-2) );
1698 
1699  for( size_t i=0UL; i<M; ++i ) {
1700  for( size_t k=0UL; k<K; ++k ) {
1701  for( size_t j=0UL; j<end; j+=2UL ) {
1702  C(i,j ) -= A(i,k) * B(k,j );
1703  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
1704  }
1705  if( end < N ) {
1706  C(i,end) -= A(i,k) * B(k,end);
1707  }
1708  }
1709  }
1710  }
1712  //**********************************************************************************************
1713 
1714  //**Vectorized default subtraction assignment to row-major dense matrices***********************
1728  template< typename MT3 // Type of the left-hand side target matrix
1729  , typename MT4 // Type of the left-hand side matrix operand
1730  , typename MT5 > // Type of the right-hand side matrix operand
1731  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1732  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1733  {
1734  typedef IntrinsicTrait<ElementType> IT;
1735 
1736  const size_t M( A.rows() );
1737  const size_t N( B.columns() );
1738  const size_t K( A.columns() );
1739 
1740  size_t j( 0UL );
1741 
1742  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1743  for( size_t i=0UL; i<M; ++i ) {
1744  IntrinsicType xmm1( (~C).load(i,j ) );
1745  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
1746  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
1747  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
1748  IntrinsicType xmm5( (~C).load(i,j+IT::size*4UL) );
1749  IntrinsicType xmm6( (~C).load(i,j+IT::size*5UL) );
1750  IntrinsicType xmm7( (~C).load(i,j+IT::size*6UL) );
1751  IntrinsicType xmm8( (~C).load(i,j+IT::size*7UL) );
1752  for( size_t k=0UL; k<K; ++k ) {
1753  const IntrinsicType a1( set( A(i,k) ) );
1754  xmm1 = xmm1 - a1 * B.load(k,j );
1755  xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
1756  xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
1757  xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
1758  xmm5 = xmm5 - a1 * B.load(k,j+IT::size*4UL);
1759  xmm6 = xmm6 - a1 * B.load(k,j+IT::size*5UL);
1760  xmm7 = xmm7 - a1 * B.load(k,j+IT::size*6UL);
1761  xmm8 = xmm8 - a1 * B.load(k,j+IT::size*7UL);
1762  }
1763  (~C).store( i, j , xmm1 );
1764  (~C).store( i, j+IT::size , xmm2 );
1765  (~C).store( i, j+IT::size*2UL, xmm3 );
1766  (~C).store( i, j+IT::size*3UL, xmm4 );
1767  (~C).store( i, j+IT::size*4UL, xmm5 );
1768  (~C).store( i, j+IT::size*5UL, xmm6 );
1769  (~C).store( i, j+IT::size*6UL, xmm7 );
1770  (~C).store( i, j+IT::size*7UL, xmm8 );
1771  }
1772  }
1773  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1774  size_t i( 0UL );
1775  for( ; (i+2UL) <= M; i+=2UL ) {
1776  IntrinsicType xmm1( (~C).load(i ,j ) );
1777  IntrinsicType xmm2( (~C).load(i ,j+IT::size ) );
1778  IntrinsicType xmm3( (~C).load(i ,j+IT::size*2UL) );
1779  IntrinsicType xmm4( (~C).load(i ,j+IT::size*3UL) );
1780  IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1781  IntrinsicType xmm6( (~C).load(i+1UL,j+IT::size ) );
1782  IntrinsicType xmm7( (~C).load(i+1UL,j+IT::size*2UL) );
1783  IntrinsicType xmm8( (~C).load(i+1UL,j+IT::size*3UL) );
1784  for( size_t k=0UL; k<K; ++k ) {
1785  const IntrinsicType a1( set( A(i ,k) ) );
1786  const IntrinsicType a2( set( A(i+1UL,k) ) );
1787  const IntrinsicType b1( B.load(k,j ) );
1788  const IntrinsicType b2( B.load(k,j+IT::size ) );
1789  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
1790  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
1791  xmm1 = xmm1 - a1 * b1;
1792  xmm2 = xmm2 - a1 * b2;
1793  xmm3 = xmm3 - a1 * b3;
1794  xmm4 = xmm4 - a1 * b4;
1795  xmm5 = xmm5 - a2 * b1;
1796  xmm6 = xmm6 - a2 * b2;
1797  xmm7 = xmm7 - a2 * b3;
1798  xmm8 = xmm8 - a2 * b4;
1799  }
1800  (~C).store( i , j , xmm1 );
1801  (~C).store( i , j+IT::size , xmm2 );
1802  (~C).store( i , j+IT::size*2UL, xmm3 );
1803  (~C).store( i , j+IT::size*3UL, xmm4 );
1804  (~C).store( i+1UL, j , xmm5 );
1805  (~C).store( i+1UL, j+IT::size , xmm6 );
1806  (~C).store( i+1UL, j+IT::size*2UL, xmm7 );
1807  (~C).store( i+1UL, j+IT::size*3UL, xmm8 );
1808  }
1809  if( i < M ) {
1810  IntrinsicType xmm1( (~C).load(i,j ) );
1811  IntrinsicType xmm2( (~C).load(i,j+IT::size ) );
1812  IntrinsicType xmm3( (~C).load(i,j+IT::size*2UL) );
1813  IntrinsicType xmm4( (~C).load(i,j+IT::size*3UL) );
1814  for( size_t k=0UL; k<K; ++k ) {
1815  const IntrinsicType a1( set( A(i,k) ) );
1816  xmm1 = xmm1 - a1 * B.load(k,j );
1817  xmm2 = xmm2 - a1 * B.load(k,j+IT::size );
1818  xmm3 = xmm3 - a1 * B.load(k,j+IT::size*2UL);
1819  xmm4 = xmm4 - a1 * B.load(k,j+IT::size*3UL);
1820  }
1821  (~C).store( i, j , xmm1 );
1822  (~C).store( i, j+IT::size , xmm2 );
1823  (~C).store( i, j+IT::size*2UL, xmm3 );
1824  (~C).store( i, j+IT::size*3UL, xmm4 );
1825  }
1826  }
1827  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1828  size_t i( 0UL );
1829  for( ; (i+2UL) <= M; i+=2UL ) {
1830  IntrinsicType xmm1( (~C).load(i ,j ) );
1831  IntrinsicType xmm2( (~C).load(i ,j+IT::size) );
1832  IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1833  IntrinsicType xmm4( (~C).load(i+1UL,j+IT::size) );
1834  for( size_t k=0UL; k<K; ++k ) {
1835  const IntrinsicType a1( set( A(i ,k) ) );
1836  const IntrinsicType a2( set( A(i+1UL,k) ) );
1837  const IntrinsicType b1( B.load(k,j ) );
1838  const IntrinsicType b2( B.load(k,j+IT::size) );
1839  xmm1 = xmm1 - a1 * b1;
1840  xmm2 = xmm2 - a1 * b2;
1841  xmm3 = xmm3 - a2 * b1;
1842  xmm4 = xmm4 - a2 * b2;
1843  }
1844  (~C).store( i , j , xmm1 );
1845  (~C).store( i , j+IT::size, xmm2 );
1846  (~C).store( i+1UL, j , xmm3 );
1847  (~C).store( i+1UL, j+IT::size, xmm4 );
1848  }
1849  if( i < M ) {
1850  IntrinsicType xmm1( (~C).load(i,j ) );
1851  IntrinsicType xmm2( (~C).load(i,j+IT::size) );
1852  for( size_t k=0UL; k<K; ++k ) {
1853  const IntrinsicType a1( set( A(i,k) ) );
1854  xmm1 = xmm1 - a1 * B.load(k,j );
1855  xmm2 = xmm2 - a1 * B.load(k,j+IT::size);
1856  }
1857  (~C).store( i, j , xmm1 );
1858  (~C).store( i, j+IT::size, xmm2 );
1859  }
1860  }
1861  if( j < N ) {
1862  size_t i( 0UL );
1863  for( ; (i+2UL) <= M; i+=2UL ) {
1864  IntrinsicType xmm1( (~C).load(i ,j) );
1865  IntrinsicType xmm2( (~C).load(i+1UL,j) );
1866  for( size_t k=0UL; k<K; ++k ) {
1867  const IntrinsicType b1( B.load(k,j) );
1868  xmm1 = xmm1 - set( A(i ,k) ) * b1;
1869  xmm2 = xmm2 - set( A(i+1UL,k) ) * b1;
1870  }
1871  (~C).store( i , j, xmm1 );
1872  (~C).store( i+1UL, j, xmm2 );
1873  }
1874  if( i < M ) {
1875  IntrinsicType xmm1( (~C).load(i,j) );
1876  for( size_t k=0UL; k<K; ++k ) {
1877  xmm1 = xmm1 - set( A(i,k) ) * B.load(k,j);
1878  }
1879  (~C).store( i, j, xmm1 );
1880  }
1881  }
1882  }
1884  //**********************************************************************************************
1885 
1886  //**Vectorized default subtraction assignment to column-major dense matrices********************
1900  template< typename MT3 // Type of the left-hand side target matrix
1901  , typename MT4 // Type of the left-hand side matrix operand
1902  , typename MT5 > // Type of the right-hand side matrix operand
1903  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1904  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1905  {
1906  typedef IntrinsicTrait<ElementType> IT;
1907 
1908  const size_t M( A.rows() );
1909  const size_t N( B.columns() );
1910  const size_t K( A.columns() );
1911 
1912  size_t i( 0UL );
1913 
1914  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1915  for( size_t j=0UL; j<N; ++j ) {
1916  IntrinsicType xmm1( (~C).load(i ,j) );
1917  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
1918  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
1919  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
1920  IntrinsicType xmm5( (~C).load(i+IT::size*4UL,j) );
1921  IntrinsicType xmm6( (~C).load(i+IT::size*5UL,j) );
1922  IntrinsicType xmm7( (~C).load(i+IT::size*6UL,j) );
1923  IntrinsicType xmm8( (~C).load(i+IT::size*7UL,j) );
1924  for( size_t k=0UL; k<K; ++k ) {
1925  const IntrinsicType b1( set( B(k,j) ) );
1926  xmm1 = xmm1 - A.load(i ,k) * b1;
1927  xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
1928  xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
1929  xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
1930  xmm5 = xmm5 - A.load(i+IT::size*4UL,k) * b1;
1931  xmm6 = xmm6 - A.load(i+IT::size*5UL,k) * b1;
1932  xmm7 = xmm7 - A.load(i+IT::size*6UL,k) * b1;
1933  xmm8 = xmm8 - A.load(i+IT::size*7UL,k) * b1;
1934  }
1935  (~C).store( i , j, xmm1 );
1936  (~C).store( i+IT::size , j, xmm2 );
1937  (~C).store( i+IT::size*2UL, j, xmm3 );
1938  (~C).store( i+IT::size*3UL, j, xmm4 );
1939  (~C).store( i+IT::size*4UL, j, xmm5 );
1940  (~C).store( i+IT::size*5UL, j, xmm6 );
1941  (~C).store( i+IT::size*6UL, j, xmm7 );
1942  (~C).store( i+IT::size*7UL, j, xmm8 );
1943  }
1944  }
1945  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1946  size_t j( 0UL );
1947  for( ; (j+2UL) <= N; j+=2UL ) {
1948  IntrinsicType xmm1( (~C).load(i ,j ) );
1949  IntrinsicType xmm2( (~C).load(i+IT::size ,j ) );
1950  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j ) );
1951  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j ) );
1952  IntrinsicType xmm5( (~C).load(i ,j+1UL) );
1953  IntrinsicType xmm6( (~C).load(i+IT::size ,j+1UL) );
1954  IntrinsicType xmm7( (~C).load(i+IT::size*2UL,j+1UL) );
1955  IntrinsicType xmm8( (~C).load(i+IT::size*3UL,j+1UL) );
1956  for( size_t k=0UL; k<K; ++k ) {
1957  const IntrinsicType a1( A.load(i ,k) );
1958  const IntrinsicType a2( A.load(i+IT::size ,k) );
1959  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
1960  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
1961  const IntrinsicType b1( set( B(k,j ) ) );
1962  const IntrinsicType b2( set( B(k,j+1UL) ) );
1963  xmm1 = xmm1 - a1 * b1;
1964  xmm2 = xmm2 - a2 * b1;
1965  xmm3 = xmm3 - a3 * b1;
1966  xmm4 = xmm4 - a4 * b1;
1967  xmm5 = xmm5 - a1 * b2;
1968  xmm6 = xmm6 - a2 * b2;
1969  xmm7 = xmm7 - a3 * b2;
1970  xmm8 = xmm8 - a4 * b2;
1971  }
1972  (~C).store( i , j , xmm1 );
1973  (~C).store( i+IT::size , j , xmm2 );
1974  (~C).store( i+IT::size*2UL, j , xmm3 );
1975  (~C).store( i+IT::size*3UL, j , xmm4 );
1976  (~C).store( i , j+1UL, xmm5 );
1977  (~C).store( i+IT::size , j+1UL, xmm6 );
1978  (~C).store( i+IT::size*2UL, j+1UL, xmm7 );
1979  (~C).store( i+IT::size*3UL, j+1UL, xmm8 );
1980  }
1981  if( j < N ) {
1982  IntrinsicType xmm1( (~C).load(i ,j) );
1983  IntrinsicType xmm2( (~C).load(i+IT::size ,j) );
1984  IntrinsicType xmm3( (~C).load(i+IT::size*2UL,j) );
1985  IntrinsicType xmm4( (~C).load(i+IT::size*3UL,j) );
1986  for( size_t k=0UL; k<K; ++k ) {
1987  const IntrinsicType b1( set( B(k,j) ) );
1988  xmm1 = xmm1 - A.load(i ,k) * b1;
1989  xmm2 = xmm2 - A.load(i+IT::size ,k) * b1;
1990  xmm3 = xmm3 - A.load(i+IT::size*2UL,k) * b1;
1991  xmm4 = xmm4 - A.load(i+IT::size*3UL,k) * b1;
1992  }
1993  (~C).store( i , j, xmm1 );
1994  (~C).store( i+IT::size , j, xmm2 );
1995  (~C).store( i+IT::size*2UL, j, xmm3 );
1996  (~C).store( i+IT::size*3UL, j, xmm4 );
1997  }
1998  }
1999  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2000  size_t j( 0UL );
2001  for( ; (j+2UL) <= N; j+=2UL ) {
2002  IntrinsicType xmm1( (~C).load(i ,j ) );
2003  IntrinsicType xmm2( (~C).load(i+IT::size,j ) );
2004  IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2005  IntrinsicType xmm4( (~C).load(i+IT::size,j+1UL) );
2006  for( size_t k=0UL; k<K; ++k ) {
2007  const IntrinsicType a1( A.load(i ,k) );
2008  const IntrinsicType a2( A.load(i+IT::size,k) );
2009  const IntrinsicType b1( set( B(k,j ) ) );
2010  const IntrinsicType b2( set( B(k,j+1UL) ) );
2011  xmm1 = xmm1 - a1 * b1;
2012  xmm2 = xmm2 - a2 * b1;
2013  xmm3 = xmm3 - a1 * b2;
2014  xmm4 = xmm4 - a2 * b2;
2015  }
2016  (~C).store( i , j , xmm1 );
2017  (~C).store( i+IT::size, j , xmm2 );
2018  (~C).store( i , j+1UL, xmm3 );
2019  (~C).store( i+IT::size, j+1UL, xmm4 );
2020  }
2021  if( j < N ) {
2022  IntrinsicType xmm1( (~C).load(i ,j) );
2023  IntrinsicType xmm2( (~C).load(i+IT::size,j) );
2024  for( size_t k=0UL; k<K; ++k ) {
2025  const IntrinsicType b1( set( B(k,j) ) );
2026  xmm1 = xmm1 - A.load(i ,k) * b1;
2027  xmm2 = xmm2 - A.load(i+IT::size,k) * b1;
2028  }
2029  (~C).store( i , j, xmm1 );
2030  (~C).store( i+IT::size, j, xmm2 );
2031  }
2032  }
2033  if( i < M ) {
2034  size_t j( 0UL );
2035  for( ; (j+2UL) <= N; j+=2UL ) {
2036  IntrinsicType xmm1( (~C).load(i,j ) );
2037  IntrinsicType xmm2( (~C).load(i,j+1UL) );
2038  for( size_t k=0UL; k<K; ++k ) {
2039  const IntrinsicType a1( A.load(i,k) );
2040  xmm1 = xmm1 - a1 * set( B(k,j ) );
2041  xmm2 = xmm2 - a1 * set( B(k,j+1UL) );
2042  }
2043  (~C).store( i, j , xmm1 );
2044  (~C).store( i, j+1UL, xmm2 );
2045  }
2046  if( j < N ) {
2047  IntrinsicType xmm1( (~C).load(i,j) );
2048  for( size_t k=0UL; k<K; ++k ) {
2049  xmm1 = xmm1 - A.load(i,k) * set( B(k,j) );
2050  }
2051  (~C).store( i, j, xmm1 );
2052  }
2053  }
2054  }
2056  //**********************************************************************************************
2057 
2058  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
2072  template< typename MT3 // Type of the left-hand side target matrix
2073  , typename MT4 // Type of the left-hand side matrix operand
2074  , typename MT5 > // Type of the right-hand side matrix operand
2075  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2076  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2077  {
2078  selectDefaultSubAssignKernel( C, A, B );
2079  }
2081  //**********************************************************************************************
2082 
2083  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
2084 #if BLAZE_BLAS_MODE
2085 
2098  template< typename MT3 // Type of the left-hand side target matrix
2099  , typename MT4 // Type of the left-hand side matrix operand
2100  , typename MT5 > // Type of the right-hand side matrix operand
2101  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2102  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2103  {
2104  using boost::numeric_cast;
2105 
2109 
2110  const int M ( numeric_cast<int>( A.rows() ) );
2111  const int N ( numeric_cast<int>( B.columns() ) );
2112  const int K ( numeric_cast<int>( A.columns() ) );
2113  const int lda( numeric_cast<int>( A.spacing() ) );
2114  const int ldb( numeric_cast<int>( B.spacing() ) );
2115  const int ldc( numeric_cast<int>( C.spacing() ) );
2116 
2117  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2118  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2119  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2120  M, N, K, -1.0F, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
2121  }
2123 #endif
2124  //**********************************************************************************************
2125 
2126  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
2127 #if BLAZE_BLAS_MODE
2128 
2141  template< typename MT3 // Type of the left-hand side target matrix
2142  , typename MT4 // Type of the left-hand side matrix operand
2143  , typename MT5 > // Type of the right-hand side matrix operand
2144  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2145  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2146  {
2147  using boost::numeric_cast;
2148 
2152 
2153  const int M ( numeric_cast<int>( A.rows() ) );
2154  const int N ( numeric_cast<int>( B.columns() ) );
2155  const int K ( numeric_cast<int>( A.columns() ) );
2156  const int lda( numeric_cast<int>( A.spacing() ) );
2157  const int ldb( numeric_cast<int>( B.spacing() ) );
2158  const int ldc( numeric_cast<int>( C.spacing() ) );
2159 
2160  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2161  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2162  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2163  M, N, K, -1.0, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
2164  }
2166 #endif
2167  //**********************************************************************************************
2168 
2169  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
2170 #if BLAZE_BLAS_MODE
2171 
2184  template< typename MT3 // Type of the left-hand side target matrix
2185  , typename MT4 // Type of the left-hand side matrix operand
2186  , typename MT5 > // Type of the right-hand side matrix operand
2187  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2188  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2189  {
2190  using boost::numeric_cast;
2191 
2195  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
2196  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
2197  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
2198 
2199  const int M ( numeric_cast<int>( A.rows() ) );
2200  const int N ( numeric_cast<int>( B.columns() ) );
2201  const int K ( numeric_cast<int>( A.columns() ) );
2202  const int lda( numeric_cast<int>( A.spacing() ) );
2203  const int ldb( numeric_cast<int>( B.spacing() ) );
2204  const int ldc( numeric_cast<int>( C.spacing() ) );
2205  const complex<float> alpha( -1.0F, 0.0F );
2206  const complex<float> beta ( 1.0F, 0.0F );
2207 
2208  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2209  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2210  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2211  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2212  }
2214 #endif
2215  //**********************************************************************************************
2216 
2217  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
2218 #if BLAZE_BLAS_MODE
2219 
2232  template< typename MT3 // Type of the left-hand side target matrix
2233  , typename MT4 // Type of the left-hand side matrix operand
2234  , typename MT5 > // Type of the right-hand side matrix operand
2235  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2236  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2237  {
2238  using boost::numeric_cast;
2239 
2243  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
2244  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
2245  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
2246 
2247  const int M ( numeric_cast<int>( A.rows() ) );
2248  const int N ( numeric_cast<int>( B.columns() ) );
2249  const int K ( numeric_cast<int>( A.columns() ) );
2250  const int lda( numeric_cast<int>( A.spacing() ) );
2251  const int ldb( numeric_cast<int>( B.spacing() ) );
2252  const int ldc( numeric_cast<int>( C.spacing() ) );
2253  const complex<double> alpha( -1.0, 0.0 );
2254  const complex<double> beta ( 1.0, 0.0 );
2255 
2256  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2257  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2258  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2259  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
2260  }
2262 #endif
2263  //**********************************************************************************************
2264 
2265  //**Subtraction assignment to sparse matrices***************************************************
2266  // No special implementation for the subtraction assignment to sparse matrices.
2267  //**********************************************************************************************
2268 
2269  //**Multiplication assignment to dense matrices*************************************************
2270  // No special implementation for the multiplication assignment to dense matrices.
2271  //**********************************************************************************************
2272 
2273  //**Multiplication assignment to sparse matrices************************************************
2274  // No special implementation for the multiplication assignment to sparse matrices.
2275  //**********************************************************************************************
2276 
2277  //**Compile time checks*************************************************************************
2284  //**********************************************************************************************
2285 };
2286 //*************************************************************************************************
2287 
2288 
2289 
2290 
2291 //=================================================================================================
2292 //
2293 // DMATSCALARMULTEXPR SPECIALIZATION
2294 //
2295 //=================================================================================================
2296 
2297 //*************************************************************************************************
2305 template< typename MT1 // Type of the left-hand side dense matrix
2306  , typename MT2 // Type of the right-hand side dense matrix
2307  , typename ST > // Type of the right-hand side scalar value
2308 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >
2309  : public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
2310  , private MatScalarMultExpr
2311  , private Computation
2312 {
2313  private:
2314  //**Type definitions****************************************************************************
2315  typedef TDMatDMatMultExpr<MT1,MT2> MMM;
2316  typedef typename MMM::ResultType RES;
2317  typedef typename MT1::ResultType RT1;
2318  typedef typename MT2::ResultType RT2;
2319  typedef typename MT1::CompositeType CT1;
2320  typedef typename MT2::CompositeType CT2;
2321  //**********************************************************************************************
2322 
2323  //**********************************************************************************************
2325 
2328  template< typename T1, typename T2, typename T3, typename T4 >
2329  struct UseSinglePrecisionKernel {
2330  enum { value = IsFloat<typename T1::ElementType>::value &&
2331  IsFloat<typename T2::ElementType>::value &&
2332  IsFloat<typename T3::ElementType>::value &&
2333  !IsComplex<T4>::value };
2334  };
2335  //**********************************************************************************************
2336 
2337  //**********************************************************************************************
2339 
2342  template< typename T1, typename T2, typename T3, typename T4 >
2343  struct UseDoublePrecisionKernel {
2344  enum { value = IsDouble<typename T1::ElementType>::value &&
2345  IsDouble<typename T2::ElementType>::value &&
2346  IsDouble<typename T3::ElementType>::value &&
2347  !IsComplex<T4>::value };
2348  };
2349  //**********************************************************************************************
2350 
2351  //**********************************************************************************************
2353 
2356  template< typename T1, typename T2, typename T3 >
2357  struct UseSinglePrecisionComplexKernel {
2358  typedef complex<float> Type;
2359  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2360  IsSame<typename T2::ElementType,Type>::value &&
2361  IsSame<typename T3::ElementType,Type>::value };
2362  };
2363  //**********************************************************************************************
2364 
2365  //**********************************************************************************************
2367 
2370  template< typename T1, typename T2, typename T3 >
2371  struct UseDoublePrecisionComplexKernel {
2372  typedef complex<double> Type;
2373  enum { value = IsSame<typename T1::ElementType,Type>::value &&
2374  IsSame<typename T2::ElementType,Type>::value &&
2375  IsSame<typename T3::ElementType,Type>::value };
2376  };
2377  //**********************************************************************************************
2378 
2379  //**********************************************************************************************
2381 
2383  template< typename T1, typename T2, typename T3, typename T4 >
2384  struct UseDefaultKernel {
2385  enum { value = !BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2386  !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2387  !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2388  !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2389  };
2390  //**********************************************************************************************
2391 
2392  //**********************************************************************************************
2394 
2396  template< typename T1, typename T2, typename T3, typename T4 >
2397  struct UseVectorizedDefaultKernel {
2398  enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2399  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2400  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2401  IsSame<typename T1::ElementType,T4>::value &&
2402  IntrinsicTrait<typename T1::ElementType>::addition &&
2403  IntrinsicTrait<typename T1::ElementType>::multiplication };
2404  };
2405  //**********************************************************************************************
2406 
2407  public:
2408  //**Type definitions****************************************************************************
2409  typedef DMatScalarMultExpr<MMM,ST,true> This;
2410  typedef typename MultTrait<RES,ST>::Type ResultType;
2411  typedef typename ResultType::OppositeType OppositeType;
2412  typedef typename ResultType::TransposeType TransposeType;
2413  typedef typename ResultType::ElementType ElementType;
2414  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
2415  typedef const ElementType ReturnType;
2416  typedef const ResultType CompositeType;
2417 
2419  typedef const TDMatDMatMultExpr<MT1,MT2> LeftOperand;
2420 
2422  typedef ST RightOperand;
2423 
2425  typedef typename SelectType< IsComputation<MT1>::value, const RT1, CT1 >::Type LT;
2426 
2428  typedef typename SelectType< IsComputation<MT2>::value, const RT2, CT2 >::Type RT;
2429  //**********************************************************************************************
2430 
2431  //**Compilation flags***************************************************************************
2433  enum { vectorizable = 0 };
2434  //**********************************************************************************************
2435 
2436  //**Constructor*********************************************************************************
2442  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
2443  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
2444  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2445  {}
2446  //**********************************************************************************************
2447 
2448  //**Access operator*****************************************************************************
2455  inline ResultType operator()( size_t i, size_t j ) const {
2456  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
2457  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
2458  return matrix_(i,j) * scalar_;
2459  }
2460  //**********************************************************************************************
2461 
2462  //**Rows function*******************************************************************************
2467  inline size_t rows() const {
2468  return matrix_.rows();
2469  }
2470  //**********************************************************************************************
2471 
2472  //**Columns function****************************************************************************
2477  inline size_t columns() const {
2478  return matrix_.columns();
2479  }
2480  //**********************************************************************************************
2481 
2482  //**Left operand access*************************************************************************
2487  inline LeftOperand leftOperand() const {
2488  return matrix_;
2489  }
2490  //**********************************************************************************************
2491 
2492  //**Right operand access************************************************************************
2497  inline RightOperand rightOperand() const {
2498  return scalar_;
2499  }
2500  //**********************************************************************************************
2501 
2502  //**********************************************************************************************
2508  template< typename T >
2509  inline bool canAlias( const T* alias ) const {
2510  return matrix_.canAlias( alias );
2511  }
2512  //**********************************************************************************************
2513 
2514  //**********************************************************************************************
2520  template< typename T >
2521  inline bool isAliased( const T* alias ) const {
2522  return matrix_.isAliased( alias );
2523  }
2524  //**********************************************************************************************
2525 
2526  private:
2527  //**Member variables****************************************************************************
2529  RightOperand scalar_;
2530  //**********************************************************************************************
2531 
2532  //**Assignment to dense matrices****************************************************************
2541  template< typename MT3 // Type of the target dense matrix
2542  , bool SO > // Storage order of the target dense matrix
2543  friend inline void assign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
2544  {
2546 
2547  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2548  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2549 
2550  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
2551  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
2552 
2553  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
2554  return;
2555  }
2556  else if( left.columns() == 0UL ) {
2557  reset( ~lhs );
2558  return;
2559  }
2560 
2561  LT A( left ); // Evaluation of the left-hand side dense matrix operand
2562  RT B( right ); // Evaluation of the right-hand side dense matrix operand
2563 
2564  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2565  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
2566  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
2567  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
2568  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2569  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
2570 
2571  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
2572  DMatScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, B, rhs.scalar_ );
2573  else
2574  DMatScalarMultExpr::selectBlasAssignKernel( ~lhs, A, B, rhs.scalar_ );
2575  }
2576  //**********************************************************************************************
2577 
2578  //**Default assignment to dense matrices********************************************************
2592  template< typename MT3 // Type of the left-hand side target matrix
2593  , typename MT4 // Type of the left-hand side matrix operand
2594  , typename MT5 // Type of the right-hand side matrix operand
2595  , typename ST2 > // Type of the scalar value
2596  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2597  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2598  {
2599  for( size_t i=0UL; i<A.rows(); ++i ) {
2600  for( size_t k=0UL; k<B.columns(); ++k ) {
2601  C(i,k) = A(i,0UL) * B(0UL,k);
2602  }
2603  for( size_t j=1UL; j<A.columns(); ++j ) {
2604  for( size_t k=0UL; k<B.columns(); ++k ) {
2605  C(i,k) += A(i,j) * B(j,k);
2606  }
2607  }
2608  for( size_t k=0UL; k<B.columns(); ++k ) {
2609  C(i,k) *= scalar;
2610  }
2611  }
2612  }
2613  //**********************************************************************************************
2614 
2615  //**Vectorized default assignment to row-major dense matrices***********************************
2629  template< typename MT3 // Type of the left-hand side target matrix
2630  , typename MT4 // Type of the left-hand side matrix operand
2631  , typename MT5 // Type of the right-hand side matrix operand
2632  , typename ST2 > // Type of the scalar value
2633  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2634  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
2635  {
2636  typedef IntrinsicTrait<ElementType> IT;
2637 
2638  const size_t M( A.rows() );
2639  const size_t N( B.columns() );
2640  const size_t K( A.columns() );
2641 
2642  const IntrinsicType factor( set( scalar ) );
2643 
2644  size_t j( 0UL );
2645 
2646  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2647  for( size_t i=0UL; i<M; ++i ) {
2648  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2649  for( size_t k=0UL; k<K; ++k ) {
2650  const IntrinsicType a1( set( A(i,k) ) );
2651  xmm1 = xmm1 + a1 * B.load(k,j );
2652  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2653  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2654  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2655  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
2656  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
2657  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
2658  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
2659  }
2660  (~C).store( i, j , xmm1 * factor );
2661  (~C).store( i, j+IT::size , xmm2 * factor );
2662  (~C).store( i, j+IT::size*2UL, xmm3 * factor );
2663  (~C).store( i, j+IT::size*3UL, xmm4 * factor );
2664  (~C).store( i, j+IT::size*4UL, xmm5 * factor );
2665  (~C).store( i, j+IT::size*5UL, xmm6 * factor );
2666  (~C).store( i, j+IT::size*6UL, xmm7 * factor );
2667  (~C).store( i, j+IT::size*7UL, xmm8 * factor );
2668  }
2669  }
2670  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
2671  size_t i( 0UL );
2672  for( ; (i+2UL) <= M; i+=2UL ) {
2673  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2674  for( size_t k=0UL; k<K; ++k ) {
2675  const IntrinsicType a1( set( A(i ,k) ) );
2676  const IntrinsicType a2( set( A(i+1UL,k) ) );
2677  const IntrinsicType b1( B.load(k,j ) );
2678  const IntrinsicType b2( B.load(k,j+IT::size ) );
2679  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
2680  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
2681  xmm1 = xmm1 + a1 * b1;
2682  xmm2 = xmm2 + a1 * b2;
2683  xmm3 = xmm3 + a1 * b3;
2684  xmm4 = xmm4 + a1 * b4;
2685  xmm5 = xmm5 + a2 * b1;
2686  xmm6 = xmm6 + a2 * b2;
2687  xmm7 = xmm7 + a2 * b3;
2688  xmm8 = xmm8 + a2 * b4;
2689  }
2690  (~C).store( i , j , xmm1 * factor );
2691  (~C).store( i , j+IT::size , xmm2 * factor );
2692  (~C).store( i , j+IT::size*2UL, xmm3 * factor );
2693  (~C).store( i , j+IT::size*3UL, xmm4 * factor );
2694  (~C).store( i+1UL, j , xmm5 * factor );
2695  (~C).store( i+1UL, j+IT::size , xmm6 * factor );
2696  (~C).store( i+1UL, j+IT::size*2UL, xmm7 * factor );
2697  (~C).store( i+1UL, j+IT::size*3UL, xmm8 * factor );
2698  }
2699  if( i < M ) {
2700  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2701  for( size_t k=0UL; k<K; ++k ) {
2702  const IntrinsicType a1( set( A(i,k) ) );
2703  xmm1 = xmm1 + a1 * B.load(k,j );
2704  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
2705  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
2706  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
2707  }
2708  (~C).store( i, j , xmm1 * factor );
2709  (~C).store( i, j+IT::size , xmm2 * factor );
2710  (~C).store( i, j+IT::size*2UL, xmm3 * factor );
2711  (~C).store( i, j+IT::size*3UL, xmm4 * factor );
2712  }
2713  }
2714  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
2715  size_t i( 0UL );
2716  for( ; (i+2UL) <= M; i+=2UL ) {
2717  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2718  for( size_t k=0UL; k<K; ++k ) {
2719  const IntrinsicType a1( set( A(i ,k) ) );
2720  const IntrinsicType a2( set( A(i+1UL,k) ) );
2721  const IntrinsicType b1( B.load(k,j ) );
2722  const IntrinsicType b2( B.load(k,j+IT::size) );
2723  xmm1 = xmm1 + a1 * b1;
2724  xmm2 = xmm2 + a1 * b2;
2725  xmm3 = xmm3 + a2 * b1;
2726  xmm4 = xmm4 + a2 * b2;
2727  }
2728  (~C).store( i , j , xmm1 * factor );
2729  (~C).store( i , j+IT::size, xmm2 * factor );
2730  (~C).store( i+1UL, j , xmm3 * factor );
2731  (~C).store( i+1UL, j+IT::size, xmm4 * factor );
2732  }
2733  if( i < M ) {
2734  IntrinsicType xmm1, xmm2;
2735  for( size_t k=0UL; k<K; ++k ) {
2736  const IntrinsicType a1( set( A(i,k) ) );
2737  xmm1 = xmm1 + a1 * B.load(k,j );
2738  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
2739  }
2740  (~C).store( i, j , xmm1 * factor );
2741  (~C).store( i, j+IT::size, xmm2 * factor );
2742  }
2743  }
2744  if( j < N ) {
2745  size_t i( 0UL );
2746  for( ; (i+2UL) <= M; i+=2UL ) {
2747  IntrinsicType xmm1, xmm2;
2748  for( size_t k=0UL; k<K; ++k ) {
2749  const IntrinsicType b1( B.load(k,j) );
2750  xmm1 = xmm1 + set( A(i ,k) ) * b1;
2751  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
2752  }
2753  (~C).store( i , j, xmm1 * factor );
2754  (~C).store( i+1UL, j, xmm2 * factor );
2755  }
2756  if( i < M ) {
2757  IntrinsicType xmm1;
2758  for( size_t k=0UL; k<K; ++k ) {
2759  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
2760  }
2761  (~C).store( i, j, xmm1 * factor );
2762  }
2763  }
2764  }
2765  //**********************************************************************************************
2766 
2767  //**Vectorized default assignment to column-major dense matrices********************************
2781  template< typename MT3 // Type of the left-hand side target matrix
2782  , typename MT4 // Type of the left-hand side matrix operand
2783  , typename MT5 // Type of the right-hand side matrix operand
2784  , typename ST2 > // Type of the scalar value
2785  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2786  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
2787  {
2788  typedef IntrinsicTrait<ElementType> IT;
2789 
2790  const size_t M( A.rows() );
2791  const size_t N( B.columns() );
2792  const size_t K( A.columns() );
2793 
2794  const IntrinsicType factor( set( scalar ) );
2795 
2796  size_t i( 0UL );
2797 
2798  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2799  for( size_t j=0UL; j<N; ++j ) {
2800  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2801  for( size_t k=0UL; k<K; ++k ) {
2802  const IntrinsicType b1( set( B(k,j) ) );
2803  xmm1 = xmm1 + A.load(i ,k) * b1;
2804  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
2805  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
2806  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
2807  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
2808  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
2809  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
2810  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
2811  }
2812  (~C).store( i , j, xmm1 * factor );
2813  (~C).store( i+IT::size , j, xmm2 * factor );
2814  (~C).store( i+IT::size*2UL, j, xmm3 * factor );
2815  (~C).store( i+IT::size*3UL, j, xmm4 * factor );
2816  (~C).store( i+IT::size*4UL, j, xmm5 * factor );
2817  (~C).store( i+IT::size*5UL, j, xmm6 * factor );
2818  (~C).store( i+IT::size*6UL, j, xmm7 * factor );
2819  (~C).store( i+IT::size*7UL, j, xmm8 * factor );
2820  }
2821  }
2822  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2823  size_t j( 0UL );
2824  for( ; (j+2UL) <= N; j+=2UL ) {
2825  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2826  for( size_t k=0UL; k<K; ++k ) {
2827  const IntrinsicType a1( A.load(i ,k) );
2828  const IntrinsicType a2( A.load(i+IT::size ,k) );
2829  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
2830  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
2831  const IntrinsicType b1( set( B(k,j ) ) );
2832  const IntrinsicType b2( set( B(k,j+1UL) ) );
2833  xmm1 = xmm1 + a1 * b1;
2834  xmm2 = xmm2 + a2 * b1;
2835  xmm3 = xmm3 + a3 * b1;
2836  xmm4 = xmm4 + a4 * b1;
2837  xmm5 = xmm5 + a1 * b2;
2838  xmm6 = xmm6 + a2 * b2;
2839  xmm7 = xmm7 + a3 * b2;
2840  xmm8 = xmm8 + a4 * b2;
2841  }
2842  (~C).store( i , j , xmm1 * factor );
2843  (~C).store( i+IT::size , j , xmm2 * factor );
2844  (~C).store( i+IT::size*2UL, j , xmm3 * factor );
2845  (~C).store( i+IT::size*3UL, j , xmm4 * factor );
2846  (~C).store( i , j+1UL, xmm5 * factor );
2847  (~C).store( i+IT::size , j+1UL, xmm6 * factor );
2848  (~C).store( i+IT::size*2UL, j+1UL, xmm7 * factor );
2849  (~C).store( i+IT::size*3UL, j+1UL, xmm8 * factor );
2850  }
2851  if( j < N ) {
2852  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2853  for( size_t k=0UL; k<K; ++k ) {
2854  const IntrinsicType b1( set( B(k,j) ) );
2855  xmm1 = xmm1 + A.load(i ,k) * b1;
2856  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
2857  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
2858  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
2859  }
2860  (~C).store( i , j, xmm1 * factor );
2861  (~C).store( i+IT::size , j, xmm2 * factor );
2862  (~C).store( i+IT::size*2UL, j, xmm3 * factor );
2863  (~C).store( i+IT::size*3UL, j, xmm4 * factor );
2864  }
2865  }
2866  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2867  size_t j( 0UL );
2868  for( ; (j+2UL) <= N; j+=2UL ) {
2869  IntrinsicType xmm1, xmm2, xmm3, xmm4;
2870  for( size_t k=0UL; k<K; ++k ) {
2871  const IntrinsicType a1( A.load(i ,k) );
2872  const IntrinsicType a2( A.load(i+IT::size,k) );
2873  const IntrinsicType b1( set( B(k,j ) ) );
2874  const IntrinsicType b2( set( B(k,j+1UL) ) );
2875  xmm1 = xmm1 + a1 * b1;
2876  xmm2 = xmm2 + a2 * b1;
2877  xmm3 = xmm3 + a1 * b2;
2878  xmm4 = xmm4 + a2 * b2;
2879  }
2880  (~C).store( i , j , xmm1 * factor );
2881  (~C).store( i+IT::size, j , xmm2 * factor );
2882  (~C).store( i , j+1UL, xmm3 * factor );
2883  (~C).store( i+IT::size, j+1UL, xmm4 * factor );
2884  }
2885  if( j < N ) {
2886  IntrinsicType xmm1, xmm2;
2887  for( size_t k=0UL; k<K; ++k ) {
2888  const IntrinsicType b1( set( B(k,j) ) );
2889  xmm1 = xmm1 + A.load(i ,k) * b1;
2890  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
2891  }
2892  (~C).store( i , j, xmm1 * factor );
2893  (~C).store( i+IT::size, j, xmm2 * factor );
2894  }
2895  }
2896  if( i < M ) {
2897  size_t j( 0UL );
2898  for( ; (j+2UL) <= N; j+=2UL ) {
2899  IntrinsicType xmm1, xmm2;
2900  for( size_t k=0UL; k<K; ++k ) {
2901  const IntrinsicType a1( A.load(i,k) );
2902  xmm1 = xmm1 + a1 * set( B(k,j ) );
2903  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
2904  }
2905  (~C).store( i, j , xmm1 * factor );
2906  (~C).store( i, j+1UL, xmm2 * factor );
2907  }
2908  if( j < N ) {
2909  IntrinsicType xmm1;
2910  for( size_t k=0UL; k<K; ++k ) {
2911  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
2912  }
2913  (~C).store( i, j, xmm1 * factor );
2914  }
2915  }
2916  }
2917  //**********************************************************************************************
2918 
2919  //**BLAS-based assignment to dense matrices (default)*******************************************
2933  template< typename MT3 // Type of the left-hand side target matrix
2934  , typename MT4 // Type of the left-hand side matrix operand
2935  , typename MT5 // Type of the right-hand side matrix operand
2936  , typename ST2 > // Type of the scalar value
2937  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
2938  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2939  {
2940  selectDefaultAssignKernel( C, A, B, scalar );
2941  }
2942  //**********************************************************************************************
2943 
2944  //**BLAS-based assignment to dense matrices (single precision)**********************************
2945 #if BLAZE_BLAS_MODE
2946 
2959  template< typename MT3 // Type of the left-hand side target matrix
2960  , typename MT4 // Type of the left-hand side matrix operand
2961  , typename MT5 // Type of the right-hand side matrix operand
2962  , typename ST2 > // Type of the scalar value
2963  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
2964  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
2965  {
2966  using boost::numeric_cast;
2967 
2971 
2972  const int M ( numeric_cast<int>( A.rows() ) );
2973  const int N ( numeric_cast<int>( B.columns() ) );
2974  const int K ( numeric_cast<int>( A.columns() ) );
2975  const int lda( numeric_cast<int>( A.spacing() ) );
2976  const int ldb( numeric_cast<int>( B.spacing() ) );
2977  const int ldc( numeric_cast<int>( C.spacing() ) );
2978 
2979  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
2980  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
2981  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
2982  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0F, C.data(), ldc );
2983  }
2984 #endif
2985  //**********************************************************************************************
2986 
2987  //**BLAS-based assignment to dense matrices (double precision)**********************************
2988 #if BLAZE_BLAS_MODE
2989 
3002  template< typename MT3 // Type of the left-hand side target matrix
3003  , typename MT4 // Type of the left-hand side matrix operand
3004  , typename MT5 // Type of the right-hand side matrix operand
3005  , typename ST2 > // Type of the scalar value
3006  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3007  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3008  {
3009  using boost::numeric_cast;
3010 
3014 
3015  const int M ( numeric_cast<int>( A.rows() ) );
3016  const int N ( numeric_cast<int>( B.columns() ) );
3017  const int K ( numeric_cast<int>( A.columns() ) );
3018  const int lda( numeric_cast<int>( A.spacing() ) );
3019  const int ldb( numeric_cast<int>( B.spacing() ) );
3020  const int ldc( numeric_cast<int>( C.spacing() ) );
3021 
3022  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3023  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3024  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3025  M, N, K, scalar, A.data(), lda, B.data(), ldb, 0.0, C.data(), ldc );
3026  }
3027 #endif
3028  //**********************************************************************************************
3029 
3030  //**BLAS-based assignment to dense matrices (single precision complex)**************************
3031 #if BLAZE_BLAS_MODE
3032 
3045  template< typename MT3 // Type of the left-hand side target matrix
3046  , typename MT4 // Type of the left-hand side matrix operand
3047  , typename MT5 // Type of the right-hand side matrix operand
3048  , typename ST2 > // Type of the scalar value
3049  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3050  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3051  {
3052  using boost::numeric_cast;
3053 
3057  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3058  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3059  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3060 
3061  const int M ( numeric_cast<int>( A.rows() ) );
3062  const int N ( numeric_cast<int>( B.columns() ) );
3063  const int K ( numeric_cast<int>( A.columns() ) );
3064  const int lda( numeric_cast<int>( A.spacing() ) );
3065  const int ldb( numeric_cast<int>( B.spacing() ) );
3066  const int ldc( numeric_cast<int>( C.spacing() ) );
3067  const complex<float> alpha( scalar );
3068  const complex<float> beta ( 0.0F, 0.0F );
3069 
3070  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3071  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3072  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3073  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3074  }
3075 #endif
3076  //**********************************************************************************************
3077 
3078  //**BLAS-based assignment to dense matrices (double precision complex)**************************
3079 #if BLAZE_BLAS_MODE
3080 
3093  template< typename MT3 // Type of the left-hand side target matrix
3094  , typename MT4 // Type of the left-hand side matrix operand
3095  , typename MT5 // Type of the right-hand side matrix operand
3096  , typename ST2 > // Type of the scalar value
3097  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3098  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3099  {
3100  using boost::numeric_cast;
3101 
3105  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3106  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3107  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3108 
3109  const int M ( numeric_cast<int>( A.rows() ) );
3110  const int N ( numeric_cast<int>( B.columns() ) );
3111  const int K ( numeric_cast<int>( A.columns() ) );
3112  const int lda( numeric_cast<int>( A.spacing() ) );
3113  const int ldb( numeric_cast<int>( B.spacing() ) );
3114  const int ldc( numeric_cast<int>( C.spacing() ) );
3115  const complex<double> alpha( scalar );
3116  const complex<double> beta ( 0.0, 0.0 );
3117 
3118  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3119  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3120  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3121  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3122  }
3123 #endif
3124  //**********************************************************************************************
3125 
3126  //**Assignment to sparse matrices***************************************************************
3138  template< typename MT // Type of the target sparse matrix
3139  , bool SO > // Storage order of the target sparse matrix
3140  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
3141  {
3143 
3144  typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3145 
3152 
3153  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3154  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3155 
3156  const TmpType tmp( rhs );
3157  assign( ~lhs, tmp );
3158  }
3159  //**********************************************************************************************
3160 
3161  //**Addition assignment to dense matrices*******************************************************
3173  template< typename MT3 // Type of the target dense matrix
3174  , bool SO > // Storage order of the target dense matrix
3175  friend inline void addAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
3176  {
3178 
3179  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3180  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3181 
3182  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3183  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3184 
3185  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3186  return;
3187  }
3188 
3189  LT A( left ); // Evaluation of the left-hand side dense matrix operand
3190  RT B( right ); // Evaluation of the right-hand side dense matrix operand
3191 
3192  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3193  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3194  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3195  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3196  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3197  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3198 
3199  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
3200  DMatScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3201  else
3202  DMatScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
3203  }
3204  //**********************************************************************************************
3205 
3206  //**Default addition assignment to dense matrices***********************************************
3220  template< typename MT3 // Type of the left-hand side target matrix
3221  , typename MT4 // Type of the left-hand side matrix operand
3222  , typename MT5 // Type of the right-hand side matrix operand
3223  , typename ST2 > // Type of the scalar value
3224  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3225  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3226  {
3227  const ResultType tmp( A * B * scalar );
3228  addAssign( C, tmp );
3229  }
3230  //**********************************************************************************************
3231 
3232  //**Vectorized default addition assignment to row-major dense matrices**************************
3246  template< typename MT3 // Type of the left-hand side target matrix
3247  , typename MT4 // Type of the left-hand side matrix operand
3248  , typename MT5 // Type of the right-hand side matrix operand
3249  , typename ST2 > // Type of the scalar value
3250  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3251  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3252  {
3253  typedef IntrinsicTrait<ElementType> IT;
3254 
3255  const size_t M( A.rows() );
3256  const size_t N( B.columns() );
3257  const size_t K( A.columns() );
3258 
3259  const IntrinsicType factor( set( scalar ) );
3260 
3261  size_t j( 0UL );
3262 
3263  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
3264  for( size_t i=0UL; i<M; ++i ) {
3265  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3266  for( size_t k=0UL; k<K; ++k ) {
3267  const IntrinsicType a1( set( A(i,k) ) );
3268  xmm1 = xmm1 + a1 * B.load(k,j );
3269  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3270  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3271  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3272  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
3273  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
3274  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
3275  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
3276  }
3277  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
3278  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
3279  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
3280  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
3281  (~C).store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) + xmm5 * factor );
3282  (~C).store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) + xmm6 * factor );
3283  (~C).store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) + xmm7 * factor );
3284  (~C).store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) + xmm8 * factor );
3285  }
3286  }
3287  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
3288  size_t i( 0UL );
3289  for( ; (i+2UL) <= M; i+=2UL ) {
3290  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3291  for( size_t k=0UL; k<K; ++k ) {
3292  const IntrinsicType a1( set( A(i ,k) ) );
3293  const IntrinsicType a2( set( A(i+1UL,k) ) );
3294  const IntrinsicType b1( B.load(k,j ) );
3295  const IntrinsicType b2( B.load(k,j+IT::size ) );
3296  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
3297  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
3298  xmm1 = xmm1 + a1 * b1;
3299  xmm2 = xmm2 + a1 * b2;
3300  xmm3 = xmm3 + a1 * b3;
3301  xmm4 = xmm4 + a1 * b4;
3302  xmm5 = xmm5 + a2 * b1;
3303  xmm6 = xmm6 + a2 * b2;
3304  xmm7 = xmm7 + a2 * b3;
3305  xmm8 = xmm8 + a2 * b4;
3306  }
3307  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3308  (~C).store( i , j+IT::size , (~C).load(i ,j+IT::size ) + xmm2 * factor );
3309  (~C).store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) + xmm3 * factor );
3310  (~C).store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) + xmm4 * factor );
3311  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
3312  (~C).store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) + xmm6 * factor );
3313  (~C).store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) + xmm7 * factor );
3314  (~C).store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) + xmm8 * factor );
3315  }
3316  if( i < M ) {
3317  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3318  for( size_t k=0UL; k<K; ++k ) {
3319  const IntrinsicType a1( set( A(i,k) ) );
3320  xmm1 = xmm1 + a1 * B.load(k,j );
3321  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3322  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3323  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3324  }
3325  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
3326  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) + xmm2 * factor );
3327  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) + xmm3 * factor );
3328  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) + xmm4 * factor );
3329  }
3330  }
3331  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3332  size_t i( 0UL );
3333  for( ; (i+2UL) <= M; i+=2UL ) {
3334  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3335  for( size_t k=0UL; k<K; ++k ) {
3336  const IntrinsicType a1( set( A(i ,k) ) );
3337  const IntrinsicType a2( set( A(i+1UL,k) ) );
3338  const IntrinsicType b1( B.load(k,j ) );
3339  const IntrinsicType b2( B.load(k,j+IT::size) );
3340  xmm1 = xmm1 + a1 * b1;
3341  xmm2 = xmm2 + a1 * b2;
3342  xmm3 = xmm3 + a2 * b1;
3343  xmm4 = xmm4 + a2 * b2;
3344  }
3345  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3346  (~C).store( i , j+IT::size, (~C).load(i ,j+IT::size) + xmm2 * factor );
3347  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
3348  (~C).store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) + xmm4 * factor );
3349  }
3350  if( i < M ) {
3351  IntrinsicType xmm1, xmm2;
3352  for( size_t k=0UL; k<K; ++k ) {
3353  const IntrinsicType a1( set( A(i,k) ) );
3354  xmm1 = xmm1 + a1 * B.load(k,j );
3355  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3356  }
3357  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
3358  (~C).store( i, j+IT::size, (~C).load(i,j+IT::size) + xmm2 * factor );
3359  }
3360  }
3361  if( j < N ) {
3362  size_t i( 0UL );
3363  for( ; (i+2UL) <= M; i+=2UL ) {
3364  IntrinsicType xmm1, xmm2;
3365  for( size_t k=0UL; k<K; ++k ) {
3366  const IntrinsicType b1( B.load(k,j) );
3367  xmm1 = xmm1 + set( A(i ,k) ) * b1;
3368  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
3369  }
3370  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
3371  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
3372  }
3373  if( i < M ) {
3374  IntrinsicType xmm1;
3375  for( size_t k=0UL; k<K; ++k ) {
3376  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
3377  }
3378  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
3379  }
3380  }
3381  }
3382  //**********************************************************************************************
3383 
3384  //**Vectorized default addition assignment to column-major dense matrices***********************
3398  template< typename MT3 // Type of the left-hand side target matrix
3399  , typename MT4 // Type of the left-hand side matrix operand
3400  , typename MT5 // Type of the right-hand side matrix operand
3401  , typename ST2 > // Type of the scalar value
3402  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3403  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3404  {
3405  typedef IntrinsicTrait<ElementType> IT;
3406 
3407  const size_t M( A.rows() );
3408  const size_t N( B.columns() );
3409  const size_t K( A.columns() );
3410 
3411  const IntrinsicType factor( set( scalar ) );
3412 
3413  size_t i( 0UL );
3414 
3415  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
3416  for( size_t j=0UL; j<N; ++j ) {
3417  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3418  for( size_t k=0UL; k<K; ++k ) {
3419  const IntrinsicType b1( set( B(k,j) ) );
3420  xmm1 = xmm1 + A.load(i ,k) * b1;
3421  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3422  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3423  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3424  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
3425  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
3426  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
3427  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
3428  }
3429  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
3430  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
3431  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
3432  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
3433  (~C).store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) + xmm5 * factor );
3434  (~C).store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) + xmm6 * factor );
3435  (~C).store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) + xmm7 * factor );
3436  (~C).store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) + xmm8 * factor );
3437  }
3438  }
3439  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
3440  size_t j( 0UL );
3441  for( ; (j+2UL) <= N; j+=2UL ) {
3442  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3443  for( size_t k=0UL; k<K; ++k ) {
3444  const IntrinsicType a1( A.load(i ,k) );
3445  const IntrinsicType a2( A.load(i+IT::size ,k) );
3446  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
3447  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
3448  const IntrinsicType b1( set( B(k,j ) ) );
3449  const IntrinsicType b2( set( B(k,j+1UL) ) );
3450  xmm1 = xmm1 + a1 * b1;
3451  xmm2 = xmm2 + a2 * b1;
3452  xmm3 = xmm3 + a3 * b1;
3453  xmm4 = xmm4 + a4 * b1;
3454  xmm5 = xmm5 + a1 * b2;
3455  xmm6 = xmm6 + a2 * b2;
3456  xmm7 = xmm7 + a3 * b2;
3457  xmm8 = xmm8 + a4 * b2;
3458  }
3459  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3460  (~C).store( i+IT::size , j , (~C).load(i+IT::size ,j ) + xmm2 * factor );
3461  (~C).store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) + xmm3 * factor );
3462  (~C).store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) + xmm4 * factor );
3463  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
3464  (~C).store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) + xmm6 * factor );
3465  (~C).store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) + xmm7 * factor );
3466  (~C).store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) + xmm8 * factor );
3467  }
3468  if( j < N ) {
3469  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3470  for( size_t k=0UL; k<K; ++k ) {
3471  const IntrinsicType b1( set( B(k,j) ) );
3472  xmm1 = xmm1 + A.load(i ,k) * b1;
3473  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
3474  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
3475  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
3476  }
3477  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
3478  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) + xmm2 * factor );
3479  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) + xmm3 * factor );
3480  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) + xmm4 * factor );
3481  }
3482  }
3483  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
3484  size_t j( 0UL );
3485  for( ; (j+2UL) <= N; j+=2UL ) {
3486  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3487  for( size_t k=0UL; k<K; ++k ) {
3488  const IntrinsicType a1( A.load(i ,k) );
3489  const IntrinsicType a2( A.load(i+IT::size,k) );
3490  const IntrinsicType b1( set( B(k,j ) ) );
3491  const IntrinsicType b2( set( B(k,j+1UL) ) );
3492  xmm1 = xmm1 + a1 * b1;
3493  xmm2 = xmm2 + a2 * b1;
3494  xmm3 = xmm3 + a1 * b2;
3495  xmm4 = xmm4 + a2 * b2;
3496  }
3497  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
3498  (~C).store( i+IT::size, j , (~C).load(i+IT::size,j ) + xmm2 * factor );
3499  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
3500  (~C).store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) + xmm4 * factor );
3501  }
3502  if( j < N ) {
3503  IntrinsicType xmm1, xmm2;
3504  for( size_t k=0UL; k<K; ++k ) {
3505  const IntrinsicType b1( set( B(k,j) ) );
3506  xmm1 = xmm1 + A.load(i ,k) * b1;
3507  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
3508  }
3509  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
3510  (~C).store( i+IT::size, j, (~C).load(i+IT::size,j) + xmm2 * factor );
3511  }
3512  }
3513  if( i < M ) {
3514  size_t j( 0UL );
3515  for( ; (j+2UL) <= N; j+=2UL ) {
3516  IntrinsicType xmm1, xmm2;
3517  for( size_t k=0UL; k<K; ++k ) {
3518  const IntrinsicType a1( A.load(i,k) );
3519  xmm1 = xmm1 + a1 * set( B(k,j ) );
3520  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
3521  }
3522  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
3523  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
3524  }
3525  if( j < N ) {
3526  IntrinsicType xmm1;
3527  for( size_t k=0UL; k<K; ++k ) {
3528  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
3529  }
3530  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
3531  }
3532  }
3533  }
3534  //**********************************************************************************************
3535 
3536  //**BLAS-based addition assignment to dense matrices (default)**********************************
3550  template< typename MT3 // Type of the left-hand side target matrix
3551  , typename MT4 // Type of the left-hand side matrix operand
3552  , typename MT5 // Type of the right-hand side matrix operand
3553  , typename ST2 > // Type of the scalar value
3554  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3555  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3556  {
3557  selectDefaultAddAssignKernel( C, A, B, scalar );
3558  }
3559  //**********************************************************************************************
3560 
3561  //**BLAS-based addition assignment to dense matrices (single precision)*************************
3562 #if BLAZE_BLAS_MODE
3563 
3576  template< typename MT3 // Type of the left-hand side target matrix
3577  , typename MT4 // Type of the left-hand side matrix operand
3578  , typename MT5 // Type of the right-hand side matrix operand
3579  , typename ST2 > // Type of the scalar value
3580  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3581  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3582  {
3583  using boost::numeric_cast;
3584 
3588 
3589  const int M ( numeric_cast<int>( A.rows() ) );
3590  const int N ( numeric_cast<int>( B.columns() ) );
3591  const int K ( numeric_cast<int>( A.columns() ) );
3592  const int lda( numeric_cast<int>( A.spacing() ) );
3593  const int ldb( numeric_cast<int>( B.spacing() ) );
3594  const int ldc( numeric_cast<int>( C.spacing() ) );
3595 
3596  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3597  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3598  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3599  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
3600  }
3601 #endif
3602  //**********************************************************************************************
3603 
3604  //**BLAS-based addition assignment to dense matrices (double precision)*************************
3605 #if BLAZE_BLAS_MODE
3606 
3619  template< typename MT3 // Type of the left-hand side target matrix
3620  , typename MT4 // Type of the left-hand side matrix operand
3621  , typename MT5 // Type of the right-hand side matrix operand
3622  , typename ST2 > // Type of the scalar value
3623  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
3624  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3625  {
3626  using boost::numeric_cast;
3627 
3631 
3632  const int M ( numeric_cast<int>( A.rows() ) );
3633  const int N ( numeric_cast<int>( B.columns() ) );
3634  const int K ( numeric_cast<int>( A.columns() ) );
3635  const int lda( numeric_cast<int>( A.spacing() ) );
3636  const int ldb( numeric_cast<int>( B.spacing() ) );
3637  const int ldc( numeric_cast<int>( C.spacing() ) );
3638 
3639  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3640  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3641  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3642  M, N, K, scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
3643  }
3644 #endif
3645  //**********************************************************************************************
3646 
3647  //**BLAS-based addition assignment to dense matrices (single precision complex)*****************
3648 #if BLAZE_BLAS_MODE
3649 
3662  template< typename MT3 // Type of the left-hand side target matrix
3663  , typename MT4 // Type of the left-hand side matrix operand
3664  , typename MT5 // Type of the right-hand side matrix operand
3665  , typename ST2 > // Type of the scalar value
3666  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3667  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3668  {
3669  using boost::numeric_cast;
3670 
3674  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
3675  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
3676  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
3677 
3678  const int M ( numeric_cast<int>( A.rows() ) );
3679  const int N ( numeric_cast<int>( B.columns() ) );
3680  const int K ( numeric_cast<int>( A.columns() ) );
3681  const int lda( numeric_cast<int>( A.spacing() ) );
3682  const int ldb( numeric_cast<int>( B.spacing() ) );
3683  const int ldc( numeric_cast<int>( C.spacing() ) );
3684  const complex<float> alpha( scalar );
3685  const complex<float> beta ( 1.0F, 0.0F );
3686 
3687  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3688  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3689  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3690  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3691  }
3692 #endif
3693  //**********************************************************************************************
3694 
3695  //**BLAS-based addition assignment to dense matrices (double precision complex)*****************
3696 #if BLAZE_BLAS_MODE
3697 
3710  template< typename MT3 // Type of the left-hand side target matrix
3711  , typename MT4 // Type of the left-hand side matrix operand
3712  , typename MT5 // Type of the right-hand side matrix operand
3713  , typename ST2 > // Type of the scalar value
3714  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3715  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3716  {
3717  using boost::numeric_cast;
3718 
3722  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
3723  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
3724  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
3725 
3726  const int M ( numeric_cast<int>( A.rows() ) );
3727  const int N ( numeric_cast<int>( B.columns() ) );
3728  const int K ( numeric_cast<int>( A.columns() ) );
3729  const int lda( numeric_cast<int>( A.spacing() ) );
3730  const int ldb( numeric_cast<int>( B.spacing() ) );
3731  const int ldc( numeric_cast<int>( C.spacing() ) );
3732  const complex<double> alpha( scalar );
3733  const complex<double> beta ( 1.0, 0.0 );
3734 
3735  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
3736  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
3737  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
3738  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
3739  }
3740 #endif
3741  //**********************************************************************************************
3742 
3743  //**Addition assignment to sparse matrices******************************************************
3744  // No special implementation for the addition assignment to sparse matrices.
3745  //**********************************************************************************************
3746 
3747  //**Subtraction assignment to dense matrices****************************************************
3759  template< typename MT3 // Type of the target dense matrix
3760  , bool SO > // Storage order of the target dense matrix
3761  friend inline void subAssign( DenseMatrix<MT3,SO>& lhs, const DMatScalarMultExpr& rhs )
3762  {
3764 
3765  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3766  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3767 
3768  typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
3769  typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
3770 
3771  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
3772  return;
3773  }
3774 
3775  LT A( left ); // Evaluation of the left-hand side dense matrix operand
3776  RT B( right ); // Evaluation of the right-hand side dense matrix operand
3777 
3778  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3779  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
3780  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
3781  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
3782  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3783  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
3784 
3785  if( (~lhs).rows() * (~lhs).columns() < TDMATDMATMULT_THRESHOLD )
3786  DMatScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3787  else
3788  DMatScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
3789  }
3790  //**********************************************************************************************
3791 
3792  //**Default subtraction assignment to dense matrices********************************************
3806  template< typename MT3 // Type of the left-hand side target matrix
3807  , typename MT4 // Type of the left-hand side matrix operand
3808  , typename MT5 // Type of the right-hand side matrix operand
3809  , typename ST2 > // Type of the scalar value
3810  static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3811  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
3812  {
3813  const ResultType tmp( A * B * scalar );
3814  subAssign( C, tmp );
3815  }
3816  //**********************************************************************************************
3817 
3818  //**Vectorized default subtraction assignment to row-major dense matrices***********************
3832  template< typename MT3 // Type of the left-hand side target matrix
3833  , typename MT4 // Type of the left-hand side matrix operand
3834  , typename MT5 // Type of the right-hand side matrix operand
3835  , typename ST2 > // Type of the scalar value
3836  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3837  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
3838  {
3839  typedef IntrinsicTrait<ElementType> IT;
3840 
3841  const size_t M( A.rows() );
3842  const size_t N( B.columns() );
3843  const size_t K( A.columns() );
3844 
3845  const IntrinsicType factor( set( scalar ) );
3846 
3847  size_t j( 0UL );
3848 
3849  for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
3850  for( size_t i=0UL; i<M; ++i ) {
3851  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3852  for( size_t k=0UL; k<K; ++k ) {
3853  const IntrinsicType a1( set( A(i,k) ) );
3854  xmm1 = xmm1 + a1 * B.load(k,j );
3855  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3856  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3857  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3858  xmm5 = xmm5 + a1 * B.load(k,j+IT::size*4UL);
3859  xmm6 = xmm6 + a1 * B.load(k,j+IT::size*5UL);
3860  xmm7 = xmm7 + a1 * B.load(k,j+IT::size*6UL);
3861  xmm8 = xmm8 + a1 * B.load(k,j+IT::size*7UL);
3862  }
3863  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
3864  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
3865  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
3866  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
3867  (~C).store( i, j+IT::size*4UL, (~C).load(i,j+IT::size*4UL) - xmm5 * factor );
3868  (~C).store( i, j+IT::size*5UL, (~C).load(i,j+IT::size*5UL) - xmm6 * factor );
3869  (~C).store( i, j+IT::size*6UL, (~C).load(i,j+IT::size*6UL) - xmm7 * factor );
3870  (~C).store( i, j+IT::size*7UL, (~C).load(i,j+IT::size*7UL) - xmm8 * factor );
3871  }
3872  }
3873  for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
3874  size_t i( 0UL );
3875  for( ; (i+2UL) <= M; i+=2UL ) {
3876  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3877  for( size_t k=0UL; k<K; ++k ) {
3878  const IntrinsicType a1( set( A(i ,k) ) );
3879  const IntrinsicType a2( set( A(i+1UL,k) ) );
3880  const IntrinsicType b1( B.load(k,j ) );
3881  const IntrinsicType b2( B.load(k,j+IT::size ) );
3882  const IntrinsicType b3( B.load(k,j+IT::size*2UL) );
3883  const IntrinsicType b4( B.load(k,j+IT::size*3UL) );
3884  xmm1 = xmm1 + a1 * b1;
3885  xmm2 = xmm2 + a1 * b2;
3886  xmm3 = xmm3 + a1 * b3;
3887  xmm4 = xmm4 + a1 * b4;
3888  xmm5 = xmm5 + a2 * b1;
3889  xmm6 = xmm6 + a2 * b2;
3890  xmm7 = xmm7 + a2 * b3;
3891  xmm8 = xmm8 + a2 * b4;
3892  }
3893  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
3894  (~C).store( i , j+IT::size , (~C).load(i ,j+IT::size ) - xmm2 * factor );
3895  (~C).store( i , j+IT::size*2UL, (~C).load(i ,j+IT::size*2UL) - xmm3 * factor );
3896  (~C).store( i , j+IT::size*3UL, (~C).load(i ,j+IT::size*3UL) - xmm4 * factor );
3897  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
3898  (~C).store( i+1UL, j+IT::size , (~C).load(i+1UL,j+IT::size ) - xmm6 * factor );
3899  (~C).store( i+1UL, j+IT::size*2UL, (~C).load(i+1UL,j+IT::size*2UL) - xmm7 * factor );
3900  (~C).store( i+1UL, j+IT::size*3UL, (~C).load(i+1UL,j+IT::size*3UL) - xmm8 * factor );
3901  }
3902  if( i < M ) {
3903  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3904  for( size_t k=0UL; k<K; ++k ) {
3905  const IntrinsicType a1( set( A(i,k) ) );
3906  xmm1 = xmm1 + a1 * B.load(k,j );
3907  xmm2 = xmm2 + a1 * B.load(k,j+IT::size );
3908  xmm3 = xmm3 + a1 * B.load(k,j+IT::size*2UL);
3909  xmm4 = xmm4 + a1 * B.load(k,j+IT::size*3UL);
3910  }
3911  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
3912  (~C).store( i, j+IT::size , (~C).load(i,j+IT::size ) - xmm2 * factor );
3913  (~C).store( i, j+IT::size*2UL, (~C).load(i,j+IT::size*2UL) - xmm3 * factor );
3914  (~C).store( i, j+IT::size*3UL, (~C).load(i,j+IT::size*3UL) - xmm4 * factor );
3915  }
3916  }
3917  for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3918  size_t i( 0UL );
3919  for( ; (i+2UL) <= M; i+=2UL ) {
3920  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3921  for( size_t k=0UL; k<K; ++k ) {
3922  const IntrinsicType a1( set( A(i ,k) ) );
3923  const IntrinsicType a2( set( A(i+1UL,k) ) );
3924  const IntrinsicType b1( B.load(k,j ) );
3925  const IntrinsicType b2( B.load(k,j+IT::size) );
3926  xmm1 = xmm1 + a1 * b1;
3927  xmm2 = xmm2 + a1 * b2;
3928  xmm3 = xmm3 + a2 * b1;
3929  xmm4 = xmm4 + a2 * b2;
3930  }
3931  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
3932  (~C).store( i , j+IT::size, (~C).load(i ,j+IT::size) - xmm2 * factor );
3933  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
3934  (~C).store( i+1UL, j+IT::size, (~C).load(i+1UL,j+IT::size) - xmm4 * factor );
3935  }
3936  if( i < M ) {
3937  IntrinsicType xmm1, xmm2;
3938  for( size_t k=0UL; k<K; ++k ) {
3939  const IntrinsicType a1( set( A(i,k) ) );
3940  xmm1 = xmm1 + a1 * B.load(k,j );
3941  xmm2 = xmm2 + a1 * B.load(k,j+IT::size);
3942  }
3943  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
3944  (~C).store( i, j+IT::size, (~C).load(i,j+IT::size) - xmm2 * factor );
3945  }
3946  }
3947  if( j < N ) {
3948  size_t i( 0UL );
3949  for( ; (i+2UL) <= M; i+=2UL ) {
3950  IntrinsicType xmm1, xmm2;
3951  for( size_t k=0UL; k<K; ++k ) {
3952  const IntrinsicType b1( B.load(k,j) );
3953  xmm1 = xmm1 + set( A(i ,k) ) * b1;
3954  xmm2 = xmm2 + set( A(i+1UL,k) ) * b1;
3955  }
3956  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
3957  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
3958  }
3959  if( i < M ) {
3960  IntrinsicType xmm1;
3961  for( size_t k=0UL; k<K; ++k ) {
3962  xmm1 = xmm1 + set( A(i,k) ) * B.load(k,j);
3963  }
3964  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
3965  }
3966  }
3967  }
3968  //**********************************************************************************************
3969 
3970  //**Vectorized default subtraction assignment to column-major dense matrices********************
3984  template< typename MT3 // Type of the left-hand side target matrix
3985  , typename MT4 // Type of the left-hand side matrix operand
3986  , typename MT5 // Type of the right-hand side matrix operand
3987  , typename ST2 > // Type of the scalar value
3988  static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
3989  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
3990  {
3991  typedef IntrinsicTrait<ElementType> IT;
3992 
3993  const size_t M( A.rows() );
3994  const size_t N( B.columns() );
3995  const size_t K( A.columns() );
3996 
3997  const IntrinsicType factor( set( scalar ) );
3998 
3999  size_t i( 0UL );
4000 
4001  for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
4002  for( size_t j=0UL; j<N; ++j ) {
4003  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4004  for( size_t k=0UL; k<K; ++k ) {
4005  const IntrinsicType b1( set( B(k,j) ) );
4006  xmm1 = xmm1 + A.load(i ,k) * b1;
4007  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4008  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4009  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4010  xmm5 = xmm5 + A.load(i+IT::size*4UL,k) * b1;
4011  xmm6 = xmm6 + A.load(i+IT::size*5UL,k) * b1;
4012  xmm7 = xmm7 + A.load(i+IT::size*6UL,k) * b1;
4013  xmm8 = xmm8 + A.load(i+IT::size*7UL,k) * b1;
4014  }
4015  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
4016  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
4017  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
4018  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
4019  (~C).store( i+IT::size*4UL, j, (~C).load(i+IT::size*4UL,j) - xmm5 * factor );
4020  (~C).store( i+IT::size*5UL, j, (~C).load(i+IT::size*5UL,j) - xmm6 * factor );
4021  (~C).store( i+IT::size*6UL, j, (~C).load(i+IT::size*6UL,j) - xmm7 * factor );
4022  (~C).store( i+IT::size*7UL, j, (~C).load(i+IT::size*7UL,j) - xmm8 * factor );
4023  }
4024  }
4025  for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
4026  size_t j( 0UL );
4027  for( ; (j+2UL) <= N; j+=2UL ) {
4028  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4029  for( size_t k=0UL; k<K; ++k ) {
4030  const IntrinsicType a1( A.load(i ,k) );
4031  const IntrinsicType a2( A.load(i+IT::size ,k) );
4032  const IntrinsicType a3( A.load(i+IT::size*2UL,k) );
4033  const IntrinsicType a4( A.load(i+IT::size*3UL,k) );
4034  const IntrinsicType b1( set( B(k,j ) ) );
4035  const IntrinsicType b2( set( B(k,j+1UL) ) );
4036  xmm1 = xmm1 + a1 * b1;
4037  xmm2 = xmm2 + a2 * b1;
4038  xmm3 = xmm3 + a3 * b1;
4039  xmm4 = xmm4 + a4 * b1;
4040  xmm5 = xmm5 + a1 * b2;
4041  xmm6 = xmm6 + a2 * b2;
4042  xmm7 = xmm7 + a3 * b2;
4043  xmm8 = xmm8 + a4 * b2;
4044  }
4045  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4046  (~C).store( i+IT::size , j , (~C).load(i+IT::size ,j ) - xmm2 * factor );
4047  (~C).store( i+IT::size*2UL, j , (~C).load(i+IT::size*2UL,j ) - xmm3 * factor );
4048  (~C).store( i+IT::size*3UL, j , (~C).load(i+IT::size*3UL,j ) - xmm4 * factor );
4049  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
4050  (~C).store( i+IT::size , j+1UL, (~C).load(i+IT::size ,j+1UL) - xmm6 * factor );
4051  (~C).store( i+IT::size*2UL, j+1UL, (~C).load(i+IT::size*2UL,j+1UL) - xmm7 * factor );
4052  (~C).store( i+IT::size*3UL, j+1UL, (~C).load(i+IT::size*3UL,j+1UL) - xmm8 * factor );
4053  }
4054  if( j < N ) {
4055  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4056  for( size_t k=0UL; k<K; ++k ) {
4057  const IntrinsicType b1( set( B(k,j) ) );
4058  xmm1 = xmm1 + A.load(i ,k) * b1;
4059  xmm2 = xmm2 + A.load(i+IT::size ,k) * b1;
4060  xmm3 = xmm3 + A.load(i+IT::size*2UL,k) * b1;
4061  xmm4 = xmm4 + A.load(i+IT::size*3UL,k) * b1;
4062  }
4063  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
4064  (~C).store( i+IT::size , j, (~C).load(i+IT::size ,j) - xmm2 * factor );
4065  (~C).store( i+IT::size*2UL, j, (~C).load(i+IT::size*2UL,j) - xmm3 * factor );
4066  (~C).store( i+IT::size*3UL, j, (~C).load(i+IT::size*3UL,j) - xmm4 * factor );
4067  }
4068  }
4069  for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
4070  size_t j( 0UL );
4071  for( ; (j+2UL) <= N; j+=2UL ) {
4072  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4073  for( size_t k=0UL; k<K; ++k ) {
4074  const IntrinsicType a1( A.load(i ,k) );
4075  const IntrinsicType a2( A.load(i+IT::size,k) );
4076  const IntrinsicType b1( set( B(k,j ) ) );
4077  const IntrinsicType b2( set( B(k,j+1UL) ) );
4078  xmm1 = xmm1 + a1 * b1;
4079  xmm2 = xmm2 + a2 * b1;
4080  xmm3 = xmm3 + a1 * b2;
4081  xmm4 = xmm4 + a2 * b2;
4082  }
4083  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
4084  (~C).store( i+IT::size, j , (~C).load(i+IT::size,j ) - xmm2 * factor );
4085  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
4086  (~C).store( i+IT::size, j+1UL, (~C).load(i+IT::size,j+1UL) - xmm4 * factor );
4087  }
4088  if( j < N ) {
4089  IntrinsicType xmm1, xmm2;
4090  for( size_t k=0UL; k<K; ++k ) {
4091  const IntrinsicType b1( set( B(k,j) ) );
4092  xmm1 = xmm1 + A.load(i ,k) * b1;
4093  xmm2 = xmm2 + A.load(i+IT::size,k) * b1;
4094  }
4095  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
4096  (~C).store( i+IT::size, j, (~C).load(i+IT::size,j) - xmm2 * factor );
4097  }
4098  }
4099  if( i < M ) {
4100  size_t j( 0UL );
4101  for( ; (j+2UL) <= N; j+=2UL ) {
4102  IntrinsicType xmm1, xmm2;
4103  for( size_t k=0UL; k<K; ++k ) {
4104  const IntrinsicType a1( A.load(i,k) );
4105  xmm1 = xmm1 + a1 * set( B(k,j ) );
4106  xmm2 = xmm2 + a1 * set( B(k,j+1UL) );
4107  }
4108  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
4109  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
4110  }
4111  if( j < N ) {
4112  IntrinsicType xmm1;
4113  for( size_t k=0UL; k<K; ++k ) {
4114  xmm1 = xmm1 + A.load(i,k) * set( B(k,j) );
4115  }
4116  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
4117  }
4118  }
4119  }
4120  //**********************************************************************************************
4121 
4122  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4136  template< typename MT3 // Type of the left-hand side target matrix
4137  , typename MT4 // Type of the left-hand side matrix operand
4138  , typename MT5 // Type of the right-hand side matrix operand
4139  , typename ST2 > // Type of the scalar value
4140  static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4141  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4142  {
4143  selectDefaultSubAssignKernel( C, A, B, scalar );
4144  }
4145  //**********************************************************************************************
4146 
4147  //**BLAS-based subraction assignment to dense matrices (single precision)***********************
4148 #if BLAZE_BLAS_MODE
4149 
4162  template< typename MT3 // Type of the left-hand side target matrix
4163  , typename MT4 // Type of the left-hand side matrix operand
4164  , typename MT5 // Type of the right-hand side matrix operand
4165  , typename ST2 > // Type of the scalar value
4166  static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4167  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4168  {
4169  using boost::numeric_cast;
4170 
4174 
4175  const int M ( numeric_cast<int>( A.rows() ) );
4176  const int N ( numeric_cast<int>( B.columns() ) );
4177  const int K ( numeric_cast<int>( A.columns() ) );
4178  const int lda( numeric_cast<int>( A.spacing() ) );
4179  const int ldb( numeric_cast<int>( B.spacing() ) );
4180  const int ldc( numeric_cast<int>( C.spacing() ) );
4181 
4182  cblas_sgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4183  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4184  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4185  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0F, C.data(), ldc );
4186  }
4187 #endif
4188  //**********************************************************************************************
4189 
4190  //**BLAS-based subraction assignment to dense matrices (double precision)***********************
4191 #if BLAZE_BLAS_MODE
4192 
4205  template< typename MT3 // Type of the left-hand side target matrix
4206  , typename MT4 // Type of the left-hand side matrix operand
4207  , typename MT5 // Type of the right-hand side matrix operand
4208  , typename ST2 > // Type of the scalar value
4209  static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
4210  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4211  {
4212  using boost::numeric_cast;
4213 
4217 
4218  const int M ( numeric_cast<int>( A.rows() ) );
4219  const int N ( numeric_cast<int>( B.columns() ) );
4220  const int K ( numeric_cast<int>( A.columns() ) );
4221  const int lda( numeric_cast<int>( A.spacing() ) );
4222  const int ldb( numeric_cast<int>( B.spacing() ) );
4223  const int ldc( numeric_cast<int>( C.spacing() ) );
4224 
4225  cblas_dgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4226  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4227  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4228  M, N, K, -scalar, A.data(), lda, B.data(), ldb, 1.0, C.data(), ldc );
4229  }
4230 #endif
4231  //**********************************************************************************************
4232 
4233  //**BLAS-based subraction assignment to dense matrices (single precision complex)***************
4234 #if BLAZE_BLAS_MODE
4235 
4248  template< typename MT3 // Type of the left-hand side target matrix
4249  , typename MT4 // Type of the left-hand side matrix operand
4250  , typename MT5 // Type of the right-hand side matrix operand
4251  , typename ST2 > // Type of the scalar value
4252  static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4253  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4254  {
4255  using boost::numeric_cast;
4256 
4260  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT3::ElementType::value_type );
4261  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT4::ElementType::value_type );
4262  BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE ( typename MT5::ElementType::value_type );
4263 
4264  const int M ( numeric_cast<int>( A.rows() ) );
4265  const int N ( numeric_cast<int>( B.columns() ) );
4266  const int K ( numeric_cast<int>( A.columns() ) );
4267  const int lda( numeric_cast<int>( A.spacing() ) );
4268  const int ldb( numeric_cast<int>( B.spacing() ) );
4269  const int ldc( numeric_cast<int>( C.spacing() ) );
4270  const complex<float> alpha( -scalar );
4271  const complex<float> beta ( 1.0F, 0.0F );
4272 
4273  cblas_cgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4274  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4275  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4276  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4277  }
4278 #endif
4279  //**********************************************************************************************
4280 
4281  //**BLAS-based subraction assignment to dense matrices (double precision complex)***************
4282 #if BLAZE_BLAS_MODE
4283 
4296  template< typename MT3 // Type of the left-hand side target matrix
4297  , typename MT4 // Type of the left-hand side matrix operand
4298  , typename MT5 // Type of the right-hand side matrix operand
4299  , typename ST2 > // Type of the scalar value
4300  static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
4301  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4302  {
4303  using boost::numeric_cast;
4304 
4308  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT3::ElementType::value_type );
4309  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT4::ElementType::value_type );
4310  BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE ( typename MT5::ElementType::value_type );
4311 
4312  const int M ( numeric_cast<int>( A.rows() ) );
4313  const int N ( numeric_cast<int>( B.columns() ) );
4314  const int K ( numeric_cast<int>( A.columns() ) );
4315  const int lda( numeric_cast<int>( A.spacing() ) );
4316  const int ldb( numeric_cast<int>( B.spacing() ) );
4317  const int ldc( numeric_cast<int>( C.spacing() ) );
4318  const complex<double> alpha( -scalar );
4319  const complex<double> beta ( 1.0, 0.0 );
4320 
4321  cblas_zgemm( ( IsRowMajorMatrix<MT3>::value )?( CblasRowMajor ):( CblasColMajor ),
4322  ( IsRowMajorMatrix<MT3>::value )?( CblasTrans ):( CblasNoTrans ),
4323  ( IsRowMajorMatrix<MT3>::value )?( CblasNoTrans ):( CblasTrans ),
4324  M, N, K, &alpha, A.data(), lda, B.data(), ldb, &beta, C.data(), ldc );
4325  }
4326 #endif
4327  //**********************************************************************************************
4328 
4329  //**Subtraction assignment to sparse matrices***************************************************
4330  // No special implementation for the subtraction assignment to sparse matrices.
4331  //**********************************************************************************************
4332 
4333  //**Multiplication assignment to dense matrices*************************************************
4334  // No special implementation for the multiplication assignment to dense matrices.
4335  //**********************************************************************************************
4336 
4337  //**Multiplication assignment to sparse matrices************************************************
4338  // No special implementation for the multiplication assignment to sparse matrices.
4339  //**********************************************************************************************
4340 
4341  //**Compile time checks*************************************************************************
4350  //**********************************************************************************************
4351 };
4353 //*************************************************************************************************
4354 
4355 
4356 
4357 
4358 //=================================================================================================
4359 //
4360 // GLOBAL BINARY ARITHMETIC OPERATORS
4361 //
4362 //=================================================================================================
4363 
4364 //*************************************************************************************************
4393 template< typename T1 // Type of the left-hand side dense matrix
4394  , typename T2 > // Type of the right-hand side dense matrix
4395 inline const TDMatDMatMultExpr<T1,T2>
4397 {
4399 
4400  if( (~lhs).columns() != (~rhs).rows() )
4401  throw std::invalid_argument( "Matrix sizes do not match" );
4402 
4403  return TDMatDMatMultExpr<T1,T2>( ~lhs, ~rhs );
4404 }
4405 //*************************************************************************************************
4406 
4407 
4408 
4409 
4410 //=================================================================================================
4411 //
4412 // EXPRESSION TRAIT SPECIALIZATIONS
4413 //
4414 //=================================================================================================
4415 
4416 //*************************************************************************************************
4418 template< typename MT1, typename MT2, typename VT >
4419 struct TDMatDVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
4420 {
4421  public:
4422  //**********************************************************************************************
4423  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4424  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4425  IsDenseVector<VT>::value && IsColumnVector<VT>::value
4426  , typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
4427  , INVALID_TYPE >::Type Type;
4428  //**********************************************************************************************
4429 };
4431 //*************************************************************************************************
4432 
4433 
4434 //*************************************************************************************************
4436 template< typename MT1, typename MT2, typename VT >
4437 struct TDMatSVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2>, VT >
4438 {
4439  public:
4440  //**********************************************************************************************
4441  typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4442  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
4443  IsSparseVector<VT>::value && IsColumnVector<VT>::value
4444  , typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
4445  , INVALID_TYPE >::Type Type;
4446  //**********************************************************************************************
4447 };
4449 //*************************************************************************************************
4450 
4451 
4452 //*************************************************************************************************
4454 template< typename VT, typename MT1, typename MT2 >
4455 struct TDVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
4456 {
4457  public:
4458  //**********************************************************************************************
4459  typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
4460  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4461  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4462  , typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4463  , INVALID_TYPE >::Type Type;
4464  //**********************************************************************************************
4465 };
4467 //*************************************************************************************************
4468 
4469 
4470 //*************************************************************************************************
4472 template< typename VT, typename MT1, typename MT2 >
4473 struct TSVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2> >
4474 {
4475  public:
4476  //**********************************************************************************************
4477  typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
4478  IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
4479  IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
4480  , typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
4481  , INVALID_TYPE >::Type Type;
4482  //**********************************************************************************************
4483 };
4485 //*************************************************************************************************
4486 
4487 
4488 //*************************************************************************************************
4490 template< typename MT1, typename MT2 >
4491 struct SubmatrixExprTrait< TDMatDMatMultExpr<MT1,MT2> >
4492 {
4493  public:
4494  //**********************************************************************************************
4495  typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1>::Type
4496  , typename SubmatrixExprTrait<const MT2>::Type >::Type Type;
4497  //**********************************************************************************************
4498 };
4500 //*************************************************************************************************
4501 
4502 
4503 //*************************************************************************************************
4505 template< typename MT1, typename MT2 >
4506 struct RowExprTrait< TDMatDMatMultExpr<MT1,MT2> >
4507 {
4508  public:
4509  //**********************************************************************************************
4510  typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
4511  //**********************************************************************************************
4512 };
4514 //*************************************************************************************************
4515 
4516 
4517 //*************************************************************************************************
4519 template< typename MT1, typename MT2 >
4520 struct ColumnExprTrait< TDMatDMatMultExpr<MT1,MT2> >
4521 {
4522  public:
4523  //**********************************************************************************************
4524  typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
4525  //**********************************************************************************************
4526 };
4528 //*************************************************************************************************
4529 
4530 } // namespace blaze
4531 
4532 #endif
Data type constraint.
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4512
EnableIf< IsIntegral< T >, Load< T, sizeof(T)> >::Type::Type load(const T *address)
Loads a vector of integral values.
Definition: Load.h:222
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:307
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:3703
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:358
SelectType< IsComputation< MT1 >::value, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:235
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:196
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
Header file for the IsColumnMajorMatrix type trait.
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2375
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:248
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:121
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:246
Compile time check for double precision floating point types.This type trait tests whether or not the...
Definition: IsDouble.h:75
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:327
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:223
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
const size_t TDMATDMATMULT_THRESHOLD
Column-major dense matrix/row-major dense matrix multiplication threshold.This setting specifies the ...
Definition: Thresholds.h:153
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:121
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:120
Constraint on the data type.
Header file for the MultExprTrait class template.
SelectType< IsComputation< MT2 >::value, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:238
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:122
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:123
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:252
Header file for the multiplication trait.
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the TDMatSVecMultExprTrait class template.
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDMatMultExpr.h:224
Header file for the DenseMatrix base class.
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:232
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:219
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Constraints on the storage order of matrix types.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2373
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:229
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:225
Header file for the IsNumeric type trait.
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
System settings for the BLAS mode.
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:648
Header file for run time assertion macros.
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:222
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:351
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:317
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:247
Header file for the TDVecDMatMultExprTrait class template.
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:221
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2370
Header file for basic type definitions.
Header file for the IsComplex type trait.
Header file for the complex data type.
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:297
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:226
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:359
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:267
Compile time check for single precision floating point types.This type trait tests whether or not the...
Definition: IsFloat.h:75
Constraint on the data type.
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:220
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Operand matrix_
The dense matrix containing the submatrix.
Definition: DenseSubmatrix.h:2792
Header file for the TDVecTDMatMultExprTrait class template.
EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:209
void store(float *address, const sse_float_t &value)
Aligned store of a vector of &#39;float&#39; values.
Definition: Store.h:242
Header file for the IsExpression type trait class.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:339
Header file for the FunctionTrace class.