35 #ifndef _BLAZE_MATH_SMP_OPENMP_DENSEMATRIX_H_ 36 #define _BLAZE_MATH_SMP_OPENMP_DENSEMATRIX_H_ 95 template<
typename MT1
99 void smpAssign_backend( DenseMatrix<MT1,SO1>& lhs,
const DenseMatrix<MT2,SO2>& rhs )
105 typedef ElementType_<MT1> ET1;
106 typedef ElementType_<MT2> ET2;
107 typedef SubmatrixExprTrait_<MT1,aligned> AlignedTarget;
108 typedef SubmatrixExprTrait_<MT1,unaligned> UnalignedTarget;
110 constexpr
bool simdEnabled( MT1::simdEnabled && MT2::simdEnabled && IsSIMDCombinable<ET1,ET2>::value );
111 constexpr
size_t SIMDSIZE( SIMDTrait< ElementType_<MT1> >::
size );
113 const bool lhsAligned( (~lhs).isAligned() );
114 const bool rhsAligned( (~rhs).isAligned() );
116 const int threads( omp_get_num_threads() );
117 const ThreadMapping threadmap( createThreadMapping( threads, ~rhs ) );
119 const size_t addon1 ( ( ( (~rhs).
rows() % threadmap.first ) != 0UL )? 1UL : 0UL );
120 const size_t equalShare1( (~rhs).
rows() / threadmap.first + addon1 );
121 const size_t rest1 ( equalShare1 & ( SIMDSIZE - 1UL ) );
122 const size_t rowsPerThread( ( simdEnabled && rest1 )?( equalShare1 - rest1 + SIMDSIZE ):( equalShare1 ) );
124 const size_t addon2 ( ( ( (~rhs).
columns() % threadmap.second ) != 0UL )? 1UL : 0UL );
125 const size_t equalShare2( (~rhs).
columns() / threadmap.second + addon2 );
126 const size_t rest2 ( equalShare2 & ( SIMDSIZE - 1UL ) );
127 const size_t colsPerThread( ( simdEnabled && rest2 )?( equalShare2 - rest2 + SIMDSIZE ):( equalShare2 ) );
129 #pragma omp for schedule(dynamic,1) nowait 130 for(
int i=0; i<threads; ++i )
132 const size_t row ( ( i / threadmap.second ) * rowsPerThread );
133 const size_t column( ( i % threadmap.second ) * colsPerThread );
138 const size_t m(
min( rowsPerThread, (~rhs).
rows() -
row ) );
141 if( simdEnabled && lhsAligned && rhsAligned ) {
142 AlignedTarget target( submatrix<aligned>( ~lhs,
row,
column, m, n ) );
143 assign( target, submatrix<aligned>( ~rhs,
row,
column, m, n ) );
145 else if( simdEnabled && lhsAligned ) {
146 AlignedTarget target( submatrix<aligned>( ~lhs,
row,
column, m, n ) );
147 assign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
149 else if( simdEnabled && rhsAligned ) {
150 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
151 assign( target, submatrix<aligned>( ~rhs,
row,
column, m, n ) );
154 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
155 assign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
179 template<
typename MT1
183 void smpAssign_backend( DenseMatrix<MT1,SO1>& lhs,
const SparseMatrix<MT2,SO2>& rhs )
189 typedef SubmatrixExprTrait_<MT1,unaligned> UnalignedTarget;
191 const size_t threads( omp_get_num_threads() );
192 const ThreadMapping threadmap( createThreadMapping( threads, ~rhs ) );
194 const size_t addon1 ( ( ( (~rhs).
rows() % threadmap.first ) != 0UL )? 1UL : 0UL );
195 const size_t rowsPerThread( (~rhs).
rows() / threadmap.first + addon1 );
197 const size_t addon2 ( ( ( (~rhs).
columns() % threadmap.second ) != 0UL )? 1UL : 0UL );
198 const size_t colsPerThread( (~rhs).
columns() / threadmap.second + addon2 );
200 #pragma omp for schedule(dynamic,1) nowait 201 for(
size_t i=0; i<threads; ++i )
203 const size_t row ( ( i / threadmap.second ) * rowsPerThread );
204 const size_t column( ( i % threadmap.second ) * colsPerThread );
209 const size_t m(
min( rowsPerThread, (~lhs).
rows() -
row ) );
212 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
213 assign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
238 template<
typename MT1
242 inline EnableIf_< And< IsDenseMatrix<MT1>
243 , Or< Not< IsSMPAssignable<MT1> >
244 , Not< IsSMPAssignable<MT2> > > > >
245 smpAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
252 assign( ~lhs, ~rhs );
276 template<
typename MT1
280 inline EnableIf_< And< IsDenseMatrix<MT1>, IsSMPAssignable<MT1>, IsSMPAssignable<MT2> > >
281 smpAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
294 assign( ~lhs, ~rhs );
297 #pragma omp parallel shared( lhs, rhs ) 298 smpAssign_backend( ~lhs, ~rhs );
330 template<
typename MT1
334 void smpAddAssign_backend( DenseMatrix<MT1,SO1>& lhs,
const DenseMatrix<MT2,SO2>& rhs )
340 typedef ElementType_<MT1> ET1;
341 typedef ElementType_<MT2> ET2;
342 typedef SubmatrixExprTrait_<MT1,aligned> AlignedTarget;
343 typedef SubmatrixExprTrait_<MT1,unaligned> UnalignedTarget;
345 constexpr
bool simdEnabled( MT1::simdEnabled && MT2::simdEnabled && IsSIMDCombinable<ET1,ET2>::value );
346 constexpr
size_t SIMDSIZE( SIMDTrait< ElementType_<MT1> >::
size );
348 const bool lhsAligned( (~lhs).isAligned() );
349 const bool rhsAligned( (~rhs).isAligned() );
351 const int threads( omp_get_num_threads() );
352 const ThreadMapping threadmap( createThreadMapping( threads, ~rhs ) );
354 const size_t addon1 ( ( ( (~rhs).
rows() % threadmap.first ) != 0UL )? 1UL : 0UL );
355 const size_t equalShare1( (~rhs).
rows() / threadmap.first + addon1 );
356 const size_t rest1 ( equalShare1 & ( SIMDSIZE - 1UL ) );
357 const size_t rowsPerThread( ( simdEnabled && rest1 )?( equalShare1 - rest1 + SIMDSIZE ):( equalShare1 ) );
359 const size_t addon2 ( ( ( (~rhs).
columns() % threadmap.second ) != 0UL )? 1UL : 0UL );
360 const size_t equalShare2( (~rhs).
columns() / threadmap.second + addon2 );
361 const size_t rest2 ( equalShare2 & ( SIMDSIZE - 1UL ) );
362 const size_t colsPerThread( ( simdEnabled && rest2 )?( equalShare2 - rest2 + SIMDSIZE ):( equalShare2 ) );
364 #pragma omp for schedule(dynamic,1) nowait 365 for(
int i=0; i<threads; ++i )
367 const size_t row ( ( i / threadmap.second ) * rowsPerThread );
368 const size_t column( ( i % threadmap.second ) * colsPerThread );
373 const size_t m(
min( rowsPerThread, (~rhs).
rows() -
row ) );
376 if( simdEnabled && lhsAligned && rhsAligned ) {
377 AlignedTarget target( submatrix<aligned>( ~lhs,
row,
column, m, n ) );
378 addAssign( target, submatrix<aligned>( ~rhs,
row,
column, m, n ) );
380 else if( simdEnabled && lhsAligned ) {
381 AlignedTarget target( submatrix<aligned>( ~lhs,
row,
column, m, n ) );
382 addAssign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
384 else if( simdEnabled && rhsAligned ) {
385 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
386 addAssign( target, submatrix<aligned>( ~rhs,
row,
column, m, n ) );
389 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
390 addAssign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
414 template<
typename MT1
418 void smpAddAssign_backend( DenseMatrix<MT1,SO1>& lhs,
const SparseMatrix<MT2,SO2>& rhs )
424 typedef SubmatrixExprTrait_<MT1,unaligned> UnalignedTarget;
426 const size_t threads( omp_get_num_threads() );
427 const ThreadMapping threadmap( createThreadMapping( threads, ~rhs ) );
429 const size_t addon1 ( ( ( (~rhs).
rows() % threadmap.first ) != 0UL )? 1UL : 0UL );
430 const size_t rowsPerThread( (~rhs).
rows() / threadmap.first + addon1 );
432 const size_t addon2 ( ( ( (~rhs).
columns() % threadmap.second ) != 0UL )? 1UL : 0UL );
433 const size_t colsPerThread( (~rhs).
columns() / threadmap.second + addon2 );
435 #pragma omp for schedule(dynamic,1) nowait 436 for(
size_t i=0; i<threads; ++i )
438 const size_t row ( ( i / threadmap.second ) * rowsPerThread );
439 const size_t column( ( i % threadmap.second ) * colsPerThread );
444 const size_t m(
min( rowsPerThread, (~lhs).
rows() -
row ) );
447 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
448 addAssign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
473 template<
typename MT1
477 inline EnableIf_< And< IsDenseMatrix<MT1>
478 , Or< Not< IsSMPAssignable<MT1> >
479 , Not< IsSMPAssignable<MT2> > > > >
480 smpAddAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
487 addAssign( ~lhs, ~rhs );
511 template<
typename MT1
515 inline EnableIf_< And< IsDenseMatrix<MT1>, IsSMPAssignable<MT1>, IsSMPAssignable<MT2> > >
516 smpAddAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
529 addAssign( ~lhs, ~rhs );
532 #pragma omp parallel shared( lhs, rhs ) 533 smpAddAssign_backend( ~lhs, ~rhs );
565 template<
typename MT1
569 void smpSubAssign_backend( DenseMatrix<MT1,SO1>& lhs,
const DenseMatrix<MT2,SO2>& rhs )
575 typedef ElementType_<MT1> ET1;
576 typedef ElementType_<MT2> ET2;
577 typedef SubmatrixExprTrait_<MT1,aligned> AlignedTarget;
578 typedef SubmatrixExprTrait_<MT1,unaligned> UnalignedTarget;
580 constexpr
bool simdEnabled( MT1::simdEnabled && MT2::simdEnabled && IsSIMDCombinable<ET1,ET2>::value );
581 constexpr
size_t SIMDSIZE( SIMDTrait< ElementType_<MT1> >::
size );
583 const bool lhsAligned( (~lhs).isAligned() );
584 const bool rhsAligned( (~rhs).isAligned() );
586 const int threads( omp_get_num_threads() );
587 const ThreadMapping threadmap( createThreadMapping( threads, ~rhs ) );
589 const size_t addon1 ( ( ( (~rhs).
rows() % threadmap.first ) != 0UL )? 1UL : 0UL );
590 const size_t equalShare1( (~rhs).
rows() / threadmap.first + addon1 );
591 const size_t rest1 ( equalShare1 & ( SIMDSIZE - 1UL ) );
592 const size_t rowsPerThread( ( simdEnabled && rest1 )?( equalShare1 - rest1 + SIMDSIZE ):( equalShare1 ) );
594 const size_t addon2 ( ( ( (~rhs).
columns() % threadmap.second ) != 0UL )? 1UL : 0UL );
595 const size_t equalShare2( (~rhs).
columns() / threadmap.second + addon2 );
596 const size_t rest2 ( equalShare2 & ( SIMDSIZE - 1UL ) );
597 const size_t colsPerThread( ( simdEnabled && rest2 )?( equalShare2 - rest2 + SIMDSIZE ):( equalShare2 ) );
599 #pragma omp for schedule(dynamic,1) nowait 600 for(
int i=0; i<threads; ++i )
602 const size_t row ( ( i / threadmap.second ) * rowsPerThread );
603 const size_t column( ( i % threadmap.second ) * colsPerThread );
608 const size_t m(
min( rowsPerThread, (~rhs).
rows() -
row ) );
611 if( simdEnabled && lhsAligned && rhsAligned ) {
612 AlignedTarget target( submatrix<aligned>( ~lhs,
row,
column, m, n ) );
613 subAssign( target, submatrix<aligned>( ~rhs,
row,
column, m, n ) );
615 else if( simdEnabled && lhsAligned ) {
616 AlignedTarget target( submatrix<aligned>( ~lhs,
row,
column, m, n ) );
617 subAssign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
619 else if( simdEnabled && rhsAligned ) {
620 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
621 subAssign( target, submatrix<aligned>( ~rhs,
row,
column, m, n ) );
624 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
625 subAssign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
650 template<
typename MT1
654 void smpSubAssign_backend( DenseMatrix<MT1,SO1>& lhs,
const SparseMatrix<MT2,SO2>& rhs )
660 typedef SubmatrixExprTrait_<MT1,unaligned> UnalignedTarget;
662 const size_t threads( omp_get_num_threads() );
663 const ThreadMapping threadmap( createThreadMapping( threads, ~rhs ) );
665 const size_t addon1 ( ( ( (~rhs).
rows() % threadmap.first ) != 0UL )? 1UL : 0UL );
666 const size_t rowsPerThread( (~rhs).
rows() / threadmap.first + addon1 );
668 const size_t addon2 ( ( ( (~rhs).
columns() % threadmap.second ) != 0UL )? 1UL : 0UL );
669 const size_t colsPerThread( (~rhs).
columns() / threadmap.second + addon2 );
671 #pragma omp for schedule(dynamic,1) nowait 672 for(
size_t i=0; i<threads; ++i )
674 const size_t row ( ( i / threadmap.second ) * rowsPerThread );
675 const size_t column( ( i % threadmap.second ) * colsPerThread );
680 const size_t m(
min( rowsPerThread, (~lhs).
rows() -
row ) );
683 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
684 subAssign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
709 template<
typename MT1
713 inline EnableIf_< And< IsDenseMatrix<MT1>
714 , Or< Not< IsSMPAssignable<MT1> >
715 , Not< IsSMPAssignable<MT2> > > > >
716 smpSubAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
723 subAssign( ~lhs, ~rhs );
747 template<
typename MT1
751 inline EnableIf_< And< IsDenseMatrix<MT1>, IsSMPAssignable<MT1>, IsSMPAssignable<MT2> > >
752 smpSubAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
765 subAssign( ~lhs, ~rhs );
768 #pragma omp parallel shared( lhs, rhs ) 769 smpSubAssign_backend( ~lhs, ~rhs );
801 template<
typename MT1
805 inline EnableIf_< IsDenseMatrix<MT1> >
806 smpMultAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
813 multAssign( ~lhs, ~rhs );
Header file for the implementation of the Submatrix view.
Header file for auxiliary alias declarations.
Header file for mathematical functions.
Header file for the alignment flag values.
Header file for basic type definitions.
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:261
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1755
Header file for the SIMD trait.
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the SparseMatrix base class.
Header file for the SMP thread mapping functionality.
Header file for the matrix storage order types.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SMP_ASSIGNABLE(T)
Constraint on the data type.In case the given data type T is SMP-assignable (can be assigned by multi...
Definition: SMPAssignable.h:81
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
System settings for the shared-memory parallelization.
Header file for the IsSMPAssignable type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
Header file for the DenseMatrix base class.
Header file for the Not class template.
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:336
Header file for the serial section implementation.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:128
Header file for the parallel section implementation.
Header file for the IsDenseMatrix type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:128
Header file for the EnableIf class template.
#define BLAZE_PARALLEL_SECTION
Section for the debugging of the shared-memory parallelization.During the shared-memory parallel (SMP...
Definition: ParallelSection.h:246
bool isSerialSectionActive()
Returns whether a serial section is active or not.
Definition: SerialSection.h:213
Header file for the IsSIMDCombinable type trait.
Header file for the SubmatrixExprTrait class template.
Header file for run time assertion macros.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:93
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:320
bool isParallelSectionActive()
Returns whether a parallel section is active or not.
Definition: ParallelSection.h:213
#define BLAZE_OPENMP_PARALLEL_MODE
Compilation switch for the OpenMP parallelization.This compilation switch enables/disables the OpenMP...
Definition: SMP.h:67
#define BLAZE_STATIC_ASSERT(expr)
Compile time assertion macro.In case of an invalid compile time expression, a compilation error is cr...
Definition: StaticAssert.h:112
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the data type.
Header file for the function trace functionality.