35 #ifndef _BLAZE_MATH_SMP_OPENMP_DENSEMATRIX_H_
36 #define _BLAZE_MATH_SMP_OPENMP_DENSEMATRIX_H_
96 template<
typename MT1
100 void smpAssign_backend( DenseMatrix<MT1,SO1>& lhs,
const DenseMatrix<MT2,SO2>& rhs )
106 typedef ElementType_<MT1> ET1;
107 typedef ElementType_<MT2> ET2;
108 typedef SubmatrixExprTrait_<MT1,aligned> AlignedTarget;
109 typedef SubmatrixExprTrait_<MT1,unaligned> UnalignedTarget;
111 enum :
size_t { SIMDSIZE = SIMDTrait< ElementType_<MT1> >
::size };
113 const bool simdEnabled( MT1::simdEnabled && MT2::simdEnabled && AreSIMDCombinable<ET1,ET2>::value );
114 const bool lhsAligned ( (~lhs).isAligned() );
115 const bool rhsAligned ( (~rhs).isAligned() );
117 const int threads( omp_get_num_threads() );
118 const ThreadMapping threadmap( createThreadMapping( threads, ~rhs ) );
120 const size_t addon1 ( ( ( (~rhs).
rows() % threadmap.first ) != 0UL )? 1UL : 0UL );
121 const size_t equalShare1( (~rhs).
rows() / threadmap.first + addon1 );
122 const size_t rest1 ( equalShare1 & ( SIMDSIZE - 1UL ) );
123 const size_t rowsPerThread( ( simdEnabled && rest1 )?( equalShare1 - rest1 + SIMDSIZE ):( equalShare1 ) );
125 const size_t addon2 ( ( ( (~rhs).
columns() % threadmap.second ) != 0UL )? 1UL : 0UL );
126 const size_t equalShare2( (~rhs).
columns() / threadmap.second + addon2 );
127 const size_t rest2 ( equalShare2 & ( SIMDSIZE - 1UL ) );
128 const size_t colsPerThread( ( simdEnabled && rest2 )?( equalShare2 - rest2 + SIMDSIZE ):( equalShare2 ) );
130 #pragma omp for schedule(dynamic,1) nowait
131 for(
int i=0; i<threads; ++i )
133 const size_t row ( ( i / threadmap.second ) * rowsPerThread );
134 const size_t column( ( i % threadmap.second ) * colsPerThread );
139 const size_t m(
min( rowsPerThread, (~rhs).
rows() -
row ) );
142 if( simdEnabled && lhsAligned && rhsAligned ) {
143 AlignedTarget target( submatrix<aligned>( ~lhs,
row,
column, m, n ) );
144 assign( target, submatrix<aligned>( ~rhs,
row,
column, m, n ) );
146 else if( simdEnabled && lhsAligned ) {
147 AlignedTarget target( submatrix<aligned>( ~lhs,
row,
column, m, n ) );
148 assign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
150 else if( simdEnabled && rhsAligned ) {
151 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
152 assign( target, submatrix<aligned>( ~rhs,
row,
column, m, n ) );
155 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
156 assign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
180 template<
typename MT1
184 void smpAssign_backend( DenseMatrix<MT1,SO1>& lhs,
const SparseMatrix<MT2,SO2>& rhs )
190 typedef SubmatrixExprTrait_<MT1,unaligned> UnalignedTarget;
192 const size_t threads( omp_get_num_threads() );
193 const ThreadMapping threadmap( createThreadMapping( threads, ~rhs ) );
195 const size_t addon1 ( ( ( (~rhs).
rows() % threadmap.first ) != 0UL )? 1UL : 0UL );
196 const size_t rowsPerThread( (~rhs).
rows() / threadmap.first + addon1 );
198 const size_t addon2 ( ( ( (~rhs).
columns() % threadmap.second ) != 0UL )? 1UL : 0UL );
199 const size_t colsPerThread( (~rhs).
columns() / threadmap.second + addon2 );
201 #pragma omp for schedule(dynamic,1) nowait
202 for(
int i=0; i<threads; ++i )
204 const size_t row ( ( i / threadmap.second ) * rowsPerThread );
205 const size_t column( ( i % threadmap.second ) * colsPerThread );
210 const size_t m(
min( rowsPerThread, (~lhs).
rows() -
row ) );
213 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
214 assign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
239 template<
typename MT1
243 inline EnableIf_< And< IsDenseMatrix<MT1>
244 , Or< Not< IsSMPAssignable<MT1> >
245 , Not< IsSMPAssignable<MT2> > > > >
246 smpAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
253 assign( ~lhs, ~rhs );
277 template<
typename MT1
281 inline EnableIf_< And< IsDenseMatrix<MT1>, IsSMPAssignable<MT1>, IsSMPAssignable<MT2> > >
282 smpAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
295 assign( ~lhs, ~rhs );
298 #pragma omp parallel shared( lhs, rhs )
299 smpAssign_backend( ~lhs, ~rhs );
331 template<
typename MT1
335 void smpAddAssign_backend( DenseMatrix<MT1,SO1>& lhs,
const DenseMatrix<MT2,SO2>& rhs )
341 typedef ElementType_<MT1> ET1;
342 typedef ElementType_<MT2> ET2;
343 typedef SubmatrixExprTrait_<MT1,aligned> AlignedTarget;
344 typedef SubmatrixExprTrait_<MT1,unaligned> UnalignedTarget;
346 enum :
size_t { SIMDSIZE = SIMDTrait< ElementType_<MT1> >
::size };
348 const bool simdEnabled( MT1::simdEnabled && MT2::simdEnabled && IsSame<ET1,ET2>::value );
349 const bool lhsAligned ( (~lhs).isAligned() );
350 const bool rhsAligned ( (~rhs).isAligned() );
352 const int threads( omp_get_num_threads() );
353 const ThreadMapping threadmap( createThreadMapping( threads, ~rhs ) );
355 const size_t addon1 ( ( ( (~rhs).
rows() % threadmap.first ) != 0UL )? 1UL : 0UL );
356 const size_t equalShare1( (~rhs).
rows() / threadmap.first + addon1 );
357 const size_t rest1 ( equalShare1 & ( SIMDSIZE - 1UL ) );
358 const size_t rowsPerThread( ( simdEnabled && rest1 )?( equalShare1 - rest1 + SIMDSIZE ):( equalShare1 ) );
360 const size_t addon2 ( ( ( (~rhs).
columns() % threadmap.second ) != 0UL )? 1UL : 0UL );
361 const size_t equalShare2( (~rhs).
columns() / threadmap.second + addon2 );
362 const size_t rest2 ( equalShare2 & ( SIMDSIZE - 1UL ) );
363 const size_t colsPerThread( ( simdEnabled && rest2 )?( equalShare2 - rest2 + SIMDSIZE ):( equalShare2 ) );
365 #pragma omp for schedule(dynamic,1) nowait
366 for(
int i=0; i<threads; ++i )
368 const size_t row ( ( i / threadmap.second ) * rowsPerThread );
369 const size_t column( ( i % threadmap.second ) * colsPerThread );
374 const size_t m(
min( rowsPerThread, (~rhs).
rows() -
row ) );
377 if( simdEnabled && lhsAligned && rhsAligned ) {
378 AlignedTarget target( submatrix<aligned>( ~lhs,
row,
column, m, n ) );
379 addAssign( target, submatrix<aligned>( ~rhs,
row,
column, m, n ) );
381 else if( simdEnabled && lhsAligned ) {
382 AlignedTarget target( submatrix<aligned>( ~lhs,
row,
column, m, n ) );
383 addAssign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
385 else if( simdEnabled && rhsAligned ) {
386 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
387 addAssign( target, submatrix<aligned>( ~rhs,
row,
column, m, n ) );
390 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
391 addAssign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
415 template<
typename MT1
419 void smpAddAssign_backend( DenseMatrix<MT1,SO1>& lhs,
const SparseMatrix<MT2,SO2>& rhs )
425 typedef SubmatrixExprTrait_<MT1,unaligned> UnalignedTarget;
427 const size_t threads( omp_get_num_threads() );
428 const ThreadMapping threadmap( createThreadMapping( threads, ~rhs ) );
430 const size_t addon1 ( ( ( (~rhs).
rows() % threadmap.first ) != 0UL )? 1UL : 0UL );
431 const size_t rowsPerThread( (~rhs).
rows() / threadmap.first + addon1 );
433 const size_t addon2 ( ( ( (~rhs).
columns() % threadmap.second ) != 0UL )? 1UL : 0UL );
434 const size_t colsPerThread( (~rhs).
columns() / threadmap.second + addon2 );
436 #pragma omp for schedule(dynamic,1) nowait
437 for(
int i=0; i<threads; ++i )
439 const size_t row ( ( i / threadmap.second ) * rowsPerThread );
440 const size_t column( ( i % threadmap.second ) * colsPerThread );
445 const size_t m(
min( rowsPerThread, (~lhs).
rows() -
row ) );
448 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
449 addAssign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
474 template<
typename MT1
478 inline EnableIf_< And< IsDenseMatrix<MT1>
479 , Or< Not< IsSMPAssignable<MT1> >
480 , Not< IsSMPAssignable<MT2> > > > >
481 smpAddAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
488 addAssign( ~lhs, ~rhs );
512 template<
typename MT1
516 inline EnableIf_< And< IsDenseMatrix<MT1>, IsSMPAssignable<MT1>, IsSMPAssignable<MT2> > >
517 smpAddAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
530 addAssign( ~lhs, ~rhs );
533 #pragma omp parallel shared( lhs, rhs )
534 smpAddAssign_backend( ~lhs, ~rhs );
566 template<
typename MT1
570 void smpSubAssign_backend( DenseMatrix<MT1,SO1>& lhs,
const DenseMatrix<MT2,SO2>& rhs )
576 typedef ElementType_<MT1> ET1;
577 typedef ElementType_<MT2> ET2;
578 typedef SubmatrixExprTrait_<MT1,aligned> AlignedTarget;
579 typedef SubmatrixExprTrait_<MT1,unaligned> UnalignedTarget;
581 enum :
size_t { SIMDSIZE = SIMDTrait< ElementType_<MT1> >
::size };
583 const bool simdEnabled( MT1::simdEnabled && MT2::simdEnabled && IsSame<ET1,ET2>::value );
584 const bool lhsAligned ( (~lhs).isAligned() );
585 const bool rhsAligned ( (~rhs).isAligned() );
587 const int threads( omp_get_num_threads() );
588 const ThreadMapping threadmap( createThreadMapping( threads, ~rhs ) );
590 const size_t addon1 ( ( ( (~rhs).
rows() % threadmap.first ) != 0UL )? 1UL : 0UL );
591 const size_t equalShare1( (~rhs).
rows() / threadmap.first + addon1 );
592 const size_t rest1 ( equalShare1 & ( SIMDSIZE - 1UL ) );
593 const size_t rowsPerThread( ( simdEnabled && rest1 )?( equalShare1 - rest1 + SIMDSIZE ):( equalShare1 ) );
595 const size_t addon2 ( ( ( (~rhs).
columns() % threadmap.second ) != 0UL )? 1UL : 0UL );
596 const size_t equalShare2( (~rhs).
columns() / threadmap.second + addon2 );
597 const size_t rest2 ( equalShare2 & ( SIMDSIZE - 1UL ) );
598 const size_t colsPerThread( ( simdEnabled && rest2 )?( equalShare2 - rest2 + SIMDSIZE ):( equalShare2 ) );
600 #pragma omp for schedule(dynamic,1) nowait
601 for(
int i=0; i<threads; ++i )
603 const size_t row ( ( i / threadmap.second ) * rowsPerThread );
604 const size_t column( ( i % threadmap.second ) * colsPerThread );
609 const size_t m(
min( rowsPerThread, (~rhs).
rows() -
row ) );
612 if( simdEnabled && lhsAligned && rhsAligned ) {
613 AlignedTarget target( submatrix<aligned>( ~lhs,
row,
column, m, n ) );
614 subAssign( target, submatrix<aligned>( ~rhs,
row,
column, m, n ) );
616 else if( simdEnabled && lhsAligned ) {
617 AlignedTarget target( submatrix<aligned>( ~lhs,
row,
column, m, n ) );
618 subAssign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
620 else if( simdEnabled && rhsAligned ) {
621 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
622 subAssign( target, submatrix<aligned>( ~rhs,
row,
column, m, n ) );
625 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
626 subAssign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
651 template<
typename MT1
655 void smpSubAssign_backend( DenseMatrix<MT1,SO1>& lhs,
const SparseMatrix<MT2,SO2>& rhs )
661 typedef SubmatrixExprTrait_<MT1,unaligned> UnalignedTarget;
663 const size_t threads( omp_get_num_threads() );
664 const ThreadMapping threadmap( createThreadMapping( threads, ~rhs ) );
666 const size_t addon1 ( ( ( (~rhs).
rows() % threadmap.first ) != 0UL )? 1UL : 0UL );
667 const size_t rowsPerThread( (~rhs).
rows() / threadmap.first + addon1 );
669 const size_t addon2 ( ( ( (~rhs).
columns() % threadmap.second ) != 0UL )? 1UL : 0UL );
670 const size_t colsPerThread( (~rhs).
columns() / threadmap.second + addon2 );
672 #pragma omp for schedule(dynamic,1) nowait
673 for(
int i=0; i<threads; ++i )
675 const size_t row ( ( i / threadmap.second ) * rowsPerThread );
676 const size_t column( ( i % threadmap.second ) * colsPerThread );
681 const size_t m(
min( rowsPerThread, (~lhs).
rows() -
row ) );
684 UnalignedTarget target( submatrix<unaligned>( ~lhs,
row,
column, m, n ) );
685 subAssign( target, submatrix<unaligned>( ~rhs,
row,
column, m, n ) );
710 template<
typename MT1
714 inline EnableIf_< And< IsDenseMatrix<MT1>
715 , Or< Not< IsSMPAssignable<MT1> >
716 , Not< IsSMPAssignable<MT2> > > > >
717 smpSubAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
724 subAssign( ~lhs, ~rhs );
748 template<
typename MT1
752 inline EnableIf_< And< IsDenseMatrix<MT1>, IsSMPAssignable<MT1>, IsSMPAssignable<MT2> > >
753 smpSubAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
766 subAssign( ~lhs, ~rhs );
769 #pragma omp parallel shared( lhs, rhs )
770 smpSubAssign_backend( ~lhs, ~rhs );
802 template<
typename MT1
806 inline EnableIf_< IsDenseMatrix<MT1> >
807 smpMultAssign( Matrix<MT1,SO1>& lhs,
const Matrix<MT2,SO2>& rhs )
814 multAssign( ~lhs, ~rhs );
Header file for the implementation of the Submatrix view.
Header file for auxiliary alias declarations.
Header file for mathematical functions.
Header file for the alignment flag values.
Header file for basic type definitions.
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
Header file for the IsSame and IsStrictlySame type traits.
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1669
Header file for the SIMD trait.
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:126
Header file for the SparseMatrix base class.
Header file for the SMP thread mapping functionality.
Header file for the matrix storage order types.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SMP_ASSIGNABLE(T)
Constraint on the data type.In case the given data type T is SMP-assignable (can be assigned by multi...
Definition: SMPAssignable.h:81
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
System settings for the shared-memory parallelization.
Header file for the IsSMPAssignable type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
Header file for the DenseMatrix base class.
Header file for the Not class template.
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:330
Header file for the serial section implementation.
Header file for the parallel section implementation.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
#define BLAZE_PARALLEL_SECTION
Section for the debugging of the shared-memory parallelization.During the shared-memory parallel (SMP...
Definition: ParallelSection.h:246
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:126
bool isSerialSectionActive()
Returns whether a serial section is active or not.
Definition: SerialSection.h:213
Header file for the SubmatrixExprTrait class template.
Header file for run time assertion macros.
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:314
Header file for the AreSIMDCombinable type trait.
bool isParallelSectionActive()
Returns whether a parallel section is active or not.
Definition: ParallelSection.h:213
#define BLAZE_OPENMP_PARALLEL_MODE
Compilation switch for the OpenMP parallelization.This compilation switch enables/disables the OpenMP...
Definition: SMP.h:67
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
#define BLAZE_STATIC_ASSERT(expr)
Compile time assertion macro.In case of an invalid compile time expression, a compilation error is cr...
Definition: StaticAssert.h:112
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the data type.
Header file for the FunctionTrace class.