- edited description
CloverLeaf_OpenACC
Issue #8
new
Get the src
- ssh -Y swan
- cd /lus/scratch/$USER/
- git clone https://github.com/Warwick-PCAV/CloverLeaf_OpenACC_KERNELS.git
- cd CloverLeaf_OpenACC_KERNELS/
Compile
cp ~piccinal/CloverLeaf/CloverLeaf_OpenACC_KERNELS/CloverLeaf_OpenACC_KERNELS/Makefile Makefile
CCE
- module load PrgEnv-cray # cce/832
- module load craype-accel-nvidia35
- module load perftools/6.2.0
- make clean
make \
MPI_COMPILER=ftn \
C_MPI_COMPILER=cc \
COMPILER=CRAY \
NV_ARCH=KEPLER \
CFLAGS_CRAY=-em \
FLAGS_CRAY="-em -h acc_model=fast_addr:no_deep_copy:auto_async_all"
PGI
module swap PrgEnv-cray PrgEnv-pgi
module swap craype/2.05 craype/2.2.0
module swap pgi /apps/daint/scorep/mf/pgi/1470
module swap cray-mpich/6.2.2 cray-mpich/7.0.3
module load craype-accel-nvidia35
module rm libsci_acc
make clean
make \
MPI_COMPILER=ftn \
C_MPI_COMPILER=cc \
COMPILER=PGI \
NV_ARCH=KEPLER \
CFLAGS_PGI="-mp=nonuma -acc -ta=nvidia:cc35" \
FLAGS_PGI="-mp=nonuma -acc -ta=nvidia:cc35"
- grep end_time clover.in
end_time=200.0
Run
- qsub -I -V -l mppwidth=1 -q gpu_nodes
- cd /lus/scratch/p01991/CloverLeaf_OpenACC_KERNELS/
- aprun -n1 ./clover_leaf+pat
Step 5001 time 200.0000000 control sound timestep 4.00E-02
1, 1 x 5.00E-01 y 5.00E-01
Test problem 1 is within 99.892471051684851 % of the expected solution
This test is considered NOT PASSED (OK)
Wall clock 50.109227895736694
First step overhead 3.726959228515625E-3
Comments (18)
-
reporter -
reporter - edited description
-
reporter - edited description
-
reporter - edited description
-
reporter - edited description
-
reporter - edited description
-
reporter - edited description
-
reporter - edited description
-
reporter -
Can craypat tell me how much time is spent on the GPU or at least the ration CPU/GPU ? => do not mix openacc and cuda when running pat_build, then sum the % in openacc.
-
How to compile OpenACC with CCE (conflict between GNU/scorep --cuda & CCE/scorep --mpi) ?
-
-
reporter - changed title to CloverLeaf_OpenACC
-
reporter - edited description
-
reporter - edited description
-
reporter SCOREP/1.3
PGI
Compile
make \ MPI_COMPILER="scorep --mpp=mpi --thread=omp --cuda ftn" \ C_MPI_COMPILER="scorep --mpp=mpi --thread=omp --cuda cc" \ COMPILER=PGI \ NV_ARCH=KEPLER \ CFLAGS_PGI="-mp=nonuma -acc -ta=nvidia:cc35" \ FLAGS_PGI="-mp=nonuma -acc -ta=nvidia:cc35"
Run
salloc -N4 export SCOREP_ENABLE_PROFILING=false export SCOREP_ENABLE_TRACING=true export SCOREP_CUDA_ENABLE=yes,flushatexit export SCOREP_TOTAL_MEMORY=1G export OMP_NUM_THREADS=4 ## not needed, there is no openmp module swap PrgEnv-cray PrgEnv-pgi module swap craype/2.05 craype/2.2.0 module swap pgi /apps/daint/scorep/mf/pgi/1470 module swap cray-mpich/6.2.2 cray-mpich/7.0.3 module load craype-accel-nvidia35 module rm libsci_acc module load scorep/1.3 aprun -n4 -N1 -d1 PGI.SANTIS.MPIACC
-
Ignoring: [Score-P] src/adapters/cuda/scorep_cupti4_activity.c:594: Warning: [CUPTI Activity] Reached maximum CUDA buffer size for context 9508288
-
Ignoring: [Score-P] src/adapters/cuda/scorep_cupti4_activity.c:253: Warning: [CUPTI Activity] Destroying buffer which is currently in use (8599504, 0, 1) ===> set flushatexit
Analyse
vampir83 scorep-*/traces.otf2
-
-
reporter - edited description
-
reporter - edited description
-
reporter - edited description
-
reporter - edited description
-
reporter craypat/620
- pat_report *.xf
pat_build -g mpi,omp,io,oacc clover_leaf (OK)
Table 1: Profile by Function Group and Function Time% | Time | Imb. | Imb. | Calls |Group | | Time | Time% | | Function | | | | | Thread=HIDE 100.0% | 34.427764 | -- | -- | 84263863.0 |Total |------------------------------------------------------------------------------------------------------------------ | 85.4% | 29.386012 | -- | -- | 1886589.0 |USER ||----------------------------------------------------------------------------------------------------------------- || 5.4% | 1.859823 | -- | -- | 115033.0 |clover_exchange_message$clover_module_.ACC_SYNC_WAIT@li.504 || 2.0% | 0.674976 | -- | -- | 45006.0 |advec_mom_kernel$advec_mom_kernel_mod_ || 1.6% | 0.566926 | -- | -- | 1.0 |hydro_cycle$hydro_cycle_module_.ACC_DATA_REGION@li.133 || 1.4% | 0.491311 | -- | -- | 15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.143 || 1.3% | 0.456587 | -- | -- | 15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.154 || 1.3% | 0.452998 | -- | -- | 15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.248 || 1.3% | 0.450229 | -- | -- | 15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.270 || 1.3% | 0.449144 | -- | -- | 15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.176 || 1.3% | 0.448298 | -- | -- | 15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.259 || 1.3% | 0.447358 | -- | -- | 15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.165 || 1.3% | 0.445770 | -- | -- | 15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.237 || 1.2% | 0.413044 | -- | -- | 30007.0 |update_halo_kernel$update_halo_kernel_module_.ACC_DATA_REGION@li.86 || 1.1% | 0.386894 | -- | -- | 10505.0 |ideal_gas_kernel$ideal_gas_kernel_module_.ACC_KERNEL@li.48 || 1.1% | 0.369279 | -- | -- | 30007.0 |update_halo_kernel$update_halo_kernel_module_.ACC_ASYNC_COPY@li.798 || 1.0% | 0.354926 | -- | -- | 10002.0 |calc_dt$calc_dt_module_.ACC_KERNEL@li.50 || 1.0% | 0.349611 | -- | -- | 30007.0 |update_halo_kernel$update_halo_kernel_module_.ACC_ASYNC_COPY@li.86 || 1.0% | 0.339757 | -- | -- | 20002.0 |advec_cell_kernel$advec_cell_kernel_module_ || 1.0% | 0.332413 | -- | -- | 10002.0 |advec_mom_kernel$advec_mom_kernel_mod_.ACC_KERNEL@li.231 ||====================================================================== | 14.3% | 4.938361 | -- | -- | 82112158.0 |ETC ||---------------------------------------------------------------------------------------------------------------- || 11.3% | 3.907472 | 1.953664 | 100.0% | 73537807.0 |==LO_MEMORY== libcuda.so.1 || 1.2% | 0.398469 | -- | -- | 7361292.0 |cuptiGetTimestamp |=======================================================================
% of OpenACC
grep ACC_ in |awk '{s=s+$1}END{print s}'
- 23.9%
Meaning of LO_MEMORY
pat_help FAQ "Processing Data with pat_report" (.=quit ,=back ^=up /=top ~=search) => 1 ==LO_MEMORY== is used for any address that can not be mapped to a function address. These are typically below the lowest address for any global symbol in the symbol table. There are several reasons why this can happen: 1. The instrumented program was rebuilt after the data files were created but before the report was generated. 2. The -i option of pat_report was used to designate a program that is not the same as the one that created the data files. 3. An optimized frame contains an unexpected value at the offset normally reserved for a return address. 4. The program executes code that is not represented by a global symbol in the symbol table. 5. The program executes code that is dynamically loaded at addresses above the end of the static text segment.
pat_build -g mpi,omp,io,cuda clover_leaf (WRONG)
- aprun -n2 -N1 ./clover_leaf+pat
Table 1: Profile by Function Group and Function Time% | Time | Imb. | Imb. | Calls |Group | | Time | Time% | | Function | | | | | PE=HIDE | | | | | Thread=HIDE 100.0% | 61.949031 | -- | -- | 146317173.5 |Total |----------------------------------------------------------------------------------------------------------------- | 82.0% | 50.775036 | -- | -- | 142439872.5 |ETC ||---------------------------------------------------------------------------------------------------------------- || 24.6% | 15.221305 | 0.074233 | 1.0% | 39678861.0 |__cray_acc_hw_start_kernel || 19.3% | 11.975088 | 0.035921 | 0.6% | 29338028.0 |_ZL12end_acc_timeP11CUstream_stPmP17cray_acc_location.part.3 || 8.5% | 5.248636 | 0.004887 | 0.2% | 15844897.0 |__cray_acc_hw_start_async_tracking || 7.5% | 4.656549 | 0.009852 | 0.4% | 16421220.0 |_ZL17complete_trackingP17cray_acc_location.part.2 || 7.0% | 4.306423 | 0.006287 | 0.3% | 15507789.0 |__cray_acc_hw_end_async_tracking || 6.3% | 3.874195 | 0.008674 | 0.4% | 11729831.0 |start_acc_time || 1.8% | 1.134196 | 0.003694 | 0.6% | 3240918.0 |__cray_acc_hw_synchronize || 1.7% | 1.081438 | 0.009005 | 1.7% | 2707153.0 |__cray_acc_hw_copy_acc_to_host || 1.7% | 1.034765 | 0.001789 | 0.3% | 2677813.0 |__cray_acc_hw_copy_host_to_acc || 1.5% | 0.947850 | 0.001436 | 0.3% | 2530965.0 |__cray_acc_hw_wait ||==================================================================== | 14.6% | 9.025409 | -- | -- | 3382018.0 |USER ||---------------------------------------------------------------------------------------------------------------- || 1.3% | 0.826430 | 0.007207 | 1.7% | 345099.0 |clover_exchange_message$clover_module_.ACC_ASYNC_COPY@li.553 || 1.3% | 0.808651 | 0.007495 | 1.8% | 345099.0 |clover_exchange_message$clover_module_.ACC_ASYNC_COPY@li.504 || 1.3% | 0.798432 | 0.000642 | 0.2% | 230066.0 |clover_exchange_message$clover_module_.ACC_SYNC_WAIT@li.504 || 1.2% | 0.769508 | 0.128373 | 28.6% | 1.0 |hydro_cycle$hydro_cycle_module_.ACC_DATA_REGION@li.133 ||================================================================================================================ | 2.5% | 1.530344 | -- | -- | 477658.0 |MPI ||---------------------------------------------------------------------------------------------------------------- | 1.5% | 0.952014 | 0.209916 | 36.1% | 230066.0 | mpi_waitall_ |====================================================================
pat_build -g mpi,omp,io,oacc,cuda clover_leaf (WRONG)
- aprun -n1 ./clover_leaf+pat
- pat_report *.xf
Table 1: Profile by Function Group and Function Time% | Time | Imb. | Imb. | Calls |Group | | Time | Time% | | Function | | | | | Thread=HIDE 100.0% | 38.938440 | -- | -- | 87413436.0 |Total |------------------------------------------------------------------------------------------------------------------- | 85.8% | 33.411758 | -- | -- | 85261731.0 |ETC ||------------------------------------------------------------------------------------------------------------------ || 34.5% | 13.450064 | -- | -- | 34141422.0 |__cray_acc_hw_start_kernel || 26.8% | 10.446550 | -- | -- | 24553294.0 |_ZL12end_acc_timeP11CUstream_stPmP17cray_acc_location.part.3 || 9.4% | 3.649009 | -- | -- | 10112879.0 |start_acc_time || 3.6% | 1.417962 | -- | -- | 3885636.0 |__cray_acc_hw_start_async_tracking || 3.3% | 1.288070 | -- | -- | 3997656.0 |_ZL17complete_trackingP17cray_acc_location.part.2 || 2.9% | 1.114773 | -- | -- | 3775584.0 |__cray_acc_hw_end_async_tracking || 2.8% | 1.077196 | -- | -- | 3240918.0 |__cray_acc_hw_synchronize ||================================================================================================================== | 13.9% | 5.424821 | -- | -- | 1886589.0 |USER ||------------------------------------------------------------------------------------------------------------------ || 1.4% | 0.551133 | -- | -- | 1.0 |hydro_cycle$hydro_cycle_module_.ACC_DATA_REGION@li.133 || 1.1% | 0.441128 | -- | -- | 30007.0 |update_halo_kernel$update_halo_kernel_module_.ACC_DATA_REGION@li.86 |===================================================================================================================
pat_build -w clover_leaf
- aprun -n2 -N1 ./clover_leaf+pat
Table 1: Profile by Function Group and Function Time% | Time | Imb. | Imb. | Calls |Group | | Time | Time% | | Function | | | | | PE=HIDE | | | | | Thread=HIDE 100.0% | 34.540779 | -- | -- | 3382025.0 |Total |---------------------------------------------------------------------------------------------------------------------- | 99.0% | 34.206815 | -- | -- | 3382018.0 |USER ||--------------------------------------------------------------------------------------------------------------------- || 11.1% | 3.846683 | 0.006851 | 0.4% | 230066.0 |clover_exchange_message$clover_module_.ACC_SYNC_WAIT@li.504 || 8.0% | 2.748113 | 0.005144 | 0.4% | 115033.0 |clover_exchange_message$clover_module_.ACC_KERNEL@li.553 || 7.5% | 2.573926 | 0.011230 | 0.9% | 115033.0 |clover_exchange_message$clover_module_.ACC_KERNEL@li.504 || 7.1% | 2.448125 | 0.012814 | 1.0% | 345099.0 |clover_exchange_message$clover_module_.ACC_ASYNC_COPY@li.553 || 6.7% | 2.297045 | 0.002926 | 0.3% | 345099.0 |clover_exchange_message$clover_module_.ACC_ASYNC_COPY@li.504 || 5.0% | 1.744293 | 0.009789 | 1.1% | 1.0 |hydro_cycle$hydro_cycle_module_.ACC_DATA_REGION@li.133 || 1.5% | 0.511979 | 0.000053 | 0.0% | 45006.0 |advec_mom_kernel$advec_mom_kernel_mod_ || 1.0% | 0.342088 | 0.001191 | 0.7% | 15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.143 || 1.0% | 0.340171 | 0.000349 | 0.2% | 30007.0 |update_halo_kernel$update_halo_kernel_module_.ACC_DATA_REGION@li.86 || 1.0% | 0.339613 | 0.001111 | 0.7% | 115033.0 |clover_exchange_message$clover_module_.ACC_DATA_REGION@li.553 |========================================================================
- Log in to comment