
Issue #8 new
jg piccinali repo owner created an issue

Get the src


cp ~piccinal/CloverLeaf/CloverLeaf_OpenACC_KERNELS/CloverLeaf_OpenACC_KERNELS/Makefile Makefile


  • module load PrgEnv-cray # cce/832
  • module load craype-accel-nvidia35
  • module load perftools/6.2.0
  • make clean
make \
FLAGS_CRAY="-em -h acc_model=fast_addr:no_deep_copy:auto_async_all"


module swap PrgEnv-cray  PrgEnv-pgi
module swap craype/2.05 craype/2.2.0
module swap pgi /apps/daint/scorep/mf/pgi/1470
module swap cray-mpich/6.2.2 cray-mpich/7.0.3
module load  craype-accel-nvidia35
module rm    libsci_acc
make clean
make \
CFLAGS_PGI="-mp=nonuma -acc -ta=nvidia:cc35" \
FLAGS_PGI="-mp=nonuma -acc -ta=nvidia:cc35"
  • grep end_time


  • qsub -I -V -l mppwidth=1 -q gpu_nodes
  • cd /lus/scratch/p01991/CloverLeaf_OpenACC_KERNELS/
  • aprun -n1 ./clover_leaf+pat
 Step    5001 time 200.0000000 control    sound    timestep   4.00E-02
       1,       1 x  5.00E-01 y  5.00E-01
 Test problem 1 is within 99.892471051684851 % of the expected solution
 This test is considered NOT PASSED    (OK)
 Wall clock  50.109227895736694
 First step overhead 3.726959228515625E-3

Comments (18)

  1. jg piccinali reporter
    • Can craypat tell me how much time is spent on the GPU or at least the ration CPU/GPU ? => do not mix openacc and cuda when running pat_build, then sum the % in openacc.

    • How to compile OpenACC with CCE (conflict between GNU/scorep --cuda & CCE/scorep --mpi) ?

  2. jg piccinali reporter




    make \
    MPI_COMPILER="scorep --mpp=mpi --thread=omp --cuda ftn" \
    C_MPI_COMPILER="scorep --mpp=mpi --thread=omp --cuda cc" \
    CFLAGS_PGI="-mp=nonuma -acc -ta=nvidia:cc35" \
    FLAGS_PGI="-mp=nonuma -acc -ta=nvidia:cc35"


    salloc -N4
    export SCOREP_CUDA_ENABLE=yes,flushatexit
    export OMP_NUM_THREADS=4  ## not needed, there is no openmp
    module swap PrgEnv-cray  PrgEnv-pgi
    module swap craype/2.05 craype/2.2.0
    module swap pgi /apps/daint/scorep/mf/pgi/1470
    module swap cray-mpich/6.2.2 cray-mpich/7.0.3
    module load  craype-accel-nvidia35
    module rm    libsci_acc
    module load scorep/1.3
    aprun -n4 -N1 -d1 PGI.SANTIS.MPIACC
    • Ignoring: [Score-P] src/adapters/cuda/scorep_cupti4_activity.c:594: Warning: [CUPTI Activity] Reached maximum CUDA buffer size for context 9508288

    • Ignoring: [Score-P] src/adapters/cuda/scorep_cupti4_activity.c:253: Warning: [CUPTI Activity] Destroying buffer which is currently in use (8599504, 0, 1) ===> set flushatexit


    vampir83 scorep-*/traces.otf2

    250914-13-54-17.png 250914-13-53-05.png

  3. jg piccinali reporter


    • pat_report *.xf

    pat_build -g mpi,omp,io,oacc clover_leaf (OK)

    Table 1:  Profile by Function Group and Function                                                                                                                                
      Time% |      Time |     Imb. |   Imb. |      Calls |Group
            |           |     Time |  Time% |            | Function
            |           |          |        |            |  Thread=HIDE
     100.0% | 34.427764 |       -- |     -- | 84263863.0 |Total
    |  85.4% | 29.386012 |       -- |     -- |  1886589.0 |USER
    ||   5.4% |  1.859823 |       -- |     -- |   115033.0 |clover_exchange_message$clover_module_.ACC_SYNC_WAIT@li.504
    ||   2.0% |  0.674976 |       -- |     -- |    45006.0 |advec_mom_kernel$advec_mom_kernel_mod_
    ||   1.6% |  0.566926 |       -- |     -- |        1.0 |hydro_cycle$hydro_cycle_module_.ACC_DATA_REGION@li.133
    ||   1.4% |  0.491311 |       -- |     -- |    15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.143
    ||   1.3% |  0.456587 |       -- |     -- |    15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.154
    ||   1.3% |  0.452998 |       -- |     -- |    15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.248
    ||   1.3% |  0.450229 |       -- |     -- |    15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.270
    ||   1.3% |  0.449144 |       -- |     -- |    15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.176
    ||   1.3% |  0.448298 |       -- |     -- |    15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.259
    ||   1.3% |  0.447358 |       -- |     -- |    15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.165
    ||   1.3% |  0.445770 |       -- |     -- |    15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.237
    ||   1.2% |  0.413044 |       -- |     -- |    30007.0 |update_halo_kernel$update_halo_kernel_module_.ACC_DATA_REGION@li.86
    ||   1.1% |  0.386894 |       -- |     -- |    10505.0 |ideal_gas_kernel$ideal_gas_kernel_module_.ACC_KERNEL@li.48
    ||   1.1% |  0.369279 |       -- |     -- |    30007.0 |update_halo_kernel$update_halo_kernel_module_.ACC_ASYNC_COPY@li.798
    ||   1.0% |  0.354926 |       -- |     -- |    10002.0 |calc_dt$calc_dt_module_.ACC_KERNEL@li.50
    ||   1.0% |  0.349611 |       -- |     -- |    30007.0 |update_halo_kernel$update_halo_kernel_module_.ACC_ASYNC_COPY@li.86
    ||   1.0% |  0.339757 |       -- |     -- |    20002.0 |advec_cell_kernel$advec_cell_kernel_module_
    ||   1.0% |  0.332413 |       -- |     -- |    10002.0 |advec_mom_kernel$advec_mom_kernel_mod_.ACC_KERNEL@li.231
    |  14.3% |  4.938361 |       -- |     -- | 82112158.0 |ETC
    ||  11.3% |  3.907472 | 1.953664 | 100.0% | 73537807.0 |==LO_MEMORY==
    ||   1.2% |  0.398469 |       -- |     -- |  7361292.0 |cuptiGetTimestamp
    % of OpenACC
    grep ACC_ in |awk '{s=s+$1}END{print s}'
    • 23.9%
    Meaning of LO_MEMORY
    pat_help FAQ "Processing Data with pat_report"
    (.=quit ,=back ^=up /=top ~=search) => 1
         ==LO_MEMORY== is used for any address that can not be mapped to a
         function address.  These are typically below the lowest address for
         any global symbol in the symbol table.
         There are several reasons why this can happen:
         1. The instrumented program was rebuilt after the data files were
         created but before the report was generated.
         2. The -i option of pat_report was used to designate a program that
         is not the same as the one that created the data files.
         3. An optimized frame contains an unexpected value at the offset
         normally reserved for a return address.
         4. The program executes code that is not represented by a global
         symbol in the symbol table.
         5. The program executes code that is dynamically loaded at addresses
         above the end of the static text segment.

    pat_build -g mpi,omp,io,cuda clover_leaf (WRONG)

    • aprun -n2 -N1 ./clover_leaf+pat
    Table 1:  Profile by Function Group and Function
      Time% |      Time |     Imb. |  Imb. |       Calls |Group
            |           |     Time | Time% |             | Function
            |           |          |       |             |  PE=HIDE
            |           |          |       |             |   Thread=HIDE
     100.0% | 61.949031 |       -- |    -- | 146317173.5 |Total
    |  82.0% | 50.775036 |       -- |    -- | 142439872.5 |ETC
    ||  24.6% | 15.221305 | 0.074233 |  1.0% |  39678861.0 |__cray_acc_hw_start_kernel
    ||  19.3% | 11.975088 | 0.035921 |  0.6% |  29338028.0 |_ZL12end_acc_timeP11CUstream_stPmP17cray_acc_location.part.3
    ||   8.5% |  5.248636 | 0.004887 |  0.2% |  15844897.0 |__cray_acc_hw_start_async_tracking
    ||   7.5% |  4.656549 | 0.009852 |  0.4% |  16421220.0 |_ZL17complete_trackingP17cray_acc_location.part.2
    ||   7.0% |  4.306423 | 0.006287 |  0.3% |  15507789.0 |__cray_acc_hw_end_async_tracking
    ||   6.3% |  3.874195 | 0.008674 |  0.4% |  11729831.0 |start_acc_time
    ||   1.8% |  1.134196 | 0.003694 |  0.6% |   3240918.0 |__cray_acc_hw_synchronize
    ||   1.7% |  1.081438 | 0.009005 |  1.7% |   2707153.0 |__cray_acc_hw_copy_acc_to_host
    ||   1.7% |  1.034765 | 0.001789 |  0.3% |   2677813.0 |__cray_acc_hw_copy_host_to_acc
    ||   1.5% |  0.947850 | 0.001436 |  0.3% |   2530965.0 |__cray_acc_hw_wait
    |  14.6% |  9.025409 |       -- |    -- |   3382018.0 |USER
    ||   1.3% |  0.826430 | 0.007207 |  1.7% |    345099.0 |clover_exchange_message$clover_module_.ACC_ASYNC_COPY@li.553
    ||   1.3% |  0.808651 | 0.007495 |  1.8% |    345099.0 |clover_exchange_message$clover_module_.ACC_ASYNC_COPY@li.504
    ||   1.3% |  0.798432 | 0.000642 |  0.2% |    230066.0 |clover_exchange_message$clover_module_.ACC_SYNC_WAIT@li.504
    ||   1.2% |  0.769508 | 0.128373 | 28.6% |         1.0 |hydro_cycle$hydro_cycle_module_.ACC_DATA_REGION@li.133
    |   2.5% |  1.530344 |       -- |    -- |    477658.0 |MPI
    |   1.5% |  0.952014 | 0.209916 | 36.1% |    230066.0 | mpi_waitall_

    pat_build -g mpi,omp,io,oacc,cuda clover_leaf (WRONG)

    • aprun -n1 ./clover_leaf+pat
    • pat_report *.xf
    Table 1:  Profile by Function Group and Function
      Time% |      Time | Imb. |  Imb. |      Calls |Group
            |           | Time | Time% |            | Function
            |           |      |       |            |  Thread=HIDE
     100.0% | 38.938440 |   -- |    -- | 87413436.0 |Total
    |  85.8% | 33.411758 |   -- |    -- | 85261731.0 |ETC
    ||  34.5% | 13.450064 |   -- |    -- | 34141422.0 |__cray_acc_hw_start_kernel
    ||  26.8% | 10.446550 |   -- |    -- | 24553294.0 |_ZL12end_acc_timeP11CUstream_stPmP17cray_acc_location.part.3
    ||   9.4% |  3.649009 |   -- |    -- | 10112879.0 |start_acc_time
    ||   3.6% |  1.417962 |   -- |    -- |  3885636.0 |__cray_acc_hw_start_async_tracking
    ||   3.3% |  1.288070 |   -- |    -- |  3997656.0 |_ZL17complete_trackingP17cray_acc_location.part.2
    ||   2.9% |  1.114773 |   -- |    -- |  3775584.0 |__cray_acc_hw_end_async_tracking
    ||   2.8% |  1.077196 |   -- |    -- |  3240918.0 |__cray_acc_hw_synchronize
    |  13.9% |  5.424821 |   -- |    -- |  1886589.0 |USER
    ||   1.4% |  0.551133 |   -- |    -- |        1.0 |hydro_cycle$hydro_cycle_module_.ACC_DATA_REGION@li.133
    ||   1.1% |  0.441128 |   -- |    -- |    30007.0 |update_halo_kernel$update_halo_kernel_module_.ACC_DATA_REGION@li.86

    pat_build -w clover_leaf

    • aprun -n2 -N1 ./clover_leaf+pat
    Table 1:  Profile by Function Group and Function
      Time% |      Time |     Imb. |  Imb. |     Calls |Group
            |           |     Time | Time% |           | Function
            |           |          |       |           |  PE=HIDE
            |           |          |       |           |   Thread=HIDE
     100.0% | 34.540779 |       -- |    -- | 3382025.0 |Total
    |  99.0% | 34.206815 |       -- |    -- | 3382018.0 |USER
    ||  11.1% |  3.846683 | 0.006851 |  0.4% |  230066.0 |clover_exchange_message$clover_module_.ACC_SYNC_WAIT@li.504
    ||   8.0% |  2.748113 | 0.005144 |  0.4% |  115033.0 |clover_exchange_message$clover_module_.ACC_KERNEL@li.553
    ||   7.5% |  2.573926 | 0.011230 |  0.9% |  115033.0 |clover_exchange_message$clover_module_.ACC_KERNEL@li.504
    ||   7.1% |  2.448125 | 0.012814 |  1.0% |  345099.0 |clover_exchange_message$clover_module_.ACC_ASYNC_COPY@li.553
    ||   6.7% |  2.297045 | 0.002926 |  0.3% |  345099.0 |clover_exchange_message$clover_module_.ACC_ASYNC_COPY@li.504
    ||   5.0% |  1.744293 | 0.009789 |  1.1% |       1.0 |hydro_cycle$hydro_cycle_module_.ACC_DATA_REGION@li.133
    ||   1.5% |  0.511979 | 0.000053 |  0.0% |   45006.0 |advec_mom_kernel$advec_mom_kernel_mod_
    ||   1.0% |  0.342088 | 0.001191 |  0.7% |   15004.0 |update_halo_kernel$update_halo_kernel_module_.ACC_KERNEL@li.143
    ||   1.0% |  0.340171 | 0.000349 |  0.2% |   30007.0 |update_halo_kernel$update_halo_kernel_module_.ACC_DATA_REGION@li.86
    ||   1.0% |  0.339613 | 0.001111 |  0.7% |  115033.0 |clover_exchange_message$clover_module_.ACC_DATA_REGION@li.553
  4. Log in to comment