- edited description
CRAY_ACC_DEBUG API
Issue #46
new
Description
- man /opt/cray/cce/default/man/man7/intro_openacc.7
C API
// /opt/cray/cce/default/craylibs/x86-64/include/openacc.h
#include <openacc.h>
int cray_acc_debug_orig=0;
int cray_acc_debug_zero=0;
cray_acc_debug_orig = cray_acc_get_debug_global_level();
cray_acc_set_debug_global_level(cray_acc_debug_zero);
...
cray_acc_set_debug_global_level(cray_acc_debug_orig);
#pragma acc parallel loop copyin(a) pcopyin(b[0:N-1]) pcopyout(c[0:N-1])
for( i = 0; i < N; ++i ) {
c[i] += a*b[i];
}
cray_acc_set_debug_global_level(cray_acc_debug_zero);
Fortran API
! /opt/cray/cce/default/craylibs/x86-64/include/openacc.h
integer :: cray_acc_debug_orig
cray_acc_debug_orig = cray_acc_get_debug_global_level()
call cray_acc_set_debug_global_level(0)
...
call cray_acc_set_debug_global_level(cray_acc_debug_orig)
#endif
!$acc kernels
do j = 2, NM-1
do i = 2, NN-1
A(i,j) = Anew(i,j)
end do
end do
!$acc end kernels
...
call cray_acc_set_debug_global_level(0)
Get the src:
- git clone EuroHack15.git
- cd examples/qwiklab/cray_acc_debug
Setup:
- module load craype-accel-nvidia35
- module list
Currently Loaded Modulefiles:
modules/3.2.10.3
nodestat/2.2-1.0502.53712.3.109.ari
sdb/1.0-1.0502.55976.5.27.ari
alps/5.2.1-2.0502.9041.11.6.ari
lustre-cray_ari_s/2.5_3.0.101_0.31.1_1.0502.8394.10.1-1.0502.17198.8.51
udreg/2.3.2-1.0502.9275.1.12.ari
ugni/5.0-1.0502.9685.4.24.ari
gni-headers/3.0-1.0502.9684.5.2.ari
dmapp/7.0.1-1.0502.9501.5.219.ari
xpmem/0.1-2.0502.55507.3.2.ari
hss-llm/7.2.0
Base-opts/1.0.2-1.0502.53325.1.2.ari
craype-network-aries
craype/2.4.0
cce/8.3.12
cray-libsci/13.0.4
pmi/5.0.7-1.0000.10678.155.25.ari
rca/1.0.0-2.0502.53711.3.127.ari
atp/1.8.2
PrgEnv-cray/5.2.40
craype-sandybridge
slurm
cray-mpich/7.2.2
ddt/5.0
cray-libsci_acc/3.1.1
cudatoolkit/6.5.14-1.0502.9613.6.1
craype-accel-nvidia35
Compile:
C
- cc -hacc -D_CRAY_ACC_DEBUG mpiacc_c_simple.c
Fortran
- ftn -D_CRAY_ACC_DEBUG -hacc task3.F90
Run:
C
- export CRAY_ACC_DEBUG=1 or 2 or 3
- aprun -n1 a.out 200
CRAY_ACC_DEBUG=1
ACC: Transfer 2 items (to acc 1592 bytes, to host 0 bytes) from mpiacc_c_simple.c:48
ACC: Execute kernel main$ck_L48_2 async(auto) from mpiacc_c_simple.c:48
ACC: Wait async(auto) from mpiacc_c_simple.c:50
ACC: Transfer 2 items (to acc 0 bytes, to host 1592 bytes) from mpiacc_c_simple.c:50
CRAY_ACC_DEBUG=2
ACC: Start transfer 2 items from mpiacc_c_simple.c:48
ACC: allocate, copy to acc 'b' (1592 bytes)
ACC: allocate 'c' (1592 bytes)
ACC: End transfer (to acc 1592 bytes, to host 0 bytes)
ACC: Execute kernel main$ck_L48_2 blocks:2 threads:128 async(auto) from mpiacc_c_simple.c:48
ACC: Wait async(auto) from mpiacc_c_simple.c:50
ACC: Start transfer 2 items from mpiacc_c_simple.c:50
ACC: free 'b' (1592 bytes)
ACC: copy to host, free 'c' (1592 bytes)
ACC: End transfer (to acc 0 bytes, to host 1592 bytes)
CRAY_ACC_DEBUG=3
ACC: Start wait async(auto) from mpiacc_c_simple.c:50
ACC: async_info: 0x2aaaad5b0280
ACC: Freeing delayed free for async(auto)
ACC: End wait
ACC:
ACC: Start transfer 2 items from mpiacc_c_simple.c:50
ACC: flags:
ACC:
ACC: Trans 1
ACC: Simple transfer of 'b' (1592 bytes)
ACC: host ptr 7fffffff59a0
ACC: acc ptr b04260000
ACC: flags: FREE REL_PRESENT REG_PRESENT
ACC: last release acc b04260000 from present table index 1 (ref_count 1)
ACC: last release of conditional present (acc b04260000, base b04260000)
ACC: remove acc b04260000 from present table index 1
ACC: new acc ptr 0
ACC:
ACC: Trans 2
ACC: Simple transfer of 'c' (1592 bytes)
ACC: host ptr 7fffffff5350
ACC: acc ptr b04260800
ACC: flags: COPY_ACC_TO_HOST FREE REL_PRESENT REG_PRESENT
ACC: last release acc b04260800 from present table index 0 (ref_count 1)
ACC: last release of conditional present (acc b04260800, base b04260800)
ACC: copy acc to host (b04260800 to 7fffffff5350)
ACC: split copy acc to host (7fffffff5350 to b04260800) size = 1592
ACC: remove acc b04260800 from present table index 0
ACC: new acc ptr 0
ACC:
ACC: End transfer (to acc 0 bytes, to host 1592 bytes)
Fortran
- export CRAY_ACC_DEBUG=1 or 2 or 3
- aprun -n1 a.out
CRAY_ACC_DEBUG=1
ACC: Execute kernel jacobi_acc_kernels_datacopy_$ck_L44_5 async(auto) from task3.F90:44
CRAY_ACC_DEBUG=2
ACC: Execute kernel jacobi_acc_kernels_datacopy_$ck_L44_5 blocks:1022 threads:128 async(auto) from task3.F90:44
CRAY_ACC_DEBUG=3
ACC: Start kernel jacobi_acc_kernels_datacopy_$ck_L44_5 async(auto) from task3.F90:44
ACC: flags: CACHE_MOD CACHE_FUNC AUTO_ASYNC
ACC: mod cache: 0xe09580
ACC: kernel cache: 0xe09540
ACC: async info: 0x2aaaad06d280
ACC: arguments: NVIDIA argument info
ACC: param size: 16
ACC: param pointer: 0x7fffffff5ec0
ACC: blocks: 1022
ACC: threads: 128
ACC: event id: 0
ACC: using cached func
ACC: kernel information
ACC: num registers : 10
ACC: max theads per block : 1024
ACC: shared size : 0 bytes
ACC: const size : 0 bytes
ACC: local size : 0 bytes
ACC:
ACC: launching kernel new
ACC: End kernel
Comments (1)
-
reporter - Log in to comment