Opencl-training
Issue #28
new
Goal
OpenCL tests
Setup
module swap PrgEnv-cray PrgEnv-gnu
module load craype-accel-nvidia35
Santis/Daint
Currently Loaded Modulefiles:
1) modules/3.2.10.3
2) nodestat/2.2-1.0502.53712.3.109.ari
3) sdb/1.0-1.0502.55976.5.27.ari
4) alps/5.2.1-2.0502.9041.11.6.ari
5) lustre-cray_ari_s/2.5_3.0.101_0.31.1_1.0502.8394.10.1-1.0502.17198.8.51
6) udreg/2.3.2-1.0502.9275.1.12.ari
7) ugni/5.0-1.0502.9685.4.24.ari
8) gni-headers/3.0-1.0502.9684.5.2.ari
9) dmapp/7.0.1-1.0502.9501.5.219.ari
10) xpmem/0.1-2.0502.55507.3.2.ari
11) hss-llm/7.2.0
12) Base-opts/1.0.2-1.0502.53325.1.2.ari
13) craype-network-aries
14) craype/2.3.0
15) craype-sandybridge
16) slurm
17) cray-mpich/7.2.0
18) ddt/4.3rc7
19) gcc/4.8.2
20) totalview-support/1.1.4
21) totalview/8.11.0
22) cray-libsci/13.0.3
23) pmi/5.0.6-1.0000.10439.140.2.ari
24) atp/1.8.1
25) PrgEnv-gnu/5.2.40
26) cray-libsci_acc/3.1.1
27) cudatoolkit/6.5.14-1.0502.9613.6.1
28) craype-accel-nvidia35
Compile
make -f makefile.daint OBJ=01_device_query.o
make -f makefile.daint OBJ=02_create_context.o
make -f makefile.daint OBJ=03_kernel_load_and_exec.o
make -f makefile.daint OBJ="clutil.o 04_matrix_multiply.o" N=04
make -f makefile.daint OBJ="clutil.o 05_dot_product.o" N=05
make -f makefile.daint OBJ="clutil.o 06_matrix_multiply_timing.o" N=06
make -f makefile.daint OBJ="clutil.o 07_convolution.o" N=07
make -f makefile.daint OBJ=08_cpp.o
make -f makefile.daint OBJ=09_memcpy.o
make -f makefile.daint # 10_mpi.cpp
make -f makefile.daint OBJ=osu_bwidth.o N=00
Run test1
- aprun -n1 ./SANTIS.GNU.01
***************************************************
Number of platforms: 2
-----------
Platform 0
-----------
Vendor: NVIDIA Corporation
Profile: FULL_PROFILE
Version: OpenCL 1.1 CUDA 6.5.48
Name: NVIDIA CUDA
Extensions: cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing
cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll
cl_nv_copy_opts
Number of devices: 1
Device 0
Type: GPU
Name: Tesla K20X
Version: OpenCL 1.1 CUDA
Vendor: NVIDIA Corporation
Profile: FULL_PROFILE
Compute units: 14
Max work item dim: 3
Work item sizes: 1024 1024 64
Max clock freq: 732 MHz
Global memory: 6039339008 bytes
Local memory: 49151 bytes
Maximum size of allocatable object: 1509834752 bytes
===================================================
-----------
Platform 1
-----------
Vendor: Intel(R) Corporation
Profile: FULL_PROFILE
Version: OpenCL 1.2 LINUX
Name: Intel(R) OpenCL
Extensions: cl_khr_fp64 cl_khr_icd cl_khr_global_int32_base_atomics
cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics
cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_intel_printf
cl_ext_device_fission cl_intel_exec_by_local_thread
Number of devices: 1
Device 0
Type: CPU
Name: Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz
Version: OpenCL 1.2 (Build 67279)
Vendor: Intel(R) Corporation
Profile: FULL_PROFILE
Compute units: 16
Max work item dim: 3
Work item sizes: 1024 1024 1024
Max clock freq: 2600 MHz
Global memory: 33785425920 bytes
Local memory: 32768 bytes
Maximum size of allocatable object: 8446356480 bytes
===================================================
Run test2
- aprun -n1 ./SANTIS.GNU.02 "NVIDIA CUDA" default 0
OpenCL context created
OpenCL context info:
reference count: 1
device id reference: 0x66b2c0
platform: NVIDIA CUDA
OpenCL context released
- aprun -n1 ./SANTIS.GNU.02 "Intel(R) OpenCL" default 0
OpenCL context created
OpenCL context info:
reference count: 1
device id reference: 0x670be8
platform: Intel(R) OpenCL
OpenCL context released
Comments (5)
-
reporter -
reporter - edited description
-
reporter - edited description
-
reporter Run test0
for streams in $( seq 0 1 ); do for ipc in $( seq 0 1 ); do for smp in $( seq 0 1 ); do echo -n "export MV2_USE_CUDA=1;" echo -n "export MV2_CUDA_NONBLOCKING_STREAMS=${streams};" echo -n "export MV2_CUDA_IPC=${ipc};" echo -n "export MV2_CUDA_SMP_IPC=${smp};" echo aprun -n2 -N1 ./SANTIS.GNU.00 done done done
- export MV2_USE_CUDA=1;export MV2_CUDA_NONBLOCKING_STREAMS=0;
- export MV2_CUDA_IPC=0;export MV2_CUDA_SMP_IPC=0;aprun -n2 -N1 ./SANTIS.GNU.00
# Using PAGEABLE host memory! # OSU MPI Bandwidth Test # Size Bandwidth (MB/s) 1 1.44 2 2.88 4 5.74 8 11.56 16 23.02 32 45.99 64 91.80 128 178.86 256 357.90 512 695.54 1024 1076.98 2048 1684.30 4096 2329.33 8192 5008.02 16384 8559.84 32768 9175.98 65536 9520.61 131072 9701.08 262144 9796.64 524288 9845.64 1048576 9868.96 2097152 9876.50 4194304 9832.50
- export MV2_USE_CUDA=1;export MV2_CUDA_NONBLOCKING_STREAMS=0;
- export MV2_CUDA_IPC=0;export MV2_CUDA_SMP_IPC=1;aprun -n2 -N1 ./SANTIS.GNU.00
# Using PAGEABLE host memory! # OSU MPI Bandwidth Test # Size Bandwidth (MB/s) 1 1.44 2 2.89 4 5.59 8 11.29 16 23.06 32 46.08 64 92.06 128 181.72 256 359.05 512 697.80 1024 1076.85 2048 1686.21 4096 2351.70 8192 4970.89 16384 8545.70 32768 9167.85 65536 9464.53 131072 9700.15 262144 9789.75 524288 9832.36 1048576 9866.69 2097152 9879.95 4194304 9847.03
- export MV2_USE_CUDA=1;export MV2_CUDA_NONBLOCKING_STREAMS=0;
- export MV2_CUDA_IPC=1;export MV2_CUDA_SMP_IPC=0;aprun -n2 -N1 ./SANTIS.GNU.00
# Using PAGEABLE host memory! # OSU MPI Bandwidth Test # Size Bandwidth (MB/s) 1 1.45 2 2.90 4 5.78 8 11.62 16 22.32 32 46.31 64 92.44 128 182.49 256 360.81 512 700.47 1024 1081.43 2048 1691.45 4096 2355.27 8192 5027.60 16384 8559.84 32768 9172.15 65536 9514.18 131072 9704.56 262144 9798.34 524288 9846.19 1048576 9869.32 2097152 9874.07 4194304 9847.52
- export MV2_USE_CUDA=1;export MV2_CUDA_NONBLOCKING_STREAMS=0;
- export MV2_CUDA_IPC=1;export MV2_CUDA_SMP_IPC=1;aprun -n2 -N1 ./SANTIS.GNU.00
# Using PAGEABLE host memory! # OSU MPI Bandwidth Test # Size Bandwidth (MB/s) 1 1.44 2 2.89 4 5.71 8 11.57 16 23.06 32 43.99 64 92.17 128 182.01 256 359.54 512 698.69 1024 1079.35 2048 1686.88 4096 2352.15 8192 5014.65 16384 8549.03 32768 9174.06 65536 9512.12 131072 9697.21 262144 9795.48 524288 9844.78 1048576 9859.02 2097152 9877.74 4194304 9848.53
- export MV2_USE_CUDA=1;export MV2_CUDA_NONBLOCKING_STREAMS=1;
- export MV2_CUDA_IPC=0;export MV2_CUDA_SMP_IPC=0;aprun -n2 -N1 ./SANTIS.GNU.00
# Using PAGEABLE host memory! # OSU MPI Bandwidth Test # Size Bandwidth (MB/s) 1 1.44 2 2.82 4 5.64 8 11.34 16 23.12 32 46.22 64 92.34 128 182.61 256 360.64 512 700.93 1024 1080.58 2048 1688.65 4096 2347.68 8192 5021.86 16384 8559.84 32768 9167.85 65536 9516.23 131072 9699.48 262144 9798.07 524288 9835.80 1048576 9867.71 2097152 9880.68 4194304 9846.51
- export MV2_USE_CUDA=1;export MV2_CUDA_NONBLOCKING_STREAMS=1;
- export MV2_CUDA_IPC=0;export MV2_CUDA_SMP_IPC=1;aprun -n2 -N1 ./SANTIS.GNU.00
# Using PAGEABLE host memory! # OSU MPI Bandwidth Test # Size Bandwidth (MB/s) 1 1.41 2 2.89 4 5.79 8 11.19 16 23.11 32 46.20 64 92.17 128 182.16 256 359.67 512 699.86 1024 1079.14 2048 1688.39 4096 2352.35 8192 5027.26 16384 8556.51 32768 9171.67 65536 9497.99 131072 9702.29 262144 9800.59 524288 9845.36 1048576 9868.73 2097152 9873.23 4194304 9848.95
- export MV2_USE_CUDA=1;export MV2_CUDA_NONBLOCKING_STREAMS=1;
- export MV2_CUDA_IPC=1;export MV2_CUDA_SMP_IPC=0;aprun -n2 -N1 ./SANTIS.GNU.00
# Using PAGEABLE host memory! # OSU MPI Bandwidth Test # Size Bandwidth (MB/s) 1 1.44 2 2.89 4 5.77 8 11.60 16 23.09 32 46.11 64 92.19 128 181.92 256 359.69 512 698.08 1024 1077.91 2048 1686.88 4096 2345.37 8192 5025.19 16384 8556.51 32768 9169.76 65536 9515.20 131072 9701.62 262144 9797.73 524288 9837.24 1048576 9858.66 2097152 9877.70 4194304 9847.52
- export MV2_USE_CUDA=1;export MV2_CUDA_NONBLOCKING_STREAMS=1;
- export MV2_CUDA_IPC=1;export MV2_CUDA_SMP_IPC=1;aprun -n2 -N1 ./SANTIS.GNU.00
# Using PAGEABLE host memory! # OSU MPI Bandwidth Test # Size Bandwidth (MB/s) 1 1.44 2 2.89 4 5.79 8 11.59 16 23.12 32 46.25 64 92.30 128 182.16 256 360.09 512 700.90 1024 1071.23 2048 1688.85 4096 2352.95 8192 5012.25 16384 8555.68 32768 9169.76 65536 9517.26 131072 9698.94 262144 9780.98 524288 9845.05 1048576 9868.73 2097152 9880.83 4194304 9846.40
-
reporter perftools/6.2.3
Setup
module swap PrgEnv-cray PrgEnv-gnu module load craype-accel-nvidia35
Compile
- see above
- pat_build -w -t /opt/cray/perftools/6.2.3/share/traces/TraceOpenCL SANTIS.GNU.xx
Run
- aprun ... SANTIS.GNU.xx+pat
Test1
Table 1: Profile by Function Group and Function Time% | Time | Imb. | Imb. | Calls |Group | | Time | Time% | | Function 100.0% | 0.091913 | -- | -- | 40.0 |Total |---------------------------------------------------------- | 99.8% | 0.091688 | -- | -- | 39.0 |OPENCL ||--------------------------------------------------------- || 99.4% | 0.091373 | -- | -- | 17.0 |clGetPlatformIDs
Test2
100.0% | 0.414027 | -- | -- | 31.0 |Total |---------------------------------------------------------- | 99.9% | 0.413796 | -- | -- | 30.0 |OPENCL ||--------------------------------------------------------- || 44.2% | 0.183049 | -- | -- | 1.0 |clReleaseContext || 34.6% | 0.143298 | -- | -- | 7.0 |clCreateContext || 21.1% | 0.087440 | -- | -- | 17.0 |clGetPlatformIDs
Test3
100.0% | 0.417150 | -- | -- | 41.0 |Total |---------------------------------------------------------- | 98.4% | 0.410299 | -- | -- | 40.0 |OPENCL ||--------------------------------------------------------- || 34.1% | 0.142086 | -- | -- | 7.0 |clCreateContext || 30.6% | 0.127487 | -- | -- | 1.0 |clReleaseContext || 21.0% | 0.087524 | -- | -- | 17.0 |clGetPlatformIDs || 12.7% | 0.053091 | -- | -- | 1.0 |clBuildProgram ||========================================================= | 1.6% | 0.006851 | -- | -- | 1.0 |USER ||--------------------------------------------------------- || 1.6% | 0.006851 | -- | -- | 1.0 |main
Test4
100.0% | 0.415522 | -- | -- | 47.0 |Total |---------------------------------------------------------- | 98.1% | 0.407656 | -- | -- | 46.0 |OPENCL ||--------------------------------------------------------- || 34.1% | 0.141779 | -- | -- | 7.0 |clCreateContext || 31.2% | 0.129546 | -- | -- | 1.0 |clReleaseContext || 20.8% | 0.086464 | -- | -- | 17.0 |clGetPlatformIDs || 12.0% | 0.049725 | -- | -- | 1.0 |clBuildProgram ||========================================================= | 1.9% | 0.007866 | -- | -- | 1.0 |USER ||--------------------------------------------------------- || 1.9% | 0.007866 | -- | -- | 1.0 |main
Test5
100.0% | 0.419554 | -- | -- | 46.0 |Total |---------------------------------------------------------- | 95.7% | 0.401306 | -- | -- | 45.0 |OPENCL ||--------------------------------------------------------- || 34.0% | 0.142451 | -- | -- | 7.0 |clCreateContext || 31.1% | 0.130537 | -- | -- | 1.0 |clReleaseContext || 21.0% | 0.088301 | -- | -- | 17.0 |clGetPlatformIDs || 9.5% | 0.039872 | -- | -- | 1.0 |clBuildProgram ||========================================================= | 4.3% | 0.018248 | -- | -- | 1.0 |USER ||--------------------------------------------------------- || 4.3% | 0.018248 | -- | -- | 1.0 |main
Test6
100.0% | 0.359751 | -- | -- | 52.0 |Total |---------------------------------------------------------- | 77.2% | 0.277844 | -- | -- | 51.0 |OPENCL ||--------------------------------------------------------- || 39.2% | 0.140895 | -- | -- | 7.0 |clCreateContext || 23.8% | 0.085636 | -- | -- | 17.0 |clGetPlatformIDs || 13.9% | 0.050183 | -- | -- | 1.0 |clBuildProgram ||========================================================= | 22.8% | 0.081907 | -- | -- | 1.0 |USER ||--------------------------------------------------------- || 22.8% | 0.081907 | -- | -- | 1.0 |main
Test7
pat[WARNING][0]: 5 spawned threads did not complete successfully
100.0% | 0.297737 | -- | -- | 53.0 |Total |---------------------------------------------------------- | 82.8% | 0.246573 | -- | -- | 52.0 |OPENCL ||--------------------------------------------------------- || 47.2% | 0.140398 | -- | -- | 7.0 |clCreateContext || 29.3% | 0.087153 | -- | -- | 17.0 |clGetPlatformIDs || 6.1% | 0.018153 | -- | -- | 1.0 |clBuildProgram ||========================================================= | 17.2% | 0.051164 | -- | -- | 1.0 |USER ||--------------------------------------------------------- || 17.2% | 0.051164 | -- | -- | 1.0 |main
Test8
100.0% | 0.420327 | -- | -- | 44.0 |Total |---------------------------------------------------------- | 98.7% | 0.415022 | -- | -- | 43.0 |OPENCL ||--------------------------------------------------------- || 34.1% | 0.143129 | -- | -- | 7.0 |clCreateContext || 29.2% | 0.122776 | -- | -- | 1.0 |clReleaseEvent || 21.4% | 0.089938 | -- | -- | 17.0 |clGetPlatformIDs || 14.0% | 0.058892 | -- | -- | 1.0 |clBuildProgram ||========================================================= | 1.3% | 0.005305 | -- | -- | 1.0 |USER ||--------------------------------------------------------- || 1.3% | 0.005305 | -- | -- | 1.0 |main
Test9
100.0% | 0.702952 | -- | -- | 51.0 |Total |-------------------------------------------------------------- | 80.8% | 0.567870 | -- | -- | 50.0 |OPENCL ||------------------------------------------------------------- || 19.7% | 0.138408 | -- | -- | 4.0 |clCreateBuffer || 19.3% | 0.135523 | -- | -- | 7.0 |clCreateContext || 15.0% | 0.105484 | -- | -- | 1.0 |clReleaseContext || 12.3% | 0.086211 | -- | -- | 17.0 |clGetPlatformIDs || 7.0% | 0.048991 | -- | -- | 1.0 |clEnqueueReadBuffer || 4.0% | 0.028251 | -- | -- | 1.0 |clEnqueueCopyBuffer || 3.2% | 0.022789 | -- | -- | 1.0 |clEnqueueWriteBuffer ||============================================================= | 19.2% | 0.135083 | -- | -- | 1.0 |USER ||------------------------------------------------------------- || 19.2% | 0.135083 | -- | -- | 1.0 |main
Test10
- pat_build -w -g mpi -t /opt/cray/perftools/6.2.3/share/traces/TraceOpenCL SANTIS.GNU.00
-
how to use -g opencl ?
-
export MV2_USE_CUDA=1;export MV2_CUDA_NONBLOCKING_STREAMS=0;
- export MV2_CUDA_IPC=0;export MV2_CUDA_SMP_IPC=0;
- aprun -n2 -N1 ./SANTIS.GNU.00+pat
100.0% | 1.305849 | -- | -- | 5.0 |Total |---------------------------------------------------------------- | 99.4% | 1.298129 | 0.000006 | 0.0% | 1.0 |USER ||--------------------------------------------------------------- || 99.4% | 1.298129 | 0.000006 | 0.0% | 1.0 |main ||=============================================================== | 0.6% | 0.007718 | -- | -- | 2.0 |MPI_SYNC ||--------------------------------------------------------------- || 0.6% | 0.007700 | 0.007682 | 99.8% | 1.0 |MPI_Init(sync) || 0.0% | 0.000018 | 0.000007 | 35.5% | 1.0 |MPI_Finalize(sync) ||=============================================================== | 0.0% | 0.000002 | -- | -- | 2.0 |MPI ||--------------------------------------------------------------- || 0.0% | 0.000001 | 0.000000 | 6.1% | 1.0 |MPI_Finalize || 0.0% | 0.000001 | 0.000000 | 34.2% | 1.0 |MPI_Init |================================================================ Processing step 2 of 3 =================== Observations and suggestions =================== Number of accelerators used: 0 of 2
- how to spend more time on the gpu ?
- Log in to comment
Run test3
Run test4
Run test5
Run test6
Run test7
Run test8
Run test9