BigDFT (OpenCL)
Issue: what tool can we use to get a performance report ?
Compile and run BigDFT on PizDaint
Get the code
ssh daint
wget https://launchpad.net/bigdft/1.7/1.7.5/+download/bigdft-1.7.5.tar.xz
cd /scratch/daint/piccinal/BIGDFT/GNU+OCL/bigdft-1.7.5
setup:
module swap PrgEnv-cray PrgEnv-gnu
module load craype-accel-nvidia35
module swap cray-mpich cray-mpich/7.1.1
module load fftw
Currently Loaded Modulefiles:
1) modules/3.2.6.7
2) nodestat/2.2-1.0501.47138.1.78.ari
3) sdb/1.0-1.0501.48084.4.48.ari
4) alps/5.1.1-2.0501.8713.1.1.ari
5) MySQL/5.0.64-1.0000.7096.23.2
6) lustre-cray_ari_s/2.4_3.0.80_0.5.1_1.0501.7664.13.1-1.0501.14952.18.1
7) udreg/2.3.2-1.0501.7914.1.13.ari
8) ugni/5.0-1.0501.8253.10.22.ari
9) gni-headers/3.0-1.0501.8317.12.1.ari
10) dmapp/7.0.1-1.0501.8315.8.4.ari
11) xpmem/0.1-2.0501.48424.3.3.ari
12) hss-llm/7.1.0
13) Base-opts/1.0.2-1.0501.47945.4.2.ari
14) craype-network-aries
15) craype/2.2.1
16) craype-sandybridge
17) slurm
18) cray-mpich/7.1.1
19) ddt/4.3rc7
20) gcc/4.8.2
21) totalview-support/1.1.4
22) totalview/8.11.0
23) cray-libsci/13.0.1
24) pmi/5.0.5-1.0000.10300.134.8.ari
25) atp/1.7.5
26) PrgEnv-gnu/5.1.29
27) cray-libsci_acc/3.0.2
28) cudatoolkit/5.5.20-1.0501.7945.8.2
29) craype-accel-nvidia35
30) fftw/3.3.4.0
compile:
export FC=ftn
export F77=ftn
export CC=cc
export CXX=CC
export CFLAGS="-O2"
export FCFLAGS="-fopenmp"
export FFLAGS="-fopenmp"
export LDFLAGS=
export LIBS="-L/opt/cray/nvidia/default/lib64 -L/opt/nvidia/cudatoolkit/default/"
export FCLIBS=" "
./configure \
--with-blas=no \
--with-lapack=no \
--with-ext-linalg=' ' \
--enable-opencl \
--enable-cuda-gpu \
--with-cuda-path=/opt/nvidia/cudatoolkit/default \
--with-ocl-path=/opt/nvidia/cudatoolkit/default \
--prefix=/scratch/daint/piccinal/BIGDFT/GNU+OCL/175
make
make install
run:
cd /scratch/daint/piccinal/BIGDFT/GNU+OCL/175/bin/PbSeSurface sbatch runme.slurm.daint
CPU only
grep accel: input.yaml
# accel: NO
grep real o_bigdft.0176.1.8.1.daint.oclgpu
# 162sec
OPENCL
grep accel: input.yaml
# accel: OCLGPU
grep real o_bigdft.0176.1.8.1.daint.cpu
# 223sec
CUDA version
The CUDA version is not maintained anymore and is working only for Periodic BC at Gamma point: http://bigdft.org/Wiki/index.php?title=Acceleration_example_on_different_platforms
Comments (9)
-
reporter -
reporter NVIDIA Command Line Profiler
DAINT
module swap PrgEnv-cray PrgEnv-gnu
module load craype-accel-nvidia35
Compilation
make -f makefile.daint
CC -I/opt/cray/mpt/7.0.4/gni/mpich2-gnu/48/include -I/opt/nvidia/cudatoolkit/include -w -c 10_mpi.cpp CC -I/opt/cray/mpt/7.0.4/gni/mpich2-gnu/48/include -I/opt/nvidia/cudatoolkit/include \ -L/opt/cray/nvidia/default/lib64 -lOpenCL \ -L/opt/cray/mpt/7.0.4/gni/mpich2-gnu/48/lib -lmpich 10_mpi.o -o DAINT.GNU
Runtime
export COMPUTE_PROFILE=1
export COMPUTE_PROFILE_LOG=o_$HOSTNAME.%p.csv
export COMPUTE_PROFILE_CSV=1
export COMPUTE_PROFILE_CONFIG=myconfig
Doc
- doc/5.5.20-1.0501.7945.8.2/doc/html/profiler-users-guide/index.html#import-csv-session
When using the command-line profiler to create a CSV file for import into the Visual Profiler, the following requirement must be met: COMPUTE_PROFILE_CSV must be 1 to generate CSV formatted output. COMPUTE_PROFILE_CONFIG must point to a file that contains gpustarttimestamp and streamid configuration parameters. The configuration file may also contain other configuration parameters, including events.
- /apps/cuda/doc/5.5.20-1.0501.7945.8.2/doc/html/cuda-toolkit-release-notes
Profiler data gets flushed to a file only at synchronization calls like cudaDeviceSynchronize() and cudaStreamSynchronize() or when the profiler buffer gets full. If an app terminates without these sync calls then profiler data may be lost. Similarly for OpenCL apps the OpenCL resources like the contexts, events should be freed before the app terminates.
- doc/5.5.20-1.0501.7945.8.2/doc/html/cuda-runtime-api/group__CUDART__DEVICE.html
cudaError_t cudaDeviceReset(void) Explicitly destroys and cleans up all resources associated with the current device in the current process. Any subsequent API call to this device will reinitialize the device.
cat myconfig
timestamp gpustarttimestamp gridsize
aprun -n2 -N1 ./DAINT.GNU 0 gpu 0 1024
# Found device [Tesla K20X] # Found device [Tesla K20X] # GPU-to-GPU transfer took 1.35319e-08 seconds [0]: PASSED [1]: PASSED
cat o_daint01.29074.csv
# OPENCL_PROFILE_LOG_VERSION 2.0 # OPENCL_DEVICE 0 Tesla K20X # OPENCL_CONTEXT 1 # OPENCL_PROFILE_CSV 1 # TIMESTAMPFACTOR fffff61319a012c8 timestamp,method,gputime,cputime,ndrangesizeX,ndrangesizeY,occupancy 34935.000,memcpyHtoDasync,2.272,12.000 34956.000,arrayset,8.096,12.000,1024,1,0.250 35209.000,memcpyDtoHasync,4.352,5.000 36835.000,memcpyHtoDasync,3.776,6.000 56315.000,sum,8.096,6.000,1024,1,0.250 56561.000,memcpyDtoHasync,4.384,5.000
nvvp => import => error: CVS data missing start timestamp
DOM
Compilation
make -f makefile.daint CXX=mpicxx
mpicxx -I/apps/dom/mvapich2/2.0/gcc/4.8.2/include -I/usr/local/cuda-6.5/include -w -c 10_mpi.cpp mpicxx -I/apps/dom/mvapich2/2.0/gcc/4.8.2/include -I/usr/local/cuda-6.5/include -w \ -L/usr/local/cuda-6.5/lib64 -lOpenCL \ -L/apps/dom/mvapich2/2.0/gcc/4.8.2/lib -lmpich 10_mpi.o -o DOM.LOGIN.CSCS.CH.
Runtime
salloc -N2
srun -n2 --ntasks-per-node=1 ./DOM.LOGIN.CSCS.CH. 0 gpu 0 1024
librdmacm: Fatal: unable to open RDMA device In: PMI_Abort(1, Fatal error in MPI_Init: Other MPI error, error stack: MPIR_Init_thread(483).......: MPID_Init(367)..............: channel initialization failed MPIDI_CH3_Init(362).........: MPIDI_CH3I_RDMA_init(170)...: rdma_setup_startup_ring(389): cannot open hca device)
NVPROF
module swap PrgEnv-cray PrgEnv-gnu
module swap cray-mpich cray-mpich/7.0.4
module load craype-accel-nvidia35
Runtime
unset COMPUTE_PROFILE
export PMI_NO_FORK=1
nvprof --version
Release version 5.5 (15)
aprun -n1 nvprof -s ./a.out
Warning: No CUDA application was profiled, exiting
-
reporter TAU
cd /scratch/daint/asheyko/BigDFT_launchpad/calc_test/lucamar
export PATH=/apps/daint/sandbox/jgp/tau/2.24/gnu_482/craycnl/bin:$PATH
aprun -n1 tau_exec -T serial -opencl bigdft
pprof -s profile.0.0.0
FUNCTION SUMMARY (mean): --------------------------------------------------------------------------------------- %Time Exclusive Inclusive #Call #Subrs Inclusive Name msec total msec usec/call --------------------------------------------------------------------------------------- 100.0 3,870 4,307 1 25400.5 4307771 .TAU application 3.6 156 156 5.5 0 28411 cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *) C 1.5 66 66 5.5 0 12067 cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *) C, void *) C 1.4 59 59 307 0 192 cl_int clEnqueueReadBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *) C
-
reporter Other tools
Gremedy
http://www.gremedy.com/downloadLinux.php
version 5.8.1 on Todi => ./gDEBugger-bin: /usr/lib64/libstdc++.so.6: version GLIBCXX_3.4.11 not found
LTPV
LTPV is a light and generic profiler for High Performance Computing applications. It can be used as an OpenCL profiler.
-
reporter SCOREP/VAMPIR
PE
module swap PrgEnv-cray PrgEnv-gnu module swap gcc gcc/4.8.2 module load craype-accel-nvidia35 module load fftw module load scorep/1.4
Compilation
- Fix mpi version first !
scorep --mpp=mpi --opencl CC ... SCOREP="scorep --nocompiler --opencl --static" scorep --mpp=mpi --nocompiler --opencl --static cc "$@" ../configure FC="$SCOREP gfortran" F77="$SCOREP gfortran" CC="$SCOREP gcc" CXX="$SCOREP g++" ...
Runtime
export SCOREP_ENABLE_PROFILING=false
export SCOREP_ENABLE_TRACING=true
export SCOREP_OPENCL_ENABLE=yes
aprun -n2 -N1 -d1 -j1 ./DAINT.GNU.scorep 0 gpu 0 1024
Analyse
vampir83 scorep-20150116_1417_159737160397806/traces.otf2
- Fix mpi version first !
-
reporter - changed title to BigDFT (OpenCL)
-
reporter - edited description
-
reporter - edited description
-
reporter Compile and run BigDFT on swan (IvyBridge)
Get the code
ssh swan wget https://launchpad.net/bigdft/1.7/1.7.5/+download/bigdft-1.7.5.tar.xz cd /lus/scratch/p01991/822356/GNU+GPUivyb+PT622/bigdft-1.7.5
setup (IVYB)
module swap PrgEnv-cray PrgEnv-gnu module swap craype-haswell craype-ivybridge module swap gcc gcc/4.8.2 module load craype-accel-nvidia35 module load fftw # module load perftools-lite/6.2.2 # after the configure step ! echo $CRAY_ACCEL_TARGET # nvidia35
Currently Loaded Modulefiles:
1) modules/3.2.10.2 2) nodestat/2.2-1.0502.53712.3.109.ari 3) sdb/1.0-1.0502.55976.5.27.ari 4) alps/5.2.1-2.0502.9041.11.6.ari 5) lustre-cray_ari_s/2.5_3.0.101_0.31.1_1.0502.8394.10.1-1.0502.17198.8.51 6) udreg/2.3.2-1.0502.9275.1.12.ari 7) ugni/5.0-1.0502.9685.4.24.ari 8) gni-headers/3.0-1.0502.9684.5.2.ari 9) dmapp/7.0.1-1.0502.9501.5.219.ari 10) xpmem/0.1-2.0502.55507.3.2.ari 11) hss-llm/7.2.0 12) Base-opts/1.0.2-1.0502.53325.1.2.ari 13) craype-network-aries 14) craype/2.2.1 15) cray-mpich/7.1.2 16) craype-ivybridge 17) moab/8.0.0 18) torque/5.0.0 19) gcc/4.8.2 20) totalview-support/1.2.0.4 21) totalview/8.15.0 22) cray-libsci/13.0.3 23) pmi/5.0.6-1.0000.10439.140.2.ari 24) atp/1.7.5 25) PrgEnv-gnu/5.2.40 26) cray-libsci_acc/3.0.2 27) cudatoolkit/5.5.22-1.0502.7944.3.1 28) craype-accel-nvidia35 29) fftw/3.3.4.1
compile
export FC=ftn export F77=ftn export CC=cc export CXX=CC # Add -fopenmp to the above if needed (KO) export FCLIBS=" " ./configure \ --with-blas=no \ --with-lapack=no \ --with-ext-linalg=' ' \ --enable-opencl \ --with-ocl-path=/opt/nvidia/cudatoolkit/default \ --enable-cuda-gpu \ --with-cuda-path=/opt/nvidia/cudatoolkit/default \ --prefix=/lus/scratch/p01991/822356/GNU+GPUivyb+PT622/175ptl622 # --with-openmp \ make -j16 make install
run (MPI)+OCL: DAINT OK
swan
qsub -I -V -l walltime=00:05:00 -q gpu_nodes -l nodes=4 cd /lus/scratch/p01991/822356/GNU+GPUivyb+PT622/175ptl622/bin/PbSeSurface /usr/bin/time -p aprun -n4 -N1 ../bigdft &> o
Poisson Kernel Initialization: MPI tasks : 4 CUBLAS ERROR: The CUDA Runtime initialization failed!
grep accel: input.yaml # ==> accel: OCLGPU grep real o # ==> real
daint
grep accel: input.yaml # ==> accel: OCLGPU cd /scratch/daint/piccinal/BIGDFT/GNU+GPU+PT622/175/bin/PbSeSurface aprun -n 22 -N 1 -d 1 -j 1 ../bigdft # real 260.11
- Log in to comment
Cray Perftools
Compilation/Instrumentation