- edited description
BigDFT (MPI/OMP)
Issue #23
new
Issue: what tool can we use to get a performance report ?
Compile and run BigDFT on swan
Get the code
ssh swan
wget https://launchpad.net/bigdft/1.7/1.7.5/+download/bigdft-1.7.5.tar.xz
cd /lus/scratch/p01991/822356/GNU+CPU+PT622/bigdft-1.7.5
setup (Haswell 12core nodes)
module swap PrgEnv-cray PrgEnv-gnu
module load craype-accel-nvidia35
module load fftw
# module load perftools-lite/6.2.2 # after the configure step !
Currently Loaded Modulefiles:
modules/3.2.10.2
nodestat/2.2-1.0502.53712.3.109.ari
sdb/1.0-1.0502.55976.5.27.ari
alps/5.2.1-2.0502.9041.11.6.ari
lustre-cray_ari_s/2.5_3.0.101_0.31.1_1.0502.8394.10.1-1.0502.17198.8.51
udreg/2.3.2-1.0502.9275.1.12.ari
ugni/5.0-1.0502.9685.4.24.ari
gni-headers/3.0-1.0502.9684.5.2.ari
dmapp/7.0.1-1.0502.9501.5.219.ari
xpmem/0.1-2.0502.55507.3.2.ari
hss-llm/7.2.0
Base-opts/1.0.2-1.0502.53325.1.2.ari
craype-network-aries
craype/2.2.1
cray-mpich/7.1.2
craype-haswell
moab/8.0.0
torque/5.0.0
gcc/4.9.2
totalview-support/1.2.0.4
totalview/8.15.0
cray-libsci/13.0.3
pmi/5.0.6-1.0000.10439.140.2.ari
atp/1.7.5
PrgEnv-gnu/5.2.40
fftw/3.3.4.1
compile
export FC=ftn
export F77=ftn
export CC=cc
export CXX=CC
# Add -fopenmp to the above if needed
export FCLIBS=" "
# export CFLAGS="-O2 -g"
# export LDFLAGS=
./configure \
--with-blas=no \
--with-lapack=no \
--with-ext-linalg=' ' \
--prefix=/lus/scratch/p01991/822356/GNU+CPU+PT622/175notool+omp \
--with-openmp \
# --prefix=/lus/scratch/p01991/822356/GNU+CPU+PT622/175notool
# --prefix=/lus/scratch/p01991/822356/GNU+CPU+PT622/175ptl622
# --prefix=/lus/scratch/p01991/822356/GNU+CPU+PT622/175notool+omp
make -j16
make install
run (MPI): OK
qsub -I -V -l walltime=00:15:00 -q hsw12 -l nodes=8
cd /lus/scratch/p01991/822356/GNU+CPU+PT622/175notool/bin/PbSeSurface
/usr/bin/time -p aprun -n96 -N12 ../bigdft &> o
grep accel: input.yaml # ==> accel: NO
grep real o # ==> real 82.14
run (MPI)+PTL622: OK
qsub -I -V -l walltime=00:15:00 -q hsw12 -l nodes=8
cd /lus/scratch/p01991/822356/GNU+CPU+PT622/175ptl622/bin/PbSeSurface
/usr/bin/time -p aprun -n96 -N12 ../bigdft &> o
grep real o # ==> real 95.71
run (MPI+OPENMP): OK
qsub -I -V -l walltime=00:15:00 -q hsw12 -l nodes=8
cd /lus/scratch/p01991/822356/GNU+CPU+PT622/175notool+omp/bin/PbSeSurface
export OMP_NUM_THREADS=4
/usr/bin/time -p aprun -n24 -N3 -d4 ../bigdft &> o
grep accel: input.yaml # ==> accel: NO
grep " OpenMP parallelization" o # ==> Yes
grep real o # ==> real 101.96 # compiled with -g
run (MPI+OPENMP)+PTL622: NOT ok
qsub -I -V -l walltime=00:15:00 -q hsw12 -l nodes=8
cd /lus/scratch/p01991/822356/GNU+CPU+PT622/175ptl622+omp/bin/PbSeSurface
export OMP_NUM_THREADS=4
/usr/bin/time -p aprun -n24 -N3 -d4 ../bigdft &> o
grep accel: input.yaml # ==> accel: NO
grep " OpenMP parallelization" o # ==> Yes
grep real o # ==> real 101.96 # compiled with -g
export LIBS="-L/opt/cray/nvidia/default/lib64 -L/opt/nvidia/cudatoolkit/default/"
BigDFT 1.7.6 on santis
MPI only 4 nodes, gnu
@santis01
export FC=ftn
export F77=ftn
export CC=cc
export CXX=CC
export FCLIBS=" "
module swap PrgEnv-cray PrgEnv-gnu
module load craype-accel-nvidia35
module rm cray-libsci_acc/3.0.2
module swap cudatoolkit/5.5.22-1.0502.7944.3.1 cudatoolkit/6.5.12
module load cray-libsci
./configure --with-blas=no --with-lapack=no --with-ext-linalg=' ' --prefix=/scratch/santis/perettig/bigdft/bin/176/gnu/ --enable-opencl --enable-cuda-gpu --with-cuda-path=/opt/nvidia/cudatoolkit/default --with-ocl-path="/opt/nvidia/cudatoolkit/default/" 'LIBS=-L/opt/cray/nvidia/default/lib64 -L/opt/nvidia/cudatoolkit/default/lib64' 'CFLAGS=-O2' 'FCLIBS= ' 'FCFLAGS=-O2 -fopenmp'
aprun -n32 -N8 ==> real 199.05 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-mpionly-4-nodes-gnu.out
MPI+OMP 4 nodes, gnu
export OMP_NUM_THREADS=8
aprun -n4 -d8 -N1 ==> real 370.82 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-omp-4-nodes-gnu.out
MPI+OCL 4 nodes, gnu
aprun -n4 -N1 ==> real 557.92 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-ocl-4-nodes-gnu.out
MPI + OCL + OMP 4 nodes, gnu
export OMP_NUM_THREADS=8
aprun -n4 -d8 -N1 ==> real 290.03 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-ocl-omp-4-nodes-gnu.out
MPI only 4 nodes, intel
@santis01
module swap PrgEnv-cray PrgEnv-intel
module swap intel intel/15.0.1.133
module unload cray-libsci
module load cudatoolkit
export FC=ftn;
export F77=ftn;
export CC=cc;
export CXX=CC;
export CFLAGS="";
export FCFLAGS="-I/opt/intel/15.0.1.133/composer_xe_2015.1.133/mkl/include -openmp -g";
export FFLAGS="";
export LDFLAGS="";
export LIBS="-L/opt/cray/nvidia/default/lib64 -L/opt/nvidia/cudatoolkit/default/lib64";
export LC_CTYPE="en_US.UTF-8"
#configure
./configure --prefix="/scratch/santis/perettig/bigdft/bin/176/intel/" --without-blas --without-lapack --with-ext-linalg="-L/opt/intel/15.0.1.133/composer_xe_2015.1.133/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_intel_thread -lpthread -lm" --enable-opencl --enable-cuda-gpu --with-cuda-path="/opt/nvidia/cudatoolkit/default/" --with-ocl-path="/opt/nvidia/cudatoolkit/default/" --with-openmp
MPI only 4 nodes, intel
aprun -n32 -N8 ==> real 195.12 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-mpionly-4-nodes-intel.out
MPI+OMP 4 nodes, intel
export OMP_NUM_THREADS=8
aprun -n4 -d8 -N1 ==> real x (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-omp-4-nodes-intel.out
MPI + OCL 4 nodes, intel
aprun -n4 -N1 ==> real 487.30 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-ocl-4-nodes.out
MPI + OCL + OMP 4 nodes, intel
export OMP_NUM_THREADS=8
aprun -n4 -d8 -N1 ==> real 792.85 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-ocl-omp-4-nodes-intel.out
Santis 1.7.6 | MPI | MPI+OMP | MPI+OCL | MPI + OCL+ OMP |
---|---|---|---|---|
GNU | 199.05 | 370.82 | 557.92 | 290.03 |
INTEL | 195.12 | 1212.20 | 487.30 | 792.85 |
Comments (17)
-
reporter -
reporter - edited description
-
reporter - edited description
-
reporter - edited description
-
reporter - edited description
-
reporter - edited description
-
reporter Issue: perftools fails to profile mpi+openmp
Compile and run BigDFT on swan (IvyBridge)
Get the code
ssh swan wget https://launchpad.net/bigdft/1.7/1.7.5/+download/bigdft-1.7.5.tar.xz cd /lus/scratch/p01991/822356/GNU+CPUivyb+PT622/bigdft-1.7.5
setup (IVYB)
module swap PrgEnv-cray PrgEnv-gnu module rm craype-accel-nvidia35 module swap craype-haswell craype-ivybridge module load fftw # module load perftools-lite/6.2.2 # after the configure step !
Currently Loaded Modulefiles:
1) modules/3.2.10.2 2) nodestat/2.2-1.0502.53712.3.109.ari 3) sdb/1.0-1.0502.55976.5.27.ari 4) alps/5.2.1-2.0502.9041.11.6.ari 5) lustre-cray_ari_s/2.5_3.0.101_0.31.1_1.0502.8394.10.1-1.0502.17198.8.51 6) udreg/2.3.2-1.0502.9275.1.12.ari 7) ugni/5.0-1.0502.9685.4.24.ari 8) gni-headers/3.0-1.0502.9684.5.2.ari 9) dmapp/7.0.1-1.0502.9501.5.219.ari 10) xpmem/0.1-2.0502.55507.3.2.ari 11) hss-llm/7.2.0 12) Base-opts/1.0.2-1.0502.53325.1.2.ari 13) craype-network-aries 14) craype/2.2.1 15) cray-mpich/7.1.2 16) craype-ivybridge 17) moab/8.0.0 18) torque/5.0.0 19) gcc/4.9.2 20) totalview-support/1.2.0.4 21) totalview/8.15.0 22) cray-libsci/13.0.3 23) pmi/5.0.6-1.0000.10439.140.2.ari 24) atp/1.7.5 25) PrgEnv-gnu/5.2.40 26) fftw/3.3.4.1
compile
export FC=ftn export F77=ftn export CC=cc export CXX=CC # Add -fopenmp to the above if needed export FCLIBS=" " ./configure \ --with-blas=no \ --with-lapack=no \ --with-ext-linalg=' ' \ --prefix=/lus/scratch/p01991/822356/GNU+CPUivyb+PT622/175notool # --with-openmp \ make -j16 make install
run (MPI): OK
qsub -I -V -l walltime=00:05:00 -q ivb12 -l nodes=4 cd /lus/scratch/p01991/822356/GNU+CPUivyb+PT622/175notool/bin/PbSeSurface /usr/bin/time -p aprun -n96 -N24 ../bigdft &> o
grep accel: input.yaml # ==> accel: NO grep real o # ==> real 96.09
run (MPI)+PTL622: OK
qsub -I -V -l walltime=00:05:00 -q gpu_nodes -l nodes=4 cd /lus/scratch/p01991/822356/GNU+CPUivyb+PT622/175ptl622/bin/PbSeSurface /usr/bin/time -p aprun -n48 -N12 ../bigdft &> o
grep real o # ==> real 155.13
- cat input.yaml
dft: hgrids: [0.6, 0.6, 0.6] rmult: [8.0, 12.0] ixc: PBE nspin: 1 mpol: 0 gnrm_cv: 1.e-5 nrepmax: 1 itermax: 2 ncong: 2 idsx: 0 disablesym: Yes mix: iscf: 17 itrpmax: 1 rpnrm_cv: 1.E-11 norbsempty: 1000 tel: 1.E-003 alphamix: 0.95 alphadiis: 1.d0 perf: accel: NO # OCLGPU # NO projrad: 8.000 psp_onfly: No verbosity: 1 kpt: method: MPgrid ngkpt: [2,1,2]
-
reporter Compile and run BigDFT on daint
Get the code
ssh daint wget https://launchpad.net/bigdft/1.7/1.7.5/+download/bigdft-1.7.5.tar.xz cd /scratch/daint/piccinal/BIGDFT/GNU+CPU+PT622/bigdft-1.7.5/
setup
module swap PrgEnv-cray PrgEnv-gnu module load craype-accel-nvidia35 module swap cray-mpich cray-mpich/7.1.1 module load fftw # module load perftools-lite/6.2.2 # after the configure step ! # module load scorep/1.4 # after the configure step !
Currently Loaded Modulefiles:
1) modules/3.2.6.7 2) nodestat/2.2-1.0501.47138.1.78.ari 3) sdb/1.0-1.0501.48084.4.48.ari 4) alps/5.1.1-2.0501.8713.1.1.ari 5) MySQL/5.0.64-1.0000.7096.23.2 6) lustre-cray_ari_s/2.4_3.0.80_0.5.1_1.0501.7664.13.1-1.0501.14952.18.1 7) udreg/2.3.2-1.0501.7914.1.13.ari 8) ugni/5.0-1.0501.8253.10.22.ari 9) gni-headers/3.0-1.0501.8317.12.1.ari 10) dmapp/7.0.1-1.0501.8315.8.4.ari 11) xpmem/0.1-2.0501.48424.3.3.ari 12) hss-llm/7.1.0 13) Base-opts/1.0.2-1.0501.47945.4.2.ari 14) craype-network-aries 15) craype/2.2.1 16) craype-sandybridge 17) slurm 18) cray-mpich/7.1.1 19) ddt/4.3rc7 20) linux/jg 21) gcc/4.8.2 22) totalview-support/1.1.4 23) totalview/8.11.0 24) cray-libsci/13.0.1 25) pmi/5.0.5-1.0000.10300.134.8.ari 26) atp/1.7.5 27) PrgEnv-gnu/5.1.29 28) fftw/3.3.4.0 29) cray-libsci_acc/3.0.2 30) cudatoolkit/5.5.20-1.0501.7945.8.2 31) craype-accel-nvidia35
compile (MPI only)
export FC=/apps/daint/sandbox/jgp/bigdft/scorep/ftn.sh export F77=/apps/daint/sandbox/jgp/bigdft/scorep/ftn.sh export CC=/apps/daint/sandbox/jgp/bigdft/scorep/cc.sh export CXX=/apps/daint/sandbox/jgp/bigdft/scorep/CC.sh # Add -fopenmp to the above if needed export FCLIBS=" " # export CFLAGS="-O2 -g" # export LDFLAGS= ./configure \ --with-blas=no \ --with-lapack=no \ --with-ext-linalg=' ' \ --prefix=/scratch/daint/piccinal/BIGDFT/GNU+CPU+PT622/175+sc14-mpionly sbatch.sh daint 15 ../bigdft `8 22` 8 1 "" "" "" -Ausup #no --with-openmp \ make -j4 make install
compile (with scorep/1.4)
- cat /apps/daint/sandbox/jgp/bigdft/scorep/cc.sh
scorep --mpp=mpi cc "$@"
run (MPI)+scorep/1.4: profiling NOT ok
cd /scratch/daint/piccinal/BIGDFT/GNU+CPU+PT622/175+sc14-mpionly/bin/PbSeSurface grep accel: input.yaml # ==> accel: NO export SCOREP_ENABLE_PROFILING=true export SCOREP_ENABLE_TRACING=false export SCOREP_TOTAL_MEMORY=50000000 # sbatch.sh daint 15 ../bigdft `8 22` 8 1 "" "" "" -Ausup /usr/bin/time -p aprun -n 176 -N 8 -d 1 -j 1 ../bigdft
Error: [Score-P] src/measurement/SCOREP_Memory.c:145: Error: No free memory page available: Out of memory. Error: Please increase SCOREP_TOTAL_MEMORY=16384000 and try again. Warning: [Score-P] src/measurement/profiling/scorep_profile_collapse.c:74: Warning: [Score-P] callpath depth limitation of 30 exceeded. Warning: Reached callpath depth was 343 Score-P callpath depth limitation of 30 exceeded. /apps/daint/sandbox/jgp/scorep/src/DAINT/BIGDFTPATCH/scorep_profile_collapse.c
run (MPI)+ptl/622: profiling OK
cd /scratch/daint/piccinal/BIGDFT/GNU+CPU+PT622/175+ptl622-mpionly/bin/PbSeSurface grep accel: input.yaml # ==> accel: NO # sbatch.sh daint 15 ../bigdft `8 22` 8 1 "" "" "" -Ausup /usr/bin/time -p aprun -n 176 -N 8 -d 1 -j 1 ../bigdft
- cat /apps/daint/sandbox/jgp/bigdft/scorep/cc.sh
-
- edited description
-
- edited description
-
- edited description
-
- edited description
-
- edited description
-
- edited description
-
- edited description
-
- edited description
-
- edited description
- Log in to comment