BigDFT (MPI/OMP)

Issue #23 new
jg piccinali repo owner created an issue

Issue: what tool can we use to get a performance report ?

Compile and run BigDFT on swan

Get the code

ssh swan
wget https://launchpad.net/bigdft/1.7/1.7.5/+download/bigdft-1.7.5.tar.xz
cd /lus/scratch/p01991/822356/GNU+CPU+PT622/bigdft-1.7.5

setup (Haswell 12core nodes)

module swap PrgEnv-cray PrgEnv-gnu
module load craype-accel-nvidia35
module load fftw
# module load perftools-lite/6.2.2      # after the configure step !

Currently Loaded Modulefiles:

modules/3.2.10.2
nodestat/2.2-1.0502.53712.3.109.ari
sdb/1.0-1.0502.55976.5.27.ari
alps/5.2.1-2.0502.9041.11.6.ari
lustre-cray_ari_s/2.5_3.0.101_0.31.1_1.0502.8394.10.1-1.0502.17198.8.51
udreg/2.3.2-1.0502.9275.1.12.ari
ugni/5.0-1.0502.9685.4.24.ari
gni-headers/3.0-1.0502.9684.5.2.ari
dmapp/7.0.1-1.0502.9501.5.219.ari
xpmem/0.1-2.0502.55507.3.2.ari
hss-llm/7.2.0
Base-opts/1.0.2-1.0502.53325.1.2.ari
craype-network-aries
craype/2.2.1
cray-mpich/7.1.2
craype-haswell
moab/8.0.0
torque/5.0.0
gcc/4.9.2
totalview-support/1.2.0.4
totalview/8.15.0
cray-libsci/13.0.3
pmi/5.0.6-1.0000.10439.140.2.ari
atp/1.7.5
PrgEnv-gnu/5.2.40
fftw/3.3.4.1

compile

export FC=ftn
export F77=ftn
export CC=cc
export CXX=CC
# Add -fopenmp to the above if needed
export FCLIBS=" "

# export CFLAGS="-O2 -g"
# export LDFLAGS=


./configure \
--with-blas=no \
--with-lapack=no \
--with-ext-linalg=' ' \
--prefix=/lus/scratch/p01991/822356/GNU+CPU+PT622/175notool+omp \
--with-openmp \

# --prefix=/lus/scratch/p01991/822356/GNU+CPU+PT622/175notool
# --prefix=/lus/scratch/p01991/822356/GNU+CPU+PT622/175ptl622
# --prefix=/lus/scratch/p01991/822356/GNU+CPU+PT622/175notool+omp

make -j16
make install

run (MPI): OK

qsub -I -V  -l walltime=00:15:00   -q hsw12    -l nodes=8
cd /lus/scratch/p01991/822356/GNU+CPU+PT622/175notool/bin/PbSeSurface
/usr/bin/time -p aprun -n96 -N12 ../bigdft &> o
grep accel: input.yaml           # ==> accel: NO
grep real o            # ==> real 82.14

run (MPI)+PTL622: OK

qsub -I -V  -l walltime=00:15:00   -q hsw12    -l nodes=8
cd /lus/scratch/p01991/822356/GNU+CPU+PT622/175ptl622/bin/PbSeSurface
/usr/bin/time -p aprun -n96 -N12 ../bigdft &> o
grep real o            # ==> real 95.71

eff.png

run (MPI+OPENMP): OK

qsub -I -V  -l walltime=00:15:00   -q hsw12    -l nodes=8
cd /lus/scratch/p01991/822356/GNU+CPU+PT622/175notool+omp/bin/PbSeSurface
export OMP_NUM_THREADS=4
/usr/bin/time -p aprun -n24 -N3 -d4 ../bigdft &> o
grep accel: input.yaml           # ==> accel: NO
grep " OpenMP parallelization" o   # ==> Yes
grep real o            # ==> real 101.96   # compiled with -g

run (MPI+OPENMP)+PTL622: NOT ok

qsub -I -V  -l walltime=00:15:00   -q hsw12    -l nodes=8
cd /lus/scratch/p01991/822356/GNU+CPU+PT622/175ptl622+omp/bin/PbSeSurface
export OMP_NUM_THREADS=4
/usr/bin/time -p aprun -n24 -N3 -d4 ../bigdft &> o
grep accel: input.yaml           # ==> accel: NO
grep " OpenMP parallelization" o   # ==> Yes
grep real o            # ==> real 101.96   # compiled with -g

eff.png eff2.png

export LIBS="-L/opt/cray/nvidia/default/lib64 -L/opt/nvidia/cudatoolkit/default/"

BigDFT 1.7.6 on santis

MPI only 4 nodes, gnu

@santis01

export FC=ftn
export F77=ftn
export CC=cc
export CXX=CC
export FCLIBS=" "

module swap PrgEnv-cray PrgEnv-gnu
module load craype-accel-nvidia35
module rm cray-libsci_acc/3.0.2
module swap cudatoolkit/5.5.22-1.0502.7944.3.1 cudatoolkit/6.5.12
module load cray-libsci

./configure --with-blas=no --with-lapack=no --with-ext-linalg=' ' --prefix=/scratch/santis/perettig/bigdft/bin/176/gnu/ --enable-opencl --enable-cuda-gpu --with-cuda-path=/opt/nvidia/cudatoolkit/default --with-ocl-path="/opt/nvidia/cudatoolkit/default/" 'LIBS=-L/opt/cray/nvidia/default/lib64 -L/opt/nvidia/cudatoolkit/default/lib64' 'CFLAGS=-O2' 'FCLIBS= ' 'FCFLAGS=-O2 -fopenmp'
aprun -n32 -N8  ==> real 199.05 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-mpionly-4-nodes-gnu.out

MPI+OMP 4 nodes, gnu

export OMP_NUM_THREADS=8
aprun -n4 -d8 -N1  ==> real 370.82 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-omp-4-nodes-gnu.out

MPI+OCL 4 nodes, gnu

aprun -n4 -N1  ==> real 557.92 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-ocl-4-nodes-gnu.out

MPI + OCL + OMP 4 nodes, gnu

export OMP_NUM_THREADS=8
aprun -n4 -d8 -N1  ==> real 290.03 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-ocl-omp-4-nodes-gnu.out

MPI only 4 nodes, intel

@santis01

module swap PrgEnv-cray PrgEnv-intel
module swap intel intel/15.0.1.133
module unload cray-libsci
module load cudatoolkit

export FC=ftn;
export F77=ftn;
export CC=cc;
export CXX=CC;
export CFLAGS="";
export FCFLAGS="-I/opt/intel/15.0.1.133/composer_xe_2015.1.133/mkl/include -openmp -g";
export FFLAGS="";
export LDFLAGS="";
export LIBS="-L/opt/cray/nvidia/default/lib64 -L/opt/nvidia/cudatoolkit/default/lib64";
export LC_CTYPE="en_US.UTF-8"

#configure
./configure --prefix="/scratch/santis/perettig/bigdft/bin/176/intel/" --without-blas --without-lapack --with-ext-linalg="-L/opt/intel/15.0.1.133/composer_xe_2015.1.133/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_intel_thread -lpthread -lm" --enable-opencl --enable-cuda-gpu --with-cuda-path="/opt/nvidia/cudatoolkit/default/" --with-ocl-path="/opt/nvidia/cudatoolkit/default/" --with-openmp

MPI only 4 nodes, intel

aprun -n32 -N8  ==> real 195.12 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-mpionly-4-nodes-intel.out

MPI+OMP 4 nodes, intel

export OMP_NUM_THREADS=8
aprun -n4 -d8 -N1  ==> real x (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-omp-4-nodes-intel.out

MPI + OCL 4 nodes, intel

aprun -n4 -N1  ==> real 487.30 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-ocl-4-nodes.out

MPI + OCL + OMP 4 nodes, intel

export OMP_NUM_THREADS=8
aprun -n4 -d8 -N1  ==> real  792.85 (PbSeSurface)
/apps/santis/sandbox/gpp/bigdft/bigdft-1.7.6-ocl-omp-4-nodes-intel.out
Santis 1.7.6 MPI MPI+OMP MPI+OCL MPI + OCL+ OMP
GNU 199.05 370.82 557.92 290.03
INTEL 195.12 1212.20 487.30 792.85

Comments (17)

  1. jg piccinali reporter

    Issue: perftools fails to profile mpi+openmp

    Compile and run BigDFT on swan (IvyBridge)

    Get the code

    ssh swan
    wget https://launchpad.net/bigdft/1.7/1.7.5/+download/bigdft-1.7.5.tar.xz
    cd /lus/scratch/p01991/822356/GNU+CPUivyb+PT622/bigdft-1.7.5
    

    setup (IVYB)

    module swap PrgEnv-cray PrgEnv-gnu
    module rm craype-accel-nvidia35
    module swap craype-haswell craype-ivybridge
    module load fftw
    # module load perftools-lite/6.2.2      # after the configure step !
    

    Currently Loaded Modulefiles:

      1) modules/3.2.10.2
      2) nodestat/2.2-1.0502.53712.3.109.ari
      3) sdb/1.0-1.0502.55976.5.27.ari
      4) alps/5.2.1-2.0502.9041.11.6.ari
      5) lustre-cray_ari_s/2.5_3.0.101_0.31.1_1.0502.8394.10.1-1.0502.17198.8.51
      6) udreg/2.3.2-1.0502.9275.1.12.ari
      7) ugni/5.0-1.0502.9685.4.24.ari
      8) gni-headers/3.0-1.0502.9684.5.2.ari
      9) dmapp/7.0.1-1.0502.9501.5.219.ari
     10) xpmem/0.1-2.0502.55507.3.2.ari
     11) hss-llm/7.2.0
     12) Base-opts/1.0.2-1.0502.53325.1.2.ari
     13) craype-network-aries
     14) craype/2.2.1
     15) cray-mpich/7.1.2
     16) craype-ivybridge
     17) moab/8.0.0
     18) torque/5.0.0
     19) gcc/4.9.2
     20) totalview-support/1.2.0.4
     21) totalview/8.15.0
     22) cray-libsci/13.0.3
     23) pmi/5.0.6-1.0000.10439.140.2.ari
     24) atp/1.7.5
     25) PrgEnv-gnu/5.2.40
     26) fftw/3.3.4.1
    

    compile

    export FC=ftn
    export F77=ftn
    export CC=cc
    export CXX=CC
    # Add -fopenmp to the above if needed
    export FCLIBS=" "
    
    ./configure \
    --with-blas=no \
    --with-lapack=no \
    --with-ext-linalg=' ' \
    --prefix=/lus/scratch/p01991/822356/GNU+CPUivyb+PT622/175notool
    
    # --with-openmp \
    
    make -j16
    make install
    

    run (MPI): OK

    qsub -I -V  -l walltime=00:05:00   -q ivb12    -l nodes=4
    cd /lus/scratch/p01991/822356/GNU+CPUivyb+PT622/175notool/bin/PbSeSurface
    /usr/bin/time -p aprun -n96 -N24 ../bigdft &> o
    
    grep accel: input.yaml           # ==> accel: NO
    grep real o            # ==> real 96.09
    

    run (MPI)+PTL622: OK

    qsub -I -V  -l walltime=00:05:00   -q gpu_nodes    -l nodes=4
    cd /lus/scratch/p01991/822356/GNU+CPUivyb+PT622/175ptl622/bin/PbSeSurface
    /usr/bin/time -p aprun -n48 -N12 ../bigdft &> o
    
    grep real o            # ==> real 155.13
    

    eff.png

    • cat input.yaml
     dft:
       hgrids: [0.6, 0.6, 0.6]
       rmult: [8.0, 12.0]
       ixc: PBE
       nspin: 1 
       mpol: 0
       gnrm_cv: 1.e-5
       nrepmax: 1
       itermax: 2
       ncong: 2
       idsx: 0
       disablesym: Yes
     mix:
       iscf: 17
       itrpmax: 1
       rpnrm_cv: 1.E-11
       norbsempty: 1000
       tel: 1.E-003
       alphamix: 0.95
       alphadiis: 1.d0
     perf:
       accel: NO # OCLGPU # NO
       projrad: 8.000
       psp_onfly: No
       verbosity: 1
     kpt:
       method: MPgrid
       ngkpt: [2,1,2]
    
  2. jg piccinali reporter

    Compile and run BigDFT on daint

    Get the code

    ssh daint
    wget https://launchpad.net/bigdft/1.7/1.7.5/+download/bigdft-1.7.5.tar.xz
    cd /scratch/daint/piccinal/BIGDFT/GNU+CPU+PT622/bigdft-1.7.5/
    

    setup

    module swap PrgEnv-cray PrgEnv-gnu
    module load craype-accel-nvidia35
    module swap cray-mpich cray-mpich/7.1.1
    module load fftw
    # module load perftools-lite/6.2.2      # after the configure step !
    # module load scorep/1.4                  # after the configure step !
    

    Currently Loaded Modulefiles:

      1) modules/3.2.6.7
      2) nodestat/2.2-1.0501.47138.1.78.ari
      3) sdb/1.0-1.0501.48084.4.48.ari
      4) alps/5.1.1-2.0501.8713.1.1.ari
      5) MySQL/5.0.64-1.0000.7096.23.2
      6) lustre-cray_ari_s/2.4_3.0.80_0.5.1_1.0501.7664.13.1-1.0501.14952.18.1
      7) udreg/2.3.2-1.0501.7914.1.13.ari
      8) ugni/5.0-1.0501.8253.10.22.ari
      9) gni-headers/3.0-1.0501.8317.12.1.ari
     10) dmapp/7.0.1-1.0501.8315.8.4.ari
     11) xpmem/0.1-2.0501.48424.3.3.ari
     12) hss-llm/7.1.0
     13) Base-opts/1.0.2-1.0501.47945.4.2.ari
     14) craype-network-aries
     15) craype/2.2.1
     16) craype-sandybridge
     17) slurm
     18) cray-mpich/7.1.1
     19) ddt/4.3rc7
     20) linux/jg
     21) gcc/4.8.2
     22) totalview-support/1.1.4
     23) totalview/8.11.0
     24) cray-libsci/13.0.1
     25) pmi/5.0.5-1.0000.10300.134.8.ari
     26) atp/1.7.5
     27) PrgEnv-gnu/5.1.29
     28) fftw/3.3.4.0
     29) cray-libsci_acc/3.0.2
     30) cudatoolkit/5.5.20-1.0501.7945.8.2
     31) craype-accel-nvidia35
    

    compile (MPI only)

    export FC=/apps/daint/sandbox/jgp/bigdft/scorep/ftn.sh
    export F77=/apps/daint/sandbox/jgp/bigdft/scorep/ftn.sh
    export CC=/apps/daint/sandbox/jgp/bigdft/scorep/cc.sh
    export CXX=/apps/daint/sandbox/jgp/bigdft/scorep/CC.sh
    # Add -fopenmp to the above if needed
    export FCLIBS=" "
    
    # export CFLAGS="-O2 -g"
    # export LDFLAGS=
    
    ./configure \
    --with-blas=no \
    --with-lapack=no \
    --with-ext-linalg=' ' \
    --prefix=/scratch/daint/piccinal/BIGDFT/GNU+CPU+PT622/175+sc14-mpionly
    sbatch.sh daint 15 ../bigdft `8 22` 8 1 "" "" ""  -Ausup
    #no --with-openmp \
    
    make -j4
    make install
    

    compile (with scorep/1.4)

    • cat /apps/daint/sandbox/jgp/bigdft/scorep/cc.sh

      scorep --mpp=mpi cc "$@"

    run (MPI)+scorep/1.4: profiling NOT ok

    cd /scratch/daint/piccinal/BIGDFT/GNU+CPU+PT622/175+sc14-mpionly/bin/PbSeSurface
    grep accel: input.yaml           # ==> accel: NO
    
    export SCOREP_ENABLE_PROFILING=true
    export SCOREP_ENABLE_TRACING=false
    export SCOREP_TOTAL_MEMORY=50000000
    
    # sbatch.sh daint 15 ../bigdft `8 22` 8 1 "" "" ""  -Ausup
    /usr/bin/time -p  aprun -n 176 -N 8 -d 1 -j 1  ../bigdft  
    
    Error: [Score-P] src/measurement/SCOREP_Memory.c:145: 
    Error: No free memory page available: Out of memory. 
    Error: Please increase SCOREP_TOTAL_MEMORY=16384000 and try again.
    
    Warning: [Score-P] src/measurement/profiling/scorep_profile_collapse.c:74: 
    Warning: [Score-P] callpath depth limitation of 30 exceeded.                                                                             
    Warning: Reached callpath depth was 343 Score-P callpath depth limitation of 30 exceeded.
    
    /apps/daint/sandbox/jgp/scorep/src/DAINT/BIGDFTPATCH/scorep_profile_collapse.c
    

    eff.png

    run (MPI)+ptl/622: profiling OK

    cd /scratch/daint/piccinal/BIGDFT/GNU+CPU+PT622/175+ptl622-mpionly/bin/PbSeSurface
    grep accel: input.yaml           # ==> accel: NO
    
    # sbatch.sh daint 15 ../bigdft `8 22` 8 1 "" "" ""  -Ausup
    /usr/bin/time -p  aprun -n 176 -N 8 -d 1 -j 1  ../bigdft  
    

    eff.png

  3. Log in to comment