MPI - MPI deep dive - Hello world! extended and job submission¶
Objectives
Figure out the version of the MPI library in use
See an extended version of the “Hello world!” program that also prints the core ID (on the node) and the processor (node) name where the MPI processes are running
Explore job submission on a cluster
Instructor note
0 min teaching
10 min exercises or demo
MPI - version test¶
Run the version test to figure out the version of the MPI library and of the header in use:
#include <stdio.h>
#include <mpi.h>
int main (int argc, char *argv[])
{
int lib_version, lib_subversion;
MPI_Init(&argc, &argv);
MPI_Get_version(&lib_version, &lib_subversion);
printf ("Version: Library: %d.%d, mpi.h: %d.%d\n",
lib_version, lib_subversion, MPI_VERSION, MPI_SUBVERSION);
MPI_Finalize();
}
program version
use mpi_f08
implicit none
integer lib_version, lib_subversion
call MPI_Init()
call MPI_Get_version(lib_version, lib_subversion)
write(*,*) 'Version: Library: ', lib_version, lib_subversion
write(*,*) 'Version: mpi_f08 module: ', MPI_VERSION, MPI_SUBVERSION
write(*,*) 'MPI_ASYNC_PROTECTS_NONBLOCKING: ', MPI_ASYNC_PROTECTS_NONBLOCKING
write(*,*) 'MPI_SUBARRAYS_SUPPORTED : ', MPI_SUBARRAYS_SUPPORTED
call MPI_Finalize()
end program
from mpi4py import MPI
(lib_version, lib_subversion) = MPI.Get_version()
print(f"Version: Library: {lib_version}.{lib_subversion} mpi.h: {MPI.VERSION}.{MPI.SUBVERSION}")
Compile the version test:
mpicc version_test.c -o version_test
mpif90 version_test.f90 -o version_test
Run the version test:
mpirun -np 1 ./version_test
mpirun -np 1 python3 ./version_test.py
Extended version of the MPI - Hello world!¶
This is an extended version of the “Hello world!” program that also prints the core ID (on the node) and the processor (node) name where the MPI processes are running.
#include <stdio.h>
#include <sched.h> // sched_getcpu()
#include <mpi.h> // MPI header
int main(int argc, char *argv[])
{
int rank = 0, size = 1; // initialize rank and size
int core_id; // ... core_id
int comm_name_length; // ... MPI communicator name
char comm_name[MPI_MAX_OBJECT_NAME]; // ... MPI communicator name
int node_name_length; // ... MPI processor/node name
char node_name[MPI_MAX_PROCESSOR_NAME]; // ... MPI processor/node name
MPI_Init(&argc, &argv); // MPI initialization
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI rank
MPI_Comm_size(MPI_COMM_WORLD, &size); // MPI size
MPI_Comm_get_name(MPI_COMM_WORLD, comm_name, &comm_name_length); // ... MPI communicator name
MPI_Get_processor_name(node_name, &node_name_length); // ... MPI processor/node name
core_id = sched_getcpu(); // ... core_id
if (rank == 0)
{
printf ("\n");
printf ("he-mpi - MPI program to print core_id & node_name (cb)\n");
printf ("Hello world! from %4i MPI processes in %s\n", size, comm_name);
}
printf ("MPI process %4i / %4i ON core %4i of node %s\n", rank, size, core_id, node_name);
MPI_Finalize(); // MPI finalization
}
For Fortran a little C helper file is needed:
#include <sched.h>
int find_core_id_ ()
{
int core_id;
core_id = sched_getcpu();
return core_id;
}
program hello
use mpi_f08 ! MPI header/module
implicit none
integer ierror ! OPTIONAL with mpi_f08
integer rank, size ! MPI
integer core_id ! ... core_id
integer, external :: find_core_id ! ... core_id (external)
integer comm_name_length ! ... MPI communicator name
character(len=MPI_MAX_OBJECT_NAME) :: comm_name ! ... MPI communicator name
integer node_name_length ! ... MPI processor/node name
character(len=MPI_MAX_PROCESSOR_NAME) :: node_name ! ... MPI processor/node name
rank = 0 ! MPI - initialize rank
size = 1 ! MPI - initialize size
call MPI_Init(ierror) ! MPI initialization
call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierror) ! MPI rank
call MPI_Comm_size(MPI_COMM_WORLD, size, ierror) ! MPI size
call MPI_Comm_get_name(MPI_COMM_WORLD, comm_name, comm_name_length, ierror) ! ... comm_name
call MPI_Get_processor_name(node_name, node_name_length, ierror) ! MPI processor/node name
core_id = find_core_id() ! ... core_id
if (rank .eq. 0) then
write(*,"('')")
write(*,"('he-mpi - MPI program to print core_id & node_name (cb)')")
write(*,"('Hello world! from ',i4,' MPI processes in ',a14)") size, comm_name
endif
write(*,"('MPI process ',i4,' / ',i4,' ON core ',i4,' of node ',a15)") rank, size, core_id, node_name
call MPI_Finalize(ierror) ! MPI finalization
end program
import os
import numpy as np
from mpi4py import MPI
comm_world = MPI.COMM_WORLD # communicator
rank_world = comm_world.Get_rank() # process rank with respect to the communicator
size_world = comm_world.Get_size() # size of / number of processes in the communicator
comm_name = comm_world.Get_name() # communicator name
node_name = MPI.Get_processor_name() # processor/node name
core_id = os.sched_getaffinity(0) # {core_id}
#print(os.uname()[1]) # node name
#print(os.sched_getaffinity(0)) # info about affinity/binding/pinning
if (rank_world == 0):
print(f"")
print(f"he-mpi - MPI program to print core_id & node_name (cb)")
print(f"Hello world! from {size_world:4} MPI processes in {comm_name}")
print(f"MPI process {rank_world:4} / {size_world:4} ON core {core_id} of node {node_name}")
Compile the extended version (note the additional option -D_GNU_SOURCE (needed for sched.h)):
mpicc he-mpi.c -o he-mpi -D_GNU_SOURCE
gcc -c help_fortran_find_core_id.c -D_GNU_SOURCE
mpif90 he-mpi.f90 help_fortran_find_core_id.o -o he-mpi
Run the extended version (note additional options and sorting of the output):
mpirun -np 4 --map-by core --rank-by slot --bind-to hwthread --report-bindings ./he-mpi | sort +2n -3 | cut -c 1-54
mpirun -np 4 --map-by core --rank-by slot --bind-to hwthread --report-bindings python3 ./he-mpi.py | sort +2n -3
Resources available (within the notebook) (How many and which cores are available, how many MPI processes can be started):
import os
print(os.uname()[1], os.sched_getaffinity(0))
Expected output on the VSC-5 cluster in the MPI courses will be something similar to:
n3503-022 {0, 1, 2, 3, 128, 129, 130, 131}
nodename { 4 physical cores (out of 0 - 127) + their hardware hyperthreads (+128) }
Note, on VSC-5 there are 128 physical cores per node where first all hwt 0 are numbered consecutively (0 - 127) and then all hwt 1 (128 - 255). Therefore, e.g., processing unit number 0 (hwt 0) and processing unit number 128 (hwt 1) are both on the same physical core.
On VSC-5 for each pysical core:
(processing unit number of hwt 1) = (processing unit number of hwt 0) + 128
Note
A note on pinning: mpirun –bind-to core OR hwthread?
First of all, with running MPI programs you should always take care of a good pinning! Pinning is related to the performance that you will get in the end. The recommendation is to always take care of a proper pinning and not rely on any defaults. Without pinning, it might happen that some MPI processes share the same hardware resources (are put on the same physical core) or that the MPI processes are moved around and even accross NUMA domains by the operationg system - both will slow down execution.
The recommendation with pure MPI applications is to pin the MPI processes at least to physical cores and to use all physical cores available on a node if there are no other constraints. (Other constraints could be, e.g., memory-bandwidth bound applications, not enough memory on the node, or very poor scalability of the application. The former two could benefit from having a lower number of MPI processes per node, i.e., not using all physical cores of the node, but with an equal spread accross the NUMA domains of the node. If the code does not scale up to the number of available cores (nowadays the nodes in HPC systems have a quite high core count) the only way out is to use only a part of the node (on VSC-5 you can request a part of a node if instead of
--nodes(-N)in your job scripts you use--ntasks(-n)together with--memwith a rule of thumb of 4G per core).)Depending on the applications’ internal communication patterns, some applications might benefit from a specific way of pinning (e.g., consecutive pinning if the MPI processes communicate mainly with their neighbors) while other applications do not have any special needs as long as the MPI processes are pinned.
With
--bind-tohwthread the MPI processes are pinned (bound) to the hwthread (hwt 0 and “B.”), see above.With
--bind-tocore the MPI processes are pinned (bound) to physical cores (hwt 0-1 and “BB”), i.e., an MPI process can move between the two hardware hyperthreads available on the physical core, it even could use both hardware hyperthreads simultaneously by other means of parallelzation (e.g., shared-memory parallelization with OpenMP).It does not really matter on which hardware hyperthread (hwt) of the core an MPI process is running. With
--bind-tocore you will see in the output of he-mpi that some MPI processes are running on hwt 0 and some on hwt 1, that’s just fine.
Job submission¶
We are going to use the he-mpi.c from above and will run it on two nodes of VSC-5.
Job script using the Open-MPI library¶
%%writefile job_openmpi.sh
#!/bin/bash
#SBATCH --job-name openmpi # job name
#SBATCH --nodes 2 # number of nodes (exclusive)
#SBATCH --clusters vsc5 # submit to VSC-5
#SBATCH --qos zen3_0512 # use a qos
#SBATCH --partition zen3_0512 # use partition that fits to the qos
#SBATCH --ntasks-per-node 128 # SLURM_NTASKS_PER_NODE [1 mpi/core]
#SBATCH --reservation jh_training_mpi # reservation DURING COURSE ONLY
#SBATCH --time 0-00:05:00 # max runtime (5 min during course)
module purge # recommended to be done in all jobs !!!!!
# module load <modules> # load only modules actually needed by job
module load openmpi/4.1.6-gcc-12.2.0-exh7lqk # Open-MPI (including gcc)
echo
echo 'Hello from node: '$HOSTNAME
echo 'Number of nodes: '$SLURM_JOB_NUM_NODES
echo 'Tasks per node: '$SLURM_TASKS_PER_NODE
echo 'Partition used: '$SLURM_JOB_PARTITION
echo 'QOS used: '$SLURM_JOB_QOS
echo 'Using the nodes: '$SLURM_JOB_NODELIST
echo
# <do_my_work>
echo 'Compiling he-mpi & running it on all available cores & sort for output ordering'
mpicc he-mpi.c -o he-mpi -D_GNU_SOURCE # could/should be done outside of job
mpirun -np $SLURM_NTASKS --map-by core --rank-by slot --bind-to hwthread ./he-mpi | sort +2n -3 | cut -c 1-54
%%writefile job_openmpi.sh
#!/bin/bash
#SBATCH --job-name openmpi # job name
#SBATCH --nodes 2 # number of nodes (exclusive)
#SBATCH --clusters vsc5 # submit to VSC-5
#SBATCH --qos zen3_0512 # use a qos
#SBATCH --partition zen3_0512 # use partition that fits to the qos
#SBATCH --ntasks-per-node 128 # SLURM_NTASKS_PER_NODE [1 mpi/core]
#SBATCH --reservation jh_training_mpi # reservation DURING COURSE ONLY
#SBATCH --time 0-00:05:00 # max runtime (5 min during course)
module purge # recommended to be done in all jobs !!!!!
# module load <modules> # load only modules actually needed by job
module load openmpi/4.1.6-gcc-12.2.0-exh7lqk # Open-MPI (including gcc)
echo
echo 'Hello from node: '$HOSTNAME
echo 'Number of nodes: '$SLURM_JOB_NUM_NODES
echo 'Tasks per node: '$SLURM_TASKS_PER_NODE
echo 'Partition used: '$SLURM_JOB_PARTITION
echo 'QOS used: '$SLURM_JOB_QOS
echo 'Using the nodes: '$SLURM_JOB_NODELIST
echo
# <do_my_work>
echo 'Compiling he-mpi & running it on all available cores & sort for output ordering'
# normally compilation could/should be done outside of job
gcc -c help_fortran_find_core_id.c -D_GNU_SOURCE
mpif90 he-mpi.f90 help_fortran_find_core_id.o -o he-mpi
mpirun -np $SLURM_NTASKS --map-by core --rank-by slot --bind-to hwthread ./he-mpi | sort +2n -3 | cut -c 1-54
%%writefile job_openmpi.sh
#!/bin/bash
#SBATCH --job-name openmpi # job name
#SBATCH --nodes 2 # number of nodes (exclusive)
#SBATCH --clusters vsc5 # submit to VSC-5
#SBATCH --qos zen3_0512 # use a qos
#SBATCH --partition zen3_0512 # use partition that fits to the qos
#SBATCH --ntasks-per-node 128 # SLURM_NTASKS_PER_NODE [1 mpi/core]
#SBATCH --reservation jh_training_mpi # reservation DURING COURSE ONLY
#SBATCH --time 0-00:05:00 # max runtime (5 min during course)
module purge # recommended to be done in all jobs !!!!!
module load miniconda3 # activate the conda openmpi environment
eval "$(conda shell.bash hook)" # conda init; needed for activate
conda activate /opt/sw/jupyterhub/envs/conda/vsc5/jupyterhub-vsc-v3
# module load <modules> # load only modules actually needed by job
module load openmpi/4.1.6-gcc-12.2.0-exh7lqk # Open-MPI
export LD_LIBRARY_PATH="$LIBRARY_PATH:$LD_LIBRARY_PATH" # needed for Python
echo
echo 'Hello from node: '$HOSTNAME
echo 'Number of nodes: '$SLURM_JOB_NUM_NODES
echo 'Tasks per node: '$SLURM_TASKS_PER_NODE
echo 'Partition used: '$SLURM_JOB_PARTITION
echo 'QOS used: '$SLURM_JOB_QOS
echo 'Using the nodes: '$SLURM_JOB_NODELIST
echo
# <do_my_work>
echo 'Running he-mpi on all available cores & sort for output ordering'
mpirun -np $SLURM_NTASKS --map-by core --rank-by slot --bind-to hwthread python3 ./he-mpi.py | sort +2n -3
Submit the job script with sbatch
sbatch job_openmpi.sh
Note, on the JupyterHub, we have to get rid of the JupyterHub environment before we can submit a job
source ../tooling/unload_jupyter_env.sh; sbatch job_openmpi.sh
Check the job status as often as needed (job is queued (PD), running (R), or done (empty list))
squeue --me
Click on the slurm-JOB_ID.out file in the left menu (if you did not submit a job you can look at the provided job output).
Job script using the Intel-MPI library¶
%%writefile job_intelmpi.sh
#!/bin/bash
#SBATCH --job-name intelmpi # job name
#SBATCH --nodes 2 # number of nodes (exclusive)
#SBATCH --clusters vsc5 # submit to VSC-5
#SBATCH --qos zen3_0512 # use a qos
#SBATCH --partition zen3_0512 # use partition that fits to the qos
#SBATCH --ntasks-per-node 128 # SLURM_NTASKS_PER_NODE [1 mpi/core]
#SBATCH --reservation jh_training_mpi # reservation DURING COURSE ONLY
#SBATCH --time 0-00:05:00 # max runtime (5 min during course)
module purge # recommended to be done in all jobs !!!!!
# module load <modules> # load only modules actually needed by job
module load intel intel-mpi # VERY OLD Intel compiler and Intel-MPI
echo
echo 'Hello from node: '$HOSTNAME
echo 'Number of nodes: '$SLURM_JOB_NUM_NODES
echo 'Tasks per node: '$SLURM_TASKS_PER_NODE
echo 'Partition used: '$SLURM_JOB_PARTITION
echo 'QOS used: '$SLURM_JOB_QOS
echo 'Using the nodes: '$SLURM_JOB_NODELIST
echo
# <do_my_work>
echo 'Compiling he-mpi & running it on all available cores & sort for output ordering'
mpiicc he-mpi.c -o he-mpi-intel -D_GNU_SOURCE # could/should be done outside of job
export I_MPI_PIN=1
export I_MPI_PIN_RESPECT_CPUSET=0
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export I_MPI_PIN_PROCESSOR_LIST=0-127
mpirun -np $SLURM_NTASKS ./he-mpi-intel | sort +2n -3 | cut -c 1-54
%%writefile job_intelmpi.sh
#!/bin/bash
#SBATCH --job-name intelmpi # job name
#SBATCH --nodes 2 # number of nodes (exclusive)
#SBATCH --clusters vsc5 # submit to VSC-5
#SBATCH --qos zen3_0512 # use a qos
#SBATCH --partition zen3_0512 # use partition that fits to the qos
#SBATCH --ntasks-per-node 128 # SLURM_NTASKS_PER_NODE [1 mpi/core]
#SBATCH --reservation jh_training_mpi # reservation DURING COURSE ONLY
#SBATCH --time 0-00:05:00 # max runtime (5 min during course)
module purge # recommended to be done in all jobs !!!!!
# module load <modules> # load only modules actually needed by job
module load intel intel-mpi # VERY OLD Intel compiler and Intel-MPI
echo
echo 'Hello from node: '$HOSTNAME
echo 'Number of nodes: '$SLURM_JOB_NUM_NODES
echo 'Tasks per node: '$SLURM_TASKS_PER_NODE
echo 'Partition used: '$SLURM_JOB_PARTITION
echo 'QOS used: '$SLURM_JOB_QOS
echo 'Using the nodes: '$SLURM_JOB_NODELIST
echo
# <do_my_work>
echo 'Compiling he-mpi & running it on all available cores & sort for output ordering'
# normally compilation could/should be done outside of job
icc -c help_fortran_find_core_id.c -D_GNU_SOURCE
mpiifort he-mpi.f90 help_fortran_find_core_id.o -o he-mpi-intel
export I_MPI_PIN=1
export I_MPI_PIN_RESPECT_CPUSET=0
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export I_MPI_PIN_PROCESSOR_LIST=0-127
mpirun -np $SLURM_NTASKS ./he-mpi-intel | sort +2n -3 | cut -c 1-54
%%writefile job_intelmpi.sh
#!/bin/bash
#SBATCH --job-name intelmpi # job name
#SBATCH --nodes 2 # number of nodes (exclusive)
#SBATCH --clusters vsc5 # submit to VSC-5
#SBATCH --qos zen3_0512 # use a qos
#SBATCH --partition zen3_0512 # use partition that fits to the qos
#SBATCH --ntasks-per-node 128 # SLURM_NTASKS_PER_NODE [1 mpi/core]
#SBATCH --reservation jh_training_mpi # reservation DURING COURSE ONLY
#SBATCH --time 0-00:05:00 # max runtime (5 min during course)
module purge # recommended to be done in all jobs !!!!!
module load miniconda3 # activate the conda openmpi environment
eval "$(conda shell.bash hook)" # conda init; needed for activate
conda activate /home/fs70824/trainee00/gitlab_sync_courses/conda-source-env-intel
# module load <modules> # load only modules actually needed by job
module load intel intel-mpi # VERY OLD Intel compiler and Intel-MPI
echo
echo 'Hello from node: '$HOSTNAME
echo 'Number of nodes: '$SLURM_JOB_NUM_NODES
echo 'Tasks per node: '$SLURM_TASKS_PER_NODE
echo 'Partition used: '$SLURM_JOB_PARTITION
echo 'QOS used: '$SLURM_JOB_QOS
echo 'Using the nodes: '$SLURM_JOB_NODELIST
echo
# <do_my_work>
echo 'Running he-mpi on all available cores & sort for output ordering'
export I_MPI_PIN=1
export I_MPI_PIN_RESPECT_CPUSET=0
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export I_MPI_PIN_PROCESSOR_LIST=0-127
mpirun -np $SLURM_NTASKS python3 ./he-mpi.py | sort +2n -3
Submit the job script with sbatch
sbatch job_intelmpi.sh
Note, on the JupyterHub, we have to get rid of the JupyterHub environment before we can submit a job
source ../tooling/unload_jupyter_env.sh; sbatch job_intelmpi.sh
Check the job status as often as needed (job is queued (PD), running (R), or done (empty list))
squeue --me
Click on the slurm-JOB_ID.out file in the left menu (if you did not submit a job you can look at the provided job output).
Keypoints
Version of the MPI library in use
Extended version of the “Hello world!” with information about the hardware ressources
Job submission on a cluster