"docs/XcodeGuide.md" did not exist on "642acbd61235dc68f606237193cf7e7c4a61af67"
Commit 7e8d5453 authored by one's avatar one
Browse files

[rocHPL] Test single-node and multi-node using a single script

parent d49f25a8
#!/usr/bin/env bash
# Author: Noel Chalmers
# set -x #echo on
# #################################################
# helper functions
# #################################################
function display_help()
{
echo "rocHPL MPI run helper script"
echo "./mpirun_rochpl "
echo " [-P] Specific MPI grid size: the number of "
echo " rows in MPI grid. "
echo " [-Q] Specific MPI grid size: the number of "
echo " columns in MPI grid. "
echo " [-p] Specific node-local MPI grid size: the number "
echo " of rows in node-local MPI grid. Must evenly "
echo " divide P. "
echo " [-q] Specific node-local MPI grid size: the number "
echo " of columns in node-local MPI grid. Must evenly"
echo " divide Q. "
echo " [-N] Specific matrix size: the number of "
echo " rows/columns in global matrix. "
echo " [--NB] Specific panel size: the number of "
echo " rows/columns in panels. "
echo " [--it] Iterations: the number of times to run each "
echo " problem size. "
echo " [-f] Specific split fraction: the percentange to "
echo " split the trailing submatrix. "
echo " [-i] Input file. When set, all other commnand "
echo " line parameters are ignored, and problem "
echo " parameters are read from input file. "
echo " [-H|--hosts] Comma-separated list of nodes to run on. "
echo " [--tcp-iface] TCP interface to use for communication. "
echo " [--port] SSH port to use for remote connections. "
echo " [-h|--help] prints this help message "
echo " [--version] Print rocHPL version number. "
}
# This function is helpful for dockerfiles that do not have sudo installed, but the default user is root
# true is a system command that completes successfully, function returns success
# prereq: ${ID} must be defined before calling
supported_distro( )
{
if [ -z ${ID+foo} ]; then
printf "supported_distro(): \$ID must be set\n"
exit 2
fi
case "${ID}" in
debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
exit 2
;;
esac
}
# #################################################
# Pre-requisites check
# #################################################
# Exit code 0: alls well
# Exit code 1: problems with getopt
# Exit code 2: problems with supported platforms
# check if getopt command is installed
type getopt > /dev/null
if [[ $? -ne 0 ]]; then
echo "This script uses getopt to parse arguments; try installing the util-linux package";
exit 1
fi
# os-release file describes the system
if [[ -e "/etc/os-release" ]]; then
source /etc/os-release
else
echo "This script depends on the /etc/os-release file"
exit 2
fi
# The following function exits script if an unsupported distro is detected
supported_distro
# Detect the number of GPUs per node
ngpu_per_node=$(hy-smi --showid 2>/dev/null | grep -ic "Device ID")
if [[ -z "${ngpu_per_node}" || "${ngpu_per_node}" -eq 0 ]]; then
echo "Failed to get the number of GPUs per node via hy-smi. Defaulting to 8."
ngpu_per_node=8
else
echo "Detected ${ngpu_per_node} GPUs per node."
fi
# #################################################
# global variables
# #################################################
# Grab options from CMake config
rochpl_bin=@CMAKE_INSTALL_PREFIX@/bin/rochpl
mpi_bin=@MPIEXEC_EXECUTABLE@
rochpl_runscript=$(dirname "$0")/run_rochpl #assume run_rochpl is in the same location
P=1
Q=1
p=-1
q=-1
N=45312
NB=384
it=1
frac=0.3
filename=HPL.dat
inputfile=false
cmdrun=false
tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
ompi_prefix=$tpl_dir/openmpi
ompi_lib_dir=$tpl_dir/openmpi/lib
ucx_lib_dir=$tpl_dir/ucx/lib
export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH
export OPAL_PREFIX=$ompi_prefix
devices=
nodes=
tcp_iface=p14p2
ssh_port=3333
# #################################################
# MPI Args
# #################################################
# count the number of physical cores
num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}')
num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))
# #################################################
# Parameter parsing
# #################################################
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --longoptions NB:,it:,help,version,devices:,hosts:,port:,tcp-iface:, --options hP:Q:p:q:N:i:f:H: -- "$@")
else
echo "Need a new version of getopt"
exit 1
fi
if [[ $? -ne 0 ]]; then
echo "getopt invocation failed; could not parse the command line";
exit 1
fi
eval set -- "${GETOPT_PARSE}"
while true; do
case "${1}" in
-h|--help)
display_help
exit 0
;;
--version)
${mpi_bin} --allow-run-as-root -np 1 ${rochpl_runscript} --version
exit 0
;;
-P)
P=${2}
shift 2 ;;
-Q)
Q=${2}
shift 2 ;;
-p)
p=${2}
shift 2 ;;
-q)
q=${2}
shift 2 ;;
-N)
N=${2}
cmdrun=true
shift 2 ;;
--NB)
NB=${2}
cmdrun=true
shift 2 ;;
--it)
it=${2}
shift 2 ;;
-f)
frac=${2}
shift 2 ;;
-i)
filename=${2}
inputfile=true
shift 2 ;;
--devices)
devices=${2}
shift 2 ;;
-H|--hosts)
nodes=${2}
shift 2 ;;
--tcp-iface)
tcp_iface=${2}
shift 2 ;;
--port)
ssh_port=${2}
shift 2 ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
;;
esac
done
#if nothing but np and ppn parameters where given, default to running
# with default input file
if [[ "${inputfile}" == false && "${cmdrun}" == false ]]; then
inputfile=true
fi
np=$(($P*$Q))
if [[ "$np" -lt 1 ]]; then
echo "Invalid MPI grid parameters; aborting";
exit 1
fi
if [[ "${inputfile}" == true ]]; then
rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -i ${filename} -f ${frac} --it ${it}"
else
rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -N ${N} --NB ${NB} -f ${frac} --it ${it}"
fi
if [ ! -z "${devices}" ]; then
rochpl_args+=" --devices=${devices}"
fi
# Run single-node test if --hosts is not set
if [ -z "${nodes}" ]; then
echo "No compute nodes specified. Running in single-node mode."
${mpi_bin} --allow-run-as-root \
--bind-to none \
--mca pml ucx \
--mca osc ucx \
--mca btl ^vader,tcp,openib,uct \
--mca coll ^hcoll \
-x UCX_TLS=self,sm,rocm \
-x UCX_RNDV_SCHEME=put_zcopy \
-x UCX_MEMTYPE_CACHE=n \
-x HSA_FORCE_FINE_GRAIN_PCIE=1 \
-np ${np} \
${rochpl_runscript} ${rochpl_args}
else
echo "Running in multi-node mode. Using nodes: ${nodes}"
echo "Using TCP interface: ${tcp_iface}"
echo "Using SSH port: ${ssh_port}"
# Set rank counts for hosts
IFS=',' read -ra node_array <<< "${nodes}"
hosts_string=""
for node in "${node_array[@]}"; do
hosts_string+="${node}:${ngpu_per_node},"
done
hosts_string="${hosts_string%,}"
echo "MPI hosts: ${hosts_string}"
# Copy files to other nodes
current_node=$(hostname)
copyto_hosts=()
for node in "${node_array[@]}"; do
if [[ "${node}" != "${current_node}" ]]; then
copyto_hosts+=("${node}")
fi
done
# Copy files using rsync only if there are other nodes to copy to
if [ ${#copyto_hosts[@]} -gt 0 ]; then
echo "Copying files to other nodes in parallel: ${copyto_hosts[@]}"
for node in "${copyto_hosts[@]}"; do
if [[ "${inputfile}" == false ]]; then
rsync -az -e "ssh -p ${ssh_port}" build tpl "${node}:/workspace/" &
else
rsync -az -e "ssh -p ${ssh_port}" build tpl ${filename} "${node}:/workspace/" &
fi
done
wait
echo "Files synchronized successfully."
fi
# Multi-node run
${mpi_bin} --allow-run-as-root \
--prefix ${ompi_prefix} \
--map-by ppr:${ngpu_per_node}:node --bind-to none \
--mca pml ucx \
--mca osc ucx \
--mca btl ^openib \
--mca btl_tcp_if_include ${tcp_iface} \
--mca plm_rsh_args "-p ${ssh_port}" \
--mca coll_hcoll_enable 0 \
-x UCX_TLS=self,sm,rocm,rc \
-x UCX_RNDV_SCHEME=put_zcopy \
-x UCX_RNDV_FRAG_MEM_TYPE=rocm \
-x UCX_MEMTYPE_CACHE=n \
-x HSA_FORCE_FINE_GRAIN_PCIE=1 \
-x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \
-np ${np} \
-H ${hosts_string} \
${rochpl_runscript} ${rochpl_args}
fi
\ No newline at end of file
diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in
index 155f502..a0e8a41 100755
--- a/scripts/mpirun_rochpl.in
+++ b/scripts/mpirun_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -101,6 +101,13 @@ filename=HPL.dat
inputfile=false
cmdrun=false
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ompi_prefix=$tpl_dir/openmpi
+ompi_lib_dir=$tpl_dir/openmpi/lib
+ucx_lib_dir=$tpl_dir/ucx/lib
+
+export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH
+export OPAL_PREFIX=$ompi_prefix
devices=
# #################################################
@@ -111,21 +118,6 @@ num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}')
num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))
-#Default MPI options
-mpi_args=
-
-#Check if using OpenMPI
-if [[ $(${mpi_bin} --version | grep "open-mpi") ]]; then
- mpi_args+=" --map-by node --rank-by slot --bind-to none "
-
- #Check if this is OpenMPI+UCX
- ompi_info=$(dirname ${mpi_bin})/ompi_info
- if [[ $(${ompi_info} | grep "MCA pml: ucx") ]]; then
- # ucx-specific args
- mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}"
- fi
-fi
-
# #################################################
# Parameter parsing
# #################################################
@@ -153,7 +145,7 @@ while true; do
exit 0
;;
--version)
- ${mpi_bin} -np 1 ${mpi_args} ${rochpl_runscript} --version
+ ${mpi_bin} --allow-run-as-root -np 1 ${rochpl_runscript} --version
exit 0
;;
-P)
@@ -218,5 +210,25 @@ if [ ! -z "${devices}" ]; then
rochpl_args+=" --devices=${devices}"
fi
+echo "Copying files..."
+rsync -az -e 'ssh -p 3333' build tpl ${filename} node02:/workspace/
+
#run
-${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args}
+${mpi_bin} --allow-run-as-root \
+ --prefix ${ompi_prefix} \
+ --map-by ppr:8:node --bind-to none \
+ --mca pml ucx \
+ --mca osc ucx \
+ --mca btl ^openib \
+ --mca btl_tcp_if_include p14p2 \
+ --mca plm_rsh_args "-p 3333" \
+ --mca coll_hcoll_enable 0 \
+ -x UCX_TLS=self,sm,rocm,rc \
+ -x UCX_RNDV_SCHEME=put_zcopy \
+ -x UCX_RNDV_FRAG_MEM_TYPE=rocm \
+ -x UCX_MEMTYPE_CACHE=n \
+ -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
+ -x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \
+ -np 16 \
+ -H node01:8,node02:8 \
+ ${rochpl_runscript} ${rochpl_args}
diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in
index 1522e5d..68c1958 100755
--- a/scripts/run_rochpl.in
+++ b/scripts/run_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -104,7 +104,9 @@ cmdrun=false
devices=
-export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ucx_lib_dir=$tpl_dir/ucx/lib
+export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$ucx_lib_dir:${rocm_dir}/lib:$LD_LIBRARY_PATH
# #################################################
# Parameter parsing
@@ -274,7 +276,7 @@ myq=$((rank/p))
cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
#construct list of devices and their numa affinities
-devicelist=$(${rocm_dir}/bin/rocm-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
#count the cpus per core
threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
@@ -361,9 +363,23 @@ export OMP_NUM_THREADS=${omp_num_threads}
export OMP_PLACES=${omp_places}
export OMP_PROC_BIND=true
+# Hard-coded IB mapping for now
+declare -a IB_MAP=(
+ "mlx5_1:1" # GPU 0 -> NUMA 3
+ "mlx5_2:1" # GPU 1 -> NUMA 3
+ "mlx5_3:1" # GPU 2 -> NUMA 0
+ "mlx5_4:1" # GPU 3 -> NUMA 0
+ "mlx5_7:1" # GPU 4 -> NUMA 7
+ "mlx5_8:1" # GPU 5 -> NUMA 7
+ "mlx5_9:1" # GPU 6 -> NUMA 4
+ "mlx5_10:1" # GPU 7 -> NUMA 4
+)
+
+myib=${IB_MAP[$rank]}
+export UCX_NET_DEVICES=${myib}
if [[ $globalRank -lt $size ]]; then
- echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, CPU Cores: $omp_num_threads - $places"
+ echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma, IB: $myib, CPU Cores: $omp_num_threads - $places"
fi
rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac} -it ${it}"
@@ -374,4 +390,5 @@ else
fi
#run
-${rochpl_bin} ${rochpl_args}
+#${rochpl_bin} ${rochpl_args}
+numactl -N ${mynuma} -m ${mynuma} ${rochpl_bin} ${rochpl_args}
diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in
index 155f502..72b25c1 100755
--- a/scripts/mpirun_rochpl.in
+++ b/scripts/mpirun_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -101,6 +101,13 @@ filename=HPL.dat
inputfile=false
cmdrun=false
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ompi_prefix=$tpl_dir/openmpi
+ompi_lib_dir=$tpl_dir/openmpi/lib
+ucx_lib_dir=$tpl_dir/ucx/lib
+
+export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH
+export OPAL_PREFIX=$ompi_prefix
devices=
# #################################################
@@ -111,21 +118,6 @@ num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}')
num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))
-#Default MPI options
-mpi_args=
-
-#Check if using OpenMPI
-if [[ $(${mpi_bin} --version | grep "open-mpi") ]]; then
- mpi_args+=" --map-by node --rank-by slot --bind-to none "
-
- #Check if this is OpenMPI+UCX
- ompi_info=$(dirname ${mpi_bin})/ompi_info
- if [[ $(${ompi_info} | grep "MCA pml: ucx") ]]; then
- # ucx-specific args
- mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}"
- fi
-fi
-
# #################################################
# Parameter parsing
# #################################################
@@ -153,7 +145,7 @@ while true; do
exit 0
;;
--version)
- ${mpi_bin} -np 1 ${mpi_args} ${rochpl_runscript} --version
+ ${mpi_bin} --allow-run-as-root -np 1 ${rochpl_runscript} --version
exit 0
;;
-P)
@@ -218,5 +210,33 @@ if [ ! -z "${devices}" ]; then
rochpl_args+=" --devices=${devices}"
fi
+echo "Copying files..."
+if [[ "${inputfile}" == false ]]; then
+ rsync -az -e 'ssh -p 3333' build tpl node02:/workspace/
+ rsync -az -e 'ssh -p 3333' build tpl node03:/workspace/
+ rsync -az -e 'ssh -p 3333' build tpl node04:/workspace/
+else
+ rsync -az -e 'ssh -p 3333' build tpl ${filename} node02:/workspace/
+ rsync -az -e 'ssh -p 3333' build tpl ${filename} node03:/workspace/
+ rsync -az -e 'ssh -p 3333' build tpl ${filename} node04:/workspace/
+fi
+
#run
-${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args}
+${mpi_bin} --allow-run-as-root \
+ --prefix ${ompi_prefix} \
+ --map-by ppr:8:node --bind-to none \
+ --mca pml ucx \
+ --mca osc ucx \
+ --mca btl ^openib \
+ --mca btl_tcp_if_include p14p2 \
+ --mca plm_rsh_args "-p 3333" \
+ --mca coll_hcoll_enable 0 \
+ -x UCX_TLS=self,sm,rocm,rc \
+ -x UCX_RNDV_SCHEME=put_zcopy \
+ -x UCX_RNDV_FRAG_MEM_TYPE=rocm \
+ -x UCX_MEMTYPE_CACHE=n \
+ -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
+ -x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \
+ -np 32 \
+ -H node01:8,node02:8,node03:8,node04:8 \
+ ${rochpl_runscript} ${rochpl_args}
diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in
index 1522e5d..68c1958 100755
--- a/scripts/run_rochpl.in
+++ b/scripts/run_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( )
fi
case "${ID}" in
- debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos)
+ debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -104,7 +104,9 @@ cmdrun=false
devices=
-export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH
+tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
+ucx_lib_dir=$tpl_dir/ucx/lib
+export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$ucx_lib_dir:${rocm_dir}/lib:$LD_LIBRARY_PATH
# #################################################
# Parameter parsing
@@ -274,7 +276,7 @@ myq=$((rank/p))
cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
#construct list of devices and their numa affinities
-devicelist=$(${rocm_dir}/bin/rocm-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
#count the cpus per core
threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
@@ -361,9 +363,23 @@ export OMP_NUM_THREADS=${omp_num_threads}
export OMP_PLACES=${omp_places}
export OMP_PROC_BIND=true
+# Hard-coded IB mapping for now
+declare -a IB_MAP=(
+ "mlx5_1:1" # GPU 0 -> NUMA 3
+ "mlx5_2:1" # GPU 1 -> NUMA 3
+ "mlx5_3:1" # GPU 2 -> NUMA 0
+ "mlx5_4:1" # GPU 3 -> NUMA 0
+ "mlx5_7:1" # GPU 4 -> NUMA 7
+ "mlx5_8:1" # GPU 5 -> NUMA 7
+ "mlx5_9:1" # GPU 6 -> NUMA 4
+ "mlx5_10:1" # GPU 7 -> NUMA 4
+)
+
+myib=${IB_MAP[$rank]}
+export UCX_NET_DEVICES=${myib}
if [[ $globalRank -lt $size ]]; then
- echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, CPU Cores: $omp_num_threads - $places"
+ echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma, IB: $myib, CPU Cores: $omp_num_threads - $places"
fi
rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac} -it ${it}"
@@ -374,4 +390,5 @@ else
fi
#run
-${rochpl_bin} ${rochpl_args}
+#${rochpl_bin} ${rochpl_args}
+numactl -N ${mynuma} -m ${mynuma} ${rochpl_bin} ${rochpl_args}
diff --git a/scripts/wrapper.sh b/scripts/wrapper.sh
new file mode 100755
index 0000000..e11d6dc
--- /dev/null
+++ b/scripts/wrapper.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+tpl_dir=/workspace/tpl
+ompi_prefix=$tpl_dir/openmpi
+ompi_lib_dir=$tpl_dir/openmpi/lib
+ucx_lib_dir=$tpl_dir/ucx/lib
+
+export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${ompi_lib_dir}:${ucx_lib_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH
+export OPAL_PREFIX=$ompi_prefix
+export PATH=${ompi_lib_dir}/../bin:${ucx_lib_dir}/../bin:${PATH}
+
+exec mpirun --allow-run-as-root \
+ --prefix ${ompi_prefix} \
+ -np 16 \
+ -H node01:8,node02:8 \
+ --mca plm_rsh_args "-p 3333" \
+ -x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \
+ -x UCX_NET_DEVICES=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10 \
+ "$@"
diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in
index 155f502..df6bd9b 100755 index 155f502..0f15d38 100755
--- a/scripts/mpirun_rochpl.in --- a/scripts/mpirun_rochpl.in
+++ b/scripts/mpirun_rochpl.in +++ b/scripts/mpirun_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( ) @@ -31,6 +31,9 @@ function display_help()
echo " [-i] Input file. When set, all other commnand "
echo " line parameters are ignored, and problem "
echo " parameters are read from input file. "
+ echo " [-H|--hosts] Comma-separated list of nodes to run on. "
+ echo " [--tcp-iface] TCP interface to use for communication. "
+ echo " [--port] SSH port to use for remote connections. "
echo " [-h|--help] prints this help message "
echo " [--version] Print rocHPL version number. "
}
@@ -46,7 +49,7 @@ supported_distro( )
fi fi
case "${ID}" in case "${ID}" in
...@@ -11,7 +21,23 @@ index 155f502..df6bd9b 100755 ...@@ -11,7 +21,23 @@ index 155f502..df6bd9b 100755
true true
;; ;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n" *) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
@@ -101,6 +101,13 @@ filename=HPL.dat @@ -80,6 +83,15 @@ fi
# The following function exits script if an unsupported distro is detected
supported_distro
+# Detect the number of GPUs per node
+ngpu_per_node=$(hy-smi --showid 2>/dev/null | grep -ic "Device ID")
+if [[ -z "${ngpu_per_node}" || "${ngpu_per_node}" -eq 0 ]]; then
+ echo "Failed to get the number of GPUs per node via hy-smi. Defaulting to 8."
+ ngpu_per_node=8
+else
+ echo "Detected ${ngpu_per_node} GPUs per node."
+fi
+
# #################################################
# global variables
# #################################################
@@ -101,7 +113,17 @@ filename=HPL.dat
inputfile=false inputfile=false
cmdrun=false cmdrun=false
...@@ -23,9 +49,13 @@ index 155f502..df6bd9b 100755 ...@@ -23,9 +49,13 @@ index 155f502..df6bd9b 100755
+export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$ompi_lib_dir:$ucx_lib_dir:$LD_LIBRARY_PATH
+export OPAL_PREFIX=$ompi_prefix +export OPAL_PREFIX=$ompi_prefix
devices= devices=
+nodes=
+tcp_iface=p14p2
+ssh_port=3333
# ################################################# # #################################################
@@ -111,21 +118,6 @@ num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}') # MPI Args
@@ -111,21 +133,6 @@ num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}')
num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}') num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets)) total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))
...@@ -47,7 +77,16 @@ index 155f502..df6bd9b 100755 ...@@ -47,7 +77,16 @@ index 155f502..df6bd9b 100755
# ################################################# # #################################################
# Parameter parsing # Parameter parsing
# ################################################# # #################################################
@@ -153,7 +145,7 @@ while true; do @@ -133,7 +140,7 @@ fi
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
- GETOPT_PARSE=$(getopt --name "${0}" --longoptions NB:,it:,help,version,devices:, --options hP:Q:p:q:N:i:f: -- "$@")
+ GETOPT_PARSE=$(getopt --name "${0}" --longoptions NB:,it:,help,version,devices:,hosts:,port:,tcp-iface:, --options hP:Q:p:q:N:i:f:H: -- "$@")
else
echo "Need a new version of getopt"
exit 1
@@ -153,7 +160,7 @@ while true; do
exit 0 exit 0
;; ;;
--version) --version)
...@@ -56,25 +95,105 @@ index 155f502..df6bd9b 100755 ...@@ -56,25 +95,105 @@ index 155f502..df6bd9b 100755
exit 0 exit 0
;; ;;
-P) -P)
@@ -219,4 +211,15 @@ if [ ! -z "${devices}" ]; then @@ -189,6 +196,15 @@ while true; do
--devices)
devices=${2}
shift 2 ;;
+ -H|--hosts)
+ nodes=${2}
+ shift 2 ;;
+ --tcp-iface)
+ tcp_iface=${2}
+ shift 2 ;;
+ --port)
+ ssh_port=${2}
+ shift 2 ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
@@ -218,5 +234,77 @@ if [ ! -z "${devices}" ]; then
rochpl_args+=" --devices=${devices}"
fi fi
#run -#run
-${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args} -${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args}
+${mpi_bin} --allow-run-as-root \ +# Run single-node test if --hosts is not set
+ --bind-to none \ +if [ -z "${nodes}" ]; then
+ --mca pml ucx \ + echo "No compute nodes specified. Running in single-node mode."
+ --mca osc ucx \ +
+ --mca btl ^vader,tcp,openib,uct \ + ${mpi_bin} --allow-run-as-root \
+ --mca coll ^hcoll \ + --bind-to none \
+ -x UCX_TLS=self,sm,rocm \ + --mca pml ucx \
+ -x UCX_RNDV_SCHEME=put_zcopy \ + --mca osc ucx \
+ -x UCX_MEMTYPE_CACHE=n \ + --mca btl ^vader,tcp,openib,uct \
+ -x HSA_FORCE_FINE_GRAIN_PCIE=1 \ + --mca coll ^hcoll \
+ -np ${np} \ + -x UCX_TLS=self,sm,rocm \
+ ${rochpl_runscript} ${rochpl_args} + -x UCX_RNDV_SCHEME=put_zcopy \
+ -x UCX_MEMTYPE_CACHE=n \
+ -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
+ -np ${np} \
+ ${rochpl_runscript} ${rochpl_args}
+else
+ echo "Running in multi-node mode. Using nodes: ${nodes}"
+ echo "Using TCP interface: ${tcp_iface}"
+ echo "Using SSH port: ${ssh_port}"
+
+ # Set rank counts for hosts
+ IFS=',' read -ra node_array <<< "${nodes}"
+ hosts_string=""
+ for node in "${node_array[@]}"; do
+ hosts_string+="${node}:${ngpu_per_node},"
+ done
+ hosts_string="${hosts_string%,}"
+
+ echo "MPI hosts: ${hosts_string}"
+
+ # Copy files to other nodes
+ current_node=$(hostname)
+ copyto_hosts=()
+ for node in "${node_array[@]}"; do
+ if [[ "${node}" != "${current_node}" ]]; then
+ copyto_hosts+=("${node}")
+ fi
+ done
+
+ # Copy files using rsync only if there are other nodes to copy to
+ if [ ${#copyto_hosts[@]} -gt 0 ]; then
+ echo "Copying files to other nodes in parallel: ${copyto_hosts[@]}"
+ for node in "${copyto_hosts[@]}"; do
+ if [[ "${inputfile}" == false ]]; then
+ rsync -az -e "ssh -p ${ssh_port}" build tpl "${node}:/workspace/" &
+ else
+ rsync -az -e "ssh -p ${ssh_port}" build tpl ${filename} "${node}:/workspace/" &
+ fi
+ done
+ wait
+ echo "Files synchronized successfully."
+ fi
+
+ # Multi-node run
+ ${mpi_bin} --allow-run-as-root \
+ --prefix ${ompi_prefix} \
+ --map-by ppr:${ngpu_per_node}:node --bind-to none \
+ --mca pml ucx \
+ --mca osc ucx \
+ --mca btl ^openib \
+ --mca btl_tcp_if_include ${tcp_iface} \
+ --mca plm_rsh_args "-p ${ssh_port}" \
+ --mca coll_hcoll_enable 0 \
+ -x UCX_TLS=self,sm,rocm,rc \
+ -x UCX_RNDV_SCHEME=put_zcopy \
+ -x UCX_RNDV_FRAG_MEM_TYPE=rocm \
+ -x UCX_MEMTYPE_CACHE=n \
+ -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
+ -x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \
+ -np ${np} \
+ -H ${hosts_string} \
+ ${rochpl_runscript} ${rochpl_args}
+fi
\ No newline at end of file
diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in
index 1522e5d..3f840a7 100755 index 1522e5d..68c1958 100755
--- a/scripts/run_rochpl.in --- a/scripts/run_rochpl.in
+++ b/scripts/run_rochpl.in +++ b/scripts/run_rochpl.in
@@ -46,7 +46,7 @@ supported_distro( ) @@ -46,7 +46,7 @@ supported_distro( )
...@@ -106,19 +225,35 @@ index 1522e5d..3f840a7 100755 ...@@ -106,19 +225,35 @@ index 1522e5d..3f840a7 100755
#count the cpus per core #count the cpus per core
threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*") threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
@@ -363,7 +365,7 @@ export OMP_PROC_BIND=true @@ -361,9 +363,23 @@ export OMP_NUM_THREADS=${omp_num_threads}
export OMP_PLACES=${omp_places}
export OMP_PROC_BIND=true
+# Hard-coded IB mapping for now
+declare -a IB_MAP=(
+ "mlx5_1:1" # GPU 0 -> NUMA 3
+ "mlx5_2:1" # GPU 1 -> NUMA 3
+ "mlx5_3:1" # GPU 2 -> NUMA 0
+ "mlx5_4:1" # GPU 3 -> NUMA 0
+ "mlx5_7:1" # GPU 4 -> NUMA 7
+ "mlx5_8:1" # GPU 5 -> NUMA 7
+ "mlx5_9:1" # GPU 6 -> NUMA 4
+ "mlx5_10:1" # GPU 7 -> NUMA 4
+)
+
+myib=${IB_MAP[$rank]}
+export UCX_NET_DEVICES=${myib}
if [[ $globalRank -lt $size ]]; then if [[ $globalRank -lt $size ]]; then
- echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, CPU Cores: $omp_num_threads - $places" - echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, CPU Cores: $omp_num_threads - $places"
+ echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma, CPU Cores: $omp_num_threads - $places" + echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma, IB: $myib, CPU Cores: $omp_num_threads - $places"
fi fi
rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac} -it ${it}" rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac} -it ${it}"
@@ -374,4 +376,5 @@ else @@ -374,4 +390,5 @@ else
fi fi
#run #run
-${rochpl_bin} ${rochpl_args} -${rochpl_bin} ${rochpl_args}
+#${rochpl_bin} ${rochpl_args} +#${rochpl_bin} ${rochpl_args}
+numactl --cpunodebind=${mynuma} --membind=${mynuma} ${rochpl_bin} ${rochpl_args} +numactl -N ${mynuma} -m ${mynuma} ${rochpl_bin} ${rochpl_args}
#!/usr/bin/env bash
# Author: Noel Chalmers
# set -x #echo on
# #################################################
# helper functions
# #################################################
function display_help()
{
echo "rocHPL run helper script"
echo "./run_rochpl "
echo " [-P] Specific MPI grid size: the number of "
echo " rows in MPI grid. "
echo " [-Q] Specific MPI grid size: the number of "
echo " columns in MPI grid. "
echo " [-p] Specific node-local MPI grid size: the number "
echo " of rows in node-local MPI grid. Must evenly "
echo " divide P. "
echo " [-q] Specific node-local MPI grid size: the number "
echo " of columns in node-local MPI grid. Must evenly"
echo " divide Q. "
echo " [-N] Specific matrix size: the number of "
echo " rows/columns in global matrix. "
echo " [--NB] Specific panel size: the number of "
echo " rows/columns in panels. "
echo " [--it] Iterations: the number of times to run each "
echo " problem size. "
echo " [-f] Specific split fraction: the percentange to "
echo " split the trailing submatrix. "
echo " [-i] Input file. When set, all other commnand "
echo " line parameters are ignored, and problem "
echo " parameters are read from input file. "
echo " [-h|--help] prints this help message "
echo " [--version] Print rocHPL version number. "
}
# This function is helpful for dockerfiles that do not have sudo installed, but the default user is root
# true is a system command that completes successfully, function returns success
# prereq: ${ID} must be defined before calling
supported_distro( )
{
if [ -z ${ID+foo} ]; then
printf "supported_distro(): \$ID must be set\n"
exit 2
fi
case "${ID}" in
debian|linuxmint|ubuntu|centos|rhel|fedora|sles|tencentos|kylin|rocky)
true
;;
*) printf "This script is currently supported on Debian, Linuxmint, Ubuntu, CentOS, RHEL, Fedora and SLES\n"
exit 2
;;
esac
}
# #################################################
# Pre-requisites check
# #################################################
# Exit code 0: alls well
# Exit code 1: problems with getopt
# Exit code 2: problems with supported platforms
# check if getopt command is installed
type getopt > /dev/null
if [[ $? -ne 0 ]]; then
echo "This script uses getopt to parse arguments; try installing the util-linux package";
exit 1
fi
# os-release file describes the system
if [[ -e "/etc/os-release" ]]; then
source /etc/os-release
else
echo "This script depends on the /etc/os-release file"
exit 2
fi
# The following function exits script if an unsupported distro is detected
supported_distro
# #################################################
# global variables
# #################################################
# Grab options from CMake config
rochpl_bin=@CMAKE_INSTALL_PREFIX@/bin/rochpl
rocm_dir=@ROCM_PATH@
rocblas_dir=@ROCBLAS_LIB_PATH@
blas_dir=@HPL_BLAS_DIR@
P=1
Q=1
p=-1
q=-1
N=45312
NB=384
it=1
frac=0.3
filename=HPL.dat
inputfile=false
cmdrun=false
devices=
tpl_dir=$(dirname "$(readlink -f "$0")")/../tpl
ucx_lib_dir=$tpl_dir/ucx/lib
export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$ucx_lib_dir:${rocm_dir}/lib:$LD_LIBRARY_PATH
# #################################################
# Parameter parsing
# #################################################
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --longoptions NB:,it:,help,version,devices:, --options hP:Q:p:q:N:i:f: -- "$@")
else
echo "Need a new version of getopt"
exit 1
fi
if [[ $? -ne 0 ]]; then
echo "getopt invocation failed; could not parse the command line";
exit 1
fi
eval set -- "${GETOPT_PARSE}"
while true; do
case "${1}" in
-h|--help)
display_help
exit 0
;;
--version)
${rochpl_bin} --version
exit 0
;;
-P)
P=${2}
shift 2 ;;
-Q)
Q=${2}
shift 2 ;;
-p)
p=${2}
shift 2 ;;
-q)
q=${2}
shift 2 ;;
-N)
N=${2}
cmdrun=true
shift 2 ;;
--NB)
NB=${2}
cmdrun=true
shift 2 ;;
--it)
it=${2}
shift 2 ;;
-f)
frac=${2}
shift 2 ;;
-i)
filename=${2}
inputfile=true
shift 2 ;;
--devices)
devices=${2}
shift 2 ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
;;
esac
done
#if nothing but np and ppn parameters where given, default to running
# with default input file
if [[ "${inputfile}" == false && "${cmdrun}" == false ]]; then
inputfile=true
fi
np=$(($P*$Q))
if [[ "$np" -lt 1 ]]; then
echo "Invalid MPI grid parameters; aborting";
exit 1
fi
#######################################
# Now figure out the CPU core mappings
#######################################
# Get local process numbering
set +u
if [[ -n ${OMPI_COMM_WORLD_LOCAL_RANK+x} ]]; then
globalRank=$OMPI_COMM_WORLD_RANK
globalSize=$OMPI_COMM_WORLD_SIZE
rank=$OMPI_COMM_WORLD_LOCAL_RANK
size=$OMPI_COMM_WORLD_LOCAL_SIZE
elif [[ -n ${SLURM_LOCALID+x} ]]; then
globalRank=$SLURM_PROCID
globalSize=$SLURM_NTASKS
rank=$SLURM_LOCALID
size=$SLURM_TASKS_PER_NODE
#Slurm can return a string like "2(x2),1". Get the first number
size=$(echo $size | sed -r 's/^([^.]+).*$/\1/; s/^[^0-9]*([0-9]+).*$/\1/')
elif [[ -n ${FLUX_TASK_LOCAL_ID+x} ]]; then
globalRank=$FLUX_TASK_RANK
globalSize=$FLUX_JOB_SIZE
nnodes=$FLUX_JOB_NNODES
rank=$FLUX_TASK_LOCAL_ID
size=$((globalSize/nnodes))
fi
set -u
#Determing node-local grid size
if [[ "$p" -lt 1 && "$q" -lt 1 ]]; then
# no node-local grid was specified, pick defaults
q=$(( (Q<=size) ? Q : size))
if [[ $((size % q)) -gt 0 ]]; then
echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting";
exit 1
fi
p=$(( size/q ))
elif [[ "$p" -lt 1 ]]; then
#q was specified
if [[ $((size % q)) -gt 0 ]]; then
echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting";
exit 1
fi
p=$(( size/q ))
elif [[ "$q" -lt 1 ]]; then
#p was specified
if [[ $((size % p)) -gt 0 ]]; then
echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting";
exit 1
fi
q=$(( size/p ))
else
#Both p and q were specified
if [[ $size -ne $((p*q)) ]]; then
echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting";
exit 1
fi
fi
# Check that the columns are evenly divided among nodes
if [[ $((P % p)) -gt 0 ]]; then
echo "Invalid MPI grid parameters; Must have the same number of P rows on every node; aborting";
exit 1
fi
# Check that the rows are evenly divided among nodes
if [[ $((Q % q)) -gt 0 ]]; then
echo "Invalid MPI grid parameters; Must have the same number of Q columns on every node; aborting";
exit 1
fi
myp=$((rank%p))
myq=$((rank/p))
#construct a list of all cpus, sorted by core
cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
#construct list of devices and their numa affinities
devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
#count the cpus per core
threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
#remove the extra cpus on each core to make a list of just physical cores, then sort by numa domain
corelist=$(echo "$cpulist" | awk -v tpc=${threads_per_core} '(NR-1)%tpc==0' | sort -k 3 -g -s)
#count numa domains
line=($(echo "$cpulist" | tail -n 1))
n_numa=$((line[2]+1))
numa_core_counts=()
numa_proc_counts=()
for i in $(seq 1 ${n_numa}); do numa_core_counts+=(0); numa_proc_counts+=(0); done
#parse the list of cpus to array and count cpus in each numa
cpus=()
while read -a line; do
cpus+=(${line[0]})
((numa_core_counts[${line[2]}]++))
done <<< ${corelist}
numa_core_offsets=(0)
for i in $(seq 1 $((n_numa-1))); do numa_core_offsets+=($((numa_core_offsets[$((i-1))] + numa_core_counts[$i]))); done
#parse device to numa mapping
device_to_numa=()
while read -a line; do
device_to_numa+=(${line[1]})
done <<< ${devicelist}
rank_to_device=()
if [ ! -z "${devices}" ]; then
IFS=',' read -r -a device_array <<< "$devices"
n_devices=${#device_array[@]}
for i in $(seq 0 $((size-1))); do
rank_to_device+=(${device_array[$((i%n_devices))]})
done
export ROCR_VISIBLE_DEVICES=${rank_to_device[rank]}
else
n_devices=$(echo "${devicelist}" | grep -c "card")
for i in $(seq 0 $((size-1))); do
rank_to_device+=($((i%n_devices)))
done
fi
mygpu=${rank_to_device[rank]}
mynuma=${device_to_numa[mygpu]}
rank_to_numa=()
for i in $(seq 0 $((size-1))); do
rank_to_numa+=(${device_to_numa[${rank_to_device[$((i%n_devices))]}]})
done
for i in $(seq 0 $((size-1))); do
numa=${rank_to_numa[$i]}
((numa_proc_counts[numa]++))
done
omp_num_threads=$((numa_core_counts[mynuma]/numa_proc_counts[mynuma]))
core_offset=${numa_core_offsets[mynuma]}
for i in $(seq 0 $((rank-1))); do
numa=${rank_to_numa[$i]}
if [[ $numa -eq $mynuma ]]; then
core_offset=$((core_offset + omp_num_threads))
fi
done
omp_places="{${cpus[core_offset]}}"
for c in $(seq 1 $((omp_num_threads-1))); do
omp_places+=",{${cpus[core_offset+c]}}"
done
if [[ $omp_num_threads -gt 1 ]]; then
places="{${cpus[core_offset]}-${cpus[core_offset+$((omp_num_threads-1))]}}"
else
places="{${cpus[core_offset]}}"
fi
# Export OpenMP config
export OMP_NUM_THREADS=${omp_num_threads}
export OMP_PLACES=${omp_places}
export OMP_PROC_BIND=true
# Hard-coded IB mapping for now
declare -a IB_MAP=(
"mlx5_1:1" # GPU 0 -> NUMA 3
"mlx5_2:1" # GPU 1 -> NUMA 3
"mlx5_3:1" # GPU 2 -> NUMA 0
"mlx5_4:1" # GPU 3 -> NUMA 0
"mlx5_7:1" # GPU 4 -> NUMA 7
"mlx5_8:1" # GPU 5 -> NUMA 7
"mlx5_9:1" # GPU 6 -> NUMA 4
"mlx5_10:1" # GPU 7 -> NUMA 4
)
myib=${IB_MAP[$rank]}
export UCX_NET_DEVICES=${myib}
if [[ $globalRank -lt $size ]]; then
echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] GPU: $mygpu, NUMA: $mynuma, IB: $myib, CPU Cores: $omp_num_threads - $places"
fi
rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac} -it ${it}"
if [[ "${inputfile}" == true ]]; then
rochpl_args+=" -i ${filename}"
else
rochpl_args+=" -N ${N} -NB ${NB}"
fi
#run
#${rochpl_bin} ${rochpl_args}
numactl -N ${mynuma} -m ${mynuma} ${rochpl_bin} ${rochpl_args}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment