Commit 11052102 authored by one's avatar one
Browse files

[rocHPCG] Add patches and scripts

parent 03a29671
## rocHPCG Patch Files
- `rochpcg-install.patch`
- `rochpcg-scripts-bw.patch`
## Examples
```bash
./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
--npx 4 --npy 4 --npz 2 \
-H node01,node02,node03,node04 \
--tcp-iface p14p2 \
--ssh-port 3333
```
\ No newline at end of file
#!/bin/bash
set -e
# =================================================
# Helper functions
# =================================================
help() {
cat << EOF
rocHPCG MPI run helper script
Usage: $(basename "$0") [OPTIONS]
OPTIONS:
-h, --help Show this help message and exit
--npx Number of processes in x dimension of process grid (default: ${npx})
--npy Number of processes in y dimension of process grid (default: ${npy})
--npz Number of processes in z dimension of process grid (default: ${npz})
--nx Problem size in x dimension (default: ${nx})
--ny Problem size in y dimension (default: ${ny})
--nz Problem size in z dimension (default: ${nz})
--rt Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
--tol Residual tolerance, skip reference verification if set (default: ${tol})
--pz Partition boundary in z process dimension (default: 0, uniform grid)
--zl Local nz value for processes with z rank < pz (default: equal to ${nz})
--zu Local nz value for processes with z rank >= pz (default: equal to ${nz})
-H, --hosts Comma-separated list of nodes to run on
--tcp-iface TCP interface to use for communication (default: ${tcp_iface})
--ssh-port SSH port to use for remote connections (default: ${ssh_port})
EOF
}
# =================================================
# Global variables
# =================================================
npx=1
npy=1
npz=1
nx=560
ny=280
nz=280
runtime=60
tol=1
pz=0
zl=${nz}
zu=${nz}
nodes=
tcp_iface=p14p2
ssh_port=3333
rochpcg_runscript="${PWD}/run_rochpcg"
mpi_bin="${PWD}/deps/openmpi/bin/mpirun"
ompi_prefix="${PWD}/deps/openmpi"
ompi_lib_dir="${PWD}/deps/openmpi/lib"
ompi_lib64_dir="${PWD}/deps/openmpi/lib64"
ucx_lib_dir="${PWD}/deps/ucx/lib"
ucx_lib64_dir="${PWD}/deps/ucx/lib64"
export PATH="${ompi_prefix}/bin${PATH:+:${PATH}}"
export LD_LIBRARY_PATH="${ompi_lib_dir}:${ompi_lib64_dir}:${ucx_lib_dir}:${ucx_lib64_dir}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
export OPAL_PREFIX="${ompi_prefix}"
# Detect the number of GPUs per node
ngpu_per_node=$(hy-smi --showid 2>/dev/null | grep -ic "Device ID")
if [[ -z "${ngpu_per_node}" || "${ngpu_per_node}" -eq 0 ]]; then
echo "Failed to get the number of GPUs per node via hy-smi. Defaulting to 8."
ngpu_per_node=8
else
echo "Detected ${ngpu_per_node} GPUs per node."
fi
# =================================================
# Parameter parsing
# =================================================
GETOPT_PARSE=$(getopt --name "${0}" --options hH: --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu:,hosts:,tcp-iface:,ssh-port: -- "$@") \
|| { echo "getopt invocation failed; could not parse the command line"; exit 1; }
eval set -- "${GETOPT_PARSE}"
while true; do
case "${1}" in
-h|--help) help; exit 0 ;;
--npx) npx=${2}; shift 2 ;;
--npy) npy=${2}; shift 2 ;;
--npz) npz=${2}; shift 2 ;;
--nx) nx=${2}; shift 2 ;;
--ny) ny=${2}; shift 2 ;;
--nz)
nz=${2}
zl=${nz}
zu=${nz}
shift 2 ;;
--rt) runtime=${2}; shift 2 ;;
--tol) tol=${2}; shift 2 ;;
--pz) pz=${2}; shift 2 ;;
--zl) zl=${2}; shift 2 ;;
--zu) zu=${2}; shift 2 ;;
-H|--hosts) nodes=${2}; shift 2 ;;
--tcp-iface) tcp_iface=${2}; shift 2 ;;
--ssh-port) ssh_port=${2}; shift 2 ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
;;
esac
done
# Build rochpcg arguments
rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
rochpcg_args+=" --rt=${runtime}"
rochpcg_args+=" --tol=${tol}"
rochpcg_args+=" --pz=${pz}"
rochpcg_args+=" --zl=${zl}"
rochpcg_args+=" --zu=${zu}"
# Calculate total number of processes
np=$((${npx}*${npy}*${npz}))
# =================================================
# Run rochpcg script
# =================================================
# Run single-node test if --hosts is not set
if [ -z "${nodes}" ]; then
echo "No compute nodes specified. Running in single-node mode."
${mpi_bin} --allow-run-as-root \
--bind-to none \
--mca pml ucx \
--mca osc ucx \
--mca btl ^vader,tcp,openib,uct \
--mca coll ^hcoll \
-x UCX_TLS=self,sm,rocm \
-x UCX_RNDV_SCHEME=put_zcopy \
-x UCX_MEMTYPE_CACHE=y \
-x HSA_FORCE_FINE_GRAIN_PCIE=1 \
-np ${np} \
${rochpcg_runscript} ${rochpcg_args}
else
echo "Running in multi-node mode. Using nodes: ${nodes}"
echo "Using TCP interface: ${tcp_iface}"
echo "Using SSH port: ${ssh_port}"
# Set rank counts for hosts
IFS=',' read -ra node_array <<< "${nodes}"
hosts_string=""
for node in "${node_array[@]}"; do
hosts_string+="${node}:${ngpu_per_node},"
done
hosts_string="${hosts_string%,}"
echo "MPI hosts: ${hosts_string}"
# Copy files to other nodes
current_node=$(hostname)
copyto_hosts=()
for node in "${node_array[@]}"; do
if [[ "${node}" != "${current_node}" ]]; then
copyto_hosts+=("${node}")
fi
done
# Copy files using rsync only if there are other nodes to copy to
if [ ${#copyto_hosts[@]} -gt 0 ]; then
echo "Copying files to other nodes in parallel: ${copyto_hosts[@]}"
for node in "${copyto_hosts[@]}"; do
rsync -az -e "ssh -p ${ssh_port}" build deps ${rochpcg_runscript} "${node}:/workspace/" &
done
wait
echo "Files synchronized successfully."
fi
# Multi-node run
${mpi_bin} --allow-run-as-root \
--prefix ${ompi_prefix} \
--map-by ppr:${ngpu_per_node}:node --bind-to none \
--mca pml ucx \
--mca osc ucx \
--mca btl ^openib \
--mca btl_tcp_if_include ${tcp_iface} \
--mca plm_rsh_args "-p ${ssh_port}" \
--mca coll_hcoll_enable 0 \
-x UCX_TLS=self,sm,rocm,rc \
-x UCX_RNDV_SCHEME=put_zcopy \
-x UCX_RNDV_FRAG_MEM_TYPE=rocm \
-x UCX_MEMTYPE_CACHE=n \
-x UCX_LOG_LEVEL=fatal \
-x HSA_FORCE_FINE_GRAIN_PCIE=1 \
-x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \
-np ${np} \
-H ${hosts_string} \
${rochpcg_runscript} ${rochpcg_args}
fi
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 1300bd7..83490ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,6 +37,7 @@ tags
# build-in-source directory
build
+deps
# doc directory
docBin
diff --git a/install.sh b/install.sh
index e2c3a80..8922489 100755
--- a/install.sh
+++ b/install.sh
@@ -17,7 +17,7 @@ function display_help()
echo " [-g|--debug] -DCMAKE_BUILD_TYPE=Debug (default: Release)"
echo " [-t|--test] build single GPU test"
echo " [--with-rocm=<dir>] Path to ROCm install (default: /opt/rocm)"
- echo " [--with-mpi=<dir>] Path to external MPI install (Default: clone+build OpenMPI v4.1.0 in deps/)"
+ echo " [--with-mpi=<dir>] Path to external MPI install (Default: clone+build OpenMPI in deps/)"
echo " [--gpu-aware-mpi] MPI library supports GPU-aware communication (Default: false)"
echo " [--with-openmp] compile with OpenMP support (default: enabled)"
echo " [--with-memmgmt] compile with smart memory management (default: enabled)"
@@ -186,22 +186,76 @@ install_packages( )
# Clone and build OpenMPI+UCX in rochpcg/openmpi
install_openmpi( )
{
- if [ ! -d "./deps/ucx" ]; then
- mkdir -p deps && cd deps
- git clone --branch v1.13.1 https://github.com/openucx/ucx.git ucx
- cd ucx; ./autogen.sh; ./autogen.sh #why do we have to run this twice?
- mkdir build; cd build
- ../contrib/configure-opt --prefix=${PWD}/../ --with-rocm=${with_rocm} --without-knem --without-cuda --without-java
- make -j$(nproc); make install; cd ../../..
+ local install_dir=${PWD}/deps
+ local ucx_prefix=${install_dir}/ucx
+ local ompi_prefix=${install_dir}/openmpi
+
+ local ucx_lib_folder=${ucx_prefix}/lib
+ local ucx_lib64_folder=${ucx_prefix}/lib64
+ local ompi_lib_folder=${ompi_prefix}/lib
+ local ompi_lib64_folder=${ompi_prefix}/lib64
+
+ local ucx_version=1.20.0
+ local ucx_src=${install_dir}/ucx-${ucx_version}
+ local ucx_tarball=ucx-${ucx_version}.tar.gz
+ local ompi_version=5.0.9
+ local ompi_src=${install_dir}/openmpi-${ompi_version}
+ local ompi_tarball=openmpi-${ompi_version}.tar.gz
+
+ # Create the tpl directory
+ mkdir -p ${install_dir} && cd ${install_dir}
+
+ # Download UCX on demand
+ rm -rf ${ucx_src}
+ if [ ! -f "${ucx_tarball}" ]; then
+ wget https://github.com/openucx/ucx/releases/download/v${ucx_version}/${ucx_tarball}
+ fi
+ tar -zxf ${ucx_tarball}
+ # Download OpenMPI on demand
+ rm -rf ${ompi_src}
+ if [ ! -f "${ompi_tarball}" ]; then
+ wget https://download.open-mpi.org/release/open-mpi/v${ompi_version%.*}/${ompi_tarball}
+ fi
+ tar -zxf ${ompi_tarball}
+
+
+ # Build UCX on demand
+ if [ ! -f "${ucx_lib_folder}/libucm.so" ] && [ ! -f "${ucx_lib64_folder}/libucm.so" ]; then
+ cd ${ucx_src}
+ ./contrib/configure-release --prefix=${ucx_prefix} \
+ --enable-optimizations --enable-tuning \
+ --enable-cma --enable-mt \
+ --with-mlx5 --with-rc --with-ud --with-dc --with-dm --with-ib_hw_tm \
+ --with-verbs=/usr/include --with-rdmacm=/usr \
+ --with-rocm=${with_rocm} \
+ --without-knem --without-cuda --without-java
+ make -j$(nproc)
+ make install
fi
- if [ ! -d "./deps/openmpi" ]; then
- mkdir -p deps && cd deps
- git clone --branch v4.1.4 https://github.com/open-mpi/ompi.git openmpi
- cd openmpi; ./autogen.pl; mkdir build; cd build
- ../configure --prefix=${PWD}/../ --with-ucx=${PWD}/../../ucx --without-verbs
- make -j$(nproc); make install; cd ../../..
+ export LD_LIBRARY_PATH="${ucx_lib_folder}:${ucx_lib64_folder}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+ export LIBRARY_PATH="${ucx_lib_folder}:${ucx_lib64_folder}${LIBRARY_PATH:+:${LIBRARY_PATH}}"
+ export CPATH="${ucx_prefix}/include${CPATH:+:${CPATH}}"
+
+ # Build OpenMPI on demand
+ if [ ! -f "${ompi_lib_folder}/libmpi.so" ] && [ ! -f "${ompi_lib64_folder}/libmpi.so" ]; then
+ cd ${ompi_src}
+ ./configure --prefix=${ompi_prefix} \
+ --with-ucx=${ucx_prefix} \
+ --with-rocm=${with_rocm} \
+ --enable-builtin-atomics \
+ --enable-wrapper-rpath \
+ --enable-mca-no-build=btl-uct
+ make -j$(nproc)
+ make install
fi
+
+ export LD_LIBRARY_PATH="${ompi_lib_folder}:${ompi_lib64_folder}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+ export LIBRARY_PATH="${ompi_lib_folder}:${ompi_lib64_folder}${LIBRARY_PATH:+:${LIBRARY_PATH}}"
+ export CPATH="${ompi_prefix}/include${CPATH:+:${CPATH}}"
+ export OPAL_PREFIX=${ompi_prefix}
+
+ cd ${install_dir}/..
}
# #################################################
@@ -396,7 +450,7 @@ pushd .
fi
# Build library with AMD toolchain because of existense of device kernels
- ${cmake_executable} ${cmake_common_options} \
+ ${cmake_executable} --fresh ${cmake_common_options} \
-DCPACK_SET_DESTDIR=OFF \
-DCMAKE_INSTALL_PREFIX=${install_prefix} \
-DCPACK_PACKAGING_INSTALL_PREFIX=${with_rocm} \
diff --git a/mpirun_rochpcg b/mpirun_rochpcg
new file mode 100755
index 0000000..3dccc72
--- /dev/null
+++ b/mpirun_rochpcg
@@ -0,0 +1,193 @@
+#!/bin/bash
+set -e
+
+# =================================================
+# Helper functions
+# =================================================
+help() {
+ cat << EOF
+rocHPCG MPI run helper script
+Usage: $(basename "$0") [OPTIONS]
+
+OPTIONS:
+ -h, --help Show this help message and exit
+ --npx Number of processes in x dimension of process grid (default: ${npx})
+ --npy Number of processes in y dimension of process grid (default: ${npy})
+ --npz Number of processes in z dimension of process grid (default: ${npz})
+ --nx Problem size in x dimension (default: ${nx})
+ --ny Problem size in y dimension (default: ${ny})
+ --nz Problem size in z dimension (default: ${nz})
+ --rt Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
+ --tol Residual tolerance, skip reference verification if set (default: ${tol})
+ --pz Partition boundary in z process dimension (default: 0, uniform grid)
+ --zl Local nz value for processes with z rank < pz (default: equal to ${nz})
+ --zu Local nz value for processes with z rank >= pz (default: equal to ${nz})
+
+ -H, --hosts Comma-separated list of nodes to run on
+ --tcp-iface TCP interface to use for communication (default: ${tcp_iface})
+ --ssh-port SSH port to use for remote connections (default: ${ssh_port})
+EOF
+}
+
+# =================================================
+# Global variables
+# =================================================
+npx=1
+npy=1
+npz=1
+nx=560
+ny=280
+nz=280
+runtime=60
+tol=1
+pz=0
+zl=${nz}
+zu=${nz}
+
+nodes=
+tcp_iface=p14p2
+ssh_port=3333
+
+rochpcg_runscript="${PWD}/run_rochpcg"
+mpi_bin="${PWD}/deps/openmpi/bin/mpirun"
+ompi_prefix="${PWD}/deps/openmpi"
+ompi_lib_dir="${PWD}/deps/openmpi/lib"
+ompi_lib64_dir="${PWD}/deps/openmpi/lib64"
+ucx_lib_dir="${PWD}/deps/ucx/lib"
+ucx_lib64_dir="${PWD}/deps/ucx/lib64"
+
+export PATH="${ompi_prefix}/bin${PATH:+:${PATH}}"
+export LD_LIBRARY_PATH="${ompi_lib_dir}:${ompi_lib64_dir}:${ucx_lib_dir}:${ucx_lib64_dir}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+export OPAL_PREFIX="${ompi_prefix}"
+
+# Detect the number of GPUs per node
+ngpu_per_node=$(hy-smi --showid 2>/dev/null | grep -ic "Device ID")
+if [[ -z "${ngpu_per_node}" || "${ngpu_per_node}" -eq 0 ]]; then
+ echo "Failed to get the number of GPUs per node via hy-smi. Defaulting to 8."
+ ngpu_per_node=8
+else
+ echo "Detected ${ngpu_per_node} GPUs per node."
+fi
+
+# =================================================
+# Parameter parsing
+# =================================================
+GETOPT_PARSE=$(getopt --name "${0}" --options hH: --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu:,hosts:,tcp-iface:,ssh-port: -- "$@") \
+ || { echo "getopt invocation failed; could not parse the command line"; exit 1; }
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+ case "${1}" in
+ -h|--help) help; exit 0 ;;
+ --npx) npx=${2}; shift 2 ;;
+ --npy) npy=${2}; shift 2 ;;
+ --npz) npz=${2}; shift 2 ;;
+ --nx) nx=${2}; shift 2 ;;
+ --ny) ny=${2}; shift 2 ;;
+ --nz)
+ nz=${2}
+ zl=${nz}
+ zu=${nz}
+ shift 2 ;;
+ --rt) runtime=${2}; shift 2 ;;
+ --tol) tol=${2}; shift 2 ;;
+ --pz) pz=${2}; shift 2 ;;
+ --zl) zl=${2}; shift 2 ;;
+ --zu) zu=${2}; shift 2 ;;
+ -H|--hosts) nodes=${2}; shift 2 ;;
+ --tcp-iface) tcp_iface=${2}; shift 2 ;;
+ --ssh-port) ssh_port=${2}; shift 2 ;;
+ --) shift ; break ;;
+ *) echo "Unexpected command line parameter received; aborting";
+ exit 1
+ ;;
+ esac
+done
+
+# Build rochpcg arguments
+rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
+rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
+rochpcg_args+=" --rt=${runtime}"
+rochpcg_args+=" --tol=${tol}"
+rochpcg_args+=" --pz=${pz}"
+rochpcg_args+=" --zl=${zl}"
+rochpcg_args+=" --zu=${zu}"
+
+# Calculate total number of processes
+np=$((${npx}*${npy}*${npz}))
+
+# =================================================
+# Run rochpcg script
+# =================================================
+# Run single-node test if --hosts is not set
+if [ -z "${nodes}" ]; then
+ echo "No compute nodes specified. Running in single-node mode."
+
+ ${mpi_bin} --allow-run-as-root \
+ --bind-to none \
+ --mca pml ucx \
+ --mca osc ucx \
+ --mca btl ^vader,tcp,openib,uct \
+ --mca coll ^hcoll \
+ -x UCX_TLS=self,sm,rocm \
+ -x UCX_RNDV_SCHEME=put_zcopy \
+ -x UCX_MEMTYPE_CACHE=y \
+ -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
+ -np ${np} \
+ ${rochpcg_runscript} ${rochpcg_args}
+else
+ echo "Running in multi-node mode. Using nodes: ${nodes}"
+ echo "Using TCP interface: ${tcp_iface}"
+ echo "Using SSH port: ${ssh_port}"
+
+ # Set rank counts for hosts
+ IFS=',' read -ra node_array <<< "${nodes}"
+ hosts_string=""
+ for node in "${node_array[@]}"; do
+ hosts_string+="${node}:${ngpu_per_node},"
+ done
+ hosts_string="${hosts_string%,}"
+
+ echo "MPI hosts: ${hosts_string}"
+
+ # Copy files to other nodes
+ current_node=$(hostname)
+ copyto_hosts=()
+ for node in "${node_array[@]}"; do
+ if [[ "${node}" != "${current_node}" ]]; then
+ copyto_hosts+=("${node}")
+ fi
+ done
+
+ # Copy files using rsync only if there are other nodes to copy to
+ if [ ${#copyto_hosts[@]} -gt 0 ]; then
+ echo "Copying files to other nodes in parallel: ${copyto_hosts[@]}"
+ for node in "${copyto_hosts[@]}"; do
+ rsync -az -e "ssh -p ${ssh_port}" build deps ${rochpcg_runscript} "${node}:/workspace/" &
+ done
+ wait
+ echo "Files synchronized successfully."
+ fi
+
+ # Multi-node run
+ ${mpi_bin} --allow-run-as-root \
+ --prefix ${ompi_prefix} \
+ --map-by ppr:${ngpu_per_node}:node --bind-to none \
+ --mca pml ucx \
+ --mca osc ucx \
+ --mca btl ^openib \
+ --mca btl_tcp_if_include ${tcp_iface} \
+ --mca plm_rsh_args "-p ${ssh_port}" \
+ --mca coll_hcoll_enable 0 \
+ -x UCX_TLS=self,sm,rocm,rc \
+ -x UCX_RNDV_SCHEME=put_zcopy \
+ -x UCX_RNDV_FRAG_MEM_TYPE=rocm \
+ -x UCX_MEMTYPE_CACHE=n \
+ -x UCX_LOG_LEVEL=fatal \
+ -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
+ -x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \
+ -np ${np} \
+ -H ${hosts_string} \
+ ${rochpcg_runscript} ${rochpcg_args}
+fi
\ No newline at end of file
diff --git a/run_rochpcg b/run_rochpcg
new file mode 100755
index 0000000..0f806fe
--- /dev/null
+++ b/run_rochpcg
@@ -0,0 +1,195 @@
+#!/bin/bash
+
+# =================================================
+# Helper functions
+# =================================================
+help() {
+ cat << EOF
+rocHPCG helper script
+Usage: $(basename "$0") [OPTIONS]
+
+OPTIONS:
+ -h, --help Show this help message and exit
+ --npx Number of processes in x dimension of process grid (default: ${npx})
+ --npy Number of processes in y dimension of process grid (default: ${npy})
+ --npz Number of processes in z dimension of process grid (default: ${npz})
+ --nx Problem size in x dimension (default: ${nx})
+ --ny Problem size in y dimension (default: ${ny})
+ --nz Problem size in z dimension (default: ${nz})
+ --rt Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
+ --tol Residual tolerance, skip reference verification if set (default: ${tol})
+ --pz Partition boundary in z process dimension (default: 0, uniform grid)
+ --zl Local nz value for processes with z rank < pz (default: equal to ${nz})
+ --zu Local nz value for processes with z rank >= pz (default: equal to ${nz})
+EOF
+}
+
+# =================================================
+# Global variables
+# =================================================
+npx=1
+npy=1
+npz=1
+nx=560
+ny=280
+nz=280
+runtime=60
+tol=1
+pz=0
+zl=${nz}
+zu=${nz}
+
+rochpcg_bin="${PWD}/build/release/rochpcg-install/bin/rochpcg"
+
+# =================================================
+# Parameter parsing
+# =================================================
+GETOPT_PARSE=$(getopt --name "${0}" --options h --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu: -- "$@") \
+ || { echo "getopt invocation failed; could not parse the command line"; exit 1; }
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+ case "${1}" in
+ -h|--help) help; exit 0 ;;
+ --npx) npx=${2}; shift 2 ;;
+ --npy) npy=${2}; shift 2 ;;
+ --npz) npz=${2}; shift 2 ;;
+ --nx) nx=${2}; shift 2 ;;
+ --ny) ny=${2}; shift 2 ;;
+ --nz)
+ nz=${2}
+ zl=${nz}
+ zu=${nz}
+ shift 2 ;;
+ --rt) runtime=${2}; shift 2 ;;
+ --tol) tol=${2}; shift 2 ;;
+ --pz) pz=${2}; shift 2 ;;
+ --zl) zl=${2}; shift 2 ;;
+ --zu) zu=${2}; shift 2 ;;
+ --) shift ; break ;;
+ *) echo "Unexpected command line parameter received; aborting";
+ exit 1
+ ;;
+ esac
+done
+
+# Build rochpcg arguments
+rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
+rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
+rochpcg_args+=" --rt=${runtime}"
+rochpcg_args+=" --tol=${tol}"
+rochpcg_args+=" --pz=${pz}"
+rochpcg_args+=" --zl=${zl}"
+rochpcg_args+=" --zu=${zu}"
+
+# =================================================
+# Affinity setup
+# =================================================
+globalRank=$OMPI_COMM_WORLD_RANK
+globalSize=$OMPI_COMM_WORLD_SIZE
+rank=$OMPI_COMM_WORLD_LOCAL_RANK
+size=$OMPI_COMM_WORLD_LOCAL_SIZE
+
+#construct a list of all cpus, sorted by core
+cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
+
+#construct list of devices and their numa affinities
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+
+#count the cpus per core
+threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
+
+#remove the extra cpus on each core to make a list of just physical cores, then sort by numa domain
+corelist=$(echo "$cpulist" | awk -v tpc=${threads_per_core} '(NR-1)%tpc==0' | sort -k 3 -g -s)
+
+#count numa domains
+line=($(echo "$cpulist" | tail -n 1))
+n_numa=$((line[2]+1))
+
+numa_core_counts=()
+numa_proc_counts=()
+for i in $(seq 1 ${n_numa}); do numa_core_counts+=(0); numa_proc_counts+=(0); done
+
+#parse the list of cpus to array and count cpus in each numa
+cpus=()
+while read -a line; do
+ cpus+=(${line[0]})
+ ((numa_core_counts[${line[2]}]++)) || true
+done <<< "${corelist}"
+
+numa_core_offsets=(0)
+for i in $(seq 1 $((n_numa-1))); do numa_core_offsets+=($((numa_core_offsets[$((i-1))] + numa_core_counts[$i]))); done
+
+#parse device to numa mapping
+device_to_numa=()
+while read -a line; do
+ device_to_numa+=(${line[1]})
+done <<< "${devicelist}"
+
+rank_to_device=()
+n_devices=$(echo "${devicelist}" | grep -c "card")
+for i in $(seq 0 $((size-1))); do
+ rank_to_device+=($((i%n_devices)))
+done
+
+mygpu=${rank_to_device[rank]}
+mynuma=${device_to_numa[mygpu]}
+
+rank_to_numa=()
+for i in $(seq 0 $((size-1))); do
+ rank_to_numa+=(${device_to_numa[${rank_to_device[$((i%n_devices))]}]})
+done
+
+for i in $(seq 0 $((size-1))); do
+ numa=${rank_to_numa[$i]}
+ ((numa_proc_counts[numa]++)) || true
+done
+
+omp_num_threads=$((numa_core_counts[mynuma]/numa_proc_counts[mynuma]))
+
+core_offset=${numa_core_offsets[mynuma]}
+for i in $(seq 0 $((rank-1))); do
+ numa=${rank_to_numa[$i]}
+ if [[ $numa -eq $mynuma ]]; then
+ core_offset=$((core_offset + omp_num_threads))
+ fi
+done
+
+omp_places="{${cpus[core_offset]}}"
+for c in $(seq 1 $((omp_num_threads-1))); do
+ omp_places+=",{${cpus[core_offset+c]}}"
+done
+
+if [[ $omp_num_threads -gt 1 ]]; then
+ places="{${cpus[core_offset]}-${cpus[core_offset+$((omp_num_threads-1))]}}"
+else
+ places="{${cpus[core_offset]}}"
+fi
+
+# Export OpenMP config
+export OMP_NUM_THREADS=${omp_num_threads}
+export OMP_PLACES=${omp_places}
+export OMP_PROC_BIND=true
+
+# Hard-coded IB mapping for now
+declare -a IB_MAP=(
+ "mlx5_1:1" # GPU 0 -> NUMA 3
+ "mlx5_2:1" # GPU 1 -> NUMA 3
+ "mlx5_3:1" # GPU 2 -> NUMA 0
+ "mlx5_4:1" # GPU 3 -> NUMA 0
+ "mlx5_7:1" # GPU 4 -> NUMA 7
+ "mlx5_8:1" # GPU 5 -> NUMA 7
+ "mlx5_9:1" # GPU 6 -> NUMA 4
+ "mlx5_10:1" # GPU 7 -> NUMA 4
+)
+
+myib=${IB_MAP[$rank]}
+export UCX_NET_DEVICES=${myib}
+
+if [[ $globalRank -lt $size ]]; then
+ echo "Node Binding: Process $rank [(nx,ny,nz)=(${nx},${ny},${nz})] GPU: $mygpu, NUMA: $mynuma, IB: $myib, CPU Cores: $omp_num_threads - $places"
+fi
+
+# Run
+numactl -N ${mynuma} -m ${mynuma} ${rochpcg_bin} ${rochpcg_args}
\ No newline at end of file
#!/bin/bash
set -e
nx=560
ny=280
nz=280
rt=60
# 4 nodes, 32 GPUs
./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
--npx 4 --npy 4 --npz 2 \
-H node01,node02,node03,node04 \
--tcp-iface p14p2 \
--ssh-port 3333
# 2 nodes, 16 GPUs
./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
--npx 4 --npy 2 --npz 2 \
-H node01,node02 \
--tcp-iface p14p2 \
--ssh-port 3333
# 1 nodes, 8 GPUs
./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
--npx 2 --npy 2 --npz 2
# 1 nodes, 4 GPUs
./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
--npx 2 --npy 2 --npz 2
# 1 nodes, 2 GPUs
./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
--npx 2 --npy 1 --npz 1
# 1 nodes, 1 GPU
./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
--npx 1 --npy 1 --npz 1
\ No newline at end of file
#!/bin/bash
# =================================================
# Helper functions
# =================================================
help() {
cat << EOF
rocHPCG helper script
Usage: $(basename "$0") [OPTIONS]
OPTIONS:
-h, --help Show this help message and exit
--npx Number of processes in x dimension of process grid (default: ${npx})
--npy Number of processes in y dimension of process grid (default: ${npy})
--npz Number of processes in z dimension of process grid (default: ${npz})
--nx Problem size in x dimension (default: ${nx})
--ny Problem size in y dimension (default: ${ny})
--nz Problem size in z dimension (default: ${nz})
--rt Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
--tol Residual tolerance, skip reference verification if set (default: ${tol})
--pz Partition boundary in z process dimension (default: 0, uniform grid)
--zl Local nz value for processes with z rank < pz (default: equal to ${nz})
--zu Local nz value for processes with z rank >= pz (default: equal to ${nz})
EOF
}
# =================================================
# Global variables
# =================================================
npx=1
npy=1
npz=1
nx=560
ny=280
nz=280
runtime=60
tol=1
pz=0
zl=${nz}
zu=${nz}
rochpcg_bin="${PWD}/build/release/rochpcg-install/bin/rochpcg"
# =================================================
# Parameter parsing
# =================================================
GETOPT_PARSE=$(getopt --name "${0}" --options h --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu: -- "$@") \
|| { echo "getopt invocation failed; could not parse the command line"; exit 1; }
eval set -- "${GETOPT_PARSE}"
while true; do
case "${1}" in
-h|--help) help; exit 0 ;;
--npx) npx=${2}; shift 2 ;;
--npy) npy=${2}; shift 2 ;;
--npz) npz=${2}; shift 2 ;;
--nx) nx=${2}; shift 2 ;;
--ny) ny=${2}; shift 2 ;;
--nz)
nz=${2}
zl=${nz}
zu=${nz}
shift 2 ;;
--rt) runtime=${2}; shift 2 ;;
--tol) tol=${2}; shift 2 ;;
--pz) pz=${2}; shift 2 ;;
--zl) zl=${2}; shift 2 ;;
--zu) zu=${2}; shift 2 ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
;;
esac
done
# Build rochpcg arguments
rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
rochpcg_args+=" --rt=${runtime}"
rochpcg_args+=" --tol=${tol}"
rochpcg_args+=" --pz=${pz}"
rochpcg_args+=" --zl=${zl}"
rochpcg_args+=" --zu=${zu}"
# =================================================
# Affinity setup
# =================================================
globalRank=$OMPI_COMM_WORLD_RANK
globalSize=$OMPI_COMM_WORLD_SIZE
rank=$OMPI_COMM_WORLD_LOCAL_RANK
size=$OMPI_COMM_WORLD_LOCAL_SIZE
#construct a list of all cpus, sorted by core
cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
#construct list of devices and their numa affinities
devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
#count the cpus per core
threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
#remove the extra cpus on each core to make a list of just physical cores, then sort by numa domain
corelist=$(echo "$cpulist" | awk -v tpc=${threads_per_core} '(NR-1)%tpc==0' | sort -k 3 -g -s)
#count numa domains
line=($(echo "$cpulist" | tail -n 1))
n_numa=$((line[2]+1))
numa_core_counts=()
numa_proc_counts=()
for i in $(seq 1 ${n_numa}); do numa_core_counts+=(0); numa_proc_counts+=(0); done
#parse the list of cpus to array and count cpus in each numa
cpus=()
while read -a line; do
cpus+=(${line[0]})
((numa_core_counts[${line[2]}]++)) || true
done <<< "${corelist}"
numa_core_offsets=(0)
for i in $(seq 1 $((n_numa-1))); do numa_core_offsets+=($((numa_core_offsets[$((i-1))] + numa_core_counts[$i]))); done
#parse device to numa mapping
device_to_numa=()
while read -a line; do
device_to_numa+=(${line[1]})
done <<< "${devicelist}"
rank_to_device=()
n_devices=$(echo "${devicelist}" | grep -c "card")
for i in $(seq 0 $((size-1))); do
rank_to_device+=($((i%n_devices)))
done
mygpu=${rank_to_device[rank]}
mynuma=${device_to_numa[mygpu]}
rank_to_numa=()
for i in $(seq 0 $((size-1))); do
rank_to_numa+=(${device_to_numa[${rank_to_device[$((i%n_devices))]}]})
done
for i in $(seq 0 $((size-1))); do
numa=${rank_to_numa[$i]}
((numa_proc_counts[numa]++)) || true
done
omp_num_threads=$((numa_core_counts[mynuma]/numa_proc_counts[mynuma]))
core_offset=${numa_core_offsets[mynuma]}
for i in $(seq 0 $((rank-1))); do
numa=${rank_to_numa[$i]}
if [[ $numa -eq $mynuma ]]; then
core_offset=$((core_offset + omp_num_threads))
fi
done
omp_places="{${cpus[core_offset]}}"
for c in $(seq 1 $((omp_num_threads-1))); do
omp_places+=",{${cpus[core_offset+c]}}"
done
if [[ $omp_num_threads -gt 1 ]]; then
places="{${cpus[core_offset]}-${cpus[core_offset+$((omp_num_threads-1))]}}"
else
places="{${cpus[core_offset]}}"
fi
# Export OpenMP config
export OMP_NUM_THREADS=${omp_num_threads}
export OMP_PLACES=${omp_places}
export OMP_PROC_BIND=true
# Hard-coded IB mapping for now
declare -a IB_MAP=(
"mlx5_1:1" # GPU 0 -> NUMA 3
"mlx5_2:1" # GPU 1 -> NUMA 3
"mlx5_3:1" # GPU 2 -> NUMA 0
"mlx5_4:1" # GPU 3 -> NUMA 0
"mlx5_7:1" # GPU 4 -> NUMA 7
"mlx5_8:1" # GPU 5 -> NUMA 7
"mlx5_9:1" # GPU 6 -> NUMA 4
"mlx5_10:1" # GPU 7 -> NUMA 4
)
myib=${IB_MAP[$rank]}
export UCX_NET_DEVICES=${myib}
if [[ $globalRank -lt $size ]]; then
echo "Node Binding: Process $rank [(nx,ny,nz)=(${nx},${ny},${nz})] GPU: $mygpu, NUMA: $mynuma, IB: $myib, CPU Cores: $omp_num_threads - $places"
fi
# Run
numactl -N ${mynuma} -m ${mynuma} ${rochpcg_bin} ${rochpcg_args}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment