[rocHPCG] Add patches and scripts

11052102 · one · 03a29671 · 11052102 · 11052102 · 11052102
Commit 11052102 authored Mar 04, 2026 by one
6 changed files
--- a/projects/rocHPCG/README.md
+++ b/projects/rocHPCG/README.md
+## rocHPCG Patch Files
+- `rochpcg-install.patch`
+- `rochpcg-scripts-bw.patch`
+## Examples
+```bash
+./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
+    --npx 4 --npy 4 --npz 2 \
+    -H node01,node02,node03,node04 \
+    --tcp-iface p14p2 \
+    --ssh-port 3333
+```
\ No newline at end of file
--- a/projects/rocHPCG/mpirun_rochpcg
+++ b/projects/rocHPCG/mpirun_rochpcg
+#!/bin/bash
+set -e
+# =================================================
+# Helper functions
+# =================================================
+help() {
+    cat << EOF
+rocHPCG MPI run helper script
+Usage: $(basename "$0") [OPTIONS]
+OPTIONS:
+    -h, --help    Show this help message and exit
+    --npx         Number of processes in x dimension of process grid (default: ${npx})
+    --npy         Number of processes in y dimension of process grid (default: ${npy})
+    --npz         Number of processes in z dimension of process grid (default: ${npz})
+    --nx          Problem size in x dimension (default: ${nx})
+    --ny          Problem size in y dimension (default: ${ny})
+    --nz          Problem size in z dimension (default: ${nz})
+    --rt          Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
+    --tol         Residual tolerance, skip reference verification if set (default: ${tol})
+    --pz          Partition boundary in z process dimension (default: 0, uniform grid)
+    --zl          Local nz value for processes with z rank < pz (default: equal to ${nz})
+    --zu          Local nz value for processes with z rank >= pz (default: equal to ${nz})
+    -H, --hosts   Comma-separated list of nodes to run on
+    --tcp-iface   TCP interface to use for communication (default: ${tcp_iface})
+    --ssh-port    SSH port to use for remote connections (default: ${ssh_port})
+EOF
+}
+# =================================================
+# Global variables
+# =================================================
+npx=1
+npy=1
+npz=1
+nx=560
+ny=280
+nz=280
+runtime=60
+tol=1
+pz=0
+zl=${nz}
+zu=${nz}
+nodes=
+tcp_iface=p14p2
+ssh_port=3333
+rochpcg_runscript="${PWD}/run_rochpcg"
+mpi_bin="${PWD}/deps/openmpi/bin/mpirun"
+ompi_prefix="${PWD}/deps/openmpi"
+ompi_lib_dir="${PWD}/deps/openmpi/lib"
+ompi_lib64_dir="${PWD}/deps/openmpi/lib64"
+ucx_lib_dir="${PWD}/deps/ucx/lib"
+ucx_lib64_dir="${PWD}/deps/ucx/lib64"
+export PATH="${ompi_prefix}/bin${PATH:+:${PATH}}"
+export LD_LIBRARY_PATH="${ompi_lib_dir}:${ompi_lib64_dir}:${ucx_lib_dir}:${ucx_lib64_dir}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+export OPAL_PREFIX="${ompi_prefix}"
+# Detect the number of GPUs per node
+ngpu_per_node=$(hy-smi --showid 2>/dev/null | grep -ic "Device ID")
+if [[ -z "${ngpu_per_node}" || "${ngpu_per_node}" -eq 0 ]]; then
+  echo "Failed to get the number of GPUs per node via hy-smi. Defaulting to 8."
+  ngpu_per_node=8
+else
+  echo "Detected ${ngpu_per_node} GPUs per node."
+fi
+# =================================================
+# Parameter parsing
+# =================================================
+GETOPT_PARSE=$(getopt --name "${0}" --options hH: --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu:,hosts:,tcp-iface:,ssh-port: -- "$@") \
+  || { echo "getopt invocation failed; could not parse the command line"; exit 1; }
+eval set -- "${GETOPT_PARSE}"
+while true; do
+  case "${1}" in
+    -h|--help) help; exit 0 ;;
+    --npx) npx=${2}; shift 2 ;;
+    --npy) npy=${2}; shift 2 ;;
+    --npz) npz=${2}; shift 2 ;;
+    --nx) nx=${2}; shift 2 ;;
+    --ny) ny=${2}; shift 2 ;;
+    --nz)
+        nz=${2}
+        zl=${nz}
+        zu=${nz}
+        shift 2 ;;
+    --rt) runtime=${2}; shift 2 ;;
+    --tol) tol=${2}; shift 2 ;;
+    --pz) pz=${2}; shift 2 ;;
+    --zl) zl=${2}; shift 2 ;;
+    --zu) zu=${2}; shift 2 ;;
+    -H|--hosts) nodes=${2}; shift 2 ;;
+    --tcp-iface) tcp_iface=${2}; shift 2 ;;
+    --ssh-port) ssh_port=${2}; shift 2 ;;
+    --) shift ; break ;;
+    *)  echo "Unexpected command line parameter received; aborting";
+        exit 1
+        ;;
+  esac
+done
+# Build rochpcg arguments
+rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
+rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
+rochpcg_args+=" --rt=${runtime}"
+rochpcg_args+=" --tol=${tol}"
+rochpcg_args+=" --pz=${pz}"
+rochpcg_args+=" --zl=${zl}"
+rochpcg_args+=" --zu=${zu}"
+# Calculate total number of processes
+np=$((${npx}*${npy}*${npz}))
+# =================================================
+# Run rochpcg script
+# =================================================
+# Run single-node test if --hosts is not set
+if [ -z "${nodes}" ]; then
+  echo "No compute nodes specified. Running in single-node mode."
+  ${mpi_bin} --allow-run-as-root \
+    --bind-to none \
+    --mca pml ucx \
+    --mca osc ucx \
+    --mca btl ^vader,tcp,openib,uct \
+    --mca coll ^hcoll \
+    -x UCX_TLS=self,sm,rocm \
+    -x UCX_RNDV_SCHEME=put_zcopy \
+    -x UCX_MEMTYPE_CACHE=y \
+    -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
+    -np ${np} \
+    ${rochpcg_runscript} ${rochpcg_args}
+else
+  echo "Running in multi-node mode. Using nodes: ${nodes}"
+  echo "Using TCP interface: ${tcp_iface}"
+  echo "Using SSH port: ${ssh_port}"
+  # Set rank counts for hosts
+  IFS=',' read -ra node_array <<< "${nodes}"
+  hosts_string=""
+  for node in "${node_array[@]}"; do
+    hosts_string+="${node}:${ngpu_per_node},"
+  done
+  hosts_string="${hosts_string%,}"
+  echo "MPI hosts: ${hosts_string}"
+  # Copy files to other nodes
+  current_node=$(hostname)
+  copyto_hosts=()
+  for node in "${node_array[@]}"; do
+    if [[ "${node}" != "${current_node}" ]]; then
+      copyto_hosts+=("${node}")
+    fi
+  done
+  # Copy files using rsync only if there are other nodes to copy to
+  if [ ${#copyto_hosts[@]} -gt 0 ]; then
+    echo "Copying files to other nodes in parallel: ${copyto_hosts[@]}"
+    for node in "${copyto_hosts[@]}"; do
+      rsync -az -e "ssh -p ${ssh_port}" build deps ${rochpcg_runscript} "${node}:/workspace/" &
+    done
+    wait
+    echo "Files synchronized successfully."
+  fi
+  # Multi-node run
+  ${mpi_bin} --allow-run-as-root \
+    --prefix ${ompi_prefix} \
+    --map-by ppr:${ngpu_per_node}:node --bind-to none \
+    --mca pml ucx \
+    --mca osc ucx \
+    --mca btl ^openib \
+    --mca btl_tcp_if_include ${tcp_iface} \
+    --mca plm_rsh_args "-p ${ssh_port}" \
+    --mca coll_hcoll_enable 0 \
+    -x UCX_TLS=self,sm,rocm,rc \
+    -x UCX_RNDV_SCHEME=put_zcopy \
+    -x UCX_RNDV_FRAG_MEM_TYPE=rocm \
+    -x UCX_MEMTYPE_CACHE=n \
+    -x UCX_LOG_LEVEL=fatal \
+    -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
+    -x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \
+    -np ${np} \
+    -H ${hosts_string} \
+    ${rochpcg_runscript} ${rochpcg_args}
+fi
\ No newline at end of file
--- a/projects/rocHPCG/rochpcg-install.patch
+++ b/projects/rocHPCG/rochpcg-install.patch
+diff --git a/.gitignore b/.gitignore
+index 1300bd7..83490ca 100644
+--- a/.gitignore
+++ b/.gitignore
+@@ -37,6 +37,7 @@ tags
+ # build-in-source directory
+ build
+deps
+ # doc directory
+ docBin
+diff --git a/install.sh b/install.sh
+index e2c3a80..8922489 100755
+--- a/install.sh
+++ b/install.sh
+@@ -17,7 +17,7 @@ function display_help()
+   echo "    [-g|--debug] -DCMAKE_BUILD_TYPE=Debug (default: Release)"
+   echo "    [-t|--test] build single GPU test"
+   echo "    [--with-rocm=<dir>] Path to ROCm install (default: /opt/rocm)"
+-  echo "    [--with-mpi=<dir>] Path to external MPI install (Default: clone+build OpenMPI v4.1.0 in deps/)"
+  echo "    [--with-mpi=<dir>] Path to external MPI install (Default: clone+build OpenMPI in deps/)"
+   echo "    [--gpu-aware-mpi] MPI library supports GPU-aware communication (Default: false)"
+   echo "    [--with-openmp] compile with OpenMP support (default: enabled)"
+   echo "    [--with-memmgmt] compile with smart memory management (default: enabled)"
+@@ -186,22 +186,76 @@ install_packages( )
+ # Clone and build OpenMPI+UCX in rochpcg/openmpi
+ install_openmpi( )
+ {
+-  if [ ! -d "./deps/ucx" ]; then
+-    mkdir -p deps && cd deps
+-    git clone --branch v1.13.1 https://github.com/openucx/ucx.git ucx
+-    cd ucx; ./autogen.sh; ./autogen.sh #why do we have to run this twice?
+-    mkdir build; cd build
+-    ../contrib/configure-opt --prefix=${PWD}/../ --with-rocm=${with_rocm} --without-knem --without-cuda --without-java
+-    make -j$(nproc); make install; cd ../../..
+  local install_dir=${PWD}/deps
+  local ucx_prefix=${install_dir}/ucx
+  local ompi_prefix=${install_dir}/openmpi
+
+  local ucx_lib_folder=${ucx_prefix}/lib
+  local ucx_lib64_folder=${ucx_prefix}/lib64
+  local ompi_lib_folder=${ompi_prefix}/lib
+  local ompi_lib64_folder=${ompi_prefix}/lib64
+
+  local ucx_version=1.20.0
+  local ucx_src=${install_dir}/ucx-${ucx_version}
+  local ucx_tarball=ucx-${ucx_version}.tar.gz
+  local ompi_version=5.0.9
+  local ompi_src=${install_dir}/openmpi-${ompi_version}
+  local ompi_tarball=openmpi-${ompi_version}.tar.gz
+
+  # Create the tpl directory
+  mkdir -p ${install_dir} && cd ${install_dir}
+
+  # Download UCX on demand
+  rm -rf ${ucx_src}
+  if [ ! -f "${ucx_tarball}" ]; then
+      wget https://github.com/openucx/ucx/releases/download/v${ucx_version}/${ucx_tarball}
+  fi
+  tar -zxf ${ucx_tarball}
+  # Download OpenMPI on demand
+  rm -rf ${ompi_src}
+  if [ ! -f "${ompi_tarball}" ]; then
+      wget https://download.open-mpi.org/release/open-mpi/v${ompi_version%.*}/${ompi_tarball}
+  fi
+  tar -zxf ${ompi_tarball}
+
+
+  # Build UCX on demand
+  if [ ! -f "${ucx_lib_folder}/libucm.so" ] && [ ! -f "${ucx_lib64_folder}/libucm.so" ]; then
+    cd ${ucx_src}
+    ./contrib/configure-release --prefix=${ucx_prefix} \
+        --enable-optimizations --enable-tuning \
+        --enable-cma --enable-mt \
+        --with-mlx5 --with-rc --with-ud --with-dc --with-dm --with-ib_hw_tm \
+        --with-verbs=/usr/include --with-rdmacm=/usr \
+        --with-rocm=${with_rocm} \
+        --without-knem --without-cuda --without-java
+    make -j$(nproc)
+    make install
+   fi
+-  if [ ! -d "./deps/openmpi" ]; then
+-    mkdir -p deps && cd deps
+-    git clone --branch v4.1.4 https://github.com/open-mpi/ompi.git openmpi
+-    cd openmpi; ./autogen.pl; mkdir build; cd build
+-    ../configure --prefix=${PWD}/../ --with-ucx=${PWD}/../../ucx --without-verbs
+-    make -j$(nproc); make install; cd ../../..
+  export LD_LIBRARY_PATH="${ucx_lib_folder}:${ucx_lib64_folder}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+  export LIBRARY_PATH="${ucx_lib_folder}:${ucx_lib64_folder}${LIBRARY_PATH:+:${LIBRARY_PATH}}"
+  export CPATH="${ucx_prefix}/include${CPATH:+:${CPATH}}"
+
+  # Build OpenMPI on demand
+  if [ ! -f "${ompi_lib_folder}/libmpi.so" ] && [ ! -f "${ompi_lib64_folder}/libmpi.so" ]; then
+    cd ${ompi_src}
+    ./configure --prefix=${ompi_prefix} \
+      --with-ucx=${ucx_prefix} \
+      --with-rocm=${with_rocm} \
+      --enable-builtin-atomics \
+      --enable-wrapper-rpath \
+      --enable-mca-no-build=btl-uct
+    make -j$(nproc)
+    make install
+   fi
+
+  export LD_LIBRARY_PATH="${ompi_lib_folder}:${ompi_lib64_folder}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+  export LIBRARY_PATH="${ompi_lib_folder}:${ompi_lib64_folder}${LIBRARY_PATH:+:${LIBRARY_PATH}}"
+  export CPATH="${ompi_prefix}/include${CPATH:+:${CPATH}}"
+  export OPAL_PREFIX=${ompi_prefix}
+
+  cd ${install_dir}/..
+ }
+ # #################################################
+@@ -396,7 +450,7 @@ pushd .
+   fi
+   # Build library with AMD toolchain because of existense of device kernels
+-  ${cmake_executable} ${cmake_common_options} \
+  ${cmake_executable} --fresh ${cmake_common_options} \
+     -DCPACK_SET_DESTDIR=OFF \
+     -DCMAKE_INSTALL_PREFIX=${install_prefix} \
+     -DCPACK_PACKAGING_INSTALL_PREFIX=${with_rocm} \
--- a/projects/rocHPCG/rochpcg-scripts-bw.patch
+++ b/projects/rocHPCG/rochpcg-scripts-bw.patch
+diff --git a/mpirun_rochpcg b/mpirun_rochpcg
+new file mode 100755
+index 0000000..3dccc72
+--- /dev/null
+++ b/mpirun_rochpcg
+@@ -0,0 +1,193 @@
+#!/bin/bash
+set -e
+
+# =================================================
+# Helper functions
+# =================================================
+help() {
+    cat << EOF
+rocHPCG MPI run helper script
+Usage: $(basename "$0") [OPTIONS]
+
+OPTIONS:
+    -h, --help    Show this help message and exit
+    --npx         Number of processes in x dimension of process grid (default: ${npx})
+    --npy         Number of processes in y dimension of process grid (default: ${npy})
+    --npz         Number of processes in z dimension of process grid (default: ${npz})
+    --nx          Problem size in x dimension (default: ${nx})
+    --ny          Problem size in y dimension (default: ${ny})
+    --nz          Problem size in z dimension (default: ${nz})
+    --rt          Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
+    --tol         Residual tolerance, skip reference verification if set (default: ${tol})
+    --pz          Partition boundary in z process dimension (default: 0, uniform grid)
+    --zl          Local nz value for processes with z rank < pz (default: equal to ${nz})
+    --zu          Local nz value for processes with z rank >= pz (default: equal to ${nz})
+
+    -H, --hosts   Comma-separated list of nodes to run on
+    --tcp-iface   TCP interface to use for communication (default: ${tcp_iface})
+    --ssh-port    SSH port to use for remote connections (default: ${ssh_port})
+EOF
+}
+
+# =================================================
+# Global variables
+# =================================================
+npx=1
+npy=1
+npz=1
+nx=560
+ny=280
+nz=280
+runtime=60
+tol=1
+pz=0
+zl=${nz}
+zu=${nz}
+
+nodes=
+tcp_iface=p14p2
+ssh_port=3333
+
+rochpcg_runscript="${PWD}/run_rochpcg"
+mpi_bin="${PWD}/deps/openmpi/bin/mpirun"
+ompi_prefix="${PWD}/deps/openmpi"
+ompi_lib_dir="${PWD}/deps/openmpi/lib"
+ompi_lib64_dir="${PWD}/deps/openmpi/lib64"
+ucx_lib_dir="${PWD}/deps/ucx/lib"
+ucx_lib64_dir="${PWD}/deps/ucx/lib64"
+
+export PATH="${ompi_prefix}/bin${PATH:+:${PATH}}"
+export LD_LIBRARY_PATH="${ompi_lib_dir}:${ompi_lib64_dir}:${ucx_lib_dir}:${ucx_lib64_dir}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+export OPAL_PREFIX="${ompi_prefix}"
+
+# Detect the number of GPUs per node
+ngpu_per_node=$(hy-smi --showid 2>/dev/null | grep -ic "Device ID")
+if [[ -z "${ngpu_per_node}" || "${ngpu_per_node}" -eq 0 ]]; then
+  echo "Failed to get the number of GPUs per node via hy-smi. Defaulting to 8."
+  ngpu_per_node=8
+else
+  echo "Detected ${ngpu_per_node} GPUs per node."
+fi
+
+# =================================================
+# Parameter parsing
+# =================================================
+GETOPT_PARSE=$(getopt --name "${0}" --options hH: --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu:,hosts:,tcp-iface:,ssh-port: -- "$@") \
+  || { echo "getopt invocation failed; could not parse the command line"; exit 1; }
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+  case "${1}" in
+    -h|--help) help; exit 0 ;;
+    --npx) npx=${2}; shift 2 ;;
+    --npy) npy=${2}; shift 2 ;;
+    --npz) npz=${2}; shift 2 ;;
+    --nx) nx=${2}; shift 2 ;;
+    --ny) ny=${2}; shift 2 ;;
+    --nz)
+        nz=${2}
+        zl=${nz}
+        zu=${nz}
+        shift 2 ;;
+    --rt) runtime=${2}; shift 2 ;;
+    --tol) tol=${2}; shift 2 ;;
+    --pz) pz=${2}; shift 2 ;;
+    --zl) zl=${2}; shift 2 ;;
+    --zu) zu=${2}; shift 2 ;;
+    -H|--hosts) nodes=${2}; shift 2 ;;
+    --tcp-iface) tcp_iface=${2}; shift 2 ;;
+    --ssh-port) ssh_port=${2}; shift 2 ;;
+    --) shift ; break ;;
+    *)  echo "Unexpected command line parameter received; aborting";
+        exit 1
+        ;;
+  esac
+done
+
+# Build rochpcg arguments
+rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
+rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
+rochpcg_args+=" --rt=${runtime}"
+rochpcg_args+=" --tol=${tol}"
+rochpcg_args+=" --pz=${pz}"
+rochpcg_args+=" --zl=${zl}"
+rochpcg_args+=" --zu=${zu}"
+
+# Calculate total number of processes
+np=$((${npx}*${npy}*${npz}))
+
+# =================================================
+# Run rochpcg script
+# =================================================
+# Run single-node test if --hosts is not set
+if [ -z "${nodes}" ]; then
+  echo "No compute nodes specified. Running in single-node mode."
+
+  ${mpi_bin} --allow-run-as-root \
+    --bind-to none \
+    --mca pml ucx \
+    --mca osc ucx \
+    --mca btl ^vader,tcp,openib,uct \
+    --mca coll ^hcoll \
+    -x UCX_TLS=self,sm,rocm \
+    -x UCX_RNDV_SCHEME=put_zcopy \
+    -x UCX_MEMTYPE_CACHE=y \
+    -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
+    -np ${np} \
+    ${rochpcg_runscript} ${rochpcg_args}
+else
+  echo "Running in multi-node mode. Using nodes: ${nodes}"
+  echo "Using TCP interface: ${tcp_iface}"
+  echo "Using SSH port: ${ssh_port}"
+
+  # Set rank counts for hosts
+  IFS=',' read -ra node_array <<< "${nodes}"
+  hosts_string=""
+  for node in "${node_array[@]}"; do
+    hosts_string+="${node}:${ngpu_per_node},"
+  done
+  hosts_string="${hosts_string%,}"
+
+  echo "MPI hosts: ${hosts_string}"
+
+  # Copy files to other nodes
+  current_node=$(hostname)
+  copyto_hosts=()
+  for node in "${node_array[@]}"; do
+    if [[ "${node}" != "${current_node}" ]]; then
+      copyto_hosts+=("${node}")
+    fi
+  done
+
+  # Copy files using rsync only if there are other nodes to copy to
+  if [ ${#copyto_hosts[@]} -gt 0 ]; then
+    echo "Copying files to other nodes in parallel: ${copyto_hosts[@]}"
+    for node in "${copyto_hosts[@]}"; do
+      rsync -az -e "ssh -p ${ssh_port}" build deps ${rochpcg_runscript} "${node}:/workspace/" &
+    done
+    wait
+    echo "Files synchronized successfully."
+  fi
+
+  # Multi-node run
+  ${mpi_bin} --allow-run-as-root \
+    --prefix ${ompi_prefix} \
+    --map-by ppr:${ngpu_per_node}:node --bind-to none \
+    --mca pml ucx \
+    --mca osc ucx \
+    --mca btl ^openib \
+    --mca btl_tcp_if_include ${tcp_iface} \
+    --mca plm_rsh_args "-p ${ssh_port}" \
+    --mca coll_hcoll_enable 0 \
+    -x UCX_TLS=self,sm,rocm,rc \
+    -x UCX_RNDV_SCHEME=put_zcopy \
+    -x UCX_RNDV_FRAG_MEM_TYPE=rocm \
+    -x UCX_MEMTYPE_CACHE=n \
+    -x UCX_LOG_LEVEL=fatal \
+    -x HSA_FORCE_FINE_GRAIN_PCIE=1 \
+    -x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX \
+    -np ${np} \
+    -H ${hosts_string} \
+    ${rochpcg_runscript} ${rochpcg_args}
+fi
+\ No newline at end of file
+diff --git a/run_rochpcg b/run_rochpcg
+new file mode 100755
+index 0000000..0f806fe
+--- /dev/null
+++ b/run_rochpcg
+@@ -0,0 +1,195 @@
+#!/bin/bash
+
+# =================================================
+# Helper functions
+# =================================================
+help() {
+    cat << EOF
+rocHPCG helper script
+Usage: $(basename "$0") [OPTIONS]
+
+OPTIONS:
+    -h, --help    Show this help message and exit
+    --npx         Number of processes in x dimension of process grid (default: ${npx})
+    --npy         Number of processes in y dimension of process grid (default: ${npy})
+    --npz         Number of processes in z dimension of process grid (default: ${npz})
+    --nx          Problem size in x dimension (default: ${nx})
+    --ny          Problem size in y dimension (default: ${ny})
+    --nz          Problem size in z dimension (default: ${nz})
+    --rt          Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
+    --tol         Residual tolerance, skip reference verification if set (default: ${tol})
+    --pz          Partition boundary in z process dimension (default: 0, uniform grid)
+    --zl          Local nz value for processes with z rank < pz (default: equal to ${nz})
+    --zu          Local nz value for processes with z rank >= pz (default: equal to ${nz})
+EOF
+}
+
+# =================================================
+# Global variables
+# =================================================
+npx=1
+npy=1
+npz=1
+nx=560
+ny=280
+nz=280
+runtime=60
+tol=1
+pz=0
+zl=${nz}
+zu=${nz}
+
+rochpcg_bin="${PWD}/build/release/rochpcg-install/bin/rochpcg"
+
+# =================================================
+# Parameter parsing
+# =================================================
+GETOPT_PARSE=$(getopt --name "${0}" --options h --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu: -- "$@") \
+  || { echo "getopt invocation failed; could not parse the command line"; exit 1; }
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+  case "${1}" in
+    -h|--help) help; exit 0 ;;
+    --npx) npx=${2}; shift 2 ;;
+    --npy) npy=${2}; shift 2 ;;
+    --npz) npz=${2}; shift 2 ;;
+    --nx) nx=${2}; shift 2 ;;
+    --ny) ny=${2}; shift 2 ;;
+    --nz)
+        nz=${2}
+        zl=${nz}
+        zu=${nz}
+        shift 2 ;;
+    --rt) runtime=${2}; shift 2 ;;
+    --tol) tol=${2}; shift 2 ;;
+    --pz) pz=${2}; shift 2 ;;
+    --zl) zl=${2}; shift 2 ;;
+    --zu) zu=${2}; shift 2 ;;
+    --) shift ; break ;;
+    *)  echo "Unexpected command line parameter received; aborting";
+        exit 1
+        ;;
+  esac
+done
+
+# Build rochpcg arguments
+rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
+rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
+rochpcg_args+=" --rt=${runtime}"
+rochpcg_args+=" --tol=${tol}"
+rochpcg_args+=" --pz=${pz}"
+rochpcg_args+=" --zl=${zl}"
+rochpcg_args+=" --zu=${zu}"
+
+# =================================================
+# Affinity setup
+# =================================================
+globalRank=$OMPI_COMM_WORLD_RANK
+globalSize=$OMPI_COMM_WORLD_SIZE
+rank=$OMPI_COMM_WORLD_LOCAL_RANK
+size=$OMPI_COMM_WORLD_LOCAL_SIZE
+
+#construct a list of all cpus, sorted by core
+cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
+
+#construct list of devices and their numa affinities
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+
+#count the cpus per core
+threads_per_core=$(echo "${cpulist}" | grep -c ".*	0	.*")
+
+#remove the extra cpus on each core to make a list of just physical cores, then sort by numa domain
+corelist=$(echo "$cpulist" | awk -v tpc=${threads_per_core} '(NR-1)%tpc==0' | sort -k 3 -g -s)
+
+#count numa domains
+line=($(echo "$cpulist" | tail -n 1))
+n_numa=$((line[2]+1))
+
+numa_core_counts=()
+numa_proc_counts=()
+for i in $(seq 1 ${n_numa}); do numa_core_counts+=(0); numa_proc_counts+=(0); done
+
+#parse the list of cpus to array and count cpus in each numa
+cpus=()
+while read -a line; do
+  cpus+=(${line[0]})
+  ((numa_core_counts[${line[2]}]++)) || true
+done <<< "${corelist}"
+
+numa_core_offsets=(0)
+for i in $(seq 1 $((n_numa-1))); do numa_core_offsets+=($((numa_core_offsets[$((i-1))] + numa_core_counts[$i]))); done
+
+#parse device to numa mapping
+device_to_numa=()
+while read -a line; do
+  device_to_numa+=(${line[1]})
+done <<< "${devicelist}"
+
+rank_to_device=()
+n_devices=$(echo "${devicelist}" | grep -c "card")
+for i in $(seq 0 $((size-1))); do
+  rank_to_device+=($((i%n_devices)))
+done
+
+mygpu=${rank_to_device[rank]}
+mynuma=${device_to_numa[mygpu]}
+
+rank_to_numa=()
+for i in $(seq 0 $((size-1))); do
+  rank_to_numa+=(${device_to_numa[${rank_to_device[$((i%n_devices))]}]})
+done
+
+for i in $(seq 0 $((size-1))); do
+  numa=${rank_to_numa[$i]}
+  ((numa_proc_counts[numa]++)) || true
+done
+
+omp_num_threads=$((numa_core_counts[mynuma]/numa_proc_counts[mynuma]))
+
+core_offset=${numa_core_offsets[mynuma]}
+for i in $(seq 0 $((rank-1))); do
+  numa=${rank_to_numa[$i]}
+  if [[ $numa -eq $mynuma ]]; then
+    core_offset=$((core_offset + omp_num_threads))
+  fi
+done
+
+omp_places="{${cpus[core_offset]}}"
+for c in $(seq 1 $((omp_num_threads-1))); do
+  omp_places+=",{${cpus[core_offset+c]}}"
+done
+
+if [[ $omp_num_threads -gt 1 ]]; then
+  places="{${cpus[core_offset]}-${cpus[core_offset+$((omp_num_threads-1))]}}"
+else
+  places="{${cpus[core_offset]}}"
+fi
+
+# Export OpenMP config
+export OMP_NUM_THREADS=${omp_num_threads}
+export OMP_PLACES=${omp_places}
+export OMP_PROC_BIND=true
+
+# Hard-coded IB mapping for now
+declare -a IB_MAP=(
+    "mlx5_1:1"   # GPU 0 -> NUMA 3
+    "mlx5_2:1"   # GPU 1 -> NUMA 3
+    "mlx5_3:1"   # GPU 2 -> NUMA 0
+    "mlx5_4:1"   # GPU 3 -> NUMA 0
+    "mlx5_7:1"   # GPU 4 -> NUMA 7
+    "mlx5_8:1"   # GPU 5 -> NUMA 7
+    "mlx5_9:1"   # GPU 6 -> NUMA 4
+    "mlx5_10:1"  # GPU 7 -> NUMA 4
+)
+
+myib=${IB_MAP[$rank]}
+export UCX_NET_DEVICES=${myib}
+
+if [[ $globalRank -lt $size ]]; then
+  echo "Node Binding: Process $rank [(nx,ny,nz)=(${nx},${ny},${nz})] GPU: $mygpu, NUMA: $mynuma, IB: $myib, CPU Cores: $omp_num_threads - $places"
+fi
+
+# Run
+numactl -N ${mynuma} -m ${mynuma} ${rochpcg_bin} ${rochpcg_args}
+\ No newline at end of file
--- a/projects/rocHPCG/run-all.sh
+++ b/projects/rocHPCG/run-all.sh
+#!/bin/bash
+set -e
+nx=560
+ny=280
+nz=280
+rt=60
+# 4 nodes, 32 GPUs
+./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
+    --npx 4 --npy 4 --npz 2 \
+    -H node01,node02,node03,node04 \
+    --tcp-iface p14p2 \
+    --ssh-port 3333
+# 2 nodes, 16 GPUs
+./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
+    --npx 4 --npy 2 --npz 2 \
+    -H node01,node02 \
+    --tcp-iface p14p2 \
+    --ssh-port 3333
+# 1 nodes, 8 GPUs
+./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
+    --npx 2 --npy 2 --npz 2
+# 1 nodes, 4 GPUs
+./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
+    --npx 2 --npy 2 --npz 2
+# 1 nodes, 2 GPUs
+./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
+    --npx 2 --npy 1 --npz 1
+# 1 nodes, 1 GPU
+./mpirun_rochpcg --nx ${nx} --ny ${ny} --nz ${nz} --rt ${rt} \
+    --npx 1 --npy 1 --npz 1
\ No newline at end of file
--- a/projects/rocHPCG/run_rochpcg
+++ b/projects/rocHPCG/run_rochpcg
+#!/bin/bash
+# =================================================
+# Helper functions
+# =================================================
+help() {
+    cat << EOF
+rocHPCG helper script
+Usage: $(basename "$0") [OPTIONS]
+OPTIONS:
+    -h, --help    Show this help message and exit
+    --npx         Number of processes in x dimension of process grid (default: ${npx})
+    --npy         Number of processes in y dimension of process grid (default: ${npy})
+    --npz         Number of processes in z dimension of process grid (default: ${npz})
+    --nx          Problem size in x dimension (default: ${nx})
+    --ny          Problem size in y dimension (default: ${ny})
+    --nz          Problem size in z dimension (default: ${nz})
+    --rt          Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
+    --tol         Residual tolerance, skip reference verification if set (default: ${tol})
+    --pz          Partition boundary in z process dimension (default: 0, uniform grid)
+    --zl          Local nz value for processes with z rank < pz (default: equal to ${nz})
+    --zu          Local nz value for processes with z rank >= pz (default: equal to ${nz})
+EOF
+}
+# =================================================
+# Global variables
+# =================================================
+npx=1
+npy=1
+npz=1
+nx=560
+ny=280
+nz=280
+runtime=60
+tol=1
+pz=0
+zl=${nz}
+zu=${nz}
+rochpcg_bin="${PWD}/build/release/rochpcg-install/bin/rochpcg"
+# =================================================
+# Parameter parsing
+# =================================================
+GETOPT_PARSE=$(getopt --name "${0}" --options h --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu: -- "$@") \
+  || { echo "getopt invocation failed; could not parse the command line"; exit 1; }
+eval set -- "${GETOPT_PARSE}"
+while true; do
+  case "${1}" in
+    -h|--help) help; exit 0 ;;
+    --npx) npx=${2}; shift 2 ;;
+    --npy) npy=${2}; shift 2 ;;
+    --npz) npz=${2}; shift 2 ;;
+    --nx) nx=${2}; shift 2 ;;
+    --ny) ny=${2}; shift 2 ;;
+    --nz)
+        nz=${2}
+        zl=${nz}
+        zu=${nz}
+        shift 2 ;;
+    --rt) runtime=${2}; shift 2 ;;
+    --tol) tol=${2}; shift 2 ;;
+    --pz) pz=${2}; shift 2 ;;
+    --zl) zl=${2}; shift 2 ;;
+    --zu) zu=${2}; shift 2 ;;
+    --) shift ; break ;;
+    *)  echo "Unexpected command line parameter received; aborting";
+        exit 1
+        ;;
+  esac
+done
+# Build rochpcg arguments
+rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
+rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
+rochpcg_args+=" --rt=${runtime}"
+rochpcg_args+=" --tol=${tol}"
+rochpcg_args+=" --pz=${pz}"
+rochpcg_args+=" --zl=${zl}"
+rochpcg_args+=" --zu=${zu}"
+# =================================================
+# Affinity setup
+# =================================================
+globalRank=$OMPI_COMM_WORLD_RANK
+globalSize=$OMPI_COMM_WORLD_SIZE
+rank=$OMPI_COMM_WORLD_LOCAL_RANK
+size=$OMPI_COMM_WORLD_LOCAL_SIZE
+#construct a list of all cpus, sorted by core
+cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
+#construct list of devices and their numa affinities
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+#count the cpus per core
+threads_per_core=$(echo "${cpulist}" | grep -c ".*	0	.*")
+#remove the extra cpus on each core to make a list of just physical cores, then sort by numa domain
+corelist=$(echo "$cpulist" | awk -v tpc=${threads_per_core} '(NR-1)%tpc==0' | sort -k 3 -g -s)
+#count numa domains
+line=($(echo "$cpulist" | tail -n 1))
+n_numa=$((line[2]+1))
+numa_core_counts=()
+numa_proc_counts=()
+for i in $(seq 1 ${n_numa}); do numa_core_counts+=(0); numa_proc_counts+=(0); done
+#parse the list of cpus to array and count cpus in each numa
+cpus=()
+while read -a line; do
+  cpus+=(${line[0]})
+  ((numa_core_counts[${line[2]}]++)) || true
+done <<< "${corelist}"
+numa_core_offsets=(0)
+for i in $(seq 1 $((n_numa-1))); do numa_core_offsets+=($((numa_core_offsets[$((i-1))] + numa_core_counts[$i]))); done
+#parse device to numa mapping
+device_to_numa=()
+while read -a line; do
+  device_to_numa+=(${line[1]})
+done <<< "${devicelist}"
+rank_to_device=()
+n_devices=$(echo "${devicelist}" | grep -c "card")
+for i in $(seq 0 $((size-1))); do
+  rank_to_device+=($((i%n_devices)))
+done
+mygpu=${rank_to_device[rank]}
+mynuma=${device_to_numa[mygpu]}
+rank_to_numa=()
+for i in $(seq 0 $((size-1))); do
+  rank_to_numa+=(${device_to_numa[${rank_to_device[$((i%n_devices))]}]})
+done
+for i in $(seq 0 $((size-1))); do
+  numa=${rank_to_numa[$i]}
+  ((numa_proc_counts[numa]++)) || true
+done
+omp_num_threads=$((numa_core_counts[mynuma]/numa_proc_counts[mynuma]))
+core_offset=${numa_core_offsets[mynuma]}
+for i in $(seq 0 $((rank-1))); do
+  numa=${rank_to_numa[$i]}
+  if [[ $numa -eq $mynuma ]]; then
+    core_offset=$((core_offset + omp_num_threads))
+  fi
+done
+omp_places="{${cpus[core_offset]}}"
+for c in $(seq 1 $((omp_num_threads-1))); do
+  omp_places+=",{${cpus[core_offset+c]}}"
+done
+if [[ $omp_num_threads -gt 1 ]]; then
+  places="{${cpus[core_offset]}-${cpus[core_offset+$((omp_num_threads-1))]}}"
+else
+  places="{${cpus[core_offset]}}"
+fi
+# Export OpenMP config
+export OMP_NUM_THREADS=${omp_num_threads}
+export OMP_PLACES=${omp_places}
+export OMP_PROC_BIND=true
+# Hard-coded IB mapping for now
+declare -a IB_MAP=(
+    "mlx5_1:1"   # GPU 0 -> NUMA 3
+    "mlx5_2:1"   # GPU 1 -> NUMA 3
+    "mlx5_3:1"   # GPU 2 -> NUMA 0
+    "mlx5_4:1"   # GPU 3 -> NUMA 0
+    "mlx5_7:1"   # GPU 4 -> NUMA 7
+    "mlx5_8:1"   # GPU 5 -> NUMA 7
+    "mlx5_9:1"   # GPU 6 -> NUMA 4
+    "mlx5_10:1"  # GPU 7 -> NUMA 4
+)
+myib=${IB_MAP[$rank]}
+export UCX_NET_DEVICES=${myib}
+if [[ $globalRank -lt $size ]]; then
+  echo "Node Binding: Process $rank [(nx,ny,nz)=(${nx},${ny},${nz})] GPU: $mygpu, NUMA: $mynuma, IB: $myib, CPU Cores: $omp_num_threads - $places"
+fi
+# Run
+numactl -N ${mynuma} -m ${mynuma} ${rochpcg_bin} ${rochpcg_args}
\ No newline at end of file