MicroBenchmark: rocHPCG

e4c2bd4c · one · 211e63c7 · e4c2bd4c · e4c2bd4c · e4c2bd4c
Commit e4c2bd4c authored Mar 27, 2026 by one
8 changed files
--- a/.gitmodules
+++ b/.gitmodules
@@ -36,3 +36,6 @@
 [submodule "third_party/BabelStream"]
 	path = third_party/BabelStream
 	url = https://github.com/UoB-HPC/BabelStream.git
+[submodule "third_party/rocHPCG"]
+	path = third_party/rocHPCG
+	url = https://github.com/ROCm/rocHPCG.git
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -6,6 +6,7 @@
 from superbench.benchmarks.micro_benchmarks.micro_base import MicroBenchmark, MicroBenchmarkWithInvoke
 from superbench.benchmarks.micro_benchmarks.gemm_flops_performance_base import GemmFlopsBenchmark
 from superbench.benchmarks.micro_benchmarks.memory_bw_performance_base import MemBwBenchmark
+from superbench.benchmarks.micro_benchmarks.gpu_hpcg_performance_base import GpuHpcgBenchmark

 from superbench.benchmarks.micro_benchmarks.computation_communication_overlap import ComputationCommunicationOverlap
 from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark
@@ -14,6 +15,7 @@ from superbench.benchmarks.micro_benchmarks.cublaslt_function import CublasLtBen
 from superbench.benchmarks.micro_benchmarks.rocm_hipblaslt_function import RocmHipBlasLtBenchmark
 from superbench.benchmarks.micro_benchmarks.dtk_hipblaslt_function import DtkHipBlasLtBenchmark
 from superbench.benchmarks.micro_benchmarks.dtk_gemm_flops_performance import DtkGemmFlopsBenchmark
+from superbench.benchmarks.micro_benchmarks.dtk_hpcg_performance import DtkHpcgBenchmark
 from superbench.benchmarks.micro_benchmarks.cuda_gemm_flops_performance import CudaGemmFlopsBenchmark
 from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark
 from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
@@ -63,6 +65,7 @@ __all__ = [
    'GemmFlopsBenchmark',
    'GpuBurnBenchmark',
    'GpuCopyBwBenchmark',
+    'GpuHpcgBenchmark',
    'GpuStreamBenchmark',
    'IBBenchmark',
    'IBLoopbackBenchmark',
@@ -80,5 +83,6 @@ __all__ = [
    'DirectXGPUCopyBw',
    'DirectXGPUMemBw',
    'DirectXGPUCoreFlops',
+    'DtkHpcgBenchmark',
    'NvBandwidthBenchmark',
 ]
--- a/superbench/benchmarks/micro_benchmarks/dtk_hpcg_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/dtk_hpcg_performance.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the DTK HPCG benchmark."""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.benchmarks.micro_benchmarks import GpuHpcgBenchmark
+
+
+class DtkHpcgBenchmark(GpuHpcgBenchmark):
+    """The DTK HPCG benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'run_rochpcg'
+
+
+BenchmarkRegistry.register_benchmark('gpu-hpcg', DtkHpcgBenchmark, platform=Platform.DTK)
--- a/superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the GPU HPCG benchmark base class."""
+
+import os
+
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class GpuHpcgBenchmark(MicroBenchmarkWithInvoke):
+    """The GPU HPCG benchmark base class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--npx',
+            type=int,
+            default=1,
+            required=False,
+            help='Number of MPI processes in x dimension.',
+        )
+        self._parser.add_argument(
+            '--npy',
+            type=int,
+            default=1,
+            required=False,
+            help='Number of MPI processes in y dimension.',
+        )
+        self._parser.add_argument(
+            '--npz',
+            type=int,
+            default=1,
+            required=False,
+            help='Number of MPI processes in z dimension.',
+        )
+        self._parser.add_argument(
+            '--nx',
+            type=int,
+            default=560,
+            required=False,
+            help='Local problem size in x dimension.',
+        )
+        self._parser.add_argument(
+            '--ny',
+            type=int,
+            default=280,
+            required=False,
+            help='Local problem size in y dimension.',
+        )
+        self._parser.add_argument(
+            '--nz',
+            type=int,
+            default=280,
+            required=False,
+            help='Local problem size in z dimension.',
+        )
+        self._parser.add_argument(
+            '--rt',
+            type=int,
+            default=60,
+            required=False,
+            help='Benchmark runtime in seconds.',
+        )
+        self._parser.add_argument(
+            '--tol',
+            type=float,
+            default=1.0,
+            required=False,
+            help='Residual tolerance; reference verification is skipped if set.',
+        )
+        self._parser.add_argument(
+            '--pz',
+            type=int,
+            default=0,
+            required=False,
+            help='Partition boundary in z process dimension.',
+        )
+        self._parser.add_argument(
+            '--zl',
+            type=int,
+            required=False,
+            help='Local nz value for processes with z rank < pz.',
+        )
+        self._parser.add_argument(
+            '--zu',
+            type=int,
+            required=False,
+            help='Local nz value for processes with z rank >= pz.',
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        bin_path = os.path.join(self._args.bin_dir, self._bin_name)
+        zl = self._args.zl if self._args.zl is not None else self._args.nz
+        zu = self._args.zu if self._args.zu is not None else self._args.nz
+
+        command = (
+            f'{bin_path}'
+            f' --npx={self._args.npx}'
+            f' --npy={self._args.npy}'
+            f' --npz={self._args.npz}'
+            f' --nx={self._args.nx}'
+            f' --ny={self._args.ny}'
+            f' --nz={self._args.nz}'
+            f' --rt={self._args.rt}'
+            f' --tol={self._args.tol}'
+            f' --pz={self._args.pz}'
+            f' --zl={zl}'
+            f' --zu={zu}'
+        )
+        self._commands = [command]
+
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Save raw output for later parser refinement.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            bool: Always True for now.
+        """
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
+        return True
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -91,6 +91,8 @@ class SuperBenchRunner():
                            'btl_tcp_if_exclude': 'lo,docker0',
                            'coll_hcoll_enable': 0,
                        }
+                    if 'bind_to' not in mode:
+                        self._sb_benchmarks[name].modes[idx].bind_to = 'numa'
                    for key in ['PATH', 'LD_LIBRARY_PATH', 'SB_MICRO_PATH', 'SB_WORKSPACE']:
                        self._sb_benchmarks[name].modes[idx].env.setdefault(key, None)
                    if 'pattern' in mode:
@@ -182,13 +184,14 @@ class SuperBenchRunner():
                '-tag-output '    # tag mpi output with [jobid,rank]<stdout/stderr> prefix
                '-allow-run-as-root '    # allow mpirun to run when executed by root user
                '{host_list} '    # use prepared hostfile or specify nodes and launch {proc_num} processes on each node
-                '-bind-to numa '    # bind processes to numa
+                '-bind-to {bind_to} '    # bind processes according to mode config
                '{mca_list} {env_list} {command}'
            ).format(
                trace=trace_command,
                host_list=f'-host localhost:{mode.proc_num}' if 'node_num' in mode and mode.node_num == 1 else
                f'-hostfile hostfile -map-by ppr:{mode.proc_num}:node' if 'host_list' not in mode else '-host ' +
                ','.join(f'{host}:{mode.proc_num}' for host in mode.host_list),
+                bind_to=mode.bind_to,
                mca_list=' '.join(f'-mca {k} {v}' for k, v in mode.mca.items()),
                env_list=' '.join(
                    f'-x {k}={str(v).format(proc_rank=mode.proc_rank, proc_num=mode.proc_num)}'

--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -16,7 +16,7 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")

 NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)

-.PHONY: all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt rocm_babelstream_hip megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm
+.PHONY: all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt rocm_babelstream_hip megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm rocm_hpcg

 # Build targets.
 all: cuda rocm
@@ -24,7 +24,7 @@ all: cuda rocm
 cuda_with_msccl: cuda cuda_msccl
 cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
-dtk: common rocm_perftest rocm_rccl_tests rocm_babelstream_hip megatron_deepspeed apex_rocm rocm_megatron_lm
+dtk: common rocm_perftest rocm_rccl_tests rocm_babelstream_hip megatron_deepspeed apex_rocm rocm_megatron_lm rocm_hpcg
 cpu: common cpu_perftest
 common: fio cpu_stream

@@ -192,6 +192,19 @@ ifneq (,$(wildcard BabelStream/CMakeLists.txt))
 	cp -v ./BabelStream/build/hip-stream $(SB_MICRO_PATH)/bin/
 endif

+# Build rocHPCG and stage helper scripts for SuperBench DTK image.
+rocm_hpcg: sb_micro_path
+ifneq (,$(wildcard rocHPCG/install.sh))
+	cd ./rocHPCG && \
+	if [ ! -f ./run_rochpcg ]; then \
+		git apply ../rochpcg-scripts-bw.patch; \
+	fi && \
+	./install.sh --with-rocm=$(ROCM_PATH) --with-mpi=$(MPI_HOME) --gpu-aware-mpi=on
+	cp -v ./rocHPCG/build/release/rochpcg-install/bin/rochpcg $(SB_MICRO_PATH)/bin/
+	cp -v ./rocHPCG/run_rochpcg $(SB_MICRO_PATH)/bin/
+	chmod +x $(SB_MICRO_PATH)/bin/rochpcg $(SB_MICRO_PATH)/bin/run_rochpcg
+endif
+
 # Build GPCNET from commit c56fd9.
 gpcnet: sb_micro_path
 	bash -c "source ${HPCX_HOME}/hpcx-init.sh && hpcx_load && make CC=mpicc -C GPCNET all && hpcx_unload"

--- a/rocHPCG @ 8f2795d5
+++ b/rocHPCG @ 8f2795d5
+Subproject commit 8f2795d564c69f3f4c3404d34f5b78a88e9e7dcc
--- a/third_party/rochpcg-scripts-bw.patch
+++ b/third_party/rochpcg-scripts-bw.patch
+diff --git a/run_rochpcg b/run_rochpcg
+new file mode 100755
+index 0000000..0f806fe
+--- /dev/null
+++ b/run_rochpcg
+@@ -0,0 +1,180 @@
+#!/bin/bash
+
+# =================================================
+# Helper functions
+# =================================================
+help() {
+    cat << EOF
+rocHPCG helper script
+Usage: $(basename "$0") [OPTIONS]
+
+OPTIONS:
+    -h, --help    Show this help message and exit
+    --npx         Number of processes in x dimension of process grid (default: ${npx})
+    --npy         Number of processes in y dimension of process grid (default: ${npy})
+    --npz         Number of processes in z dimension of process grid (default: ${npz})
+    --nx          Problem size in x dimension (default: ${nx})
+    --ny          Problem size in y dimension (default: ${ny})
+    --nz          Problem size in z dimension (default: ${nz})
+    --rt          Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
+    --tol         Residual tolerance, skip reference verification if set (default: ${tol})
+    --pz          Partition boundary in z process dimension (default: 0, uniform grid)
+    --zl          Local nz value for processes with z rank < pz (default: equal to ${nz})
+    --zu          Local nz value for processes with z rank >= pz (default: equal to ${nz})
+EOF
+}
+
+# =================================================
+# Global variables
+# =================================================
+npx=1
+npy=1
+npz=1
+nx=560
+ny=280
+nz=280
+runtime=60
+tol=1
+pz=0
+zl=${nz}
+zu=${nz}
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+rochpcg_bin="${SCRIPT_DIR}/rochpcg"
+
+if [[ ! -x "${rochpcg_bin}" ]]; then
+  echo "Cannot find rochpcg binary at ${rochpcg_bin}"
+  exit 1
+fi
+
+# =================================================
+# Parameter parsing
+# =================================================
+GETOPT_PARSE=$(getopt --name "${0}" --options h --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu: -- "$@") \
+  || { echo "getopt invocation failed; could not parse the command line"; exit 1; }
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+  case "${1}" in
+    -h|--help) help; exit 0 ;;
+    --npx) npx=${2}; shift 2 ;;
+    --npy) npy=${2}; shift 2 ;;
+    --npz) npz=${2}; shift 2 ;;
+    --nx) nx=${2}; shift 2 ;;
+    --ny) ny=${2}; shift 2 ;;
+    --nz)
+        nz=${2}
+        zl=${nz}
+        zu=${nz}
+        shift 2 ;;
+    --rt) runtime=${2}; shift 2 ;;
+    --tol) tol=${2}; shift 2 ;;
+    --pz) pz=${2}; shift 2 ;;
+    --zl) zl=${2}; shift 2 ;;
+    --zu) zu=${2}; shift 2 ;;
+    --) shift ; break ;;
+    *)  echo "Unexpected command line parameter received; aborting";
+        exit 1
+        ;;
+  esac
+done
+
+# Build rochpcg arguments
+rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
+rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
+rochpcg_args+=" --rt=${runtime}"
+rochpcg_args+=" --tol=${tol}"
+rochpcg_args+=" --pz=${pz}"
+rochpcg_args+=" --zl=${zl}"
+rochpcg_args+=" --zu=${zu}"
+
+# =================================================
+# Affinity setup
+# =================================================
+globalRank=${OMPI_COMM_WORLD_RANK:-0}
+rank=${OMPI_COMM_WORLD_LOCAL_RANK:-0}
+size=${OMPI_COMM_WORLD_LOCAL_SIZE:-1}
+
+#construct a list of all cpus, sorted by core
+cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
+
+#construct list of devices and their numa affinities
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+
+#count the cpus per core
+threads_per_core=$(echo "${cpulist}" | grep -c ".*	0	.*")
+
+#remove the extra cpus on each core to make a list of just physical cores, then sort by numa domain
+corelist=$(echo "$cpulist" | awk -v tpc=${threads_per_core} '(NR-1)%tpc==0' | sort -k 3 -g -s)
+
+#count numa domains
+line=($(echo "$cpulist" | tail -n 1))
+n_numa=$((line[2]+1))
+
+numa_core_counts=()
+numa_proc_counts=()
+for i in $(seq 1 ${n_numa}); do numa_core_counts+=(0); numa_proc_counts+=(0); done
+
+#parse the list of cpus to array and count cpus in each numa
+cpus=()
+while read -a line; do
+  cpus+=(${line[0]})
+  ((numa_core_counts[${line[2]}]++)) || true
+done <<< "${corelist}"
+
+numa_core_offsets=(0)
+for i in $(seq 1 $((n_numa-1))); do numa_core_offsets+=($((numa_core_offsets[$((i-1))] + numa_core_counts[$i]))); done
+
+#parse device to numa mapping
+device_to_numa=()
+while read -a line; do
+  device_to_numa+=(${line[1]})
+done <<< "${devicelist}"
+
+rank_to_device=()
+n_devices=$(echo "${devicelist}" | grep -c "card")
+for i in $(seq 0 $((size-1))); do
+  rank_to_device+=($((i%n_devices)))
+done
+
+mygpu=${rank_to_device[rank]}
+mynuma=${device_to_numa[mygpu]}
+
+rank_to_numa=()
+for i in $(seq 0 $((size-1))); do
+  rank_to_numa+=(${device_to_numa[${rank_to_device[$((i%n_devices))]}]})
+done
+
+for i in $(seq 0 $((size-1))); do
+  numa=${rank_to_numa[$i]}
+  ((numa_proc_counts[numa]++)) || true
+done
+
+omp_num_threads=$((numa_core_counts[mynuma]/numa_proc_counts[mynuma]))
+
+core_offset=${numa_core_offsets[mynuma]}
+for i in $(seq 0 $((rank-1))); do
+  numa=${rank_to_numa[$i]}
+  if [[ $numa -eq $mynuma ]]; then
+    core_offset=$((core_offset + omp_num_threads))
+  fi
+done
+
+omp_places="{${cpus[core_offset]}}"
+for c in $(seq 1 $((omp_num_threads-1))); do
+  omp_places+=",{${cpus[core_offset+c]}}"
+done
+
+if [[ $omp_num_threads -gt 1 ]]; then
+  places="{${cpus[core_offset]}-${cpus[core_offset+$((omp_num_threads-1))]}}"
+else
+  places="{${cpus[core_offset]}}"
+fi
+
+# Export OpenMP config
+export OMP_NUM_THREADS=${omp_num_threads}
+export OMP_PLACES=${omp_places}
+export OMP_PROC_BIND=true
+
+if [[ $globalRank -lt $size ]]; then
+  echo "Node Binding: Process $rank [(nx,ny,nz)=(${nx},${ny},${nz})] GPU: $mygpu, NUMA: $mynuma, CPU Cores: $omp_num_threads - $places"
+fi
+
+# Run
+numactl -N ${mynuma} -m ${mynuma} ${rochpcg_bin} ${rochpcg_args}
+\ No newline at end of file