Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
e4c2bd4c
Commit
e4c2bd4c
authored
Mar 27, 2026
by
one
Browse files
MicroBenchmark: rocHPCG
parent
211e63c7
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
387 additions
and
3 deletions
+387
-3
.gitmodules
.gitmodules
+3
-0
superbench/benchmarks/micro_benchmarks/__init__.py
superbench/benchmarks/micro_benchmarks/__init__.py
+4
-0
superbench/benchmarks/micro_benchmarks/dtk_hpcg_performance.py
...bench/benchmarks/micro_benchmarks/dtk_hpcg_performance.py
+24
-0
superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
.../benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
+144
-0
superbench/runner/runner.py
superbench/runner/runner.py
+4
-1
third_party/Makefile
third_party/Makefile
+15
-2
third_party/rocHPCG
third_party/rocHPCG
+1
-0
third_party/rochpcg-scripts-bw.patch
third_party/rochpcg-scripts-bw.patch
+192
-0
No files found.
.gitmodules
View file @
e4c2bd4c
...
...
@@ -36,3 +36,6 @@
[submodule "third_party/BabelStream"]
path = third_party/BabelStream
url = https://github.com/UoB-HPC/BabelStream.git
[submodule "third_party/rocHPCG"]
path = third_party/rocHPCG
url = https://github.com/ROCm/rocHPCG.git
superbench/benchmarks/micro_benchmarks/__init__.py
View file @
e4c2bd4c
...
...
@@ -6,6 +6,7 @@
from
superbench.benchmarks.micro_benchmarks.micro_base
import
MicroBenchmark
,
MicroBenchmarkWithInvoke
from
superbench.benchmarks.micro_benchmarks.gemm_flops_performance_base
import
GemmFlopsBenchmark
from
superbench.benchmarks.micro_benchmarks.memory_bw_performance_base
import
MemBwBenchmark
from
superbench.benchmarks.micro_benchmarks.gpu_hpcg_performance_base
import
GpuHpcgBenchmark
from
superbench.benchmarks.micro_benchmarks.computation_communication_overlap
import
ComputationCommunicationOverlap
from
superbench.benchmarks.micro_benchmarks.cublas_function
import
CublasBenchmark
...
...
@@ -14,6 +15,7 @@
from
superbench.benchmarks.micro_benchmarks.rocm_hipblaslt_function
import
RocmHipBlasLtBenchmark
from
superbench.benchmarks.micro_benchmarks.dtk_hipblaslt_function
import
DtkHipBlasLtBenchmark
from
superbench.benchmarks.micro_benchmarks.dtk_gemm_flops_performance
import
DtkGemmFlopsBenchmark
from
superbench.benchmarks.micro_benchmarks.dtk_hpcg_performance
import
DtkHpcgBenchmark
from
superbench.benchmarks.micro_benchmarks.cuda_gemm_flops_performance
import
CudaGemmFlopsBenchmark
from
superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance
import
CudaMemBwBenchmark
from
superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance
import
CudaNcclBwBenchmark
...
...
@@ -63,6 +65,7 @@
'GemmFlopsBenchmark'
,
'GpuBurnBenchmark'
,
'GpuCopyBwBenchmark'
,
'GpuHpcgBenchmark'
,
'GpuStreamBenchmark'
,
'IBBenchmark'
,
'IBLoopbackBenchmark'
,
...
...
@@ -80,5 +83,6 @@
'DirectXGPUCopyBw'
,
'DirectXGPUMemBw'
,
'DirectXGPUCoreFlops'
,
'DtkHpcgBenchmark'
,
'NvBandwidthBenchmark'
,
]
superbench/benchmarks/micro_benchmarks/dtk_hpcg_performance.py
0 → 100644
View file @
e4c2bd4c
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the DTK HPCG benchmark."""
from
superbench.benchmarks
import
BenchmarkRegistry
,
Platform
from
superbench.benchmarks.micro_benchmarks
import
GpuHpcgBenchmark
class
DtkHpcgBenchmark
(
GpuHpcgBenchmark
):
"""The DTK HPCG benchmark class."""
def
__init__
(
self
,
name
,
parameters
=
''
):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super
().
__init__
(
name
,
parameters
)
self
.
_bin_name
=
'run_rochpcg'
BenchmarkRegistry
.
register_benchmark
(
'gpu-hpcg'
,
DtkHpcgBenchmark
,
platform
=
Platform
.
DTK
)
superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
0 → 100644
View file @
e4c2bd4c
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the GPU HPCG benchmark base class."""
import
os
from
superbench.benchmarks.micro_benchmarks
import
MicroBenchmarkWithInvoke
class
GpuHpcgBenchmark
(
MicroBenchmarkWithInvoke
):
"""The GPU HPCG benchmark base class."""
def
__init__
(
self
,
name
,
parameters
=
''
):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super
().
__init__
(
name
,
parameters
)
def
add_parser_arguments
(
self
):
"""Add the specified arguments."""
super
().
add_parser_arguments
()
self
.
_parser
.
add_argument
(
'--npx'
,
type
=
int
,
default
=
1
,
required
=
False
,
help
=
'Number of MPI processes in x dimension.'
,
)
self
.
_parser
.
add_argument
(
'--npy'
,
type
=
int
,
default
=
1
,
required
=
False
,
help
=
'Number of MPI processes in y dimension.'
,
)
self
.
_parser
.
add_argument
(
'--npz'
,
type
=
int
,
default
=
1
,
required
=
False
,
help
=
'Number of MPI processes in z dimension.'
,
)
self
.
_parser
.
add_argument
(
'--nx'
,
type
=
int
,
default
=
560
,
required
=
False
,
help
=
'Local problem size in x dimension.'
,
)
self
.
_parser
.
add_argument
(
'--ny'
,
type
=
int
,
default
=
280
,
required
=
False
,
help
=
'Local problem size in y dimension.'
,
)
self
.
_parser
.
add_argument
(
'--nz'
,
type
=
int
,
default
=
280
,
required
=
False
,
help
=
'Local problem size in z dimension.'
,
)
self
.
_parser
.
add_argument
(
'--rt'
,
type
=
int
,
default
=
60
,
required
=
False
,
help
=
'Benchmark runtime in seconds.'
,
)
self
.
_parser
.
add_argument
(
'--tol'
,
type
=
float
,
default
=
1.0
,
required
=
False
,
help
=
'Residual tolerance; reference verification is skipped if set.'
,
)
self
.
_parser
.
add_argument
(
'--pz'
,
type
=
int
,
default
=
0
,
required
=
False
,
help
=
'Partition boundary in z process dimension.'
,
)
self
.
_parser
.
add_argument
(
'--zl'
,
type
=
int
,
required
=
False
,
help
=
'Local nz value for processes with z rank < pz.'
,
)
self
.
_parser
.
add_argument
(
'--zu'
,
type
=
int
,
required
=
False
,
help
=
'Local nz value for processes with z rank >= pz.'
,
)
def
_preprocess
(
self
):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if
not
super
().
_preprocess
():
return
False
bin_path
=
os
.
path
.
join
(
self
.
_args
.
bin_dir
,
self
.
_bin_name
)
zl
=
self
.
_args
.
zl
if
self
.
_args
.
zl
is
not
None
else
self
.
_args
.
nz
zu
=
self
.
_args
.
zu
if
self
.
_args
.
zu
is
not
None
else
self
.
_args
.
nz
command
=
(
f
'
{
bin_path
}
'
f
' --npx=
{
self
.
_args
.
npx
}
'
f
' --npy=
{
self
.
_args
.
npy
}
'
f
' --npz=
{
self
.
_args
.
npz
}
'
f
' --nx=
{
self
.
_args
.
nx
}
'
f
' --ny=
{
self
.
_args
.
ny
}
'
f
' --nz=
{
self
.
_args
.
nz
}
'
f
' --rt=
{
self
.
_args
.
rt
}
'
f
' --tol=
{
self
.
_args
.
tol
}
'
f
' --pz=
{
self
.
_args
.
pz
}
'
f
' --zl=
{
zl
}
'
f
' --zu=
{
zu
}
'
)
self
.
_commands
=
[
command
]
return
True
def
_process_raw_result
(
self
,
cmd_idx
,
raw_output
):
"""Save raw output for later parser refinement.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
bool: Always True for now.
"""
self
.
_result
.
add_raw_data
(
'raw_output_'
+
str
(
cmd_idx
),
raw_output
,
self
.
_args
.
log_raw_data
)
return
True
superbench/runner/runner.py
View file @
e4c2bd4c
...
...
@@ -91,6 +91,8 @@ def __validate_sb_config(self): # noqa: C901
'btl_tcp_if_exclude'
:
'lo,docker0'
,
'coll_hcoll_enable'
:
0
,
}
if
'bind_to'
not
in
mode
:
self
.
_sb_benchmarks
[
name
].
modes
[
idx
].
bind_to
=
'numa'
for
key
in
[
'PATH'
,
'LD_LIBRARY_PATH'
,
'SB_MICRO_PATH'
,
'SB_WORKSPACE'
]:
self
.
_sb_benchmarks
[
name
].
modes
[
idx
].
env
.
setdefault
(
key
,
None
)
if
'pattern'
in
mode
:
...
...
@@ -182,13 +184,14 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
'-tag-output '
# tag mpi output with [jobid,rank]<stdout/stderr> prefix
'-allow-run-as-root '
# allow mpirun to run when executed by root user
'{host_list} '
# use prepared hostfile or specify nodes and launch {proc_num} processes on each node
'-bind-to
numa
'
# bind processes
to numa
'-bind-to
{bind_to}
'
# bind processes
according to mode config
'{mca_list} {env_list} {command}'
).
format
(
trace
=
trace_command
,
host_list
=
f
'-host localhost:
{
mode
.
proc_num
}
'
if
'node_num'
in
mode
and
mode
.
node_num
==
1
else
f
'-hostfile hostfile -map-by ppr:
{
mode
.
proc_num
}
:node'
if
'host_list'
not
in
mode
else
'-host '
+
','
.
join
(
f
'
{
host
}
:
{
mode
.
proc_num
}
'
for
host
in
mode
.
host_list
),
bind_to
=
mode
.
bind_to
,
mca_list
=
' '
.
join
(
f
'-mca
{
k
}
{
v
}
'
for
k
,
v
in
mode
.
mca
.
items
()),
env_list
=
' '
.
join
(
f
'-x
{
k
}
=
{
str
(
v
).
format
(
proc_rank
=
mode
.
proc_rank
,
proc_num
=
mode
.
proc_num
)
}
'
...
...
third_party/Makefile
View file @
e4c2bd4c
...
...
@@ -16,7 +16,7 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
NUM_MAKE_JOBS
?=
$(
shell
nproc
--ignore
=
2
)
.PHONY
:
all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt rocm_babelstream_hip megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm
.PHONY
:
all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt rocm_babelstream_hip megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm
rocm_hpcg
# Build targets.
all
:
cuda rocm
...
...
@@ -24,7 +24,7 @@ all: cuda rocm
cuda_with_msccl
:
cuda cuda_msccl
cuda
:
common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
rocm
:
common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
dtk
:
common rocm_perftest rocm_rccl_tests rocm_babelstream_hip megatron_deepspeed apex_rocm rocm_megatron_lm
dtk
:
common rocm_perftest rocm_rccl_tests rocm_babelstream_hip megatron_deepspeed apex_rocm rocm_megatron_lm
rocm_hpcg
cpu
:
common cpu_perftest
common
:
fio cpu_stream
...
...
@@ -192,6 +192,19 @@ ifneq (,$(wildcard BabelStream/CMakeLists.txt))
cp
-v
./BabelStream/build/hip-stream
$(SB_MICRO_PATH)
/bin/
endif
# Build rocHPCG and stage helper scripts for SuperBench DTK image.
rocm_hpcg
:
sb_micro_path
ifneq
(,$(wildcard rocHPCG/install.sh))
cd
./rocHPCG
&&
\
if
[
!
-f
./run_rochpcg
]
;
then
\
git apply ../rochpcg-scripts-bw.patch
;
\
fi
&&
\
./install.sh
--with-rocm
=
$(ROCM_PATH)
--with-mpi
=
$(MPI_HOME)
--gpu-aware-mpi
=
on
cp
-v
./rocHPCG/build/release/rochpcg-install/bin/rochpcg
$(SB_MICRO_PATH)
/bin/
cp
-v
./rocHPCG/run_rochpcg
$(SB_MICRO_PATH)
/bin/
chmod
+x
$(SB_MICRO_PATH)
/bin/rochpcg
$(SB_MICRO_PATH)
/bin/run_rochpcg
endif
# Build GPCNET from commit c56fd9.
gpcnet
:
sb_micro_path
bash
-c
"source
${HPCX_HOME}
/hpcx-init.sh && hpcx_load && make CC=mpicc -C GPCNET all && hpcx_unload"
...
...
rocHPCG
@
8f2795d5
Subproject commit 8f2795d564c69f3f4c3404d34f5b78a88e9e7dcc
third_party/rochpcg-scripts-bw.patch
0 → 100644
View file @
e4c2bd4c
diff --git a/run_rochpcg b/run_rochpcg
new file mode 100755
index 0000000..0f806fe
--- /dev/null
+++ b/run_rochpcg
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+# =================================================
+# Helper functions
+# =================================================
+help() {
+ cat << EOF
+rocHPCG helper script
+Usage: $(basename "$0") [OPTIONS]
+
+OPTIONS:
+ -h, --help Show this help message and exit
+ --npx Number of processes in x dimension of process grid (default: ${npx})
+ --npy Number of processes in y dimension of process grid (default: ${npy})
+ --npz Number of processes in z dimension of process grid (default: ${npz})
+ --nx Problem size in x dimension (default: ${nx})
+ --ny Problem size in y dimension (default: ${ny})
+ --nz Problem size in z dimension (default: ${nz})
+ --rt Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
+ --tol Residual tolerance, skip reference verification if set (default: ${tol})
+ --pz Partition boundary in z process dimension (default: 0, uniform grid)
+ --zl Local nz value for processes with z rank < pz (default: equal to ${nz})
+ --zu Local nz value for processes with z rank >= pz (default: equal to ${nz})
+EOF
+}
+
+# =================================================
+# Global variables
+# =================================================
+npx=1
+npy=1
+npz=1
+nx=560
+ny=280
+nz=280
+runtime=60
+tol=1
+pz=0
+zl=${nz}
+zu=${nz}
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+rochpcg_bin="${SCRIPT_DIR}/rochpcg"
+
+if [[ ! -x "${rochpcg_bin}" ]]; then
+ echo "Cannot find rochpcg binary at ${rochpcg_bin}"
+ exit 1
+fi
+
+# =================================================
+# Parameter parsing
+# =================================================
+GETOPT_PARSE=$(getopt --name "${0}" --options h --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu: -- "$@") \
+ || { echo "getopt invocation failed; could not parse the command line"; exit 1; }
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+ case "${1}" in
+ -h|--help) help; exit 0 ;;
+ --npx) npx=${2}; shift 2 ;;
+ --npy) npy=${2}; shift 2 ;;
+ --npz) npz=${2}; shift 2 ;;
+ --nx) nx=${2}; shift 2 ;;
+ --ny) ny=${2}; shift 2 ;;
+ --nz)
+ nz=${2}
+ zl=${nz}
+ zu=${nz}
+ shift 2 ;;
+ --rt) runtime=${2}; shift 2 ;;
+ --tol) tol=${2}; shift 2 ;;
+ --pz) pz=${2}; shift 2 ;;
+ --zl) zl=${2}; shift 2 ;;
+ --zu) zu=${2}; shift 2 ;;
+ --) shift ; break ;;
+ *) echo "Unexpected command line parameter received; aborting";
+ exit 1
+ ;;
+ esac
+done
+
+# Build rochpcg arguments
+rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
+rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
+rochpcg_args+=" --rt=${runtime}"
+rochpcg_args+=" --tol=${tol}"
+rochpcg_args+=" --pz=${pz}"
+rochpcg_args+=" --zl=${zl}"
+rochpcg_args+=" --zu=${zu}"
+
+# =================================================
+# Affinity setup
+# =================================================
+globalRank=${OMPI_COMM_WORLD_RANK:-0}
+rank=${OMPI_COMM_WORLD_LOCAL_RANK:-0}
+size=${OMPI_COMM_WORLD_LOCAL_SIZE:-1}
+
+#construct a list of all cpus, sorted by core
+cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
+
+#construct list of devices and their numa affinities
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+
+#count the cpus per core
+threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
+
+#remove the extra cpus on each core to make a list of just physical cores, then sort by numa domain
+corelist=$(echo "$cpulist" | awk -v tpc=${threads_per_core} '(NR-1)%tpc==0' | sort -k 3 -g -s)
+
+#count numa domains
+line=($(echo "$cpulist" | tail -n 1))
+n_numa=$((line[2]+1))
+
+numa_core_counts=()
+numa_proc_counts=()
+for i in $(seq 1 ${n_numa}); do numa_core_counts+=(0); numa_proc_counts+=(0); done
+
+#parse the list of cpus to array and count cpus in each numa
+cpus=()
+while read -a line; do
+ cpus+=(${line[0]})
+ ((numa_core_counts[${line[2]}]++)) || true
+done <<< "${corelist}"
+
+numa_core_offsets=(0)
+for i in $(seq 1 $((n_numa-1))); do numa_core_offsets+=($((numa_core_offsets[$((i-1))] + numa_core_counts[$i]))); done
+
+#parse device to numa mapping
+device_to_numa=()
+while read -a line; do
+ device_to_numa+=(${line[1]})
+done <<< "${devicelist}"
+
+rank_to_device=()
+n_devices=$(echo "${devicelist}" | grep -c "card")
+for i in $(seq 0 $((size-1))); do
+ rank_to_device+=($((i%n_devices)))
+done
+
+mygpu=${rank_to_device[rank]}
+mynuma=${device_to_numa[mygpu]}
+
+rank_to_numa=()
+for i in $(seq 0 $((size-1))); do
+ rank_to_numa+=(${device_to_numa[${rank_to_device[$((i%n_devices))]}]})
+done
+
+for i in $(seq 0 $((size-1))); do
+ numa=${rank_to_numa[$i]}
+ ((numa_proc_counts[numa]++)) || true
+done
+
+omp_num_threads=$((numa_core_counts[mynuma]/numa_proc_counts[mynuma]))
+
+core_offset=${numa_core_offsets[mynuma]}
+for i in $(seq 0 $((rank-1))); do
+ numa=${rank_to_numa[$i]}
+ if [[ $numa -eq $mynuma ]]; then
+ core_offset=$((core_offset + omp_num_threads))
+ fi
+done
+
+omp_places="{${cpus[core_offset]}}"
+for c in $(seq 1 $((omp_num_threads-1))); do
+ omp_places+=",{${cpus[core_offset+c]}}"
+done
+
+if [[ $omp_num_threads -gt 1 ]]; then
+ places="{${cpus[core_offset]}-${cpus[core_offset+$((omp_num_threads-1))]}}"
+else
+ places="{${cpus[core_offset]}}"
+fi
+
+# Export OpenMP config
+export OMP_NUM_THREADS=${omp_num_threads}
+export OMP_PLACES=${omp_places}
+export OMP_PROC_BIND=true
+
+if [[ $globalRank -lt $size ]]; then
+ echo "Node Binding: Process $rank [(nx,ny,nz)=(${nx},${ny},${nz})] GPU: $mygpu, NUMA: $mynuma, CPU Cores: $omp_num_threads - $places"
+fi
+
+# Run
+numactl -N ${mynuma} -m ${mynuma} ${rochpcg_bin} ${rochpcg_args}
\
No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment