Commit e4c2bd4c authored by one's avatar one
Browse files

MicroBenchmark: rocHPCG

parent 211e63c7
......@@ -36,3 +36,6 @@
[submodule "third_party/BabelStream"]
path = third_party/BabelStream
url = https://github.com/UoB-HPC/BabelStream.git
[submodule "third_party/rocHPCG"]
path = third_party/rocHPCG
url = https://github.com/ROCm/rocHPCG.git
......@@ -6,6 +6,7 @@
from superbench.benchmarks.micro_benchmarks.micro_base import MicroBenchmark, MicroBenchmarkWithInvoke
from superbench.benchmarks.micro_benchmarks.gemm_flops_performance_base import GemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.memory_bw_performance_base import MemBwBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_hpcg_performance_base import GpuHpcgBenchmark
from superbench.benchmarks.micro_benchmarks.computation_communication_overlap import ComputationCommunicationOverlap
from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark
......@@ -14,6 +15,7 @@
from superbench.benchmarks.micro_benchmarks.rocm_hipblaslt_function import RocmHipBlasLtBenchmark
from superbench.benchmarks.micro_benchmarks.dtk_hipblaslt_function import DtkHipBlasLtBenchmark
from superbench.benchmarks.micro_benchmarks.dtk_gemm_flops_performance import DtkGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.dtk_hpcg_performance import DtkHpcgBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_gemm_flops_performance import CudaGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
......@@ -63,6 +65,7 @@
'GemmFlopsBenchmark',
'GpuBurnBenchmark',
'GpuCopyBwBenchmark',
'GpuHpcgBenchmark',
'GpuStreamBenchmark',
'IBBenchmark',
'IBLoopbackBenchmark',
......@@ -80,5 +83,6 @@
'DirectXGPUCopyBw',
'DirectXGPUMemBw',
'DirectXGPUCoreFlops',
'DtkHpcgBenchmark',
'NvBandwidthBenchmark',
]
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the DTK HPCG benchmark."""
from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.benchmarks.micro_benchmarks import GpuHpcgBenchmark
class DtkHpcgBenchmark(GpuHpcgBenchmark):
"""The DTK HPCG benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
self._bin_name = 'run_rochpcg'
BenchmarkRegistry.register_benchmark('gpu-hpcg', DtkHpcgBenchmark, platform=Platform.DTK)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the GPU HPCG benchmark base class."""
import os
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
class GpuHpcgBenchmark(MicroBenchmarkWithInvoke):
"""The GPU HPCG benchmark base class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
self._parser.add_argument(
'--npx',
type=int,
default=1,
required=False,
help='Number of MPI processes in x dimension.',
)
self._parser.add_argument(
'--npy',
type=int,
default=1,
required=False,
help='Number of MPI processes in y dimension.',
)
self._parser.add_argument(
'--npz',
type=int,
default=1,
required=False,
help='Number of MPI processes in z dimension.',
)
self._parser.add_argument(
'--nx',
type=int,
default=560,
required=False,
help='Local problem size in x dimension.',
)
self._parser.add_argument(
'--ny',
type=int,
default=280,
required=False,
help='Local problem size in y dimension.',
)
self._parser.add_argument(
'--nz',
type=int,
default=280,
required=False,
help='Local problem size in z dimension.',
)
self._parser.add_argument(
'--rt',
type=int,
default=60,
required=False,
help='Benchmark runtime in seconds.',
)
self._parser.add_argument(
'--tol',
type=float,
default=1.0,
required=False,
help='Residual tolerance; reference verification is skipped if set.',
)
self._parser.add_argument(
'--pz',
type=int,
default=0,
required=False,
help='Partition boundary in z process dimension.',
)
self._parser.add_argument(
'--zl',
type=int,
required=False,
help='Local nz value for processes with z rank < pz.',
)
self._parser.add_argument(
'--zu',
type=int,
required=False,
help='Local nz value for processes with z rank >= pz.',
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
bin_path = os.path.join(self._args.bin_dir, self._bin_name)
zl = self._args.zl if self._args.zl is not None else self._args.nz
zu = self._args.zu if self._args.zu is not None else self._args.nz
command = (
f'{bin_path}'
f' --npx={self._args.npx}'
f' --npy={self._args.npy}'
f' --npz={self._args.npz}'
f' --nx={self._args.nx}'
f' --ny={self._args.ny}'
f' --nz={self._args.nz}'
f' --rt={self._args.rt}'
f' --tol={self._args.tol}'
f' --pz={self._args.pz}'
f' --zl={zl}'
f' --zu={zu}'
)
self._commands = [command]
return True
def _process_raw_result(self, cmd_idx, raw_output):
"""Save raw output for later parser refinement.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
bool: Always True for now.
"""
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
return True
......@@ -91,6 +91,8 @@ def __validate_sb_config(self): # noqa: C901
'btl_tcp_if_exclude': 'lo,docker0',
'coll_hcoll_enable': 0,
}
if 'bind_to' not in mode:
self._sb_benchmarks[name].modes[idx].bind_to = 'numa'
for key in ['PATH', 'LD_LIBRARY_PATH', 'SB_MICRO_PATH', 'SB_WORKSPACE']:
self._sb_benchmarks[name].modes[idx].env.setdefault(key, None)
if 'pattern' in mode:
......@@ -182,13 +184,14 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
'-tag-output ' # tag mpi output with [jobid,rank]<stdout/stderr> prefix
'-allow-run-as-root ' # allow mpirun to run when executed by root user
'{host_list} ' # use prepared hostfile or specify nodes and launch {proc_num} processes on each node
'-bind-to numa ' # bind processes to numa
'-bind-to {bind_to} ' # bind processes according to mode config
'{mca_list} {env_list} {command}'
).format(
trace=trace_command,
host_list=f'-host localhost:{mode.proc_num}' if 'node_num' in mode and mode.node_num == 1 else
f'-hostfile hostfile -map-by ppr:{mode.proc_num}:node' if 'host_list' not in mode else '-host ' +
','.join(f'{host}:{mode.proc_num}' for host in mode.host_list),
bind_to=mode.bind_to,
mca_list=' '.join(f'-mca {k} {v}' for k, v in mode.mca.items()),
env_list=' '.join(
f'-x {k}={str(v).format(proc_rank=mode.proc_rank, proc_num=mode.proc_num)}'
......
......@@ -16,7 +16,7 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
.PHONY: all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt rocm_babelstream_hip megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm
.PHONY: all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt rocm_babelstream_hip megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm rocm_hpcg
# Build targets.
all: cuda rocm
......@@ -24,7 +24,7 @@ all: cuda rocm
cuda_with_msccl: cuda cuda_msccl
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
dtk: common rocm_perftest rocm_rccl_tests rocm_babelstream_hip megatron_deepspeed apex_rocm rocm_megatron_lm
dtk: common rocm_perftest rocm_rccl_tests rocm_babelstream_hip megatron_deepspeed apex_rocm rocm_megatron_lm rocm_hpcg
cpu: common cpu_perftest
common: fio cpu_stream
......@@ -192,6 +192,19 @@ ifneq (,$(wildcard BabelStream/CMakeLists.txt))
cp -v ./BabelStream/build/hip-stream $(SB_MICRO_PATH)/bin/
endif
# Build rocHPCG and stage helper scripts for SuperBench DTK image.
rocm_hpcg: sb_micro_path
ifneq (,$(wildcard rocHPCG/install.sh))
cd ./rocHPCG && \
if [ ! -f ./run_rochpcg ]; then \
git apply ../rochpcg-scripts-bw.patch; \
fi && \
./install.sh --with-rocm=$(ROCM_PATH) --with-mpi=$(MPI_HOME) --gpu-aware-mpi=on
cp -v ./rocHPCG/build/release/rochpcg-install/bin/rochpcg $(SB_MICRO_PATH)/bin/
cp -v ./rocHPCG/run_rochpcg $(SB_MICRO_PATH)/bin/
chmod +x $(SB_MICRO_PATH)/bin/rochpcg $(SB_MICRO_PATH)/bin/run_rochpcg
endif
# Build GPCNET from commit c56fd9.
gpcnet: sb_micro_path
bash -c "source ${HPCX_HOME}/hpcx-init.sh && hpcx_load && make CC=mpicc -C GPCNET all && hpcx_unload"
......
Subproject commit 8f2795d564c69f3f4c3404d34f5b78a88e9e7dcc
diff --git a/run_rochpcg b/run_rochpcg
new file mode 100755
index 0000000..0f806fe
--- /dev/null
+++ b/run_rochpcg
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+# =================================================
+# Helper functions
+# =================================================
+help() {
+ cat << EOF
+rocHPCG helper script
+Usage: $(basename "$0") [OPTIONS]
+
+OPTIONS:
+ -h, --help Show this help message and exit
+ --npx Number of processes in x dimension of process grid (default: ${npx})
+ --npy Number of processes in y dimension of process grid (default: ${npy})
+ --npz Number of processes in z dimension of process grid (default: ${npz})
+ --nx Problem size in x dimension (default: ${nx})
+ --ny Problem size in y dimension (default: ${ny})
+ --nz Problem size in z dimension (default: ${nz})
+ --rt Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
+ --tol Residual tolerance, skip reference verification if set (default: ${tol})
+ --pz Partition boundary in z process dimension (default: 0, uniform grid)
+ --zl Local nz value for processes with z rank < pz (default: equal to ${nz})
+ --zu Local nz value for processes with z rank >= pz (default: equal to ${nz})
+EOF
+}
+
+# =================================================
+# Global variables
+# =================================================
+npx=1
+npy=1
+npz=1
+nx=560
+ny=280
+nz=280
+runtime=60
+tol=1
+pz=0
+zl=${nz}
+zu=${nz}
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+rochpcg_bin="${SCRIPT_DIR}/rochpcg"
+
+if [[ ! -x "${rochpcg_bin}" ]]; then
+ echo "Cannot find rochpcg binary at ${rochpcg_bin}"
+ exit 1
+fi
+
+# =================================================
+# Parameter parsing
+# =================================================
+GETOPT_PARSE=$(getopt --name "${0}" --options h --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu: -- "$@") \
+ || { echo "getopt invocation failed; could not parse the command line"; exit 1; }
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+ case "${1}" in
+ -h|--help) help; exit 0 ;;
+ --npx) npx=${2}; shift 2 ;;
+ --npy) npy=${2}; shift 2 ;;
+ --npz) npz=${2}; shift 2 ;;
+ --nx) nx=${2}; shift 2 ;;
+ --ny) ny=${2}; shift 2 ;;
+ --nz)
+ nz=${2}
+ zl=${nz}
+ zu=${nz}
+ shift 2 ;;
+ --rt) runtime=${2}; shift 2 ;;
+ --tol) tol=${2}; shift 2 ;;
+ --pz) pz=${2}; shift 2 ;;
+ --zl) zl=${2}; shift 2 ;;
+ --zu) zu=${2}; shift 2 ;;
+ --) shift ; break ;;
+ *) echo "Unexpected command line parameter received; aborting";
+ exit 1
+ ;;
+ esac
+done
+
+# Build rochpcg arguments
+rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
+rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
+rochpcg_args+=" --rt=${runtime}"
+rochpcg_args+=" --tol=${tol}"
+rochpcg_args+=" --pz=${pz}"
+rochpcg_args+=" --zl=${zl}"
+rochpcg_args+=" --zu=${zu}"
+
+# =================================================
+# Affinity setup
+# =================================================
+globalRank=${OMPI_COMM_WORLD_RANK:-0}
+rank=${OMPI_COMM_WORLD_LOCAL_RANK:-0}
+size=${OMPI_COMM_WORLD_LOCAL_SIZE:-1}
+
+#construct a list of all cpus, sorted by core
+cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
+
+#construct list of devices and their numa affinities
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+
+#count the cpus per core
+threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
+
+#remove the extra cpus on each core to make a list of just physical cores, then sort by numa domain
+corelist=$(echo "$cpulist" | awk -v tpc=${threads_per_core} '(NR-1)%tpc==0' | sort -k 3 -g -s)
+
+#count numa domains
+line=($(echo "$cpulist" | tail -n 1))
+n_numa=$((line[2]+1))
+
+numa_core_counts=()
+numa_proc_counts=()
+for i in $(seq 1 ${n_numa}); do numa_core_counts+=(0); numa_proc_counts+=(0); done
+
+#parse the list of cpus to array and count cpus in each numa
+cpus=()
+while read -a line; do
+ cpus+=(${line[0]})
+ ((numa_core_counts[${line[2]}]++)) || true
+done <<< "${corelist}"
+
+numa_core_offsets=(0)
+for i in $(seq 1 $((n_numa-1))); do numa_core_offsets+=($((numa_core_offsets[$((i-1))] + numa_core_counts[$i]))); done
+
+#parse device to numa mapping
+device_to_numa=()
+while read -a line; do
+ device_to_numa+=(${line[1]})
+done <<< "${devicelist}"
+
+rank_to_device=()
+n_devices=$(echo "${devicelist}" | grep -c "card")
+for i in $(seq 0 $((size-1))); do
+ rank_to_device+=($((i%n_devices)))
+done
+
+mygpu=${rank_to_device[rank]}
+mynuma=${device_to_numa[mygpu]}
+
+rank_to_numa=()
+for i in $(seq 0 $((size-1))); do
+ rank_to_numa+=(${device_to_numa[${rank_to_device[$((i%n_devices))]}]})
+done
+
+for i in $(seq 0 $((size-1))); do
+ numa=${rank_to_numa[$i]}
+ ((numa_proc_counts[numa]++)) || true
+done
+
+omp_num_threads=$((numa_core_counts[mynuma]/numa_proc_counts[mynuma]))
+
+core_offset=${numa_core_offsets[mynuma]}
+for i in $(seq 0 $((rank-1))); do
+ numa=${rank_to_numa[$i]}
+ if [[ $numa -eq $mynuma ]]; then
+ core_offset=$((core_offset + omp_num_threads))
+ fi
+done
+
+omp_places="{${cpus[core_offset]}}"
+for c in $(seq 1 $((omp_num_threads-1))); do
+ omp_places+=",{${cpus[core_offset+c]}}"
+done
+
+if [[ $omp_num_threads -gt 1 ]]; then
+ places="{${cpus[core_offset]}-${cpus[core_offset+$((omp_num_threads-1))]}}"
+else
+ places="{${cpus[core_offset]}}"
+fi
+
+# Export OpenMP config
+export OMP_NUM_THREADS=${omp_num_threads}
+export OMP_PLACES=${omp_places}
+export OMP_PROC_BIND=true
+
+if [[ $globalRank -lt $size ]]; then
+ echo "Node Binding: Process $rank [(nx,ny,nz)=(${nx},${ny},${nz})] GPU: $mygpu, NUMA: $mynuma, CPU Cores: $omp_num_threads - $places"
+fi
+
+# Run
+numactl -N ${mynuma} -m ${mynuma} ${rochpcg_bin} ${rochpcg_args}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment