Commit b623c7e9 authored by one's avatar one
Browse files

Convert rochpcg script patch into shell script

parent a10c3e15
...@@ -196,12 +196,9 @@ endif ...@@ -196,12 +196,9 @@ endif
rocm_hpcg: sb_micro_path rocm_hpcg: sb_micro_path
ifneq (,$(wildcard rocHPCG/install.sh)) ifneq (,$(wildcard rocHPCG/install.sh))
cd ./rocHPCG && \ cd ./rocHPCG && \
if [ ! -f ./run_rochpcg ]; then \
git apply ../rochpcg-scripts-bw.patch; \
fi && \
./install.sh --with-rocm=$(ROCM_PATH) --with-mpi=$(MPI_HOME) --gpu-aware-mpi=on ./install.sh --with-rocm=$(ROCM_PATH) --with-mpi=$(MPI_HOME) --gpu-aware-mpi=on
cp -v ./rocHPCG/build/release/rochpcg-install/bin/rochpcg $(SB_MICRO_PATH)/bin/ cp -v ./rocHPCG/build/release/rochpcg-install/bin/rochpcg $(SB_MICRO_PATH)/bin/
cp -v ./rocHPCG/run_rochpcg $(SB_MICRO_PATH)/bin/ cp -v ./run_rochpcg.sh $(SB_MICRO_PATH)/bin/run_rochpcg
chmod +x $(SB_MICRO_PATH)/bin/rochpcg $(SB_MICRO_PATH)/bin/run_rochpcg chmod +x $(SB_MICRO_PATH)/bin/rochpcg $(SB_MICRO_PATH)/bin/run_rochpcg
endif endif
......
diff --git a/run_rochpcg b/run_rochpcg
new file mode 100755
index 0000000..0f806fe
--- /dev/null
+++ b/run_rochpcg
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+# =================================================
+# Helper functions
+# =================================================
+help() {
+ cat << EOF
+rocHPCG helper script
+Usage: $(basename "$0") [OPTIONS]
+
+OPTIONS:
+ -h, --help Show this help message and exit
+ --npx Number of processes in x dimension of process grid (default: ${npx})
+ --npy Number of processes in y dimension of process grid (default: ${npy})
+ --npz Number of processes in z dimension of process grid (default: ${npz})
+ --nx Problem size in x dimension (default: ${nx})
+ --ny Problem size in y dimension (default: ${ny})
+ --nz Problem size in z dimension (default: ${nz})
+ --rt Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
+ --tol Residual tolerance, skip reference verification if set (default: ${tol})
+ --pz Partition boundary in z process dimension (default: 0, uniform grid)
+ --zl Local nz value for processes with z rank < pz (default: equal to ${nz})
+ --zu Local nz value for processes with z rank >= pz (default: equal to ${nz})
+EOF
+}
+
+# =================================================
+# Global variables
+# =================================================
+npx=1
+npy=1
+npz=1
+nx=560
+ny=280
+nz=280
+runtime=60
+tol=1
+pz=0
+zl=${nz}
+zu=${nz}
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+rochpcg_bin="${SCRIPT_DIR}/rochpcg"
+
+if [[ ! -x "${rochpcg_bin}" ]]; then
+ echo "Cannot find rochpcg binary at ${rochpcg_bin}"
+ exit 1
+fi
+
+# =================================================
+# Parameter parsing
+# =================================================
+GETOPT_PARSE=$(getopt --name "${0}" --options h --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu: -- "$@") \
+ || { echo "getopt invocation failed; could not parse the command line"; exit 1; }
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+ case "${1}" in
+ -h|--help) help; exit 0 ;;
+ --npx) npx=${2}; shift 2 ;;
+ --npy) npy=${2}; shift 2 ;;
+ --npz) npz=${2}; shift 2 ;;
+ --nx) nx=${2}; shift 2 ;;
+ --ny) ny=${2}; shift 2 ;;
+ --nz)
+ nz=${2}
+ zl=${nz}
+ zu=${nz}
+ shift 2 ;;
+ --rt) runtime=${2}; shift 2 ;;
+ --tol) tol=${2}; shift 2 ;;
+ --pz) pz=${2}; shift 2 ;;
+ --zl) zl=${2}; shift 2 ;;
+ --zu) zu=${2}; shift 2 ;;
+ --) shift ; break ;;
+ *) echo "Unexpected command line parameter received; aborting";
+ exit 1
+ ;;
+ esac
+done
+
+# Build rochpcg arguments
+rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
+rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
+rochpcg_args+=" --rt=${runtime}"
+rochpcg_args+=" --tol=${tol}"
+rochpcg_args+=" --pz=${pz}"
+rochpcg_args+=" --zl=${zl}"
+rochpcg_args+=" --zu=${zu}"
+
+# =================================================
+# Affinity setup
+# =================================================
+globalRank=${OMPI_COMM_WORLD_RANK:-0}
+rank=${OMPI_COMM_WORLD_LOCAL_RANK:-0}
+size=${OMPI_COMM_WORLD_LOCAL_SIZE:-1}
+
+#construct a list of all cpus, sorted by core
+cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
+
+#construct list of devices and their numa affinities
+devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
+
+#count the cpus per core
+threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
+
+#remove the extra cpus on each core to make a list of just physical cores, then sort by numa domain
+corelist=$(echo "$cpulist" | awk -v tpc=${threads_per_core} '(NR-1)%tpc==0' | sort -k 3 -g -s)
+
+#count numa domains
+line=($(echo "$cpulist" | tail -n 1))
+n_numa=$((line[2]+1))
+
+numa_core_counts=()
+numa_proc_counts=()
+for i in $(seq 1 ${n_numa}); do numa_core_counts+=(0); numa_proc_counts+=(0); done
+
+#parse the list of cpus to array and count cpus in each numa
+cpus=()
+while read -a line; do
+ cpus+=(${line[0]})
+ ((numa_core_counts[${line[2]}]++)) || true
+done <<< "${corelist}"
+
+numa_core_offsets=(0)
+for i in $(seq 1 $((n_numa-1))); do numa_core_offsets+=($((numa_core_offsets[$((i-1))] + numa_core_counts[$i]))); done
+
+#parse device to numa mapping
+device_to_numa=()
+while read -a line; do
+ device_to_numa+=(${line[1]})
+done <<< "${devicelist}"
+
+rank_to_device=()
+n_devices=$(echo "${devicelist}" | grep -c "card")
+for i in $(seq 0 $((size-1))); do
+ rank_to_device+=($((i%n_devices)))
+done
+
+mygpu=${rank_to_device[rank]}
+mynuma=${device_to_numa[mygpu]}
+
+rank_to_numa=()
+for i in $(seq 0 $((size-1))); do
+ rank_to_numa+=(${device_to_numa[${rank_to_device[$((i%n_devices))]}]})
+done
+
+for i in $(seq 0 $((size-1))); do
+ numa=${rank_to_numa[$i]}
+ ((numa_proc_counts[numa]++)) || true
+done
+
+omp_num_threads=$((numa_core_counts[mynuma]/numa_proc_counts[mynuma]))
+
+core_offset=${numa_core_offsets[mynuma]}
+for i in $(seq 0 $((rank-1))); do
+ numa=${rank_to_numa[$i]}
+ if [[ $numa -eq $mynuma ]]; then
+ core_offset=$((core_offset + omp_num_threads))
+ fi
+done
+
+omp_places="{${cpus[core_offset]}}"
+for c in $(seq 1 $((omp_num_threads-1))); do
+ omp_places+=",{${cpus[core_offset+c]}}"
+done
+
+if [[ $omp_num_threads -gt 1 ]]; then
+ places="{${cpus[core_offset]}-${cpus[core_offset+$((omp_num_threads-1))]}}"
+else
+ places="{${cpus[core_offset]}}"
+fi
+
+# Export OpenMP config
+export OMP_NUM_THREADS=${omp_num_threads}
+export OMP_PLACES=${omp_places}
+export OMP_PROC_BIND=true
+
+if [[ $globalRank -lt $size ]]; then
+ echo "Node Binding: Process $rank [(nx,ny,nz)=(${nx},${ny},${nz})] GPU: $mygpu, NUMA: $mynuma, CPU Cores: $omp_num_threads - $places"
+fi
+
+# Run
+numactl -N ${mynuma} -m ${mynuma} ${rochpcg_bin} ${rochpcg_args}
\ No newline at end of file
#!/bin/bash
# =================================================
# Helper functions
# =================================================
help() {
cat << EOF
rocHPCG helper script
Usage: $(basename "$0") [OPTIONS]
OPTIONS:
-h, --help Show this help message and exit
--npx Number of processes in x dimension of process grid (default: ${npx})
--npy Number of processes in y dimension of process grid (default: ${npy})
--npz Number of processes in z dimension of process grid (default: ${npz})
--nx Problem size in x dimension (default: ${nx})
--ny Problem size in y dimension (default: ${ny})
--nz Problem size in z dimension (default: ${nz})
--rt Benchmarking time in seconds (> 1800s for official runs) (default: ${runtime})
--tol Residual tolerance, skip reference verification if set (default: ${tol})
--pz Partition boundary in z process dimension (default: 0, uniform grid)
--zl Local nz value for processes with z rank < pz (default: equal to ${nz})
--zu Local nz value for processes with z rank >= pz (default: equal to ${nz})
EOF
}
# =================================================
# Global variables
# =================================================
npx=1
npy=1
npz=1
nx=560
ny=280
nz=280
runtime=60
tol=1
pz=0
zl=${nz}
zu=${nz}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
rochpcg_bin="${SCRIPT_DIR}/rochpcg"
if [[ ! -x "${rochpcg_bin}" ]]; then
echo "Cannot find rochpcg binary at ${rochpcg_bin}"
exit 1
fi
# =================================================
# Parameter parsing
# =================================================
GETOPT_PARSE=$(getopt --name "${0}" --options h --longoptions help,npx:,npy:,npz:,nx:,ny:,nz:,rt:,tol:,pz:,zl:,zu: -- "$@") \
|| { echo "getopt invocation failed; could not parse the command line"; exit 1; }
eval set -- "${GETOPT_PARSE}"
while true; do
case "${1}" in
-h|--help) help; exit 0 ;;
--npx) npx=${2}; shift 2 ;;
--npy) npy=${2}; shift 2 ;;
--npz) npz=${2}; shift 2 ;;
--nx) nx=${2}; shift 2 ;;
--ny) ny=${2}; shift 2 ;;
--nz)
nz=${2}
zl=${nz}
zu=${nz}
shift 2 ;;
--rt) runtime=${2}; shift 2 ;;
--tol) tol=${2}; shift 2 ;;
--pz) pz=${2}; shift 2 ;;
--zl) zl=${2}; shift 2 ;;
--zu) zu=${2}; shift 2 ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
;;
esac
done
# Build rochpcg arguments
rochpcg_args="--npx=${npx} --npy=${npy} --npz=${npz}"
rochpcg_args+=" --nx=${nx} --ny=${ny} --nz=${nz}"
rochpcg_args+=" --rt=${runtime}"
rochpcg_args+=" --tol=${tol}"
rochpcg_args+=" --pz=${pz}"
rochpcg_args+=" --zl=${zl}"
rochpcg_args+=" --zu=${zu}"
# =================================================
# Affinity setup
# =================================================
globalRank=${OMPI_COMM_WORLD_RANK:-0}
rank=${OMPI_COMM_WORLD_LOCAL_RANK:-0}
size=${OMPI_COMM_WORLD_LOCAL_SIZE:-1}
#construct a list of all cpus, sorted by core
cpulist=$(lscpu --parse=CPU,CORE,NODE | awk '!/#/' | tr ',' "\t" | sort -k 2 -g -s)
#construct list of devices and their numa affinities
devicelist=$(hy-smi --csv --showtoponuma | tail -n +2 | tr ',' "\t")
#count the cpus per core
threads_per_core=$(echo "${cpulist}" | grep -c ".* 0 .*")
#remove the extra cpus on each core to make a list of just physical cores, then sort by numa domain
corelist=$(echo "$cpulist" | awk -v tpc=${threads_per_core} '(NR-1)%tpc==0' | sort -k 3 -g -s)
#count numa domains
line=($(echo "$cpulist" | tail -n 1))
n_numa=$((line[2]+1))
numa_core_counts=()
numa_proc_counts=()
for i in $(seq 1 ${n_numa}); do numa_core_counts+=(0); numa_proc_counts+=(0); done
#parse the list of cpus to array and count cpus in each numa
cpus=()
while read -a line; do
cpus+=(${line[0]})
((numa_core_counts[${line[2]}]++)) || true
done <<< "${corelist}"
numa_core_offsets=(0)
for i in $(seq 1 $((n_numa-1))); do numa_core_offsets+=($((numa_core_offsets[$((i-1))] + numa_core_counts[$i]))); done
#parse device to numa mapping
device_to_numa=()
while read -a line; do
device_to_numa+=(${line[1]})
done <<< "${devicelist}"
rank_to_device=()
n_devices=$(echo "${devicelist}" | grep -c "card")
for i in $(seq 0 $((size-1))); do
rank_to_device+=($((i%n_devices)))
done
mygpu=${rank_to_device[rank]}
mynuma=${device_to_numa[mygpu]}
rank_to_numa=()
for i in $(seq 0 $((size-1))); do
rank_to_numa+=(${device_to_numa[${rank_to_device[$((i%n_devices))]}]})
done
for i in $(seq 0 $((size-1))); do
numa=${rank_to_numa[$i]}
((numa_proc_counts[numa]++)) || true
done
omp_num_threads=$((numa_core_counts[mynuma]/numa_proc_counts[mynuma]))
core_offset=${numa_core_offsets[mynuma]}
for i in $(seq 0 $((rank-1))); do
numa=${rank_to_numa[$i]}
if [[ $numa -eq $mynuma ]]; then
core_offset=$((core_offset + omp_num_threads))
fi
done
omp_places="{${cpus[core_offset]}}"
for c in $(seq 1 $((omp_num_threads-1))); do
omp_places+=",{${cpus[core_offset+c]}}"
done
if [[ $omp_num_threads -gt 1 ]]; then
places="{${cpus[core_offset]}-${cpus[core_offset+$((omp_num_threads-1))]}}"
else
places="{${cpus[core_offset]}}"
fi
# Export OpenMP config
export OMP_NUM_THREADS=${omp_num_threads}
export OMP_PLACES=${omp_places}
export OMP_PROC_BIND=true
if [[ $globalRank -lt $size ]]; then
echo "Node Binding: Process $rank [(nx,ny,nz)=(${nx},${ny},${nz})] GPU: $mygpu, NUMA: $mynuma, CPU Cores: $omp_num_threads - $places"
fi
# Run
numactl -N ${mynuma} -m ${mynuma} ${rochpcg_bin} ${rochpcg_args}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment