Commit 4c47ac03 authored by jerrrrry's avatar jerrrrry
Browse files

Upload New File

parent 3ab48f5d
#!/bin/bash
set -eo pipefail # 严格错误处理
log_dir="/workspace/test/env_check_outputs"
mkdir -p "$log_dir"
echo "==================== 开始系统环境检查 ===================="
# 基础检查函数
run_test() {
local name=$1
shift
echo "[RUN] $name"
"$@" 2>&1 | tee "$log_dir/${name}.log" || {
echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
return 1
}
}
run_pipe_test() {
local name=$1
local cmd=$2
echo "[RUN] $name"
bash -c "$cmd" 2>&1 | tee "$log_dir/${name}.log" || {
echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
return 1
}
}
# 系统基础检查
run_test rocm_bandwidth_test rocm-bandwidth-test
run_test hy_smi hy-smi
run_test hy_smi_config hy-smi -c
run_test pip_list pip list
run_test cpu_info lscpu
run_test cpu_cores nproc
run_test memory_usage free -h
run_test disk_usage df -h
run_test hardware_info lshw -short || true
run_test network_interfaces ip a
run_test ibstat ibstat
run_test ibdev2netdev ibdev2netdev
run_pipe_test ACS_stat "lspci -vvv | grep -i acsct"
run_test rocm_info rocminfo || true
echo "==================== RCCL-TEST ===================="
cd /workspace/test/env_check_outputs
if command -v git &>/dev/null && command -v make &>/dev/null; then
if [ ! -d rccl-tests ]; then
git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git --depth 1 -b master || exit 1
fi
cd rccl-tests || exit 1
source /opt/dtk/env.sh
if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/all_reduce_perf_8.log"
./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/all_reduce_perf_4.log"
else
echo "[ERROR] RCCL编译失败" | tee "$log_dir/rccl_build_fail.log"
fi
cd ..
else
echo "[WARN] 缺少git或make,跳过RCCL测试" | tee "$log_dir/rccl_skip.log"
fi
echo "==================== DCU-ENV-CHECK ===================="
if [ ! -d dcu_env_check ]; then
git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git || {
echo "[ERROR] DCU环境检查代码克隆失败" | tee "$log_dir/dcu_clone_fail.log"
exit 1
}
fi
cd dcu_env_check && {
bash system_check.sh 2>&1 | tee "$log_dir/dcu_env_check.log"
cd ..
} || {
echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/dcu_check_fail.log"
exit 1
}
echo "==================== 检查完成 ===================="
echo "所有日志已保存至: $log_dir"
ls -lh "$log_dir"
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment