#!/bin/bash set -eo pipefail # 严格错误处理 log_dir="/workspace/test/env_check_outputs" mkdir -p "$log_dir" echo "==================== 开始系统环境检查 ====================" # 基础检查函数 run_test() { local name=$1 shift echo "[RUN] $name" "$@" 2>&1 | tee "$log_dir/${name}.log" || { echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log" return 1 } } run_pipe_test() { local name=$1 local cmd=$2 echo "[RUN] $name" bash -c "$cmd" 2>&1 | tee "$log_dir/${name}.log" || { echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log" return 1 } } # 系统基础检查 run_test rocm_bandwidth_test rocm-bandwidth-test run_test hy_smi hy-smi run_test hy_smi_config hy-smi -c run_test pip_list pip list run_test cpu_info lscpu run_test cpu_cores nproc run_test memory_usage free -h run_test disk_usage df -h run_test hardware_info lshw -short || true run_test network_interfaces ip a run_test ibstat ibstat run_test ibdev2netdev ibdev2netdev run_pipe_test ACS_stat "lspci -vvv | grep -i acsct" run_test rocm_info rocminfo || true echo "==================== RCCL-TEST ====================" cd /workspace/test/env_check_outputs if command -v git &>/dev/null && command -v make &>/dev/null; then if [ ! -d rccl-tests ]; then git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git --depth 1 -b master || exit 1 fi cd rccl-tests || exit 1 source /opt/dtk/env.sh if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \ CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then ./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/all_reduce_perf_8.log" ./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/all_reduce_perf_4.log" else echo "[ERROR] RCCL编译失败" | tee "$log_dir/rccl_build_fail.log" fi cd .. else echo "[WARN] 缺少git或make,跳过RCCL测试" | tee "$log_dir/rccl_skip.log" fi echo "==================== DCU-ENV-CHECK ====================" if [ ! -d dcu_env_check ]; then git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git || { echo "[ERROR] DCU环境检查代码克隆失败" | tee "$log_dir/dcu_clone_fail.log" exit 1 } fi cd dcu_env_check && { bash system_check.sh 2>&1 | tee "$log_dir/dcu_env_check.log" cd .. } || { echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/dcu_check_fail.log" exit 1 } echo "==================== 检查完成 ====================" echo "所有日志已保存至: $log_dir" ls -lh "$log_dir"