run_envcheck.sh 2.51 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/bin/bash
set -eo pipefail  # 严格错误处理

log_dir="/workspace/test/env_check_outputs"
mkdir -p "$log_dir"

echo "==================== 开始系统环境检查 ===================="

# 基础检查函数
run_test() {
  local name=$1
  shift
  echo "[RUN] $name"
  "$@" 2>&1 | tee "$log_dir/${name}.log" || {
    echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
    return 1
  }
}

run_pipe_test() {
  local name=$1
  local cmd=$2
  echo "[RUN] $name"
  
  bash -c "$cmd" 2>&1 | tee "$log_dir/${name}.log" || {
    echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
    return 1
  }
}

# 系统基础检查
run_test rocm_bandwidth_test rocm-bandwidth-test
run_test hy_smi hy-smi
run_test hy_smi_config hy-smi -c
run_test pip_list pip list
run_test cpu_info lscpu
run_test cpu_cores nproc
run_test memory_usage free -h
run_test disk_usage df -h
run_test hardware_info lshw -short || true
run_test network_interfaces ip a
run_test ibstat ibstat
run_test ibdev2netdev ibdev2netdev
run_pipe_test ACS_stat "lspci -vvv | grep -i acsct"
run_test rocm_info rocminfo || true

echo "==================== RCCL-TEST ===================="
cd /workspace/test/env_check_outputs

if command -v git &>/dev/null && command -v make &>/dev/null; then
  if [ ! -d rccl-tests ]; then
    git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git --depth 1 -b master || exit 1
  fi
  
  cd rccl-tests || exit 1
  source /opt/dtk/env.sh
  
  if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
       CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
    ./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/all_reduce_perf_8.log"
    ./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/all_reduce_perf_4.log"
  else
    echo "[ERROR] RCCL编译失败" | tee "$log_dir/rccl_build_fail.log"
  fi
  cd ..
else
  echo "[WARN] 缺少git或make,跳过RCCL测试" | tee "$log_dir/rccl_skip.log"
fi

echo "==================== DCU-ENV-CHECK ===================="
if [ ! -d dcu_env_check ]; then
  git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git || {
    echo "[ERROR] DCU环境检查代码克隆失败" | tee "$log_dir/dcu_clone_fail.log"
    exit 1
  }
fi

cd dcu_env_check && {
  bash system_check.sh 2>&1 | tee "$log_dir/dcu_env_check.log"
  cd ..
} || {
  echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/dcu_check_fail.log"
  exit 1
}

echo "==================== 检查完成 ===================="
echo "所有日志已保存至: $log_dir"
ls -lh "$log_dir"