run_envcheck.sh 2.5 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/bin/bash
set -eo pipefail  # 严格错误处理

log_dir="/workspace/test/env_check_outputs"
mkdir -p "$log_dir"

echo "==================== 开始系统环境检查 ===================="

# 基础检查函数
run_test() {
  local name=$1
  shift
  echo "[RUN] $name"
  "$@" 2>&1 | tee "$log_dir/${name}.log" || {
    echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
    return 1
  }
}

run_pipe_test() {
  local name=$1
  local cmd=$2
  echo "[RUN] $name"
  
  bash -c "$cmd" 2>&1 | tee "$log_dir/${name}.log" || {
    echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
    return 1
  }
}

# 系统基础检查
run_test rocm_bandwidth_test rocm-bandwidth-test
run_test hy_smi hy-smi
run_test hy_smi_config hy-smi -c
run_test pip_list pip list
run_test cpu_info lscpu
run_test cpu_cores nproc
run_test memory_usage free -h
run_test disk_usage df -h
run_test hardware_info lshw -short || true
run_test network_interfaces ip a
run_test ibstat ibstat
run_test ibdev2netdev ibdev2netdev
run_pipe_test ACS_stat "lspci -vvv | grep -i acsct"
run_test rocm_info rocminfo || true

echo "==================== RCCL-TEST ===================="
cd /workspace/test/env_check_outputs

if command -v git &>/dev/null && command -v make &>/dev/null; then
  if [ ! -d rccl-tests ]; then
jerrrrry's avatar
jerrrrry committed
52
    git clone http://developer.sourcefind.cn/codes/jerrrrry/rccl-test.git || exit 1
jerrrrry's avatar
jerrrrry committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
  fi
  
  cd rccl-tests || exit 1
  source /opt/dtk/env.sh
  
  if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
       CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
    ./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/all_reduce_perf_8.log"
    ./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/all_reduce_perf_4.log"
  else
    echo "[ERROR] RCCL编译失败" | tee "$log_dir/rccl_build_fail.log"
  fi
  cd ..
else
  echo "[WARN] 缺少git或make,跳过RCCL测试" | tee "$log_dir/rccl_skip.log"
fi

echo "==================== DCU-ENV-CHECK ===================="
if [ ! -d dcu_env_check ]; then
  git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git || {
    echo "[ERROR] DCU环境检查代码克隆失败" | tee "$log_dir/dcu_clone_fail.log"
    exit 1
  }
fi

cd dcu_env_check && {
  bash system_check.sh 2>&1 | tee "$log_dir/dcu_env_check.log"
  cd ..
} || {
  echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/dcu_check_fail.log"
  exit 1
}

echo "==================== 检查完成 ===================="
echo "所有日志已保存至: $log_dir"
jerrrrry's avatar
jerrrrry committed
88
ls -lh "$log_dir"