run_envcheck.sh 7.52 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/bin/bash
set -eo pipefail  # 严格错误处理

log_dir="/workspace/test/env_check_outputs/"
mkdir -p "$log_dir"

echo "==================== 开始系统环境检查 ===================="

# 增强版检查函数 - 遇到错误继续执行
run_test() {
  local name=$1
  local chinese_name=$2
  shift 2
  echo "[RUN] $chinese_name"
  if ! "$@" 2>&1 | tee "$log_dir/${chinese_name}.log"; then
    echo "[WARN] $chinese_name 检查失败" | tee -a "$log_dir/${chinese_name}.log"
    return 1  # 返回非零状态但不退出脚本
  fi
  return 0
}

run_pipe_test() {
  local name=$1
  local chinese_name=$2
  local cmd=$3
  echo "[RUN] $chinese_name"
  
  if ! bash -c "$cmd" 2>&1 | tee "$log_dir/${chinese_name}.log"; then
    echo "[WARN] $chinese_name 检查失败" | tee -a "$log_dir/${chinese_name}.log"
    return 1
  fi
  return 0
}

# 安全执行函数 - 确保即使命令失败也不会中断脚本
safe_run() {
  local section=$1
  shift
  echo "==================== $section ===================="
  for cmd in "$@"; do
    # 使用eval来正确处理带引号的命令
    if ! eval "$cmd"; then
      echo "[WARN] 命令执行失败: $cmd" | tee -a "$log_dir/error.log"
    fi
  done
}

# ------------------------- 1. 系统基础检查 -------------------------
safe_run "1.系统基础信息检查" \
  'run_test uname "01_系统内核信息" uname -a' \
  'run_test os_release "02_操作系统版本" cat /etc/os-release' \
  'run_test locale "03_系统语言环境" locale'

# ------------------------- 2. CPU & 内存检查 -------------------------
safe_run "2.CPU_内存检查" \
  'run_test cpu_info "04_CPU详细信息" lscpu' \
  'run_test cpu_cores "05_CPU核心数" nproc' \
  'run_pipe_test cpu_freq "06_CPU频率" "cat /proc/cpuinfo | grep \"MHz\" | sort -u"' \
  'run_test memory_usage "07_内存使用情况" free -h' \
  'run_test vm_stat "08_系统整体CPU和内存使用情况" vmstat 1 10' \
  'run_test numa_nodes "09_NUMA节点信息" numactl --hardware || true' \
  'run_pipe_test cpu_usage "10_CPU利用率检查" "mpstat -P ALL 1 5"' \
  'run_pipe_test cpu_top_usage "11_CPU占用最高进程检查" "ps -eo pid,%cpu,cmd --sort=-%cpu | head -n 10"'

# ------------------------- 3. 存储设备检查 -------------------------
safe_run "3.存储设备检查" \
  'run_test disk_usage "12_磁盘使用情况" df -hT' \
  'run_test mount_info "13_挂载信息" mount | column -t' \
  'run_test block_devices "14_块设备信息" lsblk -o NAME,SIZE,TYPE,MOUNTPOINT'

# ------------------------- 4. 网络检查 -------------------------
safe_run "4.网络检查" \
  'run_test netstat "15_网络连接状态" ss -tulnp' \
  'run_test network_interfaces "16_网络接口信息" ip -br a' \
  'run_test routing_table "17_路由表信息" ip route' \
  'run_test arp_table "18_ARP表信息" ip neigh' \
  'run_test ibdev2netdev "19_InfiniBand设备映射" ibdev2netdev' \
  'run_test topo "20_网卡-dcu-topo"   lspci -vt '


# ------------------------- 5. DCU&内核&驱动检查 -------------------------
safe_run "5.DCU_内核_驱动检查" \
  'run_test hy_smi "21_DCU设备状态" hy-smi' \
  'run_test clock_level "22_DCU时钟级别" hy-smi -g' \
  'run_test driverversion "23_DCU驱动版本" hy-smi --showdriverversion' \
  'run_test rocminfo "24_ROCM信息" rocminfo' \
  'run_test kernel_modules "25_已加载内核模块" lsmod' \
  'run_test kernel_version "26_内核版本" uname -r'

# ------------------------- 6. 软件栈检查 -------------------------
safe_run "6.软件栈检查" \
  'run_test pip_list "27_Python包列表" pip list' \
  'run_test glibc_version "28_GLIBC版本" ldd --version'

# ------------------------- 7. 其他硬件状态检查 -------------------------
safe_run "7.其他硬件状态检查" \
  'run_test lspci "29_PCI设备列表" lspci' \
  'run_test iostat "30_IO统计信息" iostat' \
  'run_test hardware_info "31_硬件摘要信息" lshw -short || true' \
  'run_pipe_test ACS_stat "32_ACS状态检查" "lspci -vvv | grep -i acsct"' \
  'run_test dmesg "33_内核日志" dmesg' \
  'run_pipe_test pcie_topology "34_PCIe拓扑结构" "echo \"====== PCIe 桥接器 ======\"; lspci -vvv | grep -E \"PCI bridge|Root port\" -A 20 | grep -E \"Device|Vendor|LnkSta:|LnkCap:|Secondary bus\"; echo \"\"; echo \"====== PCIe 带宽汇总 ======\"; lspci -vvv | grep \"LnkSta:\" | sort | uniq -c"' \
  'run_pipe_test storage_details "35_存储控制器详情" "echo \"====== 存储控制器 ======\"; lspci -vvv | grep -E \"NVMe|SATA|RAID|Storage controller\" -A 30 | grep -E \"Device|Vendor|Kernel driver|LnkSta:|Speed|Width|MSI|Bar Memory\""' \
  'run_pipe_test nic_details "36_网卡详细信息" "echo \"====== 网卡详细信息 ======\"; lspci -vvv | grep -E \"Ethernet controller|Network controller|InfiniBand\" -A 50 | grep -E \"Device|Vendor|Subsystem|Kernel driver|Kernel modules|LnkSta:|LnkCap:|NUMA node|Speed|Width\""' \
  'run_pipe_test iommu_stat "37_IOMMU状态" "dmesg | grep IOMMU"' \
  'run_pipe_test SELinux_stat "38_SELinux状态" "dmesg | grep SELinux"'

# ------------------------- 8. 带宽检查 -------------------------
source /opt/dtk/env.sh
safe_run "8.带宽检查" \
  'run_test D2D-a_test "39_D2D单向带宽测试" /opt/dtk/bin/BandwidthTest  -a -s 512MB ' \
  'run_test D2D-A_test "40_D2D双向带宽测试" /opt/dtk/bin/BandwidthTest  -A -s 512MB ' \
  'run_test D2H-H2D_test "41_D2H和H2D带宽测试" /opt/dtk/bin/BandwidthTest  -t 3  ' \
  'cd /workspace/test/env_check_tools/ || { echo "[ERROR] 无法进入/workspace/test/env_check_tools/目录"; exit 1; }' \
  'if [ -f "rccl-tests.zip" ]; then
     echo "[INFO] 发现 rccl-tests.zip,开始解压..."
     unzip -o rccl-tests.zip -d rccl-tests || {
       echo "[ERROR] rccl-tests.zip 解压失败" | tee "$log_dir/42_RCCL测试解压失败.log"
       exit 1
     }

     cd rccl-tests/rccl-tests || { echo "[ERROR] 无法进入rccl-tests目录"; exit 1; }


     if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
          CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
       ./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/43_RCCL_all_reduce_8卡测试.log" || true
       ./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/44_RCCL_all_reduce_4卡测试.log" || true
     else
       echo "[ERROR] RCCL编译失败" | tee "$log_dir/45_RCCL编译失败.log"
     fi
     cd ../..
   else
     echo "[WARN] 未找到 rccl-tests.zip,跳过 RCCL 测试" | tee "$log_dir/46_RCCL测试跳过.log"
   fi'

# ------------------------- 9.DCU环境检查 -------------------------
safe_run "9.DCU环境检查" \
  'cd /workspace/test/env_check_tools/ || { echo "[ERROR] 无法进入/workspace/test/env_check_tools/目录"; exit 1; }' \
  'if [ -f "dcu_env_check.zip" ]; then
     echo "[INFO] 发现 dcu_env_check.zip,开始解压..."
     unzip -o dcu_env_check.zip -d dcu_env_check || {
       echo "[ERROR] dcu_env_check.zip 解压失败" | tee "$log_dir/47_DCU环境检查解压失败.log"
       exit 1
     }

     chmod +x dcu_env_check/dcu_env_check-main/tools/*

     cd dcu_env_check/dcu_env_check-main && {
       bash system_check.sh 2>&1 | tee "$log_dir/48_DCU环境检查结果.log" || true
       cp system_info* /workspace/test/env_check_outputs/ || true
       cd ../..
     } || {
       echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/49_DCU环境检查执行失败.log"
     }
   else
     echo "[WARN] 未找到 dcu_env_check.zip,跳过 DCU 环境检查" | tee "$log_dir/50_DCU环境检查跳过.log"
   fi'

echo "==================== 检查完成 ===================="
echo "所有日志已保存至: $log_dir"
ls -lh "$log_dir"