#!/bin/bash set -eo pipefail # 严格错误处理 log_dir="/workspace/test/env_check_outputs/" mkdir -p "$log_dir" echo "==================== 开始系统环境检查 ====================" # 增强版检查函数 - 遇到错误继续执行 run_test() { local name=$1 local chinese_name=$2 shift 2 echo "[RUN] $chinese_name" if ! "$@" 2>&1 | tee "$log_dir/${chinese_name}.log"; then echo "[WARN] $chinese_name 检查失败" | tee -a "$log_dir/${chinese_name}.log" return 1 # 返回非零状态但不退出脚本 fi return 0 } run_pipe_test() { local name=$1 local chinese_name=$2 local cmd=$3 echo "[RUN] $chinese_name" if ! bash -c "$cmd" 2>&1 | tee "$log_dir/${chinese_name}.log"; then echo "[WARN] $chinese_name 检查失败" | tee -a "$log_dir/${chinese_name}.log" return 1 fi return 0 } # 安全执行函数 - 确保即使命令失败也不会中断脚本 safe_run() { local section=$1 shift echo "==================== $section ====================" for cmd in "$@"; do # 使用eval来正确处理带引号的命令 if ! eval "$cmd"; then echo "[WARN] 命令执行失败: $cmd" | tee -a "$log_dir/error.log" fi done } # ------------------------- 1. 系统基础检查 ------------------------- safe_run "1.系统基础信息检查" \ 'run_test uname "01_系统内核信息" uname -a' \ 'run_test os_release "02_操作系统版本" cat /etc/os-release' \ 'run_test locale "03_系统语言环境" locale' # ------------------------- 2. CPU & 内存检查 ------------------------- safe_run "2.CPU_内存检查" \ 'run_test cpu_info "04_CPU详细信息" lscpu' \ 'run_test cpu_cores "05_CPU核心数" nproc' \ 'run_pipe_test cpu_freq "06_CPU频率" "cat /proc/cpuinfo | grep \"MHz\" | sort -u"' \ 'run_test memory_usage "07_内存使用情况" free -h' \ 'run_test vm_stat "08_系统整体CPU和内存使用情况" vmstat 1 10' \ 'run_test numa_nodes "09_NUMA节点信息" numactl --hardware || true' \ 'run_pipe_test cpu_usage "10_CPU利用率检查" "mpstat -P ALL 1 5"' \ 'run_pipe_test cpu_top_usage "11_CPU占用最高进程检查" "ps -eo pid,%cpu,cmd --sort=-%cpu | head -n 10"' # ------------------------- 3. 存储设备检查 ------------------------- safe_run "3.存储设备检查" \ 'run_test disk_usage "12_磁盘使用情况" df -hT' \ 'run_test mount_info "13_挂载信息" mount | column -t' \ 'run_test block_devices "14_块设备信息" lsblk -o NAME,SIZE,TYPE,MOUNTPOINT' # ------------------------- 4. 网络检查 ------------------------- safe_run "4.网络检查" \ 'run_test netstat "15_网络连接状态" ss -tulnp' \ 'run_test network_interfaces "16_网络接口信息" ip -br a' \ 'run_test routing_table "17_路由表信息" ip route' \ 'run_test arp_table "18_ARP表信息" ip neigh' \ 'run_test ibdev2netdev "19_InfiniBand设备映射" ibdev2netdev' \ 'run_test topo "20_网卡-dcu-topo" lspci -vt ' # ------------------------- 5. DCU&内核&驱动检查 ------------------------- safe_run "5.DCU_内核_驱动检查" \ 'run_test hy_smi "21_DCU设备状态" hy-smi' \ 'run_test clock_level "22_DCU时钟级别" hy-smi -g' \ 'run_test driverversion "23_DCU驱动版本" hy-smi --showdriverversion' \ 'run_test rocminfo "24_ROCM信息" rocminfo' \ 'run_test kernel_modules "25_已加载内核模块" lsmod' \ 'run_test kernel_version "26_内核版本" uname -r' # ------------------------- 6. 软件栈检查 ------------------------- safe_run "6.软件栈检查" \ 'run_test pip_list "27_Python包列表" pip list' \ 'run_test glibc_version "28_GLIBC版本" ldd --version' # ------------------------- 7. 其他硬件状态检查 ------------------------- safe_run "7.其他硬件状态检查" \ 'run_test lspci "29_PCI设备列表" lspci' \ 'run_test iostat "30_IO统计信息" iostat' \ 'run_test hardware_info "31_硬件摘要信息" lshw -short || true' \ 'run_pipe_test ACS_stat "32_ACS状态检查" "lspci -vvv | grep -i acsct"' \ 'run_test dmesg "33_内核日志" dmesg' \ 'run_pipe_test pcie_topology "34_PCIe拓扑结构" "echo \"====== PCIe 桥接器 ======\"; lspci -vvv | grep -E \"PCI bridge|Root port\" -A 20 | grep -E \"Device|Vendor|LnkSta:|LnkCap:|Secondary bus\"; echo \"\"; echo \"====== PCIe 带宽汇总 ======\"; lspci -vvv | grep \"LnkSta:\" | sort | uniq -c"' \ 'run_pipe_test storage_details "35_存储控制器详情" "echo \"====== 存储控制器 ======\"; lspci -vvv | grep -E \"NVMe|SATA|RAID|Storage controller\" -A 30 | grep -E \"Device|Vendor|Kernel driver|LnkSta:|Speed|Width|MSI|Bar Memory\""' \ 'run_pipe_test nic_details "36_网卡详细信息" "echo \"====== 网卡详细信息 ======\"; lspci -vvv | grep -E \"Ethernet controller|Network controller|InfiniBand\" -A 50 | grep -E \"Device|Vendor|Subsystem|Kernel driver|Kernel modules|LnkSta:|LnkCap:|NUMA node|Speed|Width\""' \ 'run_pipe_test iommu_stat "37_IOMMU状态" "dmesg | grep IOMMU"' \ 'run_pipe_test SELinux_stat "38_SELinux状态" "dmesg | grep SELinux"' # ------------------------- 8. 带宽检查 ------------------------- source /opt/dtk/env.sh safe_run "8.带宽检查" \ 'run_test D2D-a_test "39_D2D单向带宽测试" /opt/dtk/bin/BandwidthTest -a -s 512MB ' \ 'run_test D2D-A_test "40_D2D双向带宽测试" /opt/dtk/bin/BandwidthTest -A -s 512MB ' \ 'run_test D2H-H2D_test "41_D2H和H2D带宽测试" /opt/dtk/bin/BandwidthTest -t 3 ' \ 'cd /workspace/test/env_check_tools/ || { echo "[ERROR] 无法进入/workspace/test/env_check_tools/目录"; exit 1; }' \ 'if [ -f "rccl-tests.zip" ]; then echo "[INFO] 发现 rccl-tests.zip,开始解压..." unzip -o rccl-tests.zip -d rccl-tests || { echo "[ERROR] rccl-tests.zip 解压失败" | tee "$log_dir/42_RCCL测试解压失败.log" exit 1 } cd rccl-tests/rccl-tests || { echo "[ERROR] 无法进入rccl-tests目录"; exit 1; } if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \ CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then ./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/43_RCCL_all_reduce_8卡测试.log" || true ./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/44_RCCL_all_reduce_4卡测试.log" || true else echo "[ERROR] RCCL编译失败" | tee "$log_dir/45_RCCL编译失败.log" fi cd ../.. else echo "[WARN] 未找到 rccl-tests.zip,跳过 RCCL 测试" | tee "$log_dir/46_RCCL测试跳过.log" fi' # ------------------------- 9.DCU环境检查 ------------------------- safe_run "9.DCU环境检查" \ 'cd /workspace/test/env_check_tools/ || { echo "[ERROR] 无法进入/workspace/test/env_check_tools/目录"; exit 1; }' \ 'if [ -f "dcu_env_check.zip" ]; then echo "[INFO] 发现 dcu_env_check.zip,开始解压..." unzip -o dcu_env_check.zip -d dcu_env_check || { echo "[ERROR] dcu_env_check.zip 解压失败" | tee "$log_dir/47_DCU环境检查解压失败.log" exit 1 } chmod +x dcu_env_check/dcu_env_check-main/tools/* cd dcu_env_check/dcu_env_check-main && { bash system_check.sh 2>&1 | tee "$log_dir/48_DCU环境检查结果.log" || true cp system_info* /workspace/test/env_check_outputs/ || true cd ../.. } || { echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/49_DCU环境检查执行失败.log" } else echo "[WARN] 未找到 dcu_env_check.zip,跳过 DCU 环境检查" | tee "$log_dir/50_DCU环境检查跳过.log" fi' echo "==================== 检查完成 ====================" echo "所有日志已保存至: $log_dir" ls -lh "$log_dir"