#!/bin/bash
set -eo pipefail  # 严格错误处理

log_dir="/workspace/test/env_check_outputs/"
mkdir -p "$log_dir"

echo "==================== 开始系统环境检查 ===================="

# 增强版检查函数 - 遇到错误继续执行
run_test() {
  local name=$1
  local chinese_name=$2
  shift 2
  echo "[RUN] $chinese_name"
  if ! "$@" 2>&1 | tee "$log_dir/${chinese_name}.log"; then
    echo "[WARN] $chinese_name 检查失败" | tee -a "$log_dir/${chinese_name}.log"
    return 1  # 返回非零状态但不退出脚本
  fi
  return 0
}

run_pipe_test() {
  local name=$1
  local chinese_name=$2
  local cmd=$3
  echo "[RUN] $chinese_name"
  
  if ! bash -c "$cmd" 2>&1 | tee "$log_dir/${chinese_name}.log"; then
    echo "[WARN] $chinese_name 检查失败" | tee -a "$log_dir/${chinese_name}.log"
    return 1
  fi
  return 0
}

# 安全执行函数 - 确保即使命令失败也不会中断脚本
safe_run() {
  local section=$1
  shift
  echo "==================== $section ===================="
  for cmd in "$@"; do
    # 使用eval来正确处理带引号的命令
    if ! eval "$cmd"; then
      echo "[WARN] 命令执行失败: $cmd" | tee -a "$log_dir/error.log"
    fi
  done
}

# ------------------------- 1. 系统基础检查 -------------------------
safe_run "1.系统基础信息检查" \
  'run_test uname "01_系统内核信息" uname -a' \
  'run_test os_release "02_操作系统版本" cat /etc/os-release' \
  'run_test locale "03_系统语言环境" locale'

# ------------------------- 2. CPU & 内存检查 -------------------------
safe_run "2.CPU_内存检查" \
  'run_test cpu_info "04_CPU详细信息" lscpu' \
  'run_test cpu_cores "05_CPU核心数" nproc' \
  'run_pipe_test cpu_freq "06_CPU频率" "cat /proc/cpuinfo | grep \"MHz\" | sort -u"' \
  'run_test memory_usage "07_内存使用情况" free -h' \
  'run_test vm_stat "08_系统整体CPU和内存使用情况" vmstat 1 10' \
  'run_test numa_nodes "09_NUMA节点信息" numactl --hardware || true' \
  'run_pipe_test cpu_usage "10_CPU利用率检查" "mpstat -P ALL 1 5"' \
  'run_pipe_test cpu_top_usage "11_CPU占用最高进程检查" "ps -eo pid,%cpu,cmd --sort=-%cpu | head -n 10"'

# ------------------------- 3. 存储设备检查 -------------------------
safe_run "3.存储设备检查" \
  'run_test disk_usage "12_磁盘使用情况" df -hT' \
  'run_test mount_info "13_挂载信息" mount | column -t' \
  'run_test block_devices "14_块设备信息" lsblk -o NAME,SIZE,TYPE,MOUNTPOINT'

# ------------------------- 4. 网络检查 -------------------------
safe_run "4.网络检查" \
  'run_test netstat "15_网络连接状态" ss -tulnp' \
  'run_test network_interfaces "16_网络接口信息" ip -br a' \
  'run_test routing_table "17_路由表信息" ip route' \
  'run_test arp_table "18_ARP表信息" ip neigh' \
  'run_test ibdev2netdev "19_InfiniBand设备映射" ibdev2netdev' \
  'run_test topo "20_网卡-dcu-topo"   lspci -vt '


# ------------------------- 5. DCU&内核&驱动检查 -------------------------
safe_run "5.DCU_内核_驱动检查" \
  'run_test hy_smi "21_DCU设备状态" hy-smi' \
  'run_test clock_level "22_DCU时钟级别" hy-smi -g' \
  'run_test driverversion "23_DCU驱动版本" hy-smi --showdriverversion' \
  'run_test rocminfo "24_ROCM信息" rocminfo' \
  'run_test kernel_modules "25_已加载内核模块" lsmod' \
  'run_test kernel_version "26_内核版本" uname -r'

# ------------------------- 6. 软件栈检查 -------------------------
safe_run "6.软件栈检查" \
  'run_test pip_list "27_Python包列表" pip list' \
  'run_test glibc_version "28_GLIBC版本" ldd --version'

# ------------------------- 7. 其他硬件状态检查 -------------------------
safe_run "7.其他硬件状态检查" \
  'run_test lspci "29_PCI设备列表" lspci' \
  'run_test iostat "30_IO统计信息" iostat' \
  'run_test hardware_info "31_硬件摘要信息" lshw -short || true' \
  'run_pipe_test ACS_stat "32_ACS状态检查" "lspci -vvv | grep -i acsct"' \
  'run_test dmesg "33_内核日志" dmesg' \
  'run_pipe_test pcie_topology "34_PCIe拓扑结构" "echo \"====== PCIe 桥接器 ======\"; lspci -vvv | grep -E \"PCI bridge|Root port\" -A 20 | grep -E \"Device|Vendor|LnkSta:|LnkCap:|Secondary bus\"; echo \"\"; echo \"====== PCIe 带宽汇总 ======\"; lspci -vvv | grep \"LnkSta:\" | sort | uniq -c"' \
  'run_pipe_test storage_details "35_存储控制器详情" "echo \"====== 存储控制器 ======\"; lspci -vvv | grep -E \"NVMe|SATA|RAID|Storage controller\" -A 30 | grep -E \"Device|Vendor|Kernel driver|LnkSta:|Speed|Width|MSI|Bar Memory\""' \
  'run_pipe_test nic_details "36_网卡详细信息" "echo \"====== 网卡详细信息 ======\"; lspci -vvv | grep -E \"Ethernet controller|Network controller|InfiniBand\" -A 50 | grep -E \"Device|Vendor|Subsystem|Kernel driver|Kernel modules|LnkSta:|LnkCap:|NUMA node|Speed|Width\""' \
  'run_pipe_test iommu_stat "37_IOMMU状态" "dmesg | grep IOMMU"' \
  'run_pipe_test SELinux_stat "38_SELinux状态" "dmesg | grep SELinux"'

# ------------------------- 8. 带宽检查 -------------------------
source /opt/dtk/env.sh
safe_run "8.带宽检查" \
  'run_test D2D-a_test "39_D2D单向带宽测试" /opt/dtk/bin/BandwidthTest  -a -s 512MB ' \
  'run_test D2D-A_test "40_D2D双向带宽测试" /opt/dtk/bin/BandwidthTest  -A -s 512MB ' \
  'run_test D2H-H2D_test "41_D2H和H2D带宽测试" /opt/dtk/bin/BandwidthTest  -t 3  ' \
  'cd /workspace/test/env_check_tools/ || { echo "[ERROR] 无法进入/workspace/test/env_check_tools/目录"; exit 1; }' \
  'if [ -f "rccl-tests.zip" ]; then
     echo "[INFO] 发现 rccl-tests.zip，开始解压..."
     unzip -o rccl-tests.zip -d rccl-tests || {
       echo "[ERROR] rccl-tests.zip 解压失败" | tee "$log_dir/42_RCCL测试解压失败.log"
       exit 1
     }

     cd rccl-tests/rccl-tests || { echo "[ERROR] 无法进入rccl-tests目录"; exit 1; }


     if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
          CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
       ./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/43_RCCL_all_reduce_8卡测试.log" || true
       ./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/44_RCCL_all_reduce_4卡测试.log" || true
     else
       echo "[ERROR] RCCL编译失败" | tee "$log_dir/45_RCCL编译失败.log"
     fi
     cd ../..
   else
     echo "[WARN] 未找到 rccl-tests.zip，跳过 RCCL 测试" | tee "$log_dir/46_RCCL测试跳过.log"
   fi'

# ------------------------- 9.DCU环境检查 -------------------------
safe_run "9.DCU环境检查" \
  'cd /workspace/test/env_check_tools/ || { echo "[ERROR] 无法进入/workspace/test/env_check_tools/目录"; exit 1; }' \
  'if [ -f "dcu_env_check.zip" ]; then
     echo "[INFO] 发现 dcu_env_check.zip，开始解压..."
     unzip -o dcu_env_check.zip -d dcu_env_check || {
       echo "[ERROR] dcu_env_check.zip 解压失败" | tee "$log_dir/47_DCU环境检查解压失败.log"
       exit 1
     }

     chmod +x dcu_env_check/dcu_env_check-main/tools/*

     cd dcu_env_check/dcu_env_check-main && {
       bash system_check.sh 2>&1 | tee "$log_dir/48_DCU环境检查结果.log" || true
       cp system_info* /workspace/test/env_check_outputs/ || true
       cd ../..
     } || {
       echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/49_DCU环境检查执行失败.log"
     }
   else
     echo "[WARN] 未找到 dcu_env_check.zip，跳过 DCU 环境检查" | tee "$log_dir/50_DCU环境检查跳过.log"
   fi'

echo "==================== 检查完成 ===================="
echo "所有日志已保存至: $log_dir"
ls -lh "$log_dir"