#!/usr/bin/bash # 收集pcie信息 parse_regions() { lspci -vv -s "$1" | awk '/Region [0-9]+:/' } get_pcie_topo() { lspci -vt } show_acs() { lspci -vvs "$1" | grep ACS } show_link_status() { local info=$(lspci -vv -s "$1") declare -A lnk=( [cur_speed]=$(grep -ioP 'LnkSta:\s+Speed\s\K[\d.]+' <<< "$info") [max_speed]=$(grep -ioP 'LnkCap:\s+Speed\s\K[\d.]+' <<< "$info") [cur_width]=$(grep -ioP 'LnkSta.*Width\sx\K\d+' <<< "$info") [max_width]=$(grep -ioP 'LnkCap.*Width\sx\K\d+' <<< "$info") ) echo "当前状态 : x${lnk[cur_width]} @ ${lnk[cur_speed]}GT/s " } show_busmaster() { lspci -vv -s "$1" | grep BusMaster | awk '{print $4}' } collect_pcie_logs() { log "收集PCIe系统信息" lspci -D -d :$DEVICE_ID | while read -r dev; do id=${dev:0:12} name=${dev:12} echo echo "$(hline)" echo "设备 ${id}" echo "型号 : ${name}" echo "$(hline)" echo echo "BAR 内存映射:" parse_regions "$id" | sed 's/^/ /' echo echo "PCIe 链路状态:" show_link_status "$id" | sed 's/^/ /' echo echo "PCIe ACS设置" show_acs "$id" | sed 's/^/ /' echo echo "BusMaster设置" show_busmaster "$id" | sed 's/^/ /' done } get_pcie_info() { log "收集PCIe系统信息" collect_pcie_logs > $OUTPUT_DIR/pcie_info.log get_pcie_topo > $OUTPUT_DIR/pcie_vt.log 2>&1 lspci -vvv > $OUTPUT_DIR/pcie_more.log 2>&1 log "PCIe 信息收集完毕" } analyze_regions() { local address echo "$1" # 提取Region关键参数 address=$(echo "$1" | awk '/Memory at/ {print $5}') # PCIe BAR地址状态检查 if [[ "$address" == "unassigned" ]]; then echo "[ERROR CODE 1] PCIe BAR地址未分配" echo "可能原因:" echo " 1. DCU卡未正确插入或供电不足" echo " 2. 主板PCIe插槽故障" echo " 3. 驱动未正确加载" echo "解决方案:" echo " 1. 检查DCU卡物理连接状态" echo " 2. 执行 'lspci -vvv | grep -A 10 \"$DEVICE_ID\"' 确认设备识别状态" echo " 3. 检查dmesg日志中的PCIe相关错误" return 1 elif [[ `echo $address | wc -c` -gt 12 ]]; then echo "[ERROR CODE 2] PCIe BAR地址超出44bit范围 (当前地址: 0x${address})" echo "可能影响:" echo " - 可能导致DMA操作失败" echo " - 系统内存空间不足" echo "解决方案:" echo " 1. 进入BIOS设置" echo " 2. 找到 'MMIO High Base' 选项" echo " 3. 设置为小于16T的值 (如 8T)" echo " 4. 保存设置并重启系统" return 2 elif [[ "$address" == "" ]]; then echo "[ERROR CODE 3] 无法获取有效BAR地址" echo "可能原因:" echo " - 内核未启用PCIe地址重分配功能" echo "解决方案:" echo " 1. 检查当前内核启动参数:" grep -q "pcie=realloc" /proc/cmdline || echo " 当前配置: $(cat /proc/cmdline)" echo " 2. 在GRUB配置中添加 'pcie=realloc' 参数" echo " 3. 更新GRUB配置并重启:" echo " - Debian/Ubuntu: update-grub" echo " - RHEL/CentOS: grub2-mkconfig -o /boot/grub2/grub.cfg" return 3 fi # 检查Region 0地址是否在保留区间 (0xf000000000-0xffffffffff) if [[ "$1" == *"Region 0"* ]]; then if [[ "$address" =~ ^f[0-9a-f]{11}$ ]]; then local dev_id=$(echo "$1" | grep -oP 'Device \K[0-9a-f:]+') echo "[ERROR CODE 4] PCIe BAR地址冲突 - 卡 ${dev_id} (地址: 0x${address})" echo "问题描述:" echo " - Region 0地址位于保留区间 (0xf000000000-0xffffffffff)" echo " - 可能与系统保留内存区域冲突" echo "解决方案:" echo " 1. 进入BIOS设置" echo " 2. 找到 'PCIe Memory Mapped IO' 或 'MMIO Configuration'" echo " 3. 调整MMIO分配范围,避开 0xf000000000-0xffffffffff" echo " 4. 如无相关选项,可能需要更新BIOS版本" return 4 fi fi echo "PCIe 状态正常" return 0 } pcie_check() { if [ ! -f "$1" ]; then echo "file not exists" >&2 exit 1 fi echo "Region 0地址测试" grep "Region 0" $1 | while read -r line; do analyze_regions "$line" done echo "Region 5 地址测试" grep "Region 5" $1 | while read -r line; do analyze_regions "$line" done }