pcie_check.sh 4.51 KB
Newer Older
liumg's avatar
liumg committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/bash

# 收集pcie信息
parse_regions() {
    lspci -vv -s "$1" | awk '/Region [0-9]+:/'
}

get_pcie_topo() {
    lspci -vt
}

show_acs() {
    lspci -vvs "$1" | grep ACS
}

show_link_status() {
    local info=$(lspci -vv -s "$1")

    declare -A lnk=(
        [cur_speed]=$(grep -ioP 'LnkSta:\s+Speed\s\K[\d.]+' <<< "$info")
        [max_speed]=$(grep -ioP 'LnkCap:\s+Speed\s\K[\d.]+' <<< "$info")
        [cur_width]=$(grep -ioP 'LnkSta.*Width\sx\K\d+' <<< "$info")
        [max_width]=$(grep -ioP 'LnkCap.*Width\sx\K\d+' <<< "$info")
    )
    echo "当前状态  : x${lnk[cur_width]} @ ${lnk[cur_speed]}GT/s "
}
show_busmaster() {
    lspci -vv -s "$1" | grep BusMaster | awk '{print $4}'
}
collect_pcie_logs() {
	log "收集PCIe系统信息"
	lspci -D -d :$DEVICE_ID | while read -r dev; do
		id=${dev:0:12}
        name=${dev:12}
        echo
        echo "$(hline)"
        echo "设备 ${id}"
        echo "型号      : ${name}"
        echo "$(hline)"
        echo
        echo "BAR 内存映射:"
        parse_regions "$id" | sed 's/^/  /'
        echo
        echo "PCIe 链路状态:"
        show_link_status "$id" | sed 's/^/  /'
        echo
        echo "PCIe ACS设置"
        show_acs "$id"  | sed 's/^/  /'
        echo
        echo "BusMaster设置"
        show_busmaster "$id"  | sed 's/^/  /'
    done
	
}

get_pcie_info() {
	log "收集PCIe系统信息"
	collect_pcie_logs >  $OUTPUT_DIR/pcie_info.log
	get_pcie_topo > $OUTPUT_DIR/pcie_vt.log 2>&1
	lspci -vvv > $OUTPUT_DIR/pcie_more.log 2>&1
	log "PCIe 信息收集完毕"
}

analyze_regions() {
    local address
    echo "$1"

    # 提取Region关键参数
	address=$(echo "$1" | awk '/Memory at/ {print $5}')
    
liumg's avatar
liumg committed
71
    # PCIe BAR地址状态检查
liumg's avatar
liumg committed
72
    if [[ "$address" == "unassigned" ]]; then
liumg's avatar
liumg committed
73
74
75
76
77
78
79
80
81
        echo "[ERROR CODE 1] PCIe BAR地址未分配"
        echo "可能原因:"
        echo "  1. DCU卡未正确插入或供电不足"
        echo "  2. 主板PCIe插槽故障"
        echo "  3. 驱动未正确加载"
        echo "解决方案:"
        echo "  1. 检查DCU卡物理连接状态"
        echo "  2. 执行 'lspci -vvv | grep -A 10 \"$DEVICE_ID\"' 确认设备识别状态"
        echo "  3. 检查dmesg日志中的PCIe相关错误"
liumg's avatar
liumg committed
82
83
        return 1
    elif [[ `echo $address | wc -c` -gt 12 ]]; then
liumg's avatar
liumg committed
84
85
86
87
88
89
90
91
92
        echo "[ERROR CODE 2] PCIe BAR地址超出44bit范围 (当前地址: 0x${address})"
        echo "可能影响:"
        echo "  - 可能导致DMA操作失败"
        echo "  - 系统内存空间不足"
        echo "解决方案:"
        echo "  1. 进入BIOS设置"
        echo "  2. 找到 'MMIO High Base' 选项"
        echo "  3. 设置为小于16T的值 (如 8T)"
        echo "  4. 保存设置并重启系统"
liumg's avatar
liumg committed
93
        return 2
liumg's avatar
liumg committed
94
95
96
97
98
99
100
101
102
103
104
    elif [[ "$address" == "<ignored>" ]]; then
        echo "[ERROR CODE 3] 无法获取有效BAR地址"
        echo "可能原因:"
        echo "  - 内核未启用PCIe地址重分配功能"
        echo "解决方案:"
        echo "  1. 检查当前内核启动参数:"
        grep -q "pcie=realloc" /proc/cmdline || echo "   当前配置: $(cat /proc/cmdline)"
        echo "  2. 在GRUB配置中添加 'pcie=realloc' 参数"
        echo "  3. 更新GRUB配置并重启:"
        echo "    - Debian/Ubuntu: update-grub"
        echo "    - RHEL/CentOS: grub2-mkconfig -o /boot/grub2/grub.cfg"
liumg's avatar
liumg committed
105
106
        return 3
    fi
liumg's avatar
liumg committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    
    # 检查Region 0地址是否在保留区间 (0xf000000000-0xffffffffff)
    if [[ "$1" == *"Region 0"* ]]; then
        if [[ "$address" =~ ^f[0-9a-f]{11}$ ]]; then
            local dev_id=$(echo "$1" | grep -oP 'Device \K[0-9a-f:]+')
            echo "[ERROR CODE 4] PCIe BAR地址冲突 - 卡 ${dev_id} (地址: 0x${address})"
            echo "问题描述:"
            echo "  - Region 0地址位于保留区间 (0xf000000000-0xffffffffff)"
            echo "  - 可能与系统保留内存区域冲突"
            echo "解决方案:"
            echo "  1. 进入BIOS设置"
            echo "  2. 找到 'PCIe Memory Mapped IO' 或 'MMIO Configuration'"
            echo "  3. 调整MMIO分配范围,避开 0xf000000000-0xffffffffff"
            echo "  4. 如无相关选项,可能需要更新BIOS版本"
            return 4
        fi
    fi
    
liumg's avatar
liumg committed
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
    echo "PCIe 状态正常"
    return 0
}

pcie_check() {
    if [ ! -f "$1" ]; then
        echo "file not exists" >&2
        exit 1
    fi
	echo "Region 0地址测试"
	grep "Region 0" $1 | while read -r line; do
		analyze_regions "$line"
	done
	echo "Region 5 地址测试"
	grep "Region 5" $1 | while read -r line; do
		analyze_regions "$line"
	done
}