system_check.sh 8.21 KB
Newer Older
liumg's avatar
liumg committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/usr/bin/env bash
# DCU系统诊断工具 - 优化版本

#set -o errexit   # 遇到错误时退出
#set -o nounset   # 遇到未定义变量时报错
#set -o pipefail  # 管道中任何一个命令失败时整个管道失败

# 加载工具库
source ./tools/utils.sh
source ./tools/pkg_check.sh
source ./tools/dcuopn_check.sh
source ./tools/sys_info.sh
source ./tools/sys_log.sh
source ./tools/pcie_check.sh
source ./tools/kernel_check.sh
source ./tools/sme_check.sh
source ./tools/log_analyze.sh

# 默认配置
readonly DEFAULT_OUTPUT_DIR="system_info_$(date +%Y%m%d_%H%M%S)"
readonly DEFAULT_KEYWORD="hydcu|hycu"
readonly DEFAULT_LOG_AGE=24      # 小时
readonly DEFAULT_LOG_SIZE_LIMIT=10  # MB

# 工具路径
readonly DRIVER_LOAD_CHECK="./tools/driver_load_check.sh"
readonly BOARD_CHECK="./tools/board_check.sh"
readonly PCIE_SPEED_CHECK="./tools/pcie_speed_check.sh"

# 全局变量
liumg's avatar
liumg committed
31
32
QUIET_MODE=0
DEBUG_MODE=0
liumg's avatar
liumg committed
33
34
35
36
OUTPUT_DIR=""
KEYWORD=""
LOG_AGE=""
LOG_SIZE_LIMIT=""
liumg's avatar
liumg committed
37
38
39
40
41
DEVICE_NAME=""
DEVICE_ID=""

# 显示帮助信息
show_help() {
liumg's avatar
liumg committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
    cat << EOF
用法: $0 [选项]
DCU系统诊断脚本 - 收集系统信息并检测驱动问题

选项:
  -o DIR      指定输出目录 (默认: ${DEFAULT_OUTPUT_DIR})
  -k KEYWORD  设置检测关键字 (默认: ${DEFAULT_KEYWORD})
  -t HOURS    收集日志的时间范围(小时) (默认: ${DEFAULT_LOG_AGE})
  -s SIZE     日志文件大小限制(MB) (默认: ${DEFAULT_LOG_SIZE_LIMIT})
  -q          静默模式(仅输出错误)
  -d          调试模式
  -h          显示此帮助信息

示例:
  $0 -o /tmp/logs -k buserr -t 48
  $0 -q -s 20    # 静默模式,日志大小限制20MB
EOF
}

# 验证数字输入
validate_numeric_input() {
    local value="$1"
    local param_name="$2"
    local min_value="${3:-0}"
    
    if ! [[ "$value" =~ ^[0-9]+$ ]] || [ "$value" -lt "$min_value" ]; then
        echo "错误: 参数 '$param_name' 的值无效。必须为大于等于 ${min_value} 的整数。" >&2
        exit 1
    fi
liumg's avatar
liumg committed
71
72
}

liumg's avatar
liumg committed
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# 检查必需的工具是否存在
check_required_tools() {
    local tools=("dmidecode" "tar" "journalctl" "dmesg" "lscpu" "free" "ip" "lspci")
    local missing_tools=()
    
    for tool in "${tools[@]}"; do
        if ! command -v "$tool" &> /dev/null; then
            missing_tools+=("$tool")
        fi
    done
    
    if [ ${#missing_tools[@]} -gt 0 ]; then
        echo "错误: 缺少必需的工具: ${missing_tools[*]}" >&2
        exit 1
    fi
}

# 检查工具脚本是否存在
check_tool_scripts() {
    local scripts=(
        "$DRIVER_LOAD_CHECK" "$BOARD_CHECK" "$PCIE_SPEED_CHECK"
        "./tools/utils.sh" "./tools/pkg_check.sh" "./tools/dcuopn_check.sh"
        "./tools/sys_info.sh" "./tools/sys_log.sh" "./tools/pcie_check.sh"
        "./tools/kernel_check.sh" "./tools/sme_check.sh" "./tools/log_analyze.sh"
    )
    
    for script in "${scripts[@]}"; do
        if [ ! -f "$script" ]; then
            echo "错误: 工具脚本不存在: $script" >&2
            exit 1
        fi
        if [ ! -x "$script" ]; then
            chmod +x "$script"
            echo "已修复工具脚本权限: $script" >&2
        fi
    done
}

# 解析命令行参数
parse_arguments() {
    while getopts "o:k:t:s:qdh" opt; do
        case $opt in
            o) CUSTOM_OUTPUT_DIR="$OPTARG" ;;
            k) KEYWORD="$OPTARG" ;;
            t) LOG_AGE="$OPTARG" ;;
            s) LOG_SIZE_LIMIT="$OPTARG" ;;
            q) QUIET_MODE=1 ;;
            d) DEBUG_MODE=1; set -x ;;
            h) show_help; exit 0 ;;
            \?) echo "无效选项: -$OPTARG" >&2; show_help; exit 1 ;;
            :) echo "选项 -$OPTARG 需要参数" >&2; show_help; exit 1 ;;
        esac
    done
    shift $((OPTIND - 1))
    
    # 处理额外的参数
    if [ $# -gt 0 ]; then
        echo "警告: 忽略额外的参数: $*" >&2
    fi
}
liumg's avatar
liumg committed
133
134

# 设置默认值
liumg's avatar
liumg committed
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
set_defaults() {
    OUTPUT_DIR="${CUSTOM_OUTPUT_DIR:-$DEFAULT_OUTPUT_DIR}"
    KEYWORD="${KEYWORD:-$DEFAULT_KEYWORD}"
    LOG_AGE="${LOG_AGE:-$DEFAULT_LOG_AGE}"
    LOG_SIZE_LIMIT="${LOG_SIZE_LIMIT:-$DEFAULT_LOG_SIZE_LIMIT}"
    
    # 验证数字参数
    validate_numeric_input "$LOG_AGE" "-t" 1
    validate_numeric_input "$LOG_SIZE_LIMIT" "-s" 1
}

# 初始化输出目录
init_output_dir() {
    if [ -e "$OUTPUT_DIR" ]; then
        echo "错误: 输出目录已存在: $OUTPUT_DIR" >&2
        exit 1
    fi
    
    if ! mkdir -p "$OUTPUT_DIR"; then
        echo "错误: 无法创建输出目录: $OUTPUT_DIR" >&2
        exit 1
    fi
}

# 包装函数用于执行命令并记录日志
run_and_log() {
    local description="$1"
    local command="$2"
    local output_file="$3"
    
    head_normal "$description"
    
    if [ $DEBUG_MODE -eq 1 ]; then
        log "执行命令: $command"
        log "输出到: $output_file"
    fi
    
    # 使用 bash -c 执行命令字符串,便于支持带参数的命令
    bash -c "$command" > "$output_file" 2>&1 || {
        local exit_code=$?
        if [ $exit_code -eq 124 ]; then
            log "警告: 命令执行超时: $command"
        else
            log "警告: 命令执行失败 (退出码: $exit_code): $command"
        fi
        return $exit_code
    }
liumg's avatar
liumg committed
182

liumg's avatar
liumg committed
183
184
    return 0
}
liumg's avatar
liumg committed
185

liumg's avatar
liumg committed
186
187
188
189
190
191
# 尝试执行 /opt/hyhal/bin/drvdiag -c 收集驱动日志
run_drvdiag() {
    local drv_cmd="/opt/hyhal/bin/drvdiag"
    local drv_arg='-c'
    local drv_path="$drv_cmd"
    local out_file="$OUTPUT_DIR/drvdiag.log"
liumg's avatar
liumg committed
192

liumg's avatar
liumg committed
193
194
195
196
    if [ ! -x "$drv_path" ]; then
        echo "没有装驱动,请先安装驱动" >&2
        return 1
    fi
liumg's avatar
liumg committed
197

liumg's avatar
liumg committed
198
199
    run_and_log "收集驱动日志 (drvdiag -c)" "$drv_path $drv_arg" "$out_file"
}
liumg's avatar
liumg committed
200

liumg's avatar
liumg committed
201
202
203
204
# 收集系统信息
collect_system_information() {
    log "开始收集系统信息..."
    
liumg's avatar
liumg committed
205
    get_dcu
liumg's avatar
liumg committed
206
    pkg_check
liumg's avatar
liumg committed
207
208
    collect_system_info
    collect_logs
liumg's avatar
liumg committed
209
210
211
212
213
214
    get_pcie_info
}

# 分析日志信息
analyze_logs() {
    log "开始分析日志信息..."
liumg's avatar
liumg committed
215
    
liumg's avatar
liumg committed
216
217
218
219
220
221
222
223
224
225
226
227
228
229
    run_and_log "分析PCIe信息" "pcie_check" "$OUTPUT_DIR/pcie_analysis.log"
    run_and_log "分析SME信息" "sme_check" "$OUTPUT_DIR/sme_analysis.log"
    run_and_log "分析驱动安装位置" "kernel_check" "$OUTPUT_DIR/kernel_analysis.log"
}

# 运行附加检查
run_additional_checks() {
    log "运行附加检查..."
    
    # 先尝试收集 drvdiag 日志(如果驱动未安装则给出提示)
    run_drvdiag || log "跳过 drvdiag,驱动未安装或执行失败"

    run_and_log "检查驱动加载状态" "$DRIVER_LOAD_CHECK" "$OUTPUT_DIR/driver_status.log"
    run_and_log "检查硬件板卡信息" "$BOARD_CHECK" "$OUTPUT_DIR/board_check.log"
liumg's avatar
liumg committed
230
    
liumg's avatar
liumg committed
231
232
233
234
235
236
237
238
239
240
241
242
    # 只在非X785-H30设备上检查PCIe速度
    if product_name=$(dmidecode -s system-product-name 2>/dev/null); then
        if [ "$product_name" != "X785-H30" ]; then
            run_and_log "检查PCIe速度" "$PCIE_SPEED_CHECK" "$OUTPUT_DIR/pcie_speed_check.log"
        fi
    else
        log "警告: 无法获取产品名称,跳过PCIe速度检查"
    fi
}

# 打包结果
package_results() {
liumg's avatar
liumg committed
243
244
    log "打包诊断数据..."
    
liumg's avatar
liumg committed
245
246
247
248
249
250
251
252
253
    local tar_file="${OUTPUT_DIR}.tar.gz"
    if tar -czf "$tar_file" -C "$(dirname "$OUTPUT_DIR")" "$(basename "$OUTPUT_DIR")" 2>/dev/null; then
        rm -rf "$OUTPUT_DIR"
        log "诊断完成! 文件已保存为: ${tar_file}"
        log "文件大小: $(du -h "$tar_file" | cut -f1)"
    else
        log "错误: 打包失败,原始数据保存在: $OUTPUT_DIR"
        exit 1
    fi
liumg's avatar
liumg committed
254
255
}

liumg's avatar
liumg committed
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# 主工作流程
main() {
    # 显示开始信息
    hline
    log "🚀 DCU系统诊断工具启动"
    log "📁 输出目录: $OUTPUT_DIR"
    log "🔍 关键字: $KEYWORD"
    log "⏰ 日志时间范围: ${LOG_AGE}小时"
    log "📊 日志大小限制: ${LOG_SIZE_LIMIT}MB"
    hline
    
    get_dcu
    # 检查环境和工具
    check_required_tools
    check_tool_scripts
    
    # 收集信息
    collect_system_information
    
    # 分析日志
    echo
    hline
    log "📊 开始日志分析"
    hline
    analyze_logs
    
    # 运行附加检查
    run_additional_checks
    
    # 打包结果
    package_results
}

# 清理函数(在信号中断时调用)
cleanup() {
    if [ -n "$OUTPUT_DIR" ] && [ -d "$OUTPUT_DIR" ]; then
        log "正在清理临时文件..."
        rm -rf "$OUTPUT_DIR"
    fi
    exit 1
}

# 设置信号处理
trap cleanup INT TERM

liumg's avatar
liumg committed
301
# 执行主程序
liumg's avatar
liumg committed
302
303
304
parse_arguments "$@"
set_defaults
init_output_dir
liumg's avatar
liumg committed
305
306
main

liumg's avatar
liumg committed
307
exit 0