Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dcu_env_check
Commits
138039a2
Commit
138039a2
authored
Mar 23, 2026
by
liumg
Browse files
变更格式
parent
5ec6d6c1
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
696 additions
and
169 deletions
+696
-169
system_check.sh
system_check.sh
+278
-84
tools/dcuopn_check.sh
tools/dcuopn_check.sh
+59
-9
tools/driver_load_check.sh
tools/driver_load_check.sh
+29
-10
tools/hcu_info.sh
tools/hcu_info.sh
+0
-0
tools/hydcutune
tools/hydcutune
+0
-0
tools/pcie_check.sh
tools/pcie_check.sh
+48
-10
tools/pkg_check.sh
tools/pkg_check.sh
+52
-15
tools/sys_log.sh
tools/sys_log.sh
+126
-28
tools/utils.sh
tools/utils.sh
+104
-13
No files found.
system_check.sh
View file @
138039a2
#!/bin/bash
#set -x
.
./tools/utils.sh
.
./tools/pkg_check.sh
.
./tools/dcuopn_check.sh
.
./tools/sys_info.sh
.
./tools/sys_log.sh
.
./tools/pcie_check.sh
.
./tools/kernel_check.sh
.
./tools/sme_check.sh
.
./tools/log_analyze.sh
# 默认参数配置
DEFAULT_OUTPUT_DIR
=
"system_info_
$(
date
+%Y%m%d_%H%M%S
)
"
DEFAULT_KEYWORD
=
"hydcu"
DEFAULT_LOG_AGE
=
24
# 小时
DEFAULT_LOG_SIZE_LIMIT
=
10
# 单位:MB
#!/usr/bin/env bash
# DCU系统诊断工具 - 优化版本
#set -o errexit # 遇到错误时退出
#set -o nounset # 遇到未定义变量时报错
#set -o pipefail # 管道中任何一个命令失败时整个管道失败
# 加载工具库
source
./tools/utils.sh
source
./tools/pkg_check.sh
source
./tools/dcuopn_check.sh
source
./tools/sys_info.sh
source
./tools/sys_log.sh
source
./tools/pcie_check.sh
source
./tools/kernel_check.sh
source
./tools/sme_check.sh
source
./tools/log_analyze.sh
# 默认配置
readonly
DEFAULT_OUTPUT_DIR
=
"system_info_
$(
date
+%Y%m%d_%H%M%S
)
"
readonly
DEFAULT_KEYWORD
=
"hydcu|hycu"
readonly
DEFAULT_LOG_AGE
=
24
# 小时
readonly
DEFAULT_LOG_SIZE_LIMIT
=
10
# MB
# 工具路径
readonly
DRIVER_LOAD_CHECK
=
"./tools/driver_load_check.sh"
readonly
BOARD_CHECK
=
"./tools/board_check.sh"
readonly
PCIE_SPEED_CHECK
=
"./tools/pcie_speed_check.sh"
# 全局变量
QUIET_MODE
=
0
DEBUG_MODE
=
0
OUTPUT_DIR
=
""
KEYWORD
=
""
LOG_AGE
=
""
LOG_SIZE_LIMIT
=
""
DEVICE_NAME
=
""
DEVICE_ID
=
""
# 显示帮助信息
show_help
()
{
echo
"Usage:
$0
[OPTIONS]"
echo
"系统诊断脚本 - 收集系统信息并检测驱动问题"
echo
echo
"选项:"
echo
" -o DIR 指定输出目录 (默认: 自动生成)"
echo
" -k KEYWORD 设置检测关键字 (默认:
$DEFAULT_KEYWORD
)"
echo
" -t HOURS 收集日志的时间范围(小时) (默认: 24)"
echo
" -s SIZE 日志文件大小限制(MB) (默认: 10)"
echo
" -q 静默模式(仅输出错误)"
echo
" -d 调试模式"
echo
" -h 显示此帮助信息"
echo
echo
"示例:"
echo
"
$0
-o /tmp/logs -k buserr -t 48"
cat
<<
EOF
用法:
$0
[选项]
DCU系统诊断脚本 - 收集系统信息并检测驱动问题
选项:
-o DIR 指定输出目录 (默认:
${
DEFAULT_OUTPUT_DIR
}
)
-k KEYWORD 设置检测关键字 (默认:
${
DEFAULT_KEYWORD
}
)
-t HOURS 收集日志的时间范围(小时) (默认:
${
DEFAULT_LOG_AGE
}
)
-s SIZE 日志文件大小限制(MB) (默认:
${
DEFAULT_LOG_SIZE_LIMIT
}
)
-q 静默模式(仅输出错误)
-d 调试模式
-h 显示此帮助信息
示例:
$0
-o /tmp/logs -k buserr -t 48
$0
-q -s 20 # 静默模式,日志大小限制20MB
EOF
}
# 验证数字输入
validate_numeric_input
()
{
local
value
=
"
$1
"
local
param_name
=
"
$2
"
local
min_value
=
"
${
3
:-
0
}
"
if
!
[[
"
$value
"
=
~ ^[0-9]+
$
]]
||
[
"
$value
"
-lt
"
$min_value
"
]
;
then
echo
"错误: 参数 '
$param_name
' 的值无效。必须为大于等于
${
min_value
}
的整数。"
>
&2
exit
1
fi
}
# 解析参数
while
getopts
"o:k:t:s:qdh"
opt
;
do
case
$opt
in
o
)
CUSTOM_OUTPUT_DIR
=
"
$OPTARG
"
;;
k
)
KEYWORD
=
"
$OPTARG
"
;;
t
)
LOG_AGE
=
"
$OPTARG
"
;;
s
)
LOG_SIZE_LIMIT
=
"
$OPTARG
"
;;
q
)
QUIET_MODE
=
1
;;
d
)
DEBUG_MODE
=
1
;
set
-x
;;
h
)
show_help
;
exit
0
;;
\?
)
echo
"无效选项: -
$OPTARG
"
>
&2
;
exit
1
;;
:
)
echo
"选项 -
$OPTARG
需要参数"
>
&2
;
exit
1
;;
esac
done
# 检查必需的工具是否存在
check_required_tools
()
{
local
tools
=(
"dmidecode"
"tar"
"journalctl"
"dmesg"
"lscpu"
"free"
"ip"
"lspci"
)
local
missing_tools
=()
for
tool
in
"
${
tools
[@]
}
"
;
do
if
!
command
-v
"
$tool
"
&> /dev/null
;
then
missing_tools+
=(
"
$tool
"
)
fi
done
if
[
${#
missing_tools
[@]
}
-gt
0
]
;
then
echo
"错误: 缺少必需的工具:
${
missing_tools
[*]
}
"
>
&2
exit
1
fi
}
# 检查工具脚本是否存在
check_tool_scripts
()
{
local
scripts
=(
"
$DRIVER_LOAD_CHECK
"
"
$BOARD_CHECK
"
"
$PCIE_SPEED_CHECK
"
"./tools/utils.sh"
"./tools/pkg_check.sh"
"./tools/dcuopn_check.sh"
"./tools/sys_info.sh"
"./tools/sys_log.sh"
"./tools/pcie_check.sh"
"./tools/kernel_check.sh"
"./tools/sme_check.sh"
"./tools/log_analyze.sh"
)
for
script
in
"
${
scripts
[@]
}
"
;
do
if
[
!
-f
"
$script
"
]
;
then
echo
"错误: 工具脚本不存在:
$script
"
>
&2
exit
1
fi
if
[
!
-x
"
$script
"
]
;
then
chmod
+x
"
$script
"
echo
"已修复工具脚本权限:
$script
"
>
&2
fi
done
}
# 解析命令行参数
parse_arguments
()
{
while
getopts
"o:k:t:s:qdh"
opt
;
do
case
$opt
in
o
)
CUSTOM_OUTPUT_DIR
=
"
$OPTARG
"
;;
k
)
KEYWORD
=
"
$OPTARG
"
;;
t
)
LOG_AGE
=
"
$OPTARG
"
;;
s
)
LOG_SIZE_LIMIT
=
"
$OPTARG
"
;;
q
)
QUIET_MODE
=
1
;;
d
)
DEBUG_MODE
=
1
;
set
-x
;;
h
)
show_help
;
exit
0
;;
\?
)
echo
"无效选项: -
$OPTARG
"
>
&2
;
show_help
;
exit
1
;;
:
)
echo
"选项 -
$OPTARG
需要参数"
>
&2
;
show_help
;
exit
1
;;
esac
done
shift
$((
OPTIND
-
1
))
# 处理额外的参数
if
[
$#
-gt
0
]
;
then
echo
"警告: 忽略额外的参数:
$*
"
>
&2
fi
}
# 设置默认值
:
${
OUTPUT_DIR
:
=
${
CUSTOM_OUTPUT_DIR
:-
$DEFAULT_OUTPUT_DIR
}}
:
${
KEYWORD
:
=
$DEFAULT_KEYWORD
}
:
${
LOG_AGE
:
=
$DEFAULT_LOG_AGE
}
:
${
LOG_SIZE_LIMIT
:
=
$DEFAULT_LOG_SIZE_LIMIT
}
set_defaults
()
{
OUTPUT_DIR
=
"
${
CUSTOM_OUTPUT_DIR
:-
$DEFAULT_OUTPUT_DIR
}
"
KEYWORD
=
"
${
KEYWORD
:-
$DEFAULT_KEYWORD
}
"
LOG_AGE
=
"
${
LOG_AGE
:-
$DEFAULT_LOG_AGE
}
"
LOG_SIZE_LIMIT
=
"
${
LOG_SIZE_LIMIT
:-
$DEFAULT_LOG_SIZE_LIMIT
}
"
# 验证数字参数
validate_numeric_input
"
$LOG_AGE
"
"-t"
1
validate_numeric_input
"
$LOG_SIZE_LIMIT
"
"-s"
1
}
# 初始化输出目录
init_output_dir
()
{
if
[
-e
"
$OUTPUT_DIR
"
]
;
then
echo
"错误: 输出目录已存在:
$OUTPUT_DIR
"
>
&2
exit
1
fi
if
!
mkdir
-p
"
$OUTPUT_DIR
"
;
then
echo
"错误: 无法创建输出目录:
$OUTPUT_DIR
"
>
&2
exit
1
fi
}
# 包装函数用于执行命令并记录日志
run_and_log
()
{
local
description
=
"
$1
"
local command
=
"
$2
"
local
output_file
=
"
$3
"
head_normal
"
$description
"
if
[
$DEBUG_MODE
-eq
1
]
;
then
log
"执行命令:
$command
"
log
"输出到:
$output_file
"
fi
# 使用 bash -c 执行命令字符串,便于支持带参数的命令
bash
-c
"
$command
"
>
"
$output_file
"
2>&1
||
{
local
exit_code
=
$?
if
[
$exit_code
-eq
124
]
;
then
log
"警告: 命令执行超时:
$command
"
else
log
"警告: 命令执行失败 (退出码:
$exit_code
):
$command
"
fi
return
$exit_code
}
return
0
}
# 初始化目录
mkdir
-p
"
$OUTPUT_DIR
"
||
exit
1
# 尝试执行 /opt/hyhal/bin/drvdiag -c 收集驱动日志
run_drvdiag
()
{
local
drv_cmd
=
"/opt/hyhal/bin/drvdiag"
local
drv_arg
=
'-c'
local
drv_path
=
"
$drv_cmd
"
local
out_file
=
"
$OUTPUT_DIR
/drvdiag.log"
if
[
!
-x
"
$drv_path
"
]
;
then
echo
"没有装驱动,请先安装驱动"
>
&2
return
1
fi
# 主流程
main
()
{
run_and_log
"收集驱动日志 (drvdiag -c)"
"
$drv_path
$drv_arg
"
"
$out_file
"
}
hline
echo
-e
'################ 日志收集 ##################'
hline
# 收集系统信息
collect_system_information
()
{
log
"开始收集系统信息..."
get_dcu
pkg_check
collect_system_info
collect_logs
#analyze_errors
get_pcie_info
get_pcie_info
}
# 分析日志信息
analyze_logs
()
{
log
"开始分析日志信息..."
echo
-e
'\n###### 日志分析 #######'
hline
head_normal
"分析pcie信息"
pcie_check
$OUTPUT_DIR
/pcie_info.log
head_normal
"分析sme信息"
sme_check
$OUTPUT_DIR
/dmesg.log
head_normal
"分析驱动安装位置"
kernel_check
./tools/driver_load_check.sh
>
$OUTPUT_DIR
/driver_status.log
./tools/board_check.sh
>
$OUTPUT_DIR
/board_check.log
product_name
=
`
dmidecode
-s
system-product-name
`
if
[
"
$product_name
"
!=
"X785-H30"
]
;
then
./tools/pcie_speed_check.sh
>
$OUTPUT_DIR
/pcie_speek_check.log
fi
local
status
=
$?
run_and_log
"分析PCIe信息"
"pcie_check"
"
$OUTPUT_DIR
/pcie_analysis.log"
run_and_log
"分析SME信息"
"sme_check"
"
$OUTPUT_DIR
/sme_analysis.log"
run_and_log
"分析驱动安装位置"
"kernel_check"
"
$OUTPUT_DIR
/kernel_analysis.log"
}
# 运行附加检查
run_additional_checks
()
{
log
"运行附加检查..."
# 先尝试收集 drvdiag 日志(如果驱动未安装则给出提示)
run_drvdiag
||
log
"跳过 drvdiag,驱动未安装或执行失败"
run_and_log
"检查驱动加载状态"
"
$DRIVER_LOAD_CHECK
"
"
$OUTPUT_DIR
/driver_status.log"
run_and_log
"检查硬件板卡信息"
"
$BOARD_CHECK
"
"
$OUTPUT_DIR
/board_check.log"
#打包结果
# 只在非X785-H30设备上检查PCIe速度
if
product_name
=
$(
dmidecode
-s
system-product-name 2>/dev/null
)
;
then
if
[
"
$product_name
"
!=
"X785-H30"
]
;
then
run_and_log
"检查PCIe速度"
"
$PCIE_SPEED_CHECK
"
"
$OUTPUT_DIR
/pcie_speed_check.log"
fi
else
log
"警告: 无法获取产品名称,跳过PCIe速度检查"
fi
}
# 打包结果
package_results
()
{
log
"打包诊断数据..."
tar
-czf
"
${
OUTPUT_DIR
}
.tar.gz"
"
$OUTPUT_DIR
"
2>/dev/null
rm
-rf
"
$OUTPUT_DIR
"
log
"诊断文件已保存为:
${
OUTPUT_DIR
}
.tar.gz"
return
$status
local
tar_file
=
"
${
OUTPUT_DIR
}
.tar.gz"
if
tar
-czf
"
$tar_file
"
-C
"
$(
dirname
"
$OUTPUT_DIR
"
)
"
"
$(
basename
"
$OUTPUT_DIR
"
)
"
2>/dev/null
;
then
rm
-rf
"
$OUTPUT_DIR
"
log
"诊断完成! 文件已保存为:
${
tar_file
}
"
log
"文件大小:
$(
du
-h
"
$tar_file
"
|
cut
-f1
)
"
else
log
"错误: 打包失败,原始数据保存在:
$OUTPUT_DIR
"
exit
1
fi
}
# 主工作流程
main
()
{
# 显示开始信息
hline
log
"🚀 DCU系统诊断工具启动"
log
"📁 输出目录:
$OUTPUT_DIR
"
log
"🔍 关键字:
$KEYWORD
"
log
"⏰ 日志时间范围:
${
LOG_AGE
}
小时"
log
"📊 日志大小限制:
${
LOG_SIZE_LIMIT
}
MB"
hline
get_dcu
# 检查环境和工具
check_required_tools
check_tool_scripts
# 收集信息
collect_system_information
# 分析日志
echo
hline
log
"📊 开始日志分析"
hline
analyze_logs
# 运行附加检查
run_additional_checks
# 打包结果
package_results
}
# 清理函数(在信号中断时调用)
cleanup
()
{
if
[
-n
"
$OUTPUT_DIR
"
]
&&
[
-d
"
$OUTPUT_DIR
"
]
;
then
log
"正在清理临时文件..."
rm
-rf
"
$OUTPUT_DIR
"
fi
exit
1
}
# 设置信号处理
trap
cleanup INT TERM
# 执行主程序
parse_arguments
"
$@
"
set_defaults
init_output_dir
main
exit
$?
exit
0
\ No newline at end of file
tools/dcuopn_check.sh
View file @
138039a2
#!/usr/bin/bash
# 原始设备映射表
declare
-A
devices_id
=(
[
"Z100"
]=
"54b7"
[
"Z100L"
]=
"55b7"
[
"K100"
]=
"62b7"
[
"K100-AI"
]=
"6210"
[
"K100-AI-ECO"
]=
"6211"
[
"BW1000"
]=
"6320"
[
"BW"
]=
"6320"
[
"BW1100"
]=
"6430"
)
# 构建反向映射表(设备ID → 设备名称)
declare
-A
devices
for
name
in
"
${
!devices_id[@]
}
"
;
do
id
=
"
${
devices_id
[
$name
]
}
"
devices[
"
$
{
id
}
"
]
+
=
"
$name
"
devices[
"
$id
"
]=
"
$name
"
done
get_dcu
()
{
# 检查lspci命令是否存在
if
!
command
-v
lspci &> /dev/null
;
then
echo
"错误: lspci 命令未找到,请先安装 pciutils 包"
>
&2
return
1
fi
# 获取设备ID列表
mapfile
-t
dcu_list < <
(
lspci
-nn
|
grep
-i
-E
"display|co-processor"
|
awk
-F
'[][]'
'{print $4}'
|
awk
-F
":"
'{print $2}'
)
local
index
=
0
local
dcu_num
=
0
local
total
=
${#
dcu_list
[@]
}
local
detected_devices
=()
# 处理没有设备的情况
if
[
$total
-eq
0
]
;
then
echo
"未检测到任何DCU设备"
DEVICE_NAME
=
""
DEVICE_ID
=
""
return
0
fi
echo
"=== 检测到的DCU设备 ==="
while
[
$index
-lt
$total
]
;
do
current_id
=
"
${
dcu_list
[
$index
]
}
"
if
[
-n
"
${
devices
[
$current_id
]
}
"
]
;
then
echo
"dcu #
$dcu_num
型号为:
${
devices
[
$current_id
]
}
"
echo
"DCU #
$dcu_num
: ID
${
current_id
}
→
${
devices
[
$current_id
]
}
"
detected_devices+
=(
"
${
devices
[
$current_id
]
}
"
)
((
dcu_num++
))
else
echo
"未知设备ID:
$current_id
"
>
&2
echo
"DCU #
$dcu_num
: 未知设备ID:
$current_id
"
>
&2
detected_devices+
=(
"unknown"
)
fi
((
index++
))
done
echo
"总计:
$dcu_num
张
${
devices
[
$current_id
]
}
DCU 设备"
DEVICE_NAME
=
${
devices
[
$current_id
]
}
DEVICE_ID
=
$current_id
# echo $DEVICE_NAME $DEVICE_ID
# 统计信息
echo
"=========================="
if
[
$dcu_num
-eq
0
]
;
then
echo
"总计: 0张DCU设备"
else
# 使用关联数组统计每种设备的数量
local
-A
device_count
for
dev
in
"
${
detected_devices
[@]
}
"
;
do
((
device_count[
$dev
]
++
))
done
# 输出统计信息
echo
"总计:
$dcu_num
张DCU设备"
for
dev_name
in
"
${
!device_count[@]
}
"
;
do
if
[
"
$dev_name
"
!=
"unknown"
]
;
then
echo
" -
${
dev_name
}
:
${
device_count
[
$dev_name
]
}
张"
fi
done
if
[
${
device_count
[
"unknown"
]
:-
0
}
-gt
0
]
;
then
echo
" - 未知设备:
${
device_count
[
"unknown"
]
}
张"
fi
fi
# 设置全局变量(取第一个有效设备)
if
[
$dcu_num
-gt
0
]
;
then
DEVICE_NAME
=
"
${
detected_devices
[0]
}
"
DEVICE_ID
=
"
${
dcu_list
[0]
}
"
else
DEVICE_NAME
=
""
DEVICE_ID
=
""
fi
}
get_dcu
tools/driver_load_check.sh
View file @
138039a2
#!/bin/bash
#huangjun@hygon.cn
#v0.2
# 设备ID正则表达式 - 支持DCU设备
DEVICE_ID
=
"1d94:(5|6)[0-9a-z]{3,3}"
pn[
"5"
]=
"zifang"
pn[
"6"
]=
"kongming"
dev
=(
$(
lspci
-nn
|
grep
-oE
"1d94:(5|6)[0-9a-z]{3,3}"
|
awk
-F
:
'{print $2}'
|
grep
-o
[
56]
)
)
devname
=
${
pn
[
${
dev
[0]
}
]
}
# 设备名称映射 - 根据设备系列号匹配
declare
-A
pn
pn[
"55b7"
]=
"zifang"
# 子房
pn[
"54b7"
]=
"zifang"
# 子房
pn[
"62b7"
]=
"kongming"
# 孔明
pn[
"6210"
]=
"zhongda"
# 仲达
pn[
"6211"
]=
"zhongda"
# 仲达
pn[
"6320"
]=
"bowen"
# 伯温
pn[
"6430"
]=
"bowen"
# 伯温
# 提取完整设备ID
dev_id
=
$(
lspci
-nn
|
grep
-oE
"1d94:(5|6)[0-9a-z]{3,3}"
|
head
-n
1
)
# 提取设备系列号部分 (如 55b7)
dev_series
=
${
dev_id
#*
:
}
# 根据系列号确定设备名称
devname
=
${
pn
[
$dev_series
]
}
if
[
-z
"
$devname
"
]
;
then
echo
"未识别的设备型号:
$dev_id
"
echo
"支持的设备型号: 55b7(zifang), 62b7(kongming), 6210(zhongda), 6320(bowen), 6430(bowen)"
exit
1
fi
echo
"===THIS SCRIPT JUST FOR 5.16.21 5.2 V1.10 and later==="
function
ko_is_loaded
()
...
...
@@ -71,8 +90,8 @@ function _find_ucode_in_path()
local
tc
=
0
local
cnt
=
0
au
=
"
$(
find
$p
-name
${
devname
}
_
$
u
.bin 2> /dev/null
)
"
tc
=
$(
find
$p
-name
${
devname
}
_
$
u
.bin 2> /dev/null |
wc
-l
)
au
=
"
$(
find
$p
-name
"
${
devname
}
_
$
{
u
}
.bin
"
2> /dev/null
)
"
tc
=
$(
find
$p
-name
"
${
devname
}
_
$
{
u
}
.bin
"
2> /dev/null |
wc
-l
)
cnt
=
$((
$cnt
+
$tc
))
echo
$au
return
$cnt
...
...
@@ -123,7 +142,7 @@ function check_ucode()
function
check_ko
()
{
local
kos
=
"hy
d
cu.ko hy
d
cu-sched.ko hydrm_ttm_helper.ko hy-extra.ko hykcl.ko hyttm.ko"
local
kos
=
"hycu.ko hycu-sched.ko hydrm_ttm_helper.ko hy-extra.ko hykcl.ko hyttm.ko"
local dir
=
"/opt/hyhal/dkms/"
local
ret
=
...
...
@@ -172,10 +191,10 @@ check_vfio_pci
#2
check_ko
#3
check_ucode
#
check_ucode
#4
check_system_cap
#5
check_cuser_if_video
#
check_cuser_if_video
echo
"驱动检查结束,没有发现明显问题"
tools/hcu_info.sh
100644 → 100755
View file @
138039a2
File mode changed from 100644 to 100755
tools/hydcutune
100644 → 100755
View file @
138039a2
File mode changed from 100644 to 100755
tools/pcie_check.sh
View file @
138039a2
...
...
@@ -68,22 +68,60 @@ analyze_regions() {
# 提取Region关键参数
address
=
$(
echo
"
$1
"
|
awk
'/Memory at/ {print $5}'
)
#
判定逻辑实现
#
PCIe BAR地址状态检查
if
[[
"
$address
"
==
"unassigned"
]]
;
then
echo
"[ERROR] Bar地址未分配,需要检查卡的状态(物理连接或供电异常)"
echo
"建议操作:执行'lspci -vvv'确认设备识别状态}"
echo
"[ERROR CODE 1] PCIe BAR地址未分配"
echo
"可能原因:"
echo
" 1. DCU卡未正确插入或供电不足"
echo
" 2. 主板PCIe插槽故障"
echo
" 3. 驱动未正确加载"
echo
"解决方案:"
echo
" 1. 检查DCU卡物理连接状态"
echo
" 2. 执行 'lspci -vvv | grep -A 10
\"
$DEVICE_ID
\"
' 确认设备识别状态"
echo
" 3. 检查dmesg日志中的PCIe相关错误"
return
1
elif
[[
`
echo
$address
|
wc
-c
`
-gt
12
]]
;
then
echo
"[WARNING] Bar地址超出44bit(当前地址:0x
${
address
}
)"
echo
"解决方案:调整BIOS的MMIO High Base < 16T}"
echo
"[ERROR CODE 2] PCIe BAR地址超出44bit范围 (当前地址: 0x
${
address
}
)"
echo
"可能影响:"
echo
" - 可能导致DMA操作失败"
echo
" - 系统内存空间不足"
echo
"解决方案:"
echo
" 1. 进入BIOS设置"
echo
" 2. 找到 'MMIO High Base' 选项"
echo
" 3. 设置为小于16T的值 (如 8T)"
echo
" 4. 保存设置并重启系统"
return
2
fi
if
[[
"
$address
"
==
"<ignored>"
]]
;
then
echo
"[ERROR] 获取不到bar地址"
echo
"修复建议:检查/proc/cmdline是否包含'pcie=realloc'配置"
grep
-q
"pcie=realloc"
/proc/cmdline
||
echo
" 当前配置:
$(
cat
/proc/cmdline
)
"
elif
[[
"
$address
"
==
"<ignored>"
]]
;
then
echo
"[ERROR CODE 3] 无法获取有效BAR地址"
echo
"可能原因:"
echo
" - 内核未启用PCIe地址重分配功能"
echo
"解决方案:"
echo
" 1. 检查当前内核启动参数:"
grep
-q
"pcie=realloc"
/proc/cmdline
||
echo
" 当前配置:
$(
cat
/proc/cmdline
)
"
echo
" 2. 在GRUB配置中添加 'pcie=realloc' 参数"
echo
" 3. 更新GRUB配置并重启:"
echo
" - Debian/Ubuntu: update-grub"
echo
" - RHEL/CentOS: grub2-mkconfig -o /boot/grub2/grub.cfg"
return
3
fi
# 检查Region 0地址是否在保留区间 (0xf000000000-0xffffffffff)
if
[[
"
$1
"
==
*
"Region 0"
*
]]
;
then
if
[[
"
$address
"
=
~ ^f[0-9a-f]
{
11
}
$
]]
;
then
local
dev_id
=
$(
echo
"
$1
"
|
grep
-oP
'Device \K[0-9a-f:]+'
)
echo
"[ERROR CODE 4] PCIe BAR地址冲突 - 卡
${
dev_id
}
(地址: 0x
${
address
}
)"
echo
"问题描述:"
echo
" - Region 0地址位于保留区间 (0xf000000000-0xffffffffff)"
echo
" - 可能与系统保留内存区域冲突"
echo
"解决方案:"
echo
" 1. 进入BIOS设置"
echo
" 2. 找到 'PCIe Memory Mapped IO' 或 'MMIO Configuration'"
echo
" 3. 调整MMIO分配范围,避开 0xf000000000-0xffffffffff"
echo
" 4. 如无相关选项,可能需要更新BIOS版本"
return
4
fi
fi
echo
"PCIe 状态正常"
return
0
}
...
...
tools/pkg_check.sh
View file @
138039a2
#!/bin/bash
function
pkg_check
()
{
local
pkgs_debian
=(
dmidecode lshw pciutils numactl-devel ipmitool locate
)
local
pkgs_centos
=(
dmidecode lshw pciutils numactl-dev ipmitool mlocate
)
local
cmd
=(
dmidecode lshw lspci numactl ipmitool locate
)
local
pkgs_debian
=(
"dmidecode"
"lshw"
"pciutils"
"numactl-dev"
"ipmitool"
"mlocate"
)
local
pkgs_centos
=(
"dmidecode"
"lshw"
"pciutils"
"numactl-devel"
"ipmitool"
"mlocate"
)
local
cmd
=(
"dmidecode"
"lshw"
"lspci"
"numactl"
"ipmitool"
"locate"
)
local
missing_pkgs
=()
local
missing_cmds
=()
local
package_manager
=
""
local
distro_pkgs
=()
# 检测包管理器
if
command
-v
apt-get &>/dev/null
;
then
package_manager
=
"apt-get"
distro_pkgs
=(
"
${
pkgs_debian
[@]
}
"
)
elif
command
-v
yum &>/dev/null
;
then
package_manager
=
"yum"
distro_pkgs
=(
"
${
pkgs_centos
[@]
}
"
)
elif
command
-v
dnf &>/dev/null
;
then
package_manager
=
"dnf"
distro_pkgs
=(
"
${
pkgs_centos
[@]
}
"
)
else
echo
"错误: 未检测到支持的包管理器 (apt-get, yum, dnf)"
return
1
fi
echo
"=== 检查系统命令依赖 ==="
# 检查所有命令
for
((
i
=
0
;
i<
${#
cmd
[@]
}
;
i++
))
;
do
if
!
command
-v
${
cmd
[i]
}
&>/dev/null
;
then
if
command
-v
apt-get &>/dev/null
;
then
echo
"没有
${
cmd
[i]
}
命令,请先安装
${
pkgs_debian
[i]
}
"
exit
0
# apt-get install -y ${pkgs_debian[i]}
elif
command
-v
yum &>/dev/null
;
then
echo
"没有
${
cmd
[i]
}
命令,请先安装
${
pkgs_centos
[i]
}
"
# yum install -y ${pkgs_centos[i]}
exit
0
fi
if
!
command
-v
"
${
cmd
[i]
}
"
&>/dev/null
;
then
missing_cmds+
=(
"
${
cmd
[i]
}
"
)
missing_pkgs+
=(
"
${
distro_pkgs
[i]
}
"
)
echo
"[缺失]
${
cmd
[i]
}
→ 需要安装:
${
distro_pkgs
[i]
}
"
else
echo
"[已安装]
${
cmd
[i]
}
"
fi
done
}
\ No newline at end of file
# 处理缺失的包
if
[
${#
missing_pkgs
[@]
}
-eq
0
]
;
then
echo
"所有依赖命令均已安装 ✓"
return
0
else
echo
"============================="
echo
"缺失的命令:
${
missing_cmds
[*]
}
"
echo
"需要安装的包:
${
missing_pkgs
[*]
}
"
echo
""
echo
"安装命令:"
echo
"
$package_manager
install -y
${
missing_pkgs
[*]
}
"
echo
""
echo
"请安装上述包后重新运行脚本"
return
1
fi
}
pkg_check
\ No newline at end of file
tools/sys_log.sh
View file @
138039a2
#!/usr/bin/bash
#!/usr/bin/
env
bash
# 带大小限制的日志复制函数
copy_log_with_limit
()
{
local
src
=
$1
local
dest
=
$2
local
size_limit_mb
=
$3
if
[
-f
"
$src
"
]
;
then
file_size
=
$(
du
-m
"
$src
"
|
cut
-f1
)
if
[
$file_size
-gt
$size_limit_mb
]
;
then
log
"跳过大文件:
$src
(
${
file_size
}
MB >
${
size_limit_mb
}
MB)"
echo
"[日志文件超过大小限制未采集]"
>
"
$dest
"
return
fi
cp
"
$src
"
"
$dest
"
2>/dev/null
||
echo
"无权限读取日志"
>
"
$dest
"
local
src
=
"
$1
"
local
dest
=
"
$2
"
local
size_limit_mb
=
"
$3
"
if
[
!
-f
"
$src
"
]
;
then
echo
"日志文件不存在:
$src
"
>
"
$dest
"
return
1
fi
# 检查文件权限
if
[
!
-r
"
$src
"
]
;
then
echo
"无权限读取日志文件:
$src
"
>
"
$dest
"
return
1
fi
# 获取文件大小(MB)
local
file_size
file_size
=
$(
du
-m
"
$src
"
2>/dev/null |
cut
-f1
||
echo
0
)
if
[
"
$file_size
"
-gt
"
$size_limit_mb
"
]
;
then
log_warning
"跳过大文件:
$src
(
${
file_size
}
MB >
${
size_limit_mb
}
MB)"
echo
"[日志文件超过大小限制未采集 -
${
file_size
}
MB >
${
size_limit_mb
}
MB]"
>
"
$dest
"
return
2
fi
# 复制文件,保留原始权限
if
cp
"
$src
"
"
$dest
"
2>/dev/null
;
then
log
"成功复制日志:
$src
->
$dest
(
${
file_size
}
MB)"
return
0
else
echo
"日志文件不存在"
>
"
$dest
"
echo
"复制日志文件失败:
$src
"
>
"
$dest
"
return
1
fi
}
# 收集特定应用的日志
collect_app_logs
()
{
local
app_name
=
"
$1
"
local
log_patterns
=(
"/var/log/
${
app_name
}
/*.log"
"/var/log/
${
app_name
}
.log"
"/opt/
${
app_name
}
/logs/*.log"
)
local
app_log_dir
=
"
${
OUTPUT_DIR
}
/app_logs"
mkdir
-p
"
$app_log_dir
"
for
pattern
in
"
${
log_patterns
[@]
}
"
;
do
for
log_file
in
$pattern
;
do
if
[
-f
"
$log_file
"
]
;
then
local
base_name
=
$(
basename
"
$log_file
"
)
copy_log_with_limit
"
$log_file
"
"
${
app_log_dir
}
/
${
app_name
}
_
${
base_name
}
"
"
$LOG_SIZE_LIMIT
"
fi
done
done
}
# 收集系统日志
collect_logs
()
{
log
"收集系统日志(最近
${
LOG_AGE
}
小时)..."
log
"收集系统日志(最近
${
LOG_AGE
}
小时
,大小限制:
${
LOG_SIZE_LIMIT
}
MB
)..."
# 识别系统日志位置
local
syslog_path
[
-f
/var/log/syslog
]
&&
syslog_path
=
/var/log/syslog
[
-f
/var/log/messages
]
&&
syslog_path
=
/var/log/messages
if
[
-n
"
$syslog_path
"
]
;
then
copy_log_with_limit
"
$syslog_path
"
"
$OUTPUT_DIR
/system.log"
$LOG_SIZE_LIMIT
else
log
"收集journalctl日志..."
journalctl
--since
"
${
LOG_AGE
}
hours ago"
>
"
$OUTPUT_DIR
/system.log"
2>/dev/null
||
\
echo
"无法获取系统日志"
>
"
$OUTPUT_DIR
/system.log"
local
syslog_dir
=
"
${
OUTPUT_DIR
}
/logs"
mkdir
-p
"
$syslog_dir
"
# 常见的系统日志文件
local
system_logs
=(
"/var/log/syslog"
"/var/log/messages"
"/var/log/kern.log"
"/var/log/dmesg"
"/var/log/boot.log"
)
# 收集标准系统日志文件
for
log_file
in
"
${
system_logs
[@]
}
"
;
do
if
[
-f
"
$log_file
"
]
;
then
local
base_name
=
$(
basename
"
$log_file
"
)
copy_log_with_limit
"
$log_file
"
"
${
syslog_dir
}
/
${
base_name
}
"
"
$LOG_SIZE_LIMIT
"
fi
done
# 使用journalctl作为后备方案
if
[
!
-f
"
${
syslog_dir
}
/syslog"
]
&&
[
!
-f
"
${
syslog_dir
}
/messages"
]
;
then
log
"使用journalctl收集系统日志..."
if
command
-v
journalctl
>
/dev/null 2>&1
;
then
if
journalctl
--since
"
${
LOG_AGE
}
hours ago"
>
"
${
syslog_dir
}
/journalctl.log"
2>/dev/null
;
then
log_success
"journalctl日志收集成功"
else
log_warning
"journalctl日志收集失败"
echo
"无法获取journalctl日志"
>
"
${
syslog_dir
}
/journalctl.log"
fi
else
log_warning
"journalctl命令不存在"
echo
"journalctl命令不可用"
>
"
${
syslog_dir
}
/journalctl.log"
fi
fi
# 收集dmesg日志
log
"收集dmesg日志..."
dmesg
-T
>
"
$OUTPUT_DIR
/dmesg.log"
2>&1
if
command
-v
dmesg
>
/dev/null 2>&1
;
then
if
dmesg
-T
>
"
${
syslog_dir
}
/dmesg.log"
2>&1
;
then
log_success
"dmesg日志收集成功"
else
# 如果-T选项不支持,使用普通dmesg
dmesg
>
"
${
syslog_dir
}
/dmesg.log"
2>&1
||
{
log_warning
"dmesg日志收集失败"
echo
"无法获取dmesg日志"
>
"
${
syslog_dir
}
/dmesg.log"
}
fi
else
log_warning
"dmesg命令不存在"
echo
"dmesg命令不可用"
>
"
${
syslog_dir
}
/dmesg.log"
fi
# 收集应用日志(DCU相关)
collect_app_logs
"dcu"
collect_app_logs
"hydcu"
collect_app_logs
"nvidia"
# 兼容性考虑
# 收集其他有用的日志
local
other_logs
=(
"/var/log/secure"
"/var/log/auth.log"
"/var/log/yum.log"
"/var/log/apt/history.log"
)
for
log_file
in
"
${
other_logs
[@]
}
"
;
do
if
[
-f
"
$log_file
"
]
;
then
local
base_name
=
$(
basename
"
$log_file
"
)
copy_log_with_limit
"
$log_file
"
"
${
syslog_dir
}
/
${
base_name
}
"
"
$LOG_SIZE_LIMIT
"
fi
done
log_success
"系统日志收集完成"
}
\ No newline at end of file
tools/utils.sh
View file @
138039a2
#!/usr/bin/bash
#!/usr/bin/
env
bash
function
echoAndRun
(){
# 颜色定义
readonly
RED
=
'\033[0;31m'
readonly
GREEN
=
'\033[0;32m'
readonly
YELLOW
=
'\033[1;33m'
readonly
BLUE
=
'\033[0;34m'
readonly
NC
=
'\033[0m'
# No Color
# 带颜色的日志函数
log
()
{
if
[
$QUIET_MODE
-eq
0
]
;
then
echo
-e
"
${
BLUE
}
[INFO]
${
NC
}
$*
"
fi
}
log_success
()
{
if
[
$QUIET_MODE
-eq
0
]
;
then
echo
-e
"
${
GREEN
}
[SUCCESS]
${
NC
}
$*
"
fi
}
log_warning
()
{
echo
-e
"
${
YELLOW
}
[WARNING]
${
NC
}
$*
"
>
&2
}
log_error
()
{
echo
-e
"
${
RED
}
[ERROR]
${
NC
}
$*
"
>
&2
}
# 执行命令并显示
echoAndRun
()
{
local
cmd
=
"
$1
"
hline
echo
echo
"[root@dcu ~]# "
$1
;
eval
$1
;
echo
;
echo
-e
"
${
BLUE
}
[执行命令]
${
NC
}
$cmd
"
echo
eval
"
$cmd
"
local
exit_code
=
$?
echo
if
[
$exit_code
-eq
0
]
;
then
log_success
"命令执行成功"
else
log_warning
"命令执行失败,退出码:
$exit_code
"
fi
return
$exit_code
}
# 日志函数
function
log
()
{
[
$QUIET_MODE
-eq
0
]
&&
echo
"
$@
"
# 水平分隔线
hline
()
{
printf
'%0.s='
{
1..60
}
echo
}
function
hline
()
{
printf
"%0.s="
{
1..60
}
# 标准化标题格式
head_normal
()
{
echo
hline
echo
-e
"
${
BLUE
}
$1
${
NC
}
"
hline
}
# 检查命令是否存在
command_exists
()
{
command
-v
"
$1
"
>
/dev/null 2>&1
}
# 安全执行命令,遇到错误继续执行
safe_run
()
{
local
cmd
=
"
$1
"
local
description
=
"
${
2
:-
执行命令
}
"
log
"
$description
:
$cmd
"
eval
"
$cmd
"
||
{
log_warning
"
$description
失败,继续执行..."
return
1
}
}
# 进度指示器
progress
()
{
local
message
=
"
$1
"
if
[
$QUIET_MODE
-eq
0
]
;
then
echo
-n
-e
"
${
BLUE
}
[...]
${
NC
}
$message
"
fi
}
progress_done
()
{
if
[
$QUIET_MODE
-eq
0
]
;
then
echo
-e
"
\r
${
GREEN
}
[✓]
${
NC
}
$1
"
fi
}
progress_failed
()
{
echo
-e
"
\r
${
RED
}
[✗]
${
NC
}
$1
"
>
&2
}
# 验证文件存在
check_file
()
{
local
file
=
"
$1
"
if
[
!
-f
"
$file
"
]
;
then
log_error
"文件不存在:
$file
"
return
1
fi
return
0
}
## 标准化提示信息格式
function
head_normal
()
{
echo
-e
"
\n
############
$1
############"
# 验证目录存在
check_dir
()
{
local dir
=
"
$1
"
if
[
!
-d
"
$dir
"
]
;
then
log_error
"目录不存在:
$dir
"
return
1
fi
return
0
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment