Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
DcuProfileTools
Commits
49486f80
Commit
49486f80
authored
Mar 06, 2026
by
chengshunyan
Browse files
add init
parents
Changes
43
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
784 additions
and
0 deletions
+784
-0
env_check/dcu_env_check-main/README.md
env_check/dcu_env_check-main/README.md
+16
-0
env_check/dcu_env_check-main/system_check.sh
env_check/dcu_env_check-main/system_check.sh
+113
-0
env_check/dcu_env_check-main/tools/board_check.sh
env_check/dcu_env_check-main/tools/board_check.sh
+39
-0
env_check/dcu_env_check-main/tools/dcuopn_check.sh
env_check/dcu_env_check-main/tools/dcuopn_check.sh
+43
-0
env_check/dcu_env_check-main/tools/driver_install.check.sh
env_check/dcu_env_check-main/tools/driver_install.check.sh
+12
-0
env_check/dcu_env_check-main/tools/driver_load_check.sh
env_check/dcu_env_check-main/tools/driver_load_check.sh
+181
-0
env_check/dcu_env_check-main/tools/hydcutune
env_check/dcu_env_check-main/tools/hydcutune
+0
-0
env_check/dcu_env_check-main/tools/hyflash
env_check/dcu_env_check-main/tools/hyflash
+0
-0
env_check/dcu_env_check-main/tools/kernel_check.sh
env_check/dcu_env_check-main/tools/kernel_check.sh
+16
-0
env_check/dcu_env_check-main/tools/log_analyze.sh
env_check/dcu_env_check-main/tools/log_analyze.sh
+24
-0
env_check/dcu_env_check-main/tools/pcie_check.sh
env_check/dcu_env_check-main/tools/pcie_check.sh
+104
-0
env_check/dcu_env_check-main/tools/pcie_speed_check.sh
env_check/dcu_env_check-main/tools/pcie_speed_check.sh
+32
-0
env_check/dcu_env_check-main/tools/pkg_check.sh
env_check/dcu_env_check-main/tools/pkg_check.sh
+22
-0
env_check/dcu_env_check-main/tools/sme_check.sh
env_check/dcu_env_check-main/tools/sme_check.sh
+16
-0
env_check/dcu_env_check-main/tools/sys_info.sh
env_check/dcu_env_check-main/tools/sys_info.sh
+33
-0
env_check/dcu_env_check-main/tools/sys_log.sh
env_check/dcu_env_check-main/tools/sys_log.sh
+43
-0
env_check/dcu_env_check-main/tools/utils.sh
env_check/dcu_env_check-main/tools/utils.sh
+26
-0
env_check/dcu_env_check-main/tools/vbios_check.sh
env_check/dcu_env_check-main/tools/vbios_check.sh
+6
-0
env_check/dcu_env_check-main/tools/vfio_ffff_error.sh
env_check/dcu_env_check-main/tools/vfio_ffff_error.sh
+12
-0
env_check/utils.sh
env_check/utils.sh
+46
-0
No files found.
env_check/dcu_env_check-main/README.md
0 → 100644
View file @
49486f80
# dcu_env_check
目前支持的功能分为日志获取和日志分析两部分,
1) 日志获取
*
获取硬件信息
*
获取系统信息
*
获取OS日志
2) 日志故障分析
*
驱动内核不匹配问题分析
*
PCIe bar地址故障分析
*
sme 故障分析
*
PCIe 链路故障分析
*
驱动模块检查分析
*
PCIe speed故障分析
*
主板原因导致的掉卡问题
env_check/dcu_env_check-main/system_check.sh
0 → 100755
View file @
49486f80
#!/bin/bash
#set -x
.
./tools/utils.sh
.
./tools/pkg_check.sh
.
./tools/dcuopn_check.sh
.
./tools/sys_info.sh
.
./tools/sys_log.sh
.
./tools/pcie_check.sh
.
./tools/kernel_check.sh
.
./tools/sme_check.sh
.
./tools/log_analyze.sh
# 默认参数配置
DEFAULT_OUTPUT_DIR
=
"system_info_
$(
date
+%Y%m%d_%H%M%S
)
"
DEFAULT_KEYWORD
=
"hydcu"
DEFAULT_LOG_AGE
=
24
# 小时
DEFAULT_LOG_SIZE_LIMIT
=
10
# 单位:MB
QUIET_MODE
=
0
DEBUG_MODE
=
0
DEVICE_NAME
=
""
DEVICE_ID
=
""
# 显示帮助信息
show_help
()
{
echo
"Usage:
$0
[OPTIONS]"
echo
"系统诊断脚本 - 收集系统信息并检测驱动问题"
echo
echo
"选项:"
echo
" -o DIR 指定输出目录 (默认: 自动生成)"
echo
" -k KEYWORD 设置检测关键字 (默认:
$DEFAULT_KEYWORD
)"
echo
" -t HOURS 收集日志的时间范围(小时) (默认: 24)"
echo
" -s SIZE 日志文件大小限制(MB) (默认: 10)"
echo
" -q 静默模式(仅输出错误)"
echo
" -d 调试模式"
echo
" -h 显示此帮助信息"
echo
echo
"示例:"
echo
"
$0
-o /tmp/logs -k buserr -t 48"
}
# 解析参数
while
getopts
"o:k:t:s:qdh"
opt
;
do
case
$opt
in
o
)
CUSTOM_OUTPUT_DIR
=
"
$OPTARG
"
;;
k
)
KEYWORD
=
"
$OPTARG
"
;;
t
)
LOG_AGE
=
"
$OPTARG
"
;;
s
)
LOG_SIZE_LIMIT
=
"
$OPTARG
"
;;
q
)
QUIET_MODE
=
1
;;
d
)
DEBUG_MODE
=
1
;
set
-x
;;
h
)
show_help
;
exit
0
;;
\?
)
echo
"无效选项: -
$OPTARG
"
>
&2
;
exit
1
;;
:
)
echo
"选项 -
$OPTARG
需要参数"
>
&2
;
exit
1
;;
esac
done
# 设置默认值
:
${
OUTPUT_DIR
:
=
${
CUSTOM_OUTPUT_DIR
:-
$DEFAULT_OUTPUT_DIR
}}
:
${
KEYWORD
:
=
$DEFAULT_KEYWORD
}
:
${
LOG_AGE
:
=
$DEFAULT_LOG_AGE
}
:
${
LOG_SIZE_LIMIT
:
=
$DEFAULT_LOG_SIZE_LIMIT
}
# 初始化目录
mkdir
-p
"
$OUTPUT_DIR
"
||
exit
1
# 主流程
main
()
{
hline
echo
-e
'################ 日志收集 ##################'
hline
get_dcu
pkg_check
collect_system_info
collect_logs
#analyze_errors
get_pcie_info
echo
-e
'\n###### 日志分析 #######'
hline
head_normal
"分析pcie信息"
pcie_check
$OUTPUT_DIR
/pcie_info.log
head_normal
"分析sme信息"
sme_check
$OUTPUT_DIR
/dmesg.log
head_normal
"分析驱动安装位置"
kernel_check
./tools/driver_load_check.sh
>
$OUTPUT_DIR
/driver_status.log
./tools/board_check.sh
>
$OUTPUT_DIR
/board_check.log
product_name
=
`
dmidecode
-s
system-product-name
`
if
[
"
$product_name
"
!=
"X785-H30"
]
;
then
./tools/pcie_speed_check.sh
>
$OUTPUT_DIR
/pcie_speek_check.log
fi
local
status
=
$?
#打包结果
log
"打包诊断数据..."
tar
-czf
"
${
OUTPUT_DIR
}
.tar.gz"
"
$OUTPUT_DIR
"
2>/dev/null
rm
-rf
"
$OUTPUT_DIR
"
log
"诊断文件已保存为:
${
OUTPUT_DIR
}
.tar.gz"
return
$status
}
# 执行主程序
main
exit
$?
env_check/dcu_env_check-main/tools/board_check.sh
0 → 100755
View file @
49486f80
#!/bin/bash
# 检查是否具有root权限
if
[
"
$EUID
"
-ne
0
]
;
then
echo
"错误:该脚本需要root权限运行(请使用sudo执行)"
>
&2
exit
1
fi
# 检查dmidecode命令是否存在
if
!
command
-v
dmidecode &> /dev/null
;
then
echo
"错误:未找到dmidecode命令,请先安装dmidecode工具"
>
&2
exit
1
fi
# 获取主板序列号
baseboard_SN
=
$(
dmidecode
-t
2 |
grep
-i
"Serial Number"
|
awk
'{print $3}'
)
# 检查是否成功获取序列号
if
[
-z
"
$baseboard_SN
"
]
;
then
echo
"错误:无法获取主板序列号"
>
&2
exit
1
fi
# 型号判断逻辑
case
$baseboard_SN
in
*
AS
*
)
echo
"检测到主板型号:[
${
baseboard_SN
}
] 太老,满负载情况会出现掉卡"
>
&2
exit
1
;;
*
BH
*
)
echo
"检测到主板型号:[
${
board_model
}
] 符合要求"
exit
0
;;
*
)
echo
"未知主板型号,需要进一步查看"
exit
2
;;
esac
env_check/dcu_env_check-main/tools/dcuopn_check.sh
0 → 100755
View file @
49486f80
#!/usr/bin/bash
declare
-A
devices_id
=(
[
"Z100"
]=
"54b7"
[
"Z100L"
]=
"55b7"
[
"K100"
]=
"62b7"
[
"K100-AI"
]=
"6210"
[
"K100-AI-ECO"
]=
"6211"
[
"BW1000"
]=
"6320"
)
# 构建反向映射表(设备ID → 设备名称)
declare
-A
devices
for
name
in
"
${
!devices_id[@]
}
"
;
do
id
=
"
${
devices_id
[
$name
]
}
"
devices[
"
${
id
}
"
]
+
=
"
$name
"
done
get_dcu
()
{
# 获取设备ID列表
mapfile
-t
dcu_list < <
(
lspci
-nn
|
grep
-i
-E
"display|co-processor"
|
awk
-F
'[][]'
'{print $4}'
|
awk
-F
":"
'{print $2}'
)
local
index
=
0
local
dcu_num
=
0
local
total
=
${#
dcu_list
[@]
}
while
[
$index
-lt
$total
]
;
do
current_id
=
"
${
dcu_list
[
$index
]
}
"
if
[
-n
"
${
devices
[
$current_id
]
}
"
]
;
then
echo
"dcu #
$dcu_num
型号为:
${
devices
[
$current_id
]
}
"
((
dcu_num++
))
else
echo
"未知设备ID:
$current_id
"
>
&2
fi
((
index++
))
done
echo
"总计:
$dcu_num
张
${
devices
[
$current_id
]
}
DCU 设备"
DEVICE_NAME
=
${
devices
[
$current_id
]
}
DEVICE_ID
=
$current_id
# echo $DEVICE_NAME $DEVICE_ID
}
env_check/dcu_env_check-main/tools/driver_install.check.sh
0 → 100755
View file @
49486f80
#!/bin/bash
#获取/tmp的可用空间大小
available_tmp
=
$(
df
-k
/tmp |
awk
'NR==2 {print $4}'
)
available_tmp_gb
=
$((
available_tmp
/
1024
/
1024
))
Error_size
=
4
if
[
$available_tmp_gb
-lt
$Error_size
]
;
then
echo
"/tmp size is too smail, should be large than
$Error_size
GB!!"
exit
1
fi
env_check/dcu_env_check-main/tools/driver_load_check.sh
0 → 100755
View file @
49486f80
#!/bin/bash
#huangjun@hygon.cn
#v0.2
DEVICE_ID
=
"1d94:(5|6)[0-9a-z]{3,3}"
pn[
"5"
]=
"zifang"
pn[
"6"
]=
"kongming"
dev
=(
$(
lspci
-nn
|
grep
-oE
"1d94:(5|6)[0-9a-z]{3,3}"
|
awk
-F
:
'{print $2}'
|
grep
-o
[
56]
)
)
devname
=
${
pn
[
${
dev
[0]
}
]
}
echo
"===THIS SCRIPT JUST FOR 5.16.21 5.2 V1.10 and later==="
function
ko_is_loaded
()
{
local
st
=
$(
lsmod |
grep
"
\<
$1
\>
"
)
local
ret
=
yes
if
[
"
$st
"
=
""
]
;
then
ret
=
"no"
fi
echo
$ret
}
function
have_mod
()
{
echo
"
$(
modinfo
$1
)
"
}
function
check_iommu
()
{
if
[
"
$(
ko_is_loaded iommu_v2
)
"
=
"yes"
]
;
then
echo
"use iommu_v2, ready"
return
0
fi
if
[
"
$(
ko_is_loaded amd_iommu_v2
)
"
=
"yes"
]
;
then
echo
"use amd_iommu_v2, ready"
return
0
fi
if
[
"
$(
have_mod iommu_v2
)
"
!=
""
]
;
then
echo
"have iommu_v2 in disk, but not loaded"
return
-1
fi
if
[
"
$(
have_mod amd_iommu_v2
)
"
!=
""
]
;
then
echo
"have amd_iommu_v2 in disk, but not loaded"
return
-2
fi
echo
"no iommu driver on this system"
return
-3
}
function
check_vfio_pci
()
{
if
[
"
$(
ko_is_loaded vfio-pci
)
"
=
"yes"
]
;
then
echo
"Some device have attach to VM"
echo
"pls check it"
fi
}
function
_have_read_perm
()
{
[[
-r
$1
]]
&&
echo
"yes"
}
function
_find_ucode_in_path
()
{
local
u
=
$2
local
p
=
$1
local
au
=
""
local
tc
=
0
local
cnt
=
0
au
=
"
$(
find
$p
-name
${
devname
}
_
$u
.bin 2> /dev/null
)
"
tc
=
$(
find
$p
-name
${
devname
}
_
$u
.bin 2> /dev/null |
wc
-l
)
cnt
=
$((
$cnt
+
$tc
))
echo
$au
return
$cnt
}
function
check_ucode
()
{
local
ucodes
=
"sdma sdma1 mec mec2 rlc smu"
local
v
=
$(
uname
-r
)
local
paths
=
"/lib/firmware/updates/
$v
/lib/firmware/updates/ /lib/firmware/
$v
"
local
cnt
=
0
local
au
=
""
local
e
=
local
u
local
p
local
rp
for
u
in
$ucodes
;
do
for
p
in
$paths
;
do
au
=
"
$au
$(
_find_ucode_in_path
$p
$u
)
"
cnt
=
$((
$cnt
+
$?
))
done
if
[[
$cnt
-gt
1
]]
;
then
echo
"our firmware is local:[/lib/firmware/
$v
]"
echo
"pls rmove the other firmware."
echo
"all:[
$au
]"
e
=
"yes"
fi
if
[
"
$cnt
"
=
"0"
]
;
then
echo
"no
${
devname
}
_
$u
.bin found! pls reinstall driver."
e
=
"yes"
fi
for
p
in
$au
;
do
local
r
=
$(
_have_read_perm
$p
)
if
[
"
$r
"
!=
"yes"
]
;
then
echo
"no read perm on firmware:
$p
"
e
=
"yes"
fi
done
au
=
""
cnt
=
0
done
if
[
"
$e
"
=
"yes"
]
;
then
exit
-1
fi
echo
"firmware, ready"
}
function
check_ko
()
{
local
kos
=
"hydcu.ko hydcu-sched.ko hydrm_ttm_helper.ko hy-extra.ko hykcl.ko hyttm.ko"
local dir
=
"/opt/hyhal/dkms/"
local
ret
=
for
k
in
$kos
;
do
local
r
=
$(
_have_read_perm
$dir
/
$k
)
if
[
"
$r
"
!=
"yes"
]
;
then
ret
=
"
$ret
$k
"
fi
done
if
[
"
$ret
"
!=
""
]
;
then
echo
"no driver installed or loss read perm"
echo
"pls check[
$kos
] in
$dir
"
exit
-1
fi
echo
"dcu ko, ready"
}
function
check_cuser_if_video
()
{
local
r
=
$(
cat
/etc/group |
grep
video |
grep
$USER
)
if
[
"
$r
"
=
""
]
;
then
echo
"you should add user:
$USER
to video group. sudo usermod -aG video
$USER
"
exit
-1
fi
echo
"user group, ready"
}
function
check_system_cap
()
{
if
[
-r
/sys/fs/selinux/enforce
]
&&
[
"
$(
semodule
-l
|
grep
hydcu
)
"
=
""
]
;
then
echo
"system service no cap to load module"
echo
"pls install driver again"
exit
-1
fi
echo
"system service policy, ready"
}
#0
if
[
"
$(
ko_is_loaded hydcu
)
"
=
"yes"
]
;
then
echo
"driver loaded"
exit
0
fi
#1
check_iommu
check_vfio_pci
#2
check_ko
#3
check_ucode
#4
check_system_cap
#5
check_cuser_if_video
echo
"驱动检查结束,没有发现明显问题"
env_check/dcu_env_check-main/tools/hydcutune
0 → 100755
View file @
49486f80
File added
env_check/dcu_env_check-main/tools/hyflash
0 → 100644
View file @
49486f80
File added
env_check/dcu_env_check-main/tools/kernel_check.sh
0 → 100755
View file @
49486f80
#!/usr/bin/bash
# 驱动安装位置定位
kernel_check
()
{
# 当前kernel版本
kernel_version
=
`
uname
-r
`
# 驱动安装到的kernel
drive_in_kernel
=
`
find /lib/ |
grep
-E
"hydcu|hycu"
|
head
-n
1 |
awk
-F
"/"
'{print $4}'
`
if
[
"
$kernel_version
"
=
"
$drive_in_kernel
"
]
;
then
echo
"驱动安装在当前kernel版本下, 符合正常情况。"
else
echo
"你的内核可能有所变更,检查下环境是否是多个内核"
fi
}
\ No newline at end of file
env_check/dcu_env_check-main/tools/log_analyze.sh
0 → 100755
View file @
49486f80
#!/usr/bin/bash
# 分析错误信息
analyze_errors
()
{
log
"分析关键字
$1
相关错误..."
local
error_pattern
=
"
$1
"
local
error_flags
=
"fail|error|uncorrect|warn|exception"
# 在dmesg和系统日志中搜索
grep
-iE
"
$error_pattern
.*(
$error_flags
)|(
$error_flags
).*
$error_pattern
"
\
"
$OUTPUT_DIR
/dmesg.log"
"
$OUTPUT_DIR
/system.log"
>
"
$OUTPUT_DIR
/driver_issues.log"
if
[
-s
"
$OUTPUT_DIR
/driver_issues.log"
]
;
then
log
"发现潜在问题:"
[
$QUIET_MODE
-eq
0
]
&&
cat
"
$OUTPUT_DIR
/driver_issues.log"
return
1
else
log
"未发现相关错误信息"
rm
-f
"
$OUTPUT_DIR
/driver_issues.log"
return
0
fi
}
env_check/dcu_env_check-main/tools/pcie_check.sh
0 → 100755
View file @
49486f80
#!/usr/bin/bash
# 收集pcie信息
parse_regions
()
{
lspci
-vv
-s
"
$1
"
|
awk
'/Region [0-9]+:/'
}
get_pcie_topo
()
{
lspci
-vt
}
show_acs
()
{
lspci
-vvs
"
$1
"
|
grep
ACS
}
show_link_status
()
{
local
info
=
$(
lspci
-vv
-s
"
$1
"
)
declare
-A
lnk
=(
[
cur_speed]
=
$(
grep
-ioP
'LnkSta:\s+Speed\s\K[\d.]+'
<<<
"
$info
"
)
[
max_speed]
=
$(
grep
-ioP
'LnkCap:\s+Speed\s\K[\d.]+'
<<<
"
$info
"
)
[
cur_width]
=
$(
grep
-ioP
'LnkSta.*Width\sx\K\d+'
<<<
"
$info
"
)
[
max_width]
=
$(
grep
-ioP
'LnkCap.*Width\sx\K\d+'
<<<
"
$info
"
)
)
echo
"当前状态 : x
${
lnk
[cur_width]
}
@
${
lnk
[cur_speed]
}
GT/s "
}
show_busmaster
()
{
lspci
-vv
-s
"
$1
"
|
grep
BusMaster |
awk
'{print $4}'
}
collect_pcie_logs
()
{
log
"收集PCIe系统信息"
lspci
-D
-d
:
$DEVICE_ID
|
while
read
-r
dev
;
do
id
=
${
dev
:0:12
}
name
=
${
dev
:12
}
echo
echo
"
$(
hline
)
"
echo
"设备
${
id
}
"
echo
"型号 :
${
name
}
"
echo
"
$(
hline
)
"
echo
echo
"BAR 内存映射:"
parse_regions
"
$id
"
|
sed
's/^/ /'
echo
echo
"PCIe 链路状态:"
show_link_status
"
$id
"
|
sed
's/^/ /'
echo
echo
"PCIe ACS设置"
show_acs
"
$id
"
|
sed
's/^/ /'
echo
echo
"BusMaster设置"
show_busmaster
"
$id
"
|
sed
's/^/ /'
done
}
get_pcie_info
()
{
log
"收集PCIe系统信息"
collect_pcie_logs
>
$OUTPUT_DIR
/pcie_info.log
get_pcie_topo
>
$OUTPUT_DIR
/pcie_vt.log 2>&1
lspci
-vvv
>
$OUTPUT_DIR
/pcie_more.log 2>&1
log
"PCIe 信息收集完毕"
}
analyze_regions
()
{
local
address
echo
"
$1
"
# 提取Region关键参数
address
=
$(
echo
"
$1
"
|
awk
'/Memory at/ {print $5}'
)
# 判定逻辑实现
if
[[
"
$address
"
==
"unassigned"
]]
;
then
echo
"[ERROR] Bar地址未分配,需要检查卡的状态(物理连接或供电异常)"
echo
"建议操作:执行'lspci -vvv'确认设备识别状态}"
return
1
elif
[[
`
echo
$address
|
wc
-c
`
-gt
12
]]
;
then
echo
"[WARNING] Bar地址超出44bit(当前地址:0x
${
address
}
)"
echo
"解决方案:调整BIOS的MMIO High Base < 16T}"
return
2
fi
if
[[
"
$address
"
==
"<ignored>"
]]
;
then
echo
"[ERROR] 获取不到bar地址"
echo
"修复建议:检查/proc/cmdline是否包含'pcie=realloc'配置"
grep
-q
"pcie=realloc"
/proc/cmdline
||
echo
" 当前配置:
$(
cat
/proc/cmdline
)
"
return
3
fi
echo
"PCIe 状态正常"
return
0
}
pcie_check
()
{
if
[
!
-f
"
$1
"
]
;
then
echo
"file not exists"
>
&2
exit
1
fi
echo
"Region 0地址测试"
grep
"Region 0"
$1
|
while
read
-r
line
;
do
analyze_regions
"
$line
"
done
echo
"Region 5 地址测试"
grep
"Region 5"
$1
|
while
read
-r
line
;
do
analyze_regions
"
$line
"
done
}
env_check/dcu_env_check-main/tools/pcie_speed_check.sh
0 → 100755
View file @
49486f80
#!/bin/bash
# 检查是否具有root权限
if
[
"
$EUID
"
-ne
0
]
;
then
echo
"错误:该脚本需要root权限运行(请使用sudo执行)"
>
&2
exit
1
fi
# 获取主板序列号
speed
=
$(
./tools/hydcutune
-pciestatus
|
grep
-i
speed |
awk
'{print $5'
}
)
# 检查是否成功获取序列号
if
[
-z
"
$speed
"
]
;
then
echo
"没有获取到当前pcie 速率"
>
&2
exit
1
fi
# 型号判断逻辑
case
$speed
in
Gen1|Gen2|Gen3
)
echo
"当前PCIe 速率偏低,需要检查vbios或者使用hydcutune修复"
>
&2
exit
1
;;
Gen4|Gen5
)
echo
"PCIe速率正常"
exit
0
;;
*
)
echo
"未检测到PCIe速率"
exit
2
;;
esac
env_check/dcu_env_check-main/tools/pkg_check.sh
0 → 100755
View file @
49486f80
#!/bin/bash
function
pkg_check
()
{
local
pkgs_debian
=(
dmidecode lshw pciutils numactl-devel ipmitool locate
)
local
pkgs_centos
=(
dmidecode lshw pciutils numactl-dev ipmitool mlocate
)
local
cmd
=(
dmidecode lshw lspci numactl ipmitool locate
)
for
((
i
=
0
;
i<
${#
cmd
[@]
}
;
i++
))
;
do
if
!
command
-v
${
cmd
[i]
}
&>/dev/null
;
then
if
command
-v
apt-get &>/dev/null
;
then
echo
"没有
${
cmd
[i]
}
命令,请先安装
${
pkgs_debian
[i]
}
"
exit
0
# apt-get install -y ${pkgs_debian[i]}
elif
command
-v
yum &>/dev/null
;
then
echo
"没有
${
cmd
[i]
}
命令,请先安装
${
pkgs_centos
[i]
}
"
# yum install -y ${pkgs_centos[i]}
exit
0
fi
fi
done
}
\ No newline at end of file
env_check/dcu_env_check-main/tools/sme_check.sh
0 → 100755
View file @
49486f80
#!/usr/bin/bash
sme_check
()
{
if
[
!
-f
"
$1
"
]
;
then
echo
"file not exists"
>
&2
exit
1
fi
grep
-i
sme
$1
>
$OUTPUT_DIR
/sme.log
if
[
-s
"
$$
OUTPUT_DIR/sme.log"
]
;
then
echo
"非CSV场景,需要BIOS关闭SME设置"
return
1
else
echo
"OS SME目前没有打开, 非CSV场景下,该状态正常"
fi
}
env_check/dcu_env_check-main/tools/sys_info.sh
0 → 100755
View file @
49486f80
#!/usr/bin/bash
collect_system_info
()
{
log
"收集CPU信息..."
echoAndRun
"lscpu"
>
"
$OUTPUT_DIR
/cpuinfo.txt"
2>&1
log
"收集内存信息..."
echoAndRun
"free -h"
>
"
$OUTPUT_DIR
/meminfo.txt"
2>&1
echoAndRun
"dmidecode -t memory"
>>
"
$OUTPUT_DIR
/meminfo.txt"
2>&1
log
"收集网络信息..."
echoAndRun
"ip a"
>
"
$OUTPUT_DIR
/network.txt"
2>&1
echoAndRun
"lspci -nn | grep -i -E
\"
eth|mellanox
\"
"
>>
"
$OUTPUT_DIR
/network.txt"
2>&1
log
"收集系统版本..."
echoAndRun
"cat /etc/os-release"
>
"
$OUTPUT_DIR
/os_info.txt"
2>&1
echoAndRun
"uname -a"
>>
"
$OUTPUT_DIR
/os_info.txt"
2>&1
echoAndRun
"cat /proc/cmdline"
>>
"
$OUTPUT_DIR
/os_info.txt"
2>&1
echoAndRun
"numactl -H"
>>
"
$OUTPUT_DIR
/os_info.txt"
2>&1
echoAndRun
"rpm -qf
$(
which ldd
)
"
>>
"
$OUTPUT_DIR
/os_info.txt"
2>&1
echoAndRun
"ldd --version"
>>
"
$OUTPUT_DIR
/os_info.txt"
2>&1
echoAndRun
"strings
$(
find /usr/
-name
libc.so.6
)
| grep ^GLIBC_"
>>
"
$OUTPUT_DIR
/os_info.txt"
2>&1
echoAndRun
"strings
$(
find /usr
-name
libstdc++.so.6
)
| grep GLIBCXX"
>>
"
$OUTPUT_DIR
/os_info.txt"
2>&1
# echoAndRun "rpm -qi $(rpm -qf $(which ldd))" >> "$OUTPUT_DIR/os_info.txt" 2>&1
log
"收集服务器信息..."
echoAndRun
"ipmitool fru"
>
"
$OUTPUT_DIR
/hardware.txt"
2>&1
echoAndRun
"ipmitool mc info"
>>
"
$OUTPUT_DIR
/hardware.txt"
2>&1
echoAndRun
"dmidecode -s system-product-name"
>>
"
$OUTPUT_DIR
/hardware.txt"
2>&1
echoAndRun
"dmidecode -t bios"
>>
"
$OUTPUT_DIR
/hardware.txt"
2>&1
}
\ No newline at end of file
env_check/dcu_env_check-main/tools/sys_log.sh
0 → 100755
View file @
49486f80
#!/usr/bin/bash
# 带大小限制的日志复制函数
copy_log_with_limit
()
{
local
src
=
$1
local
dest
=
$2
local
size_limit_mb
=
$3
if
[
-f
"
$src
"
]
;
then
file_size
=
$(
du
-m
"
$src
"
|
cut
-f1
)
if
[
$file_size
-gt
$size_limit_mb
]
;
then
log
"跳过大文件:
$src
(
${
file_size
}
MB >
${
size_limit_mb
}
MB)"
echo
"[日志文件超过大小限制未采集]"
>
"
$dest
"
return
fi
cp
"
$src
"
"
$dest
"
2>/dev/null
||
echo
"无权限读取日志"
>
"
$dest
"
else
echo
"日志文件不存在"
>
"
$dest
"
fi
}
# 收集系统日志
collect_logs
()
{
log
"收集系统日志(最近
${
LOG_AGE
}
小时)..."
# 识别系统日志位置
local
syslog_path
[
-f
/var/log/syslog
]
&&
syslog_path
=
/var/log/syslog
[
-f
/var/log/messages
]
&&
syslog_path
=
/var/log/messages
if
[
-n
"
$syslog_path
"
]
;
then
copy_log_with_limit
"
$syslog_path
"
"
$OUTPUT_DIR
/system.log"
$LOG_SIZE_LIMIT
else
log
"收集journalctl日志..."
journalctl
--since
"
${
LOG_AGE
}
hours ago"
>
"
$OUTPUT_DIR
/system.log"
2>/dev/null
||
\
echo
"无法获取系统日志"
>
"
$OUTPUT_DIR
/system.log"
fi
log
"收集dmesg日志..."
dmesg
-T
>
"
$OUTPUT_DIR
/dmesg.log"
2>&1
}
\ No newline at end of file
env_check/dcu_env_check-main/tools/utils.sh
0 → 100755
View file @
49486f80
#!/usr/bin/bash
function
echoAndRun
(){
hline
echo
echo
"[root@dcu ~]# "
$1
;
eval
$1
;
echo
;
}
# 日志函数
function
log
()
{
[
$QUIET_MODE
-eq
0
]
&&
echo
"
$@
"
}
function
hline
()
{
printf
"%0.s="
{
1..60
}
echo
}
## 标准化提示信息格式
function
head_normal
()
{
echo
-e
"
\n
############
$1
############"
}
\ No newline at end of file
env_check/dcu_env_check-main/tools/vbios_check.sh
0 → 100755
View file @
49486f80
#!/bin/bash
dir
=
opn_info
=
version_info
=
env_check/dcu_env_check-main/tools/vfio_ffff_error.sh
0 → 100755
View file @
49486f80
#!/bin/bash
# 获取VFIO相关错误日志
ERROR_LOGS
=
$(
dmesg |
grep
-i
"vfio_iommu_type1_attach_group: No interrupt remapping support"
)
# 分析错误类型
if
[
-n
"
$ERROR_LOGS
"
]
;
then
echo
$ERROR_LOGS
echo
"vfio option error, do: echo
\"
options vfio_iommu_type1 allow_unsafe_interrupts=1
\"
>/etc/modprobe.d/vfio.conf, and restart vfio module"
else
echo
"no find errors"
fi
env_check/utils.sh
0 → 100755
View file @
49486f80
#!/usr/bin/bash
function
echoAndRun
(){
hline
echo
echo
"[root@dcu ~]# "
$1
;
eval
$1
;
echo
;
}
# 日志函数
function
log
()
{
[
$QUIET_MODE
-eq
0
]
&&
echo
"
$@
"
}
function
hline
()
{
printf
"%0.s="
{
1..60
}
echo
}
## 标准化提示信息格式
function
head_normal
()
{
echo
-e
"
\n
############
$1
############"
}
## 必要依赖检查及安装
function
pkg_check
()
{
local
pkgs_debian
=(
dmidecode lshw pciutils numactl-devel ipmitool locate
)
local
pkgs_centos
=(
dmidecode lshw pciutils numactl-dev ipmitool mlocate
)
local
cmd
=(
dmidecode lshw lspci numactl ipmitool locate
)
for
((
i
=
0
;
i<
${#
cmd
[@]
}
;
i++
))
;
do
if
!
command
-v
${
cmd
[i]
}
&>/dev/null
;
then
if
command
-v
apt-get &>/dev/null
;
then
echo
"没有
${
cmd
[i]
}
命令,请先安装
${
pkgs_debian
[i]
}
"
exit
0
# apt-get install -y ${pkgs_debian[i]}
elif
command
-v
yum &>/dev/null
;
then
echo
"没有
${
cmd
[i]
}
命令,请先安装
${
pkgs_centos
[i]
}
"
# yum install -y ${pkgs_centos[i]}
exit
0
fi
fi
done
}
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment