Commit 138039a2 authored by liumg's avatar liumg
Browse files

变更格式

parent 5ec6d6c1
#!/bin/bash
#set -x
. ./tools/utils.sh
. ./tools/pkg_check.sh
. ./tools/dcuopn_check.sh
. ./tools/sys_info.sh
. ./tools/sys_log.sh
. ./tools/pcie_check.sh
. ./tools/kernel_check.sh
. ./tools/sme_check.sh
. ./tools/log_analyze.sh
# 默认参数配置
DEFAULT_OUTPUT_DIR="system_info_$(date +%Y%m%d_%H%M%S)"
DEFAULT_KEYWORD="hydcu"
DEFAULT_LOG_AGE=24 # 小时
DEFAULT_LOG_SIZE_LIMIT=10 # 单位:MB
#!/usr/bin/env bash
# DCU系统诊断工具 - 优化版本
#set -o errexit # 遇到错误时退出
#set -o nounset # 遇到未定义变量时报错
#set -o pipefail # 管道中任何一个命令失败时整个管道失败
# 加载工具库
source ./tools/utils.sh
source ./tools/pkg_check.sh
source ./tools/dcuopn_check.sh
source ./tools/sys_info.sh
source ./tools/sys_log.sh
source ./tools/pcie_check.sh
source ./tools/kernel_check.sh
source ./tools/sme_check.sh
source ./tools/log_analyze.sh
# 默认配置
readonly DEFAULT_OUTPUT_DIR="system_info_$(date +%Y%m%d_%H%M%S)"
readonly DEFAULT_KEYWORD="hydcu|hycu"
readonly DEFAULT_LOG_AGE=24 # 小时
readonly DEFAULT_LOG_SIZE_LIMIT=10 # MB
# 工具路径
readonly DRIVER_LOAD_CHECK="./tools/driver_load_check.sh"
readonly BOARD_CHECK="./tools/board_check.sh"
readonly PCIE_SPEED_CHECK="./tools/pcie_speed_check.sh"
# 全局变量
QUIET_MODE=0
DEBUG_MODE=0
OUTPUT_DIR=""
KEYWORD=""
LOG_AGE=""
LOG_SIZE_LIMIT=""
DEVICE_NAME=""
DEVICE_ID=""
# 显示帮助信息
show_help() {
echo "Usage: $0 [OPTIONS]"
echo "系统诊断脚本 - 收集系统信息并检测驱动问题"
echo
echo "选项:"
echo " -o DIR 指定输出目录 (默认: 自动生成)"
echo " -k KEYWORD 设置检测关键字 (默认: $DEFAULT_KEYWORD)"
echo " -t HOURS 收集日志的时间范围(小时) (默认: 24)"
echo " -s SIZE 日志文件大小限制(MB) (默认: 10)"
echo " -q 静默模式(仅输出错误)"
echo " -d 调试模式"
echo " -h 显示此帮助信息"
echo
echo "示例:"
echo " $0 -o /tmp/logs -k buserr -t 48"
cat << EOF
用法: $0 [选项]
DCU系统诊断脚本 - 收集系统信息并检测驱动问题
选项:
-o DIR 指定输出目录 (默认: ${DEFAULT_OUTPUT_DIR})
-k KEYWORD 设置检测关键字 (默认: ${DEFAULT_KEYWORD})
-t HOURS 收集日志的时间范围(小时) (默认: ${DEFAULT_LOG_AGE})
-s SIZE 日志文件大小限制(MB) (默认: ${DEFAULT_LOG_SIZE_LIMIT})
-q 静默模式(仅输出错误)
-d 调试模式
-h 显示此帮助信息
示例:
$0 -o /tmp/logs -k buserr -t 48
$0 -q -s 20 # 静默模式,日志大小限制20MB
EOF
}
# 验证数字输入
validate_numeric_input() {
local value="$1"
local param_name="$2"
local min_value="${3:-0}"
if ! [[ "$value" =~ ^[0-9]+$ ]] || [ "$value" -lt "$min_value" ]; then
echo "错误: 参数 '$param_name' 的值无效。必须为大于等于 ${min_value} 的整数。" >&2
exit 1
fi
}
# 解析参数
while getopts "o:k:t:s:qdh" opt; do
case $opt in
o) CUSTOM_OUTPUT_DIR="$OPTARG" ;;
k) KEYWORD="$OPTARG" ;;
t) LOG_AGE="$OPTARG" ;;
s) LOG_SIZE_LIMIT="$OPTARG" ;;
q) QUIET_MODE=1 ;;
d) DEBUG_MODE=1; set -x ;;
h) show_help; exit 0 ;;
\?) echo "无效选项: -$OPTARG" >&2; exit 1 ;;
:) echo "选项 -$OPTARG 需要参数" >&2; exit 1 ;;
esac
done
# 检查必需的工具是否存在
check_required_tools() {
local tools=("dmidecode" "tar" "journalctl" "dmesg" "lscpu" "free" "ip" "lspci")
local missing_tools=()
for tool in "${tools[@]}"; do
if ! command -v "$tool" &> /dev/null; then
missing_tools+=("$tool")
fi
done
if [ ${#missing_tools[@]} -gt 0 ]; then
echo "错误: 缺少必需的工具: ${missing_tools[*]}" >&2
exit 1
fi
}
# 检查工具脚本是否存在
check_tool_scripts() {
local scripts=(
"$DRIVER_LOAD_CHECK" "$BOARD_CHECK" "$PCIE_SPEED_CHECK"
"./tools/utils.sh" "./tools/pkg_check.sh" "./tools/dcuopn_check.sh"
"./tools/sys_info.sh" "./tools/sys_log.sh" "./tools/pcie_check.sh"
"./tools/kernel_check.sh" "./tools/sme_check.sh" "./tools/log_analyze.sh"
)
for script in "${scripts[@]}"; do
if [ ! -f "$script" ]; then
echo "错误: 工具脚本不存在: $script" >&2
exit 1
fi
if [ ! -x "$script" ]; then
chmod +x "$script"
echo "已修复工具脚本权限: $script" >&2
fi
done
}
# 解析命令行参数
parse_arguments() {
while getopts "o:k:t:s:qdh" opt; do
case $opt in
o) CUSTOM_OUTPUT_DIR="$OPTARG" ;;
k) KEYWORD="$OPTARG" ;;
t) LOG_AGE="$OPTARG" ;;
s) LOG_SIZE_LIMIT="$OPTARG" ;;
q) QUIET_MODE=1 ;;
d) DEBUG_MODE=1; set -x ;;
h) show_help; exit 0 ;;
\?) echo "无效选项: -$OPTARG" >&2; show_help; exit 1 ;;
:) echo "选项 -$OPTARG 需要参数" >&2; show_help; exit 1 ;;
esac
done
shift $((OPTIND - 1))
# 处理额外的参数
if [ $# -gt 0 ]; then
echo "警告: 忽略额外的参数: $*" >&2
fi
}
# 设置默认值
: ${OUTPUT_DIR:=${CUSTOM_OUTPUT_DIR:-$DEFAULT_OUTPUT_DIR}}
: ${KEYWORD:=$DEFAULT_KEYWORD}
: ${LOG_AGE:=$DEFAULT_LOG_AGE}
: ${LOG_SIZE_LIMIT:=$DEFAULT_LOG_SIZE_LIMIT}
set_defaults() {
OUTPUT_DIR="${CUSTOM_OUTPUT_DIR:-$DEFAULT_OUTPUT_DIR}"
KEYWORD="${KEYWORD:-$DEFAULT_KEYWORD}"
LOG_AGE="${LOG_AGE:-$DEFAULT_LOG_AGE}"
LOG_SIZE_LIMIT="${LOG_SIZE_LIMIT:-$DEFAULT_LOG_SIZE_LIMIT}"
# 验证数字参数
validate_numeric_input "$LOG_AGE" "-t" 1
validate_numeric_input "$LOG_SIZE_LIMIT" "-s" 1
}
# 初始化输出目录
init_output_dir() {
if [ -e "$OUTPUT_DIR" ]; then
echo "错误: 输出目录已存在: $OUTPUT_DIR" >&2
exit 1
fi
if ! mkdir -p "$OUTPUT_DIR"; then
echo "错误: 无法创建输出目录: $OUTPUT_DIR" >&2
exit 1
fi
}
# 包装函数用于执行命令并记录日志
run_and_log() {
local description="$1"
local command="$2"
local output_file="$3"
head_normal "$description"
if [ $DEBUG_MODE -eq 1 ]; then
log "执行命令: $command"
log "输出到: $output_file"
fi
# 使用 bash -c 执行命令字符串,便于支持带参数的命令
bash -c "$command" > "$output_file" 2>&1 || {
local exit_code=$?
if [ $exit_code -eq 124 ]; then
log "警告: 命令执行超时: $command"
else
log "警告: 命令执行失败 (退出码: $exit_code): $command"
fi
return $exit_code
}
return 0
}
# 初始化目录
mkdir -p "$OUTPUT_DIR" || exit 1
# 尝试执行 /opt/hyhal/bin/drvdiag -c 收集驱动日志
run_drvdiag() {
local drv_cmd="/opt/hyhal/bin/drvdiag"
local drv_arg='-c'
local drv_path="$drv_cmd"
local out_file="$OUTPUT_DIR/drvdiag.log"
if [ ! -x "$drv_path" ]; then
echo "没有装驱动,请先安装驱动" >&2
return 1
fi
# 主流程
main() {
run_and_log "收集驱动日志 (drvdiag -c)" "$drv_path $drv_arg" "$out_file"
}
hline
echo -e '################ 日志收集 ##################'
hline
# 收集系统信息
collect_system_information() {
log "开始收集系统信息..."
get_dcu
pkg_check
collect_system_info
collect_logs
#analyze_errors
get_pcie_info
get_pcie_info
}
# 分析日志信息
analyze_logs() {
log "开始分析日志信息..."
echo -e '\n###### 日志分析 #######'
hline
head_normal "分析pcie信息"
pcie_check $OUTPUT_DIR/pcie_info.log
head_normal "分析sme信息"
sme_check $OUTPUT_DIR/dmesg.log
head_normal "分析驱动安装位置"
kernel_check
./tools/driver_load_check.sh > $OUTPUT_DIR/driver_status.log
./tools/board_check.sh > $OUTPUT_DIR/board_check.log
product_name=`dmidecode -s system-product-name`
if [ "$product_name" != "X785-H30" ]; then
./tools/pcie_speed_check.sh > $OUTPUT_DIR/pcie_speek_check.log
fi
local status=$?
run_and_log "分析PCIe信息" "pcie_check" "$OUTPUT_DIR/pcie_analysis.log"
run_and_log "分析SME信息" "sme_check" "$OUTPUT_DIR/sme_analysis.log"
run_and_log "分析驱动安装位置" "kernel_check" "$OUTPUT_DIR/kernel_analysis.log"
}
# 运行附加检查
run_additional_checks() {
log "运行附加检查..."
# 先尝试收集 drvdiag 日志(如果驱动未安装则给出提示)
run_drvdiag || log "跳过 drvdiag,驱动未安装或执行失败"
run_and_log "检查驱动加载状态" "$DRIVER_LOAD_CHECK" "$OUTPUT_DIR/driver_status.log"
run_and_log "检查硬件板卡信息" "$BOARD_CHECK" "$OUTPUT_DIR/board_check.log"
#打包结果
# 只在非X785-H30设备上检查PCIe速度
if product_name=$(dmidecode -s system-product-name 2>/dev/null); then
if [ "$product_name" != "X785-H30" ]; then
run_and_log "检查PCIe速度" "$PCIE_SPEED_CHECK" "$OUTPUT_DIR/pcie_speed_check.log"
fi
else
log "警告: 无法获取产品名称,跳过PCIe速度检查"
fi
}
# 打包结果
package_results() {
log "打包诊断数据..."
tar -czf "${OUTPUT_DIR}.tar.gz" "$OUTPUT_DIR" 2>/dev/null
rm -rf "$OUTPUT_DIR"
log "诊断文件已保存为:${OUTPUT_DIR}.tar.gz"
return $status
local tar_file="${OUTPUT_DIR}.tar.gz"
if tar -czf "$tar_file" -C "$(dirname "$OUTPUT_DIR")" "$(basename "$OUTPUT_DIR")" 2>/dev/null; then
rm -rf "$OUTPUT_DIR"
log "诊断完成! 文件已保存为: ${tar_file}"
log "文件大小: $(du -h "$tar_file" | cut -f1)"
else
log "错误: 打包失败,原始数据保存在: $OUTPUT_DIR"
exit 1
fi
}
# 主工作流程
main() {
# 显示开始信息
hline
log "🚀 DCU系统诊断工具启动"
log "📁 输出目录: $OUTPUT_DIR"
log "🔍 关键字: $KEYWORD"
log "⏰ 日志时间范围: ${LOG_AGE}小时"
log "📊 日志大小限制: ${LOG_SIZE_LIMIT}MB"
hline
get_dcu
# 检查环境和工具
check_required_tools
check_tool_scripts
# 收集信息
collect_system_information
# 分析日志
echo
hline
log "📊 开始日志分析"
hline
analyze_logs
# 运行附加检查
run_additional_checks
# 打包结果
package_results
}
# 清理函数(在信号中断时调用)
cleanup() {
if [ -n "$OUTPUT_DIR" ] && [ -d "$OUTPUT_DIR" ]; then
log "正在清理临时文件..."
rm -rf "$OUTPUT_DIR"
fi
exit 1
}
# 设置信号处理
trap cleanup INT TERM
# 执行主程序
parse_arguments "$@"
set_defaults
init_output_dir
main
exit $?
exit 0
\ No newline at end of file
#!/usr/bin/bash
# 原始设备映射表
declare -A devices_id=(
["Z100"]="54b7"
["Z100L"]="55b7"
["K100"]="62b7"
["K100-AI"]="6210"
["K100-AI-ECO"]="6211"
["BW1000"]="6320"
["BW"]="6320"
["BW1100"]="6430"
)
# 构建反向映射表(设备ID → 设备名称)
declare -A devices
for name in "${!devices_id[@]}"; do
id="${devices_id[$name]}"
devices["${id}"]+=" $name"
devices["$id"]="$name"
done
get_dcu() {
# 检查lspci命令是否存在
if ! command -v lspci &> /dev/null; then
echo "错误: lspci 命令未找到,请先安装 pciutils 包" >&2
return 1
fi
# 获取设备ID列表
mapfile -t dcu_list < <(lspci -nn | grep -i -E "display|co-processor" | awk -F'[][]' '{print $4}' | awk -F ":" '{print $2}')
local index=0
local dcu_num=0
local total=${#dcu_list[@]}
local detected_devices=()
# 处理没有设备的情况
if [ $total -eq 0 ]; then
echo "未检测到任何DCU设备"
DEVICE_NAME=""
DEVICE_ID=""
return 0
fi
echo "=== 检测到的DCU设备 ==="
while [ $index -lt $total ]; do
current_id="${dcu_list[$index]}"
if [ -n "${devices[$current_id]}" ]; then
echo "dcu #$dcu_num 型号为:${devices[$current_id]}"
echo "DCU #$dcu_num: ID ${current_id}${devices[$current_id]}"
detected_devices+=("${devices[$current_id]}")
((dcu_num++))
else
echo "未知设备ID: $current_id" >&2
echo "DCU #$dcu_num: 未知设备ID: $current_id" >&2
detected_devices+=("unknown")
fi
((index++))
done
echo "总计: $dcu_num${devices[$current_id]} DCU 设备"
DEVICE_NAME=${devices[$current_id]}
DEVICE_ID=$current_id
# echo $DEVICE_NAME $DEVICE_ID
# 统计信息
echo "=========================="
if [ $dcu_num -eq 0 ]; then
echo "总计: 0张DCU设备"
else
# 使用关联数组统计每种设备的数量
local -A device_count
for dev in "${detected_devices[@]}"; do
((device_count[$dev]++))
done
# 输出统计信息
echo "总计: $dcu_num张DCU设备"
for dev_name in "${!device_count[@]}"; do
if [ "$dev_name" != "unknown" ]; then
echo " - ${dev_name}: ${device_count[$dev_name]}张"
fi
done
if [ ${device_count["unknown"]:-0} -gt 0 ]; then
echo " - 未知设备: ${device_count["unknown"]}张"
fi
fi
# 设置全局变量(取第一个有效设备)
if [ $dcu_num -gt 0 ]; then
DEVICE_NAME="${detected_devices[0]}"
DEVICE_ID="${dcu_list[0]}"
else
DEVICE_NAME=""
DEVICE_ID=""
fi
}
get_dcu
#!/bin/bash
#huangjun@hygon.cn
#v0.2
# 设备ID正则表达式 - 支持DCU设备
DEVICE_ID="1d94:(5|6)[0-9a-z]{3,3}"
pn["5"]="zifang"
pn["6"]="kongming"
dev=($(lspci -nn | grep -oE "1d94:(5|6)[0-9a-z]{3,3}" | awk -F: '{print $2}' | grep -o [56]))
devname=${pn[${dev[0]}]}
# 设备名称映射 - 根据设备系列号匹配
declare -A pn
pn["55b7"]="zifang" # 子房
pn["54b7"]="zifang" # 子房
pn["62b7"]="kongming" # 孔明
pn["6210"]="zhongda" # 仲达
pn["6211"]="zhongda" # 仲达
pn["6320"]="bowen" # 伯温
pn["6430"]="bowen" # 伯温
# 提取完整设备ID
dev_id=$(lspci -nn | grep -oE "1d94:(5|6)[0-9a-z]{3,3}" | head -n 1)
# 提取设备系列号部分 (如 55b7)
dev_series=${dev_id#*:}
# 根据系列号确定设备名称
devname=${pn[$dev_series]}
if [ -z "$devname" ]; then
echo "未识别的设备型号: $dev_id"
echo "支持的设备型号: 55b7(zifang), 62b7(kongming), 6210(zhongda), 6320(bowen), 6430(bowen)"
exit 1
fi
echo "===THIS SCRIPT JUST FOR 5.16.21 5.2 V1.10 and later==="
function ko_is_loaded()
......@@ -71,8 +90,8 @@ function _find_ucode_in_path()
local tc=0
local cnt=0
au="$(find $p -name ${devname}_$u.bin 2> /dev/null)"
tc=$(find $p -name ${devname}_$u.bin 2> /dev/null | wc -l)
au="$(find $p -name "${devname}_${u}.bin" 2> /dev/null)"
tc=$(find $p -name "${devname}_${u}.bin" 2> /dev/null | wc -l)
cnt=$(($cnt + $tc))
echo $au
return $cnt
......@@ -123,7 +142,7 @@ function check_ucode()
function check_ko()
{
local kos="hydcu.ko hydcu-sched.ko hydrm_ttm_helper.ko hy-extra.ko hykcl.ko hyttm.ko"
local kos="hycu.ko hycu-sched.ko hydrm_ttm_helper.ko hy-extra.ko hykcl.ko hyttm.ko"
local dir="/opt/hyhal/dkms/"
local ret=
......@@ -172,10 +191,10 @@ check_vfio_pci
#2
check_ko
#3
check_ucode
# check_ucode
#4
check_system_cap
#5
check_cuser_if_video
# check_cuser_if_video
echo "驱动检查结束,没有发现明显问题"
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
......@@ -68,22 +68,60 @@ analyze_regions() {
# 提取Region关键参数
address=$(echo "$1" | awk '/Memory at/ {print $5}')
# 判定逻辑实现
# PCIe BAR地址状态检查
if [[ "$address" == "unassigned" ]]; then
echo "[ERROR] Bar地址未分配,需要检查卡的状态(物理连接或供电异常)"
echo "建议操作:执行'lspci -vvv'确认设备识别状态}"
echo "[ERROR CODE 1] PCIe BAR地址未分配"
echo "可能原因:"
echo " 1. DCU卡未正确插入或供电不足"
echo " 2. 主板PCIe插槽故障"
echo " 3. 驱动未正确加载"
echo "解决方案:"
echo " 1. 检查DCU卡物理连接状态"
echo " 2. 执行 'lspci -vvv | grep -A 10 \"$DEVICE_ID\"' 确认设备识别状态"
echo " 3. 检查dmesg日志中的PCIe相关错误"
return 1
elif [[ `echo $address | wc -c` -gt 12 ]]; then
echo "[WARNING] Bar地址超出44bit(当前地址:0x${address})"
echo "解决方案:调整BIOS的MMIO High Base < 16T}"
echo "[ERROR CODE 2] PCIe BAR地址超出44bit范围 (当前地址: 0x${address})"
echo "可能影响:"
echo " - 可能导致DMA操作失败"
echo " - 系统内存空间不足"
echo "解决方案:"
echo " 1. 进入BIOS设置"
echo " 2. 找到 'MMIO High Base' 选项"
echo " 3. 设置为小于16T的值 (如 8T)"
echo " 4. 保存设置并重启系统"
return 2
fi
if [[ "$address" == "<ignored>" ]]; then
echo "[ERROR] 获取不到bar地址"
echo "修复建议:检查/proc/cmdline是否包含'pcie=realloc'配置"
grep -q "pcie=realloc" /proc/cmdline || echo " 当前配置:$(cat /proc/cmdline)"
elif [[ "$address" == "<ignored>" ]]; then
echo "[ERROR CODE 3] 无法获取有效BAR地址"
echo "可能原因:"
echo " - 内核未启用PCIe地址重分配功能"
echo "解决方案:"
echo " 1. 检查当前内核启动参数:"
grep -q "pcie=realloc" /proc/cmdline || echo " 当前配置: $(cat /proc/cmdline)"
echo " 2. 在GRUB配置中添加 'pcie=realloc' 参数"
echo " 3. 更新GRUB配置并重启:"
echo " - Debian/Ubuntu: update-grub"
echo " - RHEL/CentOS: grub2-mkconfig -o /boot/grub2/grub.cfg"
return 3
fi
# 检查Region 0地址是否在保留区间 (0xf000000000-0xffffffffff)
if [[ "$1" == *"Region 0"* ]]; then
if [[ "$address" =~ ^f[0-9a-f]{11}$ ]]; then
local dev_id=$(echo "$1" | grep -oP 'Device \K[0-9a-f:]+')
echo "[ERROR CODE 4] PCIe BAR地址冲突 - 卡 ${dev_id} (地址: 0x${address})"
echo "问题描述:"
echo " - Region 0地址位于保留区间 (0xf000000000-0xffffffffff)"
echo " - 可能与系统保留内存区域冲突"
echo "解决方案:"
echo " 1. 进入BIOS设置"
echo " 2. 找到 'PCIe Memory Mapped IO' 或 'MMIO Configuration'"
echo " 3. 调整MMIO分配范围,避开 0xf000000000-0xffffffffff"
echo " 4. 如无相关选项,可能需要更新BIOS版本"
return 4
fi
fi
echo "PCIe 状态正常"
return 0
}
......
#!/bin/bash
function pkg_check() {
local pkgs_debian=(dmidecode lshw pciutils numactl-devel ipmitool locate)
local pkgs_centos=(dmidecode lshw pciutils numactl-dev ipmitool mlocate)
local cmd=(dmidecode lshw lspci numactl ipmitool locate)
local pkgs_debian=("dmidecode" "lshw" "pciutils" "numactl-dev" "ipmitool" "mlocate")
local pkgs_centos=("dmidecode" "lshw" "pciutils" "numactl-devel" "ipmitool" "mlocate")
local cmd=("dmidecode" "lshw" "lspci" "numactl" "ipmitool" "locate")
local missing_pkgs=()
local missing_cmds=()
local package_manager=""
local distro_pkgs=()
# 检测包管理器
if command -v apt-get &>/dev/null; then
package_manager="apt-get"
distro_pkgs=("${pkgs_debian[@]}")
elif command -v yum &>/dev/null; then
package_manager="yum"
distro_pkgs=("${pkgs_centos[@]}")
elif command -v dnf &>/dev/null; then
package_manager="dnf"
distro_pkgs=("${pkgs_centos[@]}")
else
echo "错误: 未检测到支持的包管理器 (apt-get, yum, dnf)"
return 1
fi
echo "=== 检查系统命令依赖 ==="
# 检查所有命令
for ((i=0; i<${#cmd[@]}; i++)); do
if ! command -v ${cmd[i]} &>/dev/null; then
if command -v apt-get &>/dev/null; then
echo "没有${cmd[i]} 命令,请先安装${pkgs_debian[i]}"
exit 0
# apt-get install -y ${pkgs_debian[i]}
elif command -v yum &>/dev/null; then
echo "没有${cmd[i]} 命令,请先安装${pkgs_centos[i]}"
# yum install -y ${pkgs_centos[i]}
exit 0
fi
if ! command -v "${cmd[i]}" &>/dev/null; then
missing_cmds+=("${cmd[i]}")
missing_pkgs+=("${distro_pkgs[i]}")
echo "[缺失] ${cmd[i]} → 需要安装: ${distro_pkgs[i]}"
else
echo "[已安装] ${cmd[i]}"
fi
done
}
\ No newline at end of file
# 处理缺失的包
if [ ${#missing_pkgs[@]} -eq 0 ]; then
echo "所有依赖命令均已安装 ✓"
return 0
else
echo "============================="
echo "缺失的命令: ${missing_cmds[*]}"
echo "需要安装的包: ${missing_pkgs[*]}"
echo ""
echo "安装命令:"
echo " $package_manager install -y ${missing_pkgs[*]}"
echo ""
echo "请安装上述包后重新运行脚本"
return 1
fi
}
pkg_check
\ No newline at end of file
#!/usr/bin/bash
#!/usr/bin/env bash
# 带大小限制的日志复制函数
copy_log_with_limit() {
local src=$1
local dest=$2
local size_limit_mb=$3
if [ -f "$src" ]; then
file_size=$(du -m "$src" | cut -f1)
if [ $file_size -gt $size_limit_mb ]; then
log "跳过大文件: $src (${file_size}MB > ${size_limit_mb}MB)"
echo "[日志文件超过大小限制未采集]" > "$dest"
return
fi
cp "$src" "$dest" 2>/dev/null || echo "无权限读取日志" > "$dest"
local src="$1"
local dest="$2"
local size_limit_mb="$3"
if [ ! -f "$src" ]; then
echo "日志文件不存在: $src" > "$dest"
return 1
fi
# 检查文件权限
if [ ! -r "$src" ]; then
echo "无权限读取日志文件: $src" > "$dest"
return 1
fi
# 获取文件大小(MB)
local file_size
file_size=$(du -m "$src" 2>/dev/null | cut -f1 || echo 0)
if [ "$file_size" -gt "$size_limit_mb" ]; then
log_warning "跳过大文件: $src (${file_size}MB > ${size_limit_mb}MB)"
echo "[日志文件超过大小限制未采集 - ${file_size}MB > ${size_limit_mb}MB]" > "$dest"
return 2
fi
# 复制文件,保留原始权限
if cp "$src" "$dest" 2>/dev/null; then
log "成功复制日志: $src -> $dest (${file_size}MB)"
return 0
else
echo "日志文件不存在" > "$dest"
echo "复制日志文件失败: $src" > "$dest"
return 1
fi
}
# 收集特定应用的日志
collect_app_logs() {
local app_name="$1"
local log_patterns=(
"/var/log/${app_name}/*.log"
"/var/log/${app_name}.log"
"/opt/${app_name}/logs/*.log"
)
local app_log_dir="${OUTPUT_DIR}/app_logs"
mkdir -p "$app_log_dir"
for pattern in "${log_patterns[@]}"; do
for log_file in $pattern; do
if [ -f "$log_file" ]; then
local base_name=$(basename "$log_file")
copy_log_with_limit "$log_file" "${app_log_dir}/${app_name}_${base_name}" "$LOG_SIZE_LIMIT"
fi
done
done
}
# 收集系统日志
collect_logs() {
log "收集系统日志(最近${LOG_AGE}小时)..."
log "收集系统日志(最近${LOG_AGE}小时,大小限制: ${LOG_SIZE_LIMIT}MB)..."
# 识别系统日志位置
local syslog_path
[ -f /var/log/syslog ] && syslog_path=/var/log/syslog
[ -f /var/log/messages ] && syslog_path=/var/log/messages
if [ -n "$syslog_path" ]; then
copy_log_with_limit "$syslog_path" "$OUTPUT_DIR/system.log" $LOG_SIZE_LIMIT
else
log "收集journalctl日志..."
journalctl --since "${LOG_AGE} hours ago" > "$OUTPUT_DIR/system.log" 2>/dev/null || \
echo "无法获取系统日志" > "$OUTPUT_DIR/system.log"
local syslog_dir="${OUTPUT_DIR}/logs"
mkdir -p "$syslog_dir"
# 常见的系统日志文件
local system_logs=(
"/var/log/syslog"
"/var/log/messages"
"/var/log/kern.log"
"/var/log/dmesg"
"/var/log/boot.log"
)
# 收集标准系统日志文件
for log_file in "${system_logs[@]}"; do
if [ -f "$log_file" ]; then
local base_name=$(basename "$log_file")
copy_log_with_limit "$log_file" "${syslog_dir}/${base_name}" "$LOG_SIZE_LIMIT"
fi
done
# 使用journalctl作为后备方案
if [ ! -f "${syslog_dir}/syslog" ] && [ ! -f "${syslog_dir}/messages" ]; then
log "使用journalctl收集系统日志..."
if command -v journalctl >/dev/null 2>&1; then
if journalctl --since "${LOG_AGE} hours ago" > "${syslog_dir}/journalctl.log" 2>/dev/null; then
log_success "journalctl日志收集成功"
else
log_warning "journalctl日志收集失败"
echo "无法获取journalctl日志" > "${syslog_dir}/journalctl.log"
fi
else
log_warning "journalctl命令不存在"
echo "journalctl命令不可用" > "${syslog_dir}/journalctl.log"
fi
fi
# 收集dmesg日志
log "收集dmesg日志..."
dmesg -T > "$OUTPUT_DIR/dmesg.log" 2>&1
if command -v dmesg >/dev/null 2>&1; then
if dmesg -T > "${syslog_dir}/dmesg.log" 2>&1; then
log_success "dmesg日志收集成功"
else
# 如果-T选项不支持,使用普通dmesg
dmesg > "${syslog_dir}/dmesg.log" 2>&1 || {
log_warning "dmesg日志收集失败"
echo "无法获取dmesg日志" > "${syslog_dir}/dmesg.log"
}
fi
else
log_warning "dmesg命令不存在"
echo "dmesg命令不可用" > "${syslog_dir}/dmesg.log"
fi
# 收集应用日志(DCU相关)
collect_app_logs "dcu"
collect_app_logs "hydcu"
collect_app_logs "nvidia" # 兼容性考虑
# 收集其他有用的日志
local other_logs=(
"/var/log/secure"
"/var/log/auth.log"
"/var/log/yum.log"
"/var/log/apt/history.log"
)
for log_file in "${other_logs[@]}"; do
if [ -f "$log_file" ]; then
local base_name=$(basename "$log_file")
copy_log_with_limit "$log_file" "${syslog_dir}/${base_name}" "$LOG_SIZE_LIMIT"
fi
done
log_success "系统日志收集完成"
}
\ No newline at end of file
#!/usr/bin/bash
#!/usr/bin/env bash
function echoAndRun(){
# 颜色定义
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly BLUE='\033[0;34m'
readonly NC='\033[0m' # No Color
# 带颜色的日志函数
log() {
if [ $QUIET_MODE -eq 0 ]; then
echo -e "${BLUE}[INFO]${NC} $*"
fi
}
log_success() {
if [ $QUIET_MODE -eq 0 ]; then
echo -e "${GREEN}[SUCCESS]${NC} $*"
fi
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $*" >&2
}
log_error() {
echo -e "${RED}[ERROR]${NC} $*" >&2
}
# 执行命令并显示
echoAndRun() {
local cmd="$1"
hline
echo
echo "[root@dcu ~]# "$1;
eval $1 ;
echo;
echo -e "${BLUE}[执行命令]${NC} $cmd"
echo
eval "$cmd"
local exit_code=$?
echo
if [ $exit_code -eq 0 ]; then
log_success "命令执行成功"
else
log_warning "命令执行失败,退出码: $exit_code"
fi
return $exit_code
}
# 日志函数
function log() {
[ $QUIET_MODE -eq 0 ] && echo "$@"
# 水平分隔线
hline() {
printf '%0.s=' {1..60}
echo
}
function hline() {
printf "%0.s=" {1..60}
# 标准化标题格式
head_normal() {
echo
hline
echo -e "${BLUE}$1${NC}"
hline
}
# 检查命令是否存在
command_exists() {
command -v "$1" >/dev/null 2>&1
}
# 安全执行命令,遇到错误继续执行
safe_run() {
local cmd="$1"
local description="${2:-执行命令}"
log "$description: $cmd"
eval "$cmd" || {
log_warning "$description 失败,继续执行..."
return 1
}
}
# 进度指示器
progress() {
local message="$1"
if [ $QUIET_MODE -eq 0 ]; then
echo -n -e "${BLUE}[...]${NC} $message"
fi
}
progress_done() {
if [ $QUIET_MODE -eq 0 ]; then
echo -e "\r${GREEN}[✓]${NC} $1"
fi
}
progress_failed() {
echo -e "\r${RED}[✗]${NC} $1" >&2
}
# 验证文件存在
check_file() {
local file="$1"
if [ ! -f "$file" ]; then
log_error "文件不存在: $file"
return 1
fi
return 0
}
## 标准化提示信息格式
function head_normal() {
echo -e "\n############$1############"
# 验证目录存在
check_dir() {
local dir="$1"
if [ ! -d "$dir" ]; then
log_error "目录不存在: $dir"
return 1
fi
return 0
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment