#!/bin/bash # 设置变量 INSTALL_DIR="/opt" DCGM_FILE="dcgm-dcu-v2.1.0.run" DCGM_URL="https://download.sourcefind.cn:65024/file/5/Kubernetes插件/dcgm-dcu/dcgm-dcu-v2.1.0.run" DCGM_PORT=16081 SERVICE_NAME="dcgm-dcu" SERVICE_USER="root" # 可根据需要修改运行用户 # 定义颜色输出 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # No Color # 打印带颜色的信息 print_info() { echo -e "${GREEN}[INFO]${NC} $1" } print_warn() { echo -e "${YELLOW}[WARN]${NC} $1" } print_error() { echo -e "${RED}[ERROR]${NC} $1" } # 检查是否为 root 用户 if [ "$EUID" -ne 0 ]; then print_error "请使用 root 用户运行此脚本" exit 1 fi # 切换到安装目录 cd "$INSTALL_DIR" || { print_error "无法切换到目录 $INSTALL_DIR"; exit 1; } # 检查文件是否已存在 if [ -f "$DCGM_FILE" ]; then print_info "文件 $DCGM_FILE 已存在,跳过下载" else print_info "开始下载 $DCGM_FILE ..." if wget "$DCGM_URL"; then print_info "下载完成" else print_error "下载失败" exit 1 fi fi # 添加执行权限 print_info "添加执行权限..." if chmod +x "$DCGM_FILE"; then print_info "权限设置完成" else print_error "设置执行权限失败" exit 1 fi # 执行安装 print_info "开始安装 DCGM-DCU..." if ./"$DCGM_FILE"; then print_info "安装完成" else print_error "安装失败" exit 1 fi # 检查安装目录 DCGM_BIN_DIR="/opt/dcgm-dcu/bin" if [ ! -d "$DCGM_BIN_DIR" ]; then print_error "安装目录 $DCGM_BIN_DIR 不存在" exit 1 fi # 切换到 bin 目录 cd "$DCGM_BIN_DIR" || { print_error "无法切换到目录 $DCGM_BIN_DIR"; exit 1; } # 检查是否已有进程在运行 if pgrep -f "dcgm-dcu.*--port=$DCGM_PORT" > /dev/null; then print_warn "DCGM-DCU 进程已在端口 $DCGM_PORT 上运行" print_info "正在运行的进程:" ps aux | grep "dcgm-dcu.*--port=$DCGM_PORT" | grep -v grep read -p "是否要停止现有进程并重新启动? (y/n): " -n 1 -r echo if [[ $REPLY =~ ^[Yy]$ ]]; then print_info "停止现有进程..." pkill -f "dcgm-dcu.*--port=$DCGM_PORT" sleep 2 else print_info "退出安装" exit 0 fi fi # 停止已存在的服务(如果存在) if systemctl list-unit-files | grep -q "$SERVICE_NAME.service"; then print_info "停止已存在的 $SERVICE_NAME 服务..." systemctl stop "$SERVICE_NAME" 2>/dev/null systemctl disable "$SERVICE_NAME" 2>/dev/null fi # 创建 systemd 服务文件 print_info "创建 systemd 服务文件..." cat > /etc/systemd/system/${SERVICE_NAME}.service << EOF [Unit] Description=DCGM-DCU Monitoring Service Documentation=https://github.com/your-repo/dcgm-dcu After=network.target network-online.target Wants=network-online.target [Service] Type=simple User=${SERVICE_USER} WorkingDirectory=${DCGM_BIN_DIR} ExecStart=${DCGM_BIN_DIR}/dcgm-dcu --port=${DCGM_PORT} ExecStop=/bin/kill -TERM \$MAINPID ExecReload=/bin/kill -HUP \$MAINPID Restart=on-failure RestartSec=10 StandardOutput=append:${DCGM_BIN_DIR}/log.log StandardError=append:${DCGM_BIN_DIR}/log.log PIDFile=/run/${SERVICE_NAME}.pid # 安全设置 NoNewPrivileges=yes PrivateTmp=yes ProtectSystem=strict ProtectHome=yes ReadWritePaths=${DCGM_BIN_DIR} # 资源限制 LimitNOFILE=65536 LimitNPROC=4096 [Install] WantedBy=multi-user.target EOF # 重新加载 systemd 配置 print_info "重新加载 systemd 配置..." systemctl daemon-reload # 启用服务 print_info "启用 $SERVICE_NAME 服务开机自启动..." if systemctl enable "$SERVICE_NAME"; then print_info "服务已设置为开机自启动" else print_error "设置开机自启动失败" exit 1 fi # 启动服务 print_info "启动 $SERVICE_NAME 服务..." if systemctl start "$SERVICE_NAME"; then print_info "服务启动成功" else print_error "服务启动失败,请检查状态" systemctl status "$SERVICE_NAME" --no-pager exit 1 fi # 等待2秒检查服务状态 sleep 2 # 检查服务状态 if systemctl is-active --quiet "$SERVICE_NAME"; then print_info "服务运行正常" print_info "访问地址: http://localhost:$DCGM_PORT" print_info "日志文件: ${DCGM_BIN_DIR}/log.log" print_info "查看日志: tail -f ${DCGM_BIN_DIR}/log.log" print_info "查看服务状态: systemctl status $SERVICE_NAME" print_info "停止服务: systemctl stop $SERVICE_NAME" print_info "重启服务: systemctl restart $SERVICE_NAME" else print_error "服务启动失败,请检查以下信息:" echo "" print_info "服务状态:" systemctl status "$SERVICE_NAME" --no-pager echo "" print_info "最近日志:" journalctl -u "$SERVICE_NAME" -n 20 --no-pager echo "" print_info "应用日志:" tail -20 "${DCGM_BIN_DIR}/log.log" 2>/dev/null || echo "无法读取日志文件" exit 1 fi # 创建简单的管理脚本 print_info "创建管理脚本..." cat > /usr/local/bin/dcgm-dcu-manager << 'EOF' #!/bin/bash SERVICE_NAME="dcgm-dcu" DCGM_PORT=16081 case "$1" in start) systemctl start $SERVICE_NAME echo "DCGM-DCU 服务已启动" ;; stop) systemctl stop $SERVICE_NAME echo "DCGM-DCU 服务已停止" ;; restart) systemctl restart $SERVICE_NAME echo "DCGM-DCU 服务已重启" ;; status) systemctl status $SERVICE_NAME ;; logs) tail -f /opt/dcgm-dcu/bin/log.log ;; journal) journalctl -u $SERVICE_NAME -f ;; *) echo "用法: $0 {start|stop|restart|status|logs|journal}" exit 1 ;; esac EOF chmod +x /usr/local/bin/dcgm-dcu-manager print_info "管理脚本已创建: dcgm-dcu-manager" print_info "使用示例: dcgm-dcu-manager {start|stop|restart|status|logs|journal}" # 返回到原目录 cd - > /dev/null echo "" echo "==========================================" print_info "安装和配置完成!" echo "==========================================" print_info "服务名称: $SERVICE_NAME" print_info "服务状态: $(systemctl is-active $SERVICE_NAME)" print_info "开机自启: $(systemctl is-enabled $SERVICE_NAME)" echo "" print_info "常用命令:" echo " systemctl status $SERVICE_NAME # 查看服务状态" echo " systemctl restart $SERVICE_NAME # 重启服务" echo " systemctl stop $SERVICE_NAME # 停止服务" echo " journalctl -u $SERVICE_NAME -f # 查看系统日志" echo " dcgm-dcu-manager logs # 查看应用日志" echo "=========================================="