auto-install-server.sh 6.49 KB
Newer Older
wangkaixiong's avatar
init  
wangkaixiong committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/bin/bash

# 设置变量
INSTALL_DIR="/opt"
DCGM_FILE="dcgm-dcu-v2.1.0.run"
DCGM_URL="https://download.sourcefind.cn:65024/file/5/Kubernetes插件/dcgm-dcu/dcgm-dcu-v2.1.0.run"
DCGM_PORT=16081
SERVICE_NAME="dcgm-dcu"
SERVICE_USER="root"  # 可根据需要修改运行用户

# 定义颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# 打印带颜色的信息
print_info() {
    echo -e "${GREEN}[INFO]${NC} $1"
}

print_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
}

print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# 检查是否为 root 用户
if [ "$EUID" -ne 0 ]; then 
    print_error "请使用 root 用户运行此脚本"
    exit 1
fi

# 切换到安装目录
cd "$INSTALL_DIR" || { print_error "无法切换到目录 $INSTALL_DIR"; exit 1; }

# 检查文件是否已存在
if [ -f "$DCGM_FILE" ]; then
    print_info "文件 $DCGM_FILE 已存在,跳过下载"
else
    print_info "开始下载 $DCGM_FILE ..."
    if wget "$DCGM_URL"; then
        print_info "下载完成"
    else
        print_error "下载失败"
        exit 1
    fi
fi

# 添加执行权限
print_info "添加执行权限..."
if chmod +x "$DCGM_FILE"; then
    print_info "权限设置完成"
else
    print_error "设置执行权限失败"
    exit 1
fi

# 执行安装
print_info "开始安装 DCGM-DCU..."
if ./"$DCGM_FILE"; then
    print_info "安装完成"
else
    print_error "安装失败"
    exit 1
fi

# 检查安装目录
DCGM_BIN_DIR="/opt/dcgm-dcu/bin"
if [ ! -d "$DCGM_BIN_DIR" ]; then
    print_error "安装目录 $DCGM_BIN_DIR 不存在"
    exit 1
fi

# 切换到 bin 目录
cd "$DCGM_BIN_DIR" || { print_error "无法切换到目录 $DCGM_BIN_DIR"; exit 1; }

# 检查是否已有进程在运行
if pgrep -f "dcgm-dcu.*--port=$DCGM_PORT" > /dev/null; then
    print_warn "DCGM-DCU 进程已在端口 $DCGM_PORT 上运行"
    print_info "正在运行的进程:"
    ps aux | grep "dcgm-dcu.*--port=$DCGM_PORT" | grep -v grep
    read -p "是否要停止现有进程并重新启动? (y/n): " -n 1 -r
    echo
    if [[ $REPLY =~ ^[Yy]$ ]]; then
        print_info "停止现有进程..."
        pkill -f "dcgm-dcu.*--port=$DCGM_PORT"
        sleep 2
    else
        print_info "退出安装"
        exit 0
    fi
fi

# 停止已存在的服务(如果存在)
if systemctl list-unit-files | grep -q "$SERVICE_NAME.service"; then
    print_info "停止已存在的 $SERVICE_NAME 服务..."
    systemctl stop "$SERVICE_NAME" 2>/dev/null
    systemctl disable "$SERVICE_NAME" 2>/dev/null
fi

# 创建 systemd 服务文件
print_info "创建 systemd 服务文件..."
cat > /etc/systemd/system/${SERVICE_NAME}.service << EOF
[Unit]
Description=DCGM-DCU Monitoring Service
Documentation=https://github.com/your-repo/dcgm-dcu
After=network.target network-online.target
Wants=network-online.target

[Service]
Type=simple
User=${SERVICE_USER}
WorkingDirectory=${DCGM_BIN_DIR}
ExecStart=${DCGM_BIN_DIR}/dcgm-dcu --port=${DCGM_PORT}
ExecStop=/bin/kill -TERM \$MAINPID
ExecReload=/bin/kill -HUP \$MAINPID
Restart=on-failure
RestartSec=10
StandardOutput=append:${DCGM_BIN_DIR}/log.log
StandardError=append:${DCGM_BIN_DIR}/log.log
PIDFile=/run/${SERVICE_NAME}.pid

# 安全设置
NoNewPrivileges=yes
PrivateTmp=yes
ProtectSystem=strict
ProtectHome=yes
ReadWritePaths=${DCGM_BIN_DIR}

# 资源限制
LimitNOFILE=65536
LimitNPROC=4096

[Install]
WantedBy=multi-user.target
EOF

# 重新加载 systemd 配置
print_info "重新加载 systemd 配置..."
systemctl daemon-reload

# 启用服务
print_info "启用 $SERVICE_NAME 服务开机自启动..."
if systemctl enable "$SERVICE_NAME"; then
    print_info "服务已设置为开机自启动"
else
    print_error "设置开机自启动失败"
    exit 1
fi

# 启动服务
print_info "启动 $SERVICE_NAME 服务..."
if systemctl start "$SERVICE_NAME"; then
    print_info "服务启动成功"
else
    print_error "服务启动失败,请检查状态"
    systemctl status "$SERVICE_NAME" --no-pager
    exit 1
fi

# 等待2秒检查服务状态
sleep 2

# 检查服务状态
if systemctl is-active --quiet "$SERVICE_NAME"; then
    print_info "服务运行正常"
    print_info "访问地址: http://localhost:$DCGM_PORT"
    print_info "日志文件: ${DCGM_BIN_DIR}/log.log"
    print_info "查看日志: tail -f ${DCGM_BIN_DIR}/log.log"
    print_info "查看服务状态: systemctl status $SERVICE_NAME"
    print_info "停止服务: systemctl stop $SERVICE_NAME"
    print_info "重启服务: systemctl restart $SERVICE_NAME"
else
    print_error "服务启动失败,请检查以下信息:"
    echo ""
    print_info "服务状态:"
    systemctl status "$SERVICE_NAME" --no-pager
    echo ""
    print_info "最近日志:"
    journalctl -u "$SERVICE_NAME" -n 20 --no-pager
    echo ""
    print_info "应用日志:"
    tail -20 "${DCGM_BIN_DIR}/log.log" 2>/dev/null || echo "无法读取日志文件"
    exit 1
fi

# 创建简单的管理脚本
print_info "创建管理脚本..."
cat > /usr/local/bin/dcgm-dcu-manager << 'EOF'
#!/bin/bash

SERVICE_NAME="dcgm-dcu"
DCGM_PORT=16081

case "$1" in
    start)
        systemctl start $SERVICE_NAME
        echo "DCGM-DCU 服务已启动"
        ;;
    stop)
        systemctl stop $SERVICE_NAME
        echo "DCGM-DCU 服务已停止"
        ;;
    restart)
        systemctl restart $SERVICE_NAME
        echo "DCGM-DCU 服务已重启"
        ;;
    status)
        systemctl status $SERVICE_NAME
        ;;
    logs)
        tail -f /opt/dcgm-dcu/bin/log.log
        ;;
    journal)
        journalctl -u $SERVICE_NAME -f
        ;;
    *)
        echo "用法: $0 {start|stop|restart|status|logs|journal}"
        exit 1
        ;;
esac
EOF

chmod +x /usr/local/bin/dcgm-dcu-manager
print_info "管理脚本已创建: dcgm-dcu-manager"
print_info "使用示例: dcgm-dcu-manager {start|stop|restart|status|logs|journal}"

# 返回到原目录
cd - > /dev/null

echo ""
echo "=========================================="
print_info "安装和配置完成!"
echo "=========================================="
print_info "服务名称: $SERVICE_NAME"
print_info "服务状态: $(systemctl is-active $SERVICE_NAME)"
print_info "开机自启: $(systemctl is-enabled $SERVICE_NAME)"
echo ""
print_info "常用命令:"
echo "  systemctl status $SERVICE_NAME     # 查看服务状态"
echo "  systemctl restart $SERVICE_NAME    # 重启服务"
echo "  systemctl stop $SERVICE_NAME       # 停止服务"
echo "  journalctl -u $SERVICE_NAME -f     # 查看系统日志"
echo "  dcgm-dcu-manager logs              # 查看应用日志"
echo "=========================================="