Commit c0705977 authored by wangkaixiong's avatar wangkaixiong 🚴🏼
Browse files

init

parent d3982d85
-----BEGIN CERTIFICATE-----
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
-----END CERTIFICATE-----
# 使用官方 Python 镜像作为基础
FROM python:3.9-slim
# 安装系统依赖
RUN apt-get update && apt-get install -y \
build-essential \
libssl-dev \
libffi-dev \
python3-dev \
curl \
&& rm -rf /var/lib/apt/lists/*
# 设置工作目录
WORKDIR /app
# 复制监控脚本和相关文件
COPY /public/wkx/dcgm/mon /app
# 安装 Python 依赖
RUN pip install --no-cache-dir \
gradio \
requests \
pyyaml \
pandas \
plotly \
gunicorn
# 创建数据目录
RUN mkdir -p /app/data
# 创建启动脚本
COPY entrypoint.sh /app/
RUN chmod +x /app/entrypoint.sh
# 暴露端口
EXPOSE 7860
# 设置环境变量
ENV PYTHONUNBUFFERED=1
ENV DB_PATH=/app/data/monitor.db
ENV YAML_PATH=/app/dev_list.yaml
# 设置入口点
ENTRYPOINT ["/app/entrypoint.sh"]
# DCU 多机监控平台需求文档
## 1. 项目概述
- **项目名称**: DCU Multi-Machine Monitor
- **项目描述**: 基于 DCGM 的多台 GPU 服务器实时监控可视化平台
- **技术栈**: Python + Gradio + Plotly + SQLite
---
## 2. 功能需求
### 2.1 多机监控
- [x] 支持同时监控多台机器
- [x] 批量添加 URL(每行一个地址)
- [x] 单个添加 URL
- [x] 添加 URL 后自动开始监控
- [x] 支持移除已添加的目标
- [x] 监控数据保存到数据库
### 2.2 监控指标
从 API `http://<ip>:16081/CollectDeviceMetrics` 获取以下数据:
| 字段 | 说明 | 显示方式 |
|------|------|----------|
| PowerUsage | 功率 (W) | 折线图 |
| MemoryCap | 内存容量 | - |
| MemoryUsed | 内存使用量 | 计算利用率 |
| UtilizationRate | 利用率 (%) | 折线图 |
| Temperature | 温度 (°C) | 折线图 |
**内存利用率计算**
```
Mem Utilization (%) = ceil(MemoryUsed / MemoryCap * 100)
```
- 精度:整数(向上取整,不四舍五入)
### 2.3 数据存储
- **数据库**: SQLite (`/public/wkx/dcgm/mon/monitor.db`)
- **保留时间**: 最近 24 小时
- **自动清理**: 每次更新数据时删除超过 24 小时的数据
- **数据表**:
- `monitor_targets`: 监控目标 URL 列表
- `monitor_data`: 监控数据(包含 timestamp 字段)
### 2.4 图表展示
- [x] 4 个独立 Tab 页:Power、Memory、Utilization、Temperature
- [x] 每个设备单独的颜色曲线
- [x] 鼠标悬停显示详细数值
- [] 必须是折线图
### 2.5 时间轴功能
- **默认显示**: 最近 30 分钟
- **可调范围**: 5 分钟 ~ 24 小时(1440 分钟)
- [] 期望在2.4的每一个图表底部增加选中后,鼠标滚轮滑动缩放时间轴的功能
- [] 滑动鼠标滚轮:5 ~ 1440 分钟
### 2.6 UI 布局
- 图表尺寸:高度 800px(比原来增加 1 倍)
- 目标选择:下拉框切换不同机器
- 目标列表:滚动显示所有已添加的目标
---
## 3. 页面交互
### 3.1 添加监控目标
**方式一:单个添加**
```
输入框: http://10.20.100.12:16081/CollectDeviceMetrics
按钮: Add Single URL
```
**方式二:批量添加**
```
输入框: 每行一个URL
http://10.20.100.12:16081/CollectDeviceMetrics
http://10.20.100.13:16081/CollectDeviceMetrics
...
按钮: Batch Add URLs
```
### 3.2 监控控制
- **Start Monitoring**: 启动监控(每5秒采集一次)
- **Refresh Charts**: 刷新图表数据
- [] 增加监控失败的提示,如果失败,出现按钮,删除指定机器的监控任务;
### 3.3 多个监控目标的显示
- [] 多个监控目标的话,在监控栏,以每个机器为一个矩形区,从上向下,每台机器增加1个监控目标的监控折线图展示
- [] 每个机器的矩形区的右侧,增加停机并监控目标的按钮
---
## 4. 配置文件
## 5. API 数据格式
### 5.1 输入数据
```json
[
{
"MinorNumber": 0,
"PciBusNumber": "0000:00:08.0",
"DeviceId": "TPXS320016070601",
"SubSystemName": "K100 AI",
"Temperature": 57,
"PowerUsage": 113,
"PowerCap": 400,
"MemoryCap": 68702699520,
"MemoryUsed": 2142208,
"UtilizationRate": 0,
"PcieBwMb": 0,
"Clk": 600,
"Socclk": 309,
"PerfLevel": "AUTO"
}
]
```
### 5.2 输出图表
- **Power**: 功率监控 (W)
- **Memory**: 内存利用率 (%), 范围 0-100
- **Utilization**: 利用率 (%), 范围 0-100
- **Temperature**: 温度 (°C)
#!/bin/bash
# 设置变量
INSTALL_DIR="/opt"
DCGM_FILE="dcgm-dcu-v2.1.0.run"
DCGM_URL="https://download.sourcefind.cn:65024/file/5/Kubernetes插件/dcgm-dcu/dcgm-dcu-v2.1.0.run"
DCGM_PORT=16081
SERVICE_NAME="dcgm-dcu"
SERVICE_USER="root" # 可根据需要修改运行用户
# 定义颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# 打印带颜色的信息
print_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
print_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查是否为 root 用户
if [ "$EUID" -ne 0 ]; then
print_error "请使用 root 用户运行此脚本"
exit 1
fi
# 切换到安装目录
cd "$INSTALL_DIR" || { print_error "无法切换到目录 $INSTALL_DIR"; exit 1; }
# 检查文件是否已存在
if [ -f "$DCGM_FILE" ]; then
print_info "文件 $DCGM_FILE 已存在,跳过下载"
else
print_info "开始下载 $DCGM_FILE ..."
if wget "$DCGM_URL"; then
print_info "下载完成"
else
print_error "下载失败"
exit 1
fi
fi
# 添加执行权限
print_info "添加执行权限..."
if chmod +x "$DCGM_FILE"; then
print_info "权限设置完成"
else
print_error "设置执行权限失败"
exit 1
fi
# 执行安装
print_info "开始安装 DCGM-DCU..."
if ./"$DCGM_FILE"; then
print_info "安装完成"
else
print_error "安装失败"
exit 1
fi
# 检查安装目录
DCGM_BIN_DIR="/opt/dcgm-dcu/bin"
if [ ! -d "$DCGM_BIN_DIR" ]; then
print_error "安装目录 $DCGM_BIN_DIR 不存在"
exit 1
fi
# 切换到 bin 目录
cd "$DCGM_BIN_DIR" || { print_error "无法切换到目录 $DCGM_BIN_DIR"; exit 1; }
# 检查是否已有进程在运行
if pgrep -f "dcgm-dcu.*--port=$DCGM_PORT" > /dev/null; then
print_warn "DCGM-DCU 进程已在端口 $DCGM_PORT 上运行"
print_info "正在运行的进程:"
ps aux | grep "dcgm-dcu.*--port=$DCGM_PORT" | grep -v grep
read -p "是否要停止现有进程并重新启动? (y/n): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
print_info "停止现有进程..."
pkill -f "dcgm-dcu.*--port=$DCGM_PORT"
sleep 2
else
print_info "退出安装"
exit 0
fi
fi
# 停止已存在的服务(如果存在)
if systemctl list-unit-files | grep -q "$SERVICE_NAME.service"; then
print_info "停止已存在的 $SERVICE_NAME 服务..."
systemctl stop "$SERVICE_NAME" 2>/dev/null
systemctl disable "$SERVICE_NAME" 2>/dev/null
fi
# 创建 systemd 服务文件
print_info "创建 systemd 服务文件..."
cat > /etc/systemd/system/${SERVICE_NAME}.service << EOF
[Unit]
Description=DCGM-DCU Monitoring Service
Documentation=https://github.com/your-repo/dcgm-dcu
After=network.target network-online.target
Wants=network-online.target
[Service]
Type=simple
User=${SERVICE_USER}
WorkingDirectory=${DCGM_BIN_DIR}
ExecStart=${DCGM_BIN_DIR}/dcgm-dcu --port=${DCGM_PORT}
ExecStop=/bin/kill -TERM \$MAINPID
ExecReload=/bin/kill -HUP \$MAINPID
Restart=on-failure
RestartSec=10
StandardOutput=append:${DCGM_BIN_DIR}/log.log
StandardError=append:${DCGM_BIN_DIR}/log.log
PIDFile=/run/${SERVICE_NAME}.pid
# 安全设置
NoNewPrivileges=yes
PrivateTmp=yes
ProtectSystem=strict
ProtectHome=yes
ReadWritePaths=${DCGM_BIN_DIR}
# 资源限制
LimitNOFILE=65536
LimitNPROC=4096
[Install]
WantedBy=multi-user.target
EOF
# 重新加载 systemd 配置
print_info "重新加载 systemd 配置..."
systemctl daemon-reload
# 启用服务
print_info "启用 $SERVICE_NAME 服务开机自启动..."
if systemctl enable "$SERVICE_NAME"; then
print_info "服务已设置为开机自启动"
else
print_error "设置开机自启动失败"
exit 1
fi
# 启动服务
print_info "启动 $SERVICE_NAME 服务..."
if systemctl start "$SERVICE_NAME"; then
print_info "服务启动成功"
else
print_error "服务启动失败,请检查状态"
systemctl status "$SERVICE_NAME" --no-pager
exit 1
fi
# 等待2秒检查服务状态
sleep 2
# 检查服务状态
if systemctl is-active --quiet "$SERVICE_NAME"; then
print_info "服务运行正常"
print_info "访问地址: http://localhost:$DCGM_PORT"
print_info "日志文件: ${DCGM_BIN_DIR}/log.log"
print_info "查看日志: tail -f ${DCGM_BIN_DIR}/log.log"
print_info "查看服务状态: systemctl status $SERVICE_NAME"
print_info "停止服务: systemctl stop $SERVICE_NAME"
print_info "重启服务: systemctl restart $SERVICE_NAME"
else
print_error "服务启动失败,请检查以下信息:"
echo ""
print_info "服务状态:"
systemctl status "$SERVICE_NAME" --no-pager
echo ""
print_info "最近日志:"
journalctl -u "$SERVICE_NAME" -n 20 --no-pager
echo ""
print_info "应用日志:"
tail -20 "${DCGM_BIN_DIR}/log.log" 2>/dev/null || echo "无法读取日志文件"
exit 1
fi
# 创建简单的管理脚本
print_info "创建管理脚本..."
cat > /usr/local/bin/dcgm-dcu-manager << 'EOF'
#!/bin/bash
SERVICE_NAME="dcgm-dcu"
DCGM_PORT=16081
case "$1" in
start)
systemctl start $SERVICE_NAME
echo "DCGM-DCU 服务已启动"
;;
stop)
systemctl stop $SERVICE_NAME
echo "DCGM-DCU 服务已停止"
;;
restart)
systemctl restart $SERVICE_NAME
echo "DCGM-DCU 服务已重启"
;;
status)
systemctl status $SERVICE_NAME
;;
logs)
tail -f /opt/dcgm-dcu/bin/log.log
;;
journal)
journalctl -u $SERVICE_NAME -f
;;
*)
echo "用法: $0 {start|stop|restart|status|logs|journal}"
exit 1
;;
esac
EOF
chmod +x /usr/local/bin/dcgm-dcu-manager
print_info "管理脚本已创建: dcgm-dcu-manager"
print_info "使用示例: dcgm-dcu-manager {start|stop|restart|status|logs|journal}"
# 返回到原目录
cd - > /dev/null
echo ""
echo "=========================================="
print_info "安装和配置完成!"
echo "=========================================="
print_info "服务名称: $SERVICE_NAME"
print_info "服务状态: $(systemctl is-active $SERVICE_NAME)"
print_info "开机自启: $(systemctl is-enabled $SERVICE_NAME)"
echo ""
print_info "常用命令:"
echo " systemctl status $SERVICE_NAME # 查看服务状态"
echo " systemctl restart $SERVICE_NAME # 重启服务"
echo " systemctl stop $SERVICE_NAME # 停止服务"
echo " journalctl -u $SERVICE_NAME -f # 查看系统日志"
echo " dcgm-dcu-manager logs # 查看应用日志"
echo "=========================================="
\ No newline at end of file
http://10.20.100.12:16081/CollectDeviceMetrics
http://10.20.100.13:16081/CollectDeviceMetrics
http://10.20.100.14:16081/CollectDeviceMetrics
http://10.20.100.19:16081/CollectDeviceMetrics
http://10.20.100.15:16081/CollectDeviceMetrics
#!/bin/bash
# 启动脚本 - 自动执行 DCU 监控
# 检查 YAML 文件是否存在,不存在则创建默认配置
if [ ! -f "$YAML_PATH" ]; then
echo "创建默认 YAML 配置文件..."
cat > "$YAML_PATH" << EOF
targets:
- http://10.20.100.13:16081/CollectDeviceMetrics
- http://10.20.100.12:16081/CollectDeviceMetrics
EOF
echo "默认配置文件创建完成"
fi
# 检查数据库目录是否存在
mkdir -p "$(dirname "$DB_PATH")"
# 启动监控脚本
echo "启动 DCU 监控服务..."
# 方法1:直接运行(适合调试)
python /app/new_ds_mon.py
# 方法2:使用 gunicorn(适合生产环境)
# gunicorn -w 1 -b 0.0.0.0:7860 --chdir /app new_ds_mon:app
import gradio as gr
import requests
import json
import time
import sqlite3
import os
import yaml
from datetime import datetime, timedelta
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import threading
import queue
from typing import List, Dict, Any
import re
import traceback
class DCUMultiMonitor:
def __init__(self, db_path="/public/wkx/dcgm/mon/monitor.db", yaml_path="dev_list.yaml"):
self.db_path = db_path
self.yaml_path = yaml_path
self.targets = set() # 存储所有监控目标
self.target_status = {} # 存储每个目标的状态
self.data_lock = threading.Lock() # 数据锁
self.monitoring = False
self.monitor_thread = None
self.failed_targets = set() # 存储失败的目标
self.ensure_db_exists()
self.load_targets_from_yaml() # 从YAML文件加载目标
def ensure_db_exists(self):
"""确保数据库和表存在"""
os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='monitor_data';")
table_exists = cursor.fetchone()
if table_exists:
cursor.execute("PRAGMA table_info(monitor_data);")
columns = [col[1] for col in cursor.fetchall()]
expected_columns = ['url', 'minor_number', 'timestamp', 'power_usage', 'memory_used', 'memory_cap', 'utilization_rate', 'temperature']
missing_columns = [col for col in expected_columns if col not in columns]
if missing_columns:
print(f"检测到表结构不完整,缺少列: {missing_columns}")
print("正在重建表...")
cursor.execute("DROP TABLE monitor_data;")
cursor.execute('''
CREATE TABLE monitor_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL,
minor_number INTEGER NOT NULL,
timestamp DATETIME NOT NULL,
power_usage REAL,
memory_used REAL,
memory_cap REAL,
utilization_rate REAL,
temperature REAL,
FOREIGN KEY (url) REFERENCES monitor_targets (url)
)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS monitor_targets (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
added_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS monitor_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL,
minor_number INTEGER NOT NULL,
timestamp DATETIME NOT NULL,
power_usage REAL,
memory_used REAL,
memory_cap REAL,
utilization_rate REAL,
temperature REAL,
FOREIGN KEY (url) REFERENCES monitor_targets (url)
)
''')
conn.commit()
conn.close()
def load_targets_from_yaml(self):
"""从YAML文件加载监控目标"""
if os.path.exists(self.yaml_path):
try:
with open(self.yaml_path, 'r', encoding='utf-8') as f:
urls = [line.strip() for line in f.readlines() if line.strip()]
self.targets = set(urls)
print(f"从YAML文件加载了 {len(self.targets)} 个监控目标")
except Exception as e:
print(f"读取YAML文件失败: {e}")
self.targets = set()
else:
print(f"YAML文件 {self.yaml_path} 不存在,创建新文件")
self.targets = set()
self.save_targets_to_yaml()
for target in self.targets:
self.target_status[target] = True
def save_targets_to_yaml(self):
"""保存监控目标到YAML文件"""
try:
with open(self.yaml_path, 'w', encoding='utf-8') as f:
for url in sorted(self.targets):
f.write(f"{url}\n")
print(f"保存 {len(self.targets)} 个监控目标到YAML文件")
return True
except Exception as e:
print(f"保存YAML文件失败: {e}")
return False
def add_target(self, url: str):
"""添加监控目标"""
if self.is_valid_url(url):
if url not in self.targets:
self.targets.add(url)
self.target_status[url] = True
success = self.save_targets_to_yaml()
if success:
print(f"已添加目标: {url}")
return True, f"成功添加: {url}"
else:
self.targets.remove(url)
return False, f"保存失败: {url}"
else:
return False, f"目标已存在: {url}"
return False, f"无效URL格式: {url}"
def remove_target_by_ip(self, ip: str):
"""通过IP地址删除监控目标"""
matched_url = None
for url in self.targets:
if ip in url:
matched_url = url
break
if matched_url:
self.targets.remove(matched_url)
if matched_url in self.target_status:
del self.target_status[matched_url]
if matched_url in self.failed_targets:
self.failed_targets.remove(matched_url)
self.save_targets_to_yaml()
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("DELETE FROM monitor_targets WHERE url = ?", (matched_url,))
cursor.execute("DELETE FROM monitor_data WHERE url = ?", (matched_url,))
conn.commit()
conn.close()
return True, f"已删除: {matched_url}"
return False, f"未找到包含IP {ip} 的设备"
def is_valid_url(self, url: str) -> bool:
"""验证URL格式"""
pattern = r'^https?://[\w\.-]+:\d+/.*$'
return bool(re.match(pattern, url))
def extract_ip_from_url(self, url: str) -> str:
"""从URL中提取IP地址"""
match = re.search(r'https?://([\w\.-]+):', url)
if match:
return match.group(1)
return url
def fetch_dcu_data(self, url: str) -> List[Dict[str, Any]]:
"""获取DCU数据,增加超时和异常处理"""
try:
# 设置更短的超时时间,避免卡死
response = requests.get(url, timeout=3)
if response.status_code == 200:
data = response.json()
if isinstance(data, list):
for item in data:
if item.get('MemoryCap', 0) > 0:
mem_util = (item['MemoryUsed'] / item['MemoryCap']) * 100
item['MemoryUtilization'] = int(mem_util)
else:
item['MemoryUtilization'] = 0
return data
return []
except requests.exceptions.Timeout:
print(f"获取数据超时 {url}")
return []
except requests.exceptions.ConnectionError:
print(f"连接失败 {url}")
return []
except Exception as e:
print(f"获取数据失败 {url}: {str(e)}")
return []
def save_data_to_db(self, url: str, data: List[Dict[str, Any]]):
"""保存数据到数据库"""
with self.data_lock:
try:
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cutoff_time = datetime.now() - timedelta(hours=24)
cursor.execute("DELETE FROM monitor_data WHERE timestamp < ?", (cutoff_time,))
for item in data:
cursor.execute('''
INSERT INTO monitor_data
(url, minor_number, timestamp, power_usage, memory_used, memory_cap, utilization_rate, temperature)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''', (
url,
item['MinorNumber'],
datetime.now(),
item['PowerUsage'],
item['MemoryUsed'],
item['MemoryCap'],
item['UtilizationRate'],
item['Temperature']
))
conn.commit()
conn.close()
except Exception as e:
print(f"保存数据到数据库失败 {url}: {str(e)}")
def get_recent_data(self, url: str, minutes: int = 30) -> pd.DataFrame:
"""获取最近数据,使用固定30分钟"""
try:
with self.data_lock:
conn = sqlite3.connect(self.db_path)
query = '''
SELECT url, minor_number, timestamp, power_usage, memory_used, memory_cap, utilization_rate, temperature
FROM monitor_data
WHERE url = ? AND timestamp >= datetime('now', '-30 minutes')
ORDER BY timestamp ASC
'''
df = pd.read_sql_query(query, conn, params=(url,))
conn.close()
return df
except Exception as e:
print(f"获取数据失败: {str(e)}")
return pd.DataFrame()
def start_monitoring(self):
"""开始监控"""
if self.monitoring:
return "监控已在运行中"
self.monitoring = True
def monitor_loop():
while self.monitoring:
for url in list(self.targets):
if not self.target_status.get(url, True):
continue
try:
data = self.fetch_dcu_data(url)
if data:
self.save_data_to_db(url, data)
if url in self.failed_targets:
self.failed_targets.remove(url)
else:
# 第一次失败就标记为失败
if url not in self.failed_targets:
self.failed_targets.add(url)
print(f"设备 {url} 连接失败")
except Exception as e:
print(f"监控循环中发生错误 {url}: {str(e)}")
if url not in self.failed_targets:
self.failed_targets.add(url)
time.sleep(5)
self.monitor_thread = threading.Thread(target=monitor_loop)
self.monitor_thread.daemon = True
self.monitor_thread.start()
return f"监控已启动,当前监控 {len(self.targets)} 个目标"
def stop_monitoring(self):
"""停止监控"""
self.monitoring = False
if self.monitor_thread:
self.monitor_thread.join(timeout=1)
return "监控已停止"
def get_all_targets(self):
"""获取所有目标"""
return list(self.targets)
def get_target_status_info(self):
"""获取目标状态信息"""
targets_info = []
for target in self.targets:
ip = self.extract_ip_from_url(target)
status = "✅" if target not in self.failed_targets else "❌"
targets_info.append({
'url': target,
'ip': ip,
'status': status
})
return targets_info
def refresh_targets(self):
"""刷新目标列表"""
self.load_targets_from_yaml()
return self.get_target_status_info()
def create_charts_for_target(self, url: str):
"""为单个目标创建4个图表"""
try:
df = self.get_recent_data(url)
if df.empty:
fig_power = go.Figure()
fig_power.update_layout(title=f'{self.extract_ip_from_url(url)} - 功率监控 (W)', height=600)
fig_memory = go.Figure()
fig_memory.update_layout(title=f'{self.extract_ip_from_url(url)} - 内存利用率 (%)', height=600)
fig_util = go.Figure()
fig_util.update_layout(title=f'{self.extract_ip_from_url(url)} - 利用率 (%)', height=600)
fig_temp = go.Figure()
fig_temp.update_layout(title=f'{self.extract_ip_from_url(url)} - 温度 (°C)', height=600)
return fig_power, fig_memory, fig_util, fig_temp
fig_power = go.Figure()
fig_memory = go.Figure()
fig_util = go.Figure()
fig_temp = go.Figure()
for device_id in df['minor_number'].unique():
device_data = df[df['minor_number'] == device_id]
device_data = device_data.copy()
device_data['mem_util'] = ((device_data['memory_used'] / device_data['memory_cap']) * 100).round().astype(int)
fig_power.add_trace(go.Scatter(
x=device_data['timestamp'],
y=device_data['power_usage'],
mode='lines+markers',
name=f'Device {device_id}',
line=dict(width=2),
marker=dict(size=4)
))
fig_memory.add_trace(go.Scatter(
x=device_data['timestamp'],
y=device_data['mem_util'],
mode='lines+markers',
name=f'Device {device_id}',
line=dict(width=2),
marker=dict(size=4)
))
fig_util.add_trace(go.Scatter(
x=device_data['timestamp'],
y=device_data['utilization_rate'],
mode='lines+markers',
name=f'Device {device_id}',
line=dict(width=2),
marker=dict(size=4)
))
fig_temp.add_trace(go.Scatter(
x=device_data['timestamp'],
y=device_data['temperature'],
mode='lines+markers',
name=f'Device {device_id}',
line=dict(width=2),
marker=dict(size=4)
))
fig_power.update_layout(
title=f'{self.extract_ip_from_url(url)} - 功率监控 (W)',
xaxis_title='时间',
yaxis_title='功率 (W)',
hovermode='x unified',
height=600
)
fig_memory.update_layout(
title=f'{self.extract_ip_from_url(url)} - 内存利用率 (%)',
xaxis_title='时间',
yaxis_title='内存利用率 (%)',
yaxis=dict(range=[0, 100]),
hovermode='x unified',
height=600
)
fig_util.update_layout(
title=f'{self.extract_ip_from_url(url)} - 利用率 (%)',
xaxis_title='时间',
yaxis_title='利用率 (%)',
yaxis=dict(range=[0, 100]),
hovermode='x unified',
height=600
)
fig_temp.update_layout(
title=f'{self.extract_ip_from_url(url)} - 温度 (°C)',
xaxis_title='时间',
yaxis_title='温度 (°C)',
hovermode='x unified',
height=600
)
return fig_power, fig_memory, fig_util, fig_temp
except Exception as e:
print(f"创建图表时发生错误: {str(e)}")
error_fig = go.Figure()
error_fig.update_layout(title=f'数据加载失败: {str(e)}', height=600)
return error_fig, error_fig, error_fig, error_fig
def refresh_all_charts(self, selected_target):
"""刷新所有图表"""
try:
if not selected_target or selected_target not in self.targets:
empty_fig = go.Figure()
empty_fig.update_layout(title="请选择一个监控目标", height=600)
return empty_fig, empty_fig, empty_fig, empty_fig, ""
if selected_target in self.failed_targets:
error_msg = f"错误:无法获取 {selected_target} 的数据,请检查连接"
empty_fig = go.Figure()
empty_fig.update_layout(title=f"{self.extract_ip_from_url(selected_target)} - 连接失败", height=600)
return empty_fig, empty_fig, empty_fig, empty_fig, error_msg
charts = self.create_charts_for_target(selected_target)
return *charts, f"最后更新: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
except Exception as e:
print(f"刷新图表时发生错误: {str(e)}")
error_fig = go.Figure()
error_fig.update_layout(title=f'刷新失败: {str(e)}', height=600)
return error_fig, error_fig, error_fig, error_fig, f"刷新失败: {str(e)}"
# 创建监控实例
monitor = DCUMultiMonitor()
# 创建Gradio界面
with gr.Blocks(title="DCU多机监控平台", theme=gr.themes.Soft()) as demo:
gr.Markdown("# DCU多机监控平台")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 添加监控目标")
single_url = gr.Textbox(label="单个URL", placeholder="http://ip:16081/CollectDeviceMetrics")
single_add_btn = gr.Button("添加单个URL", variant="primary")
gr.Markdown("### 监控控制")
start_btn = gr.Button("开始监控", variant="primary")
stop_btn = gr.Button("停止监控", variant="stop")
refresh_btn = gr.Button("刷新设备列表", variant="secondary")
status_output = gr.Textbox(label="状态", interactive=False)
gr.Markdown("### 监控设备列表")
device_radio = gr.Radio(label="选择设备", choices=[], interactive=True)
gr.Markdown("### 删除设备")
delete_ip = gr.Textbox(label="输入IP地址", placeholder="例如: 10.20.100.12")
delete_btn = gr.Button("删除设备", variant="stop")
with gr.Column(scale=3):
gr.Markdown("### 监控图表")
with gr.Tab("功率监控"):
power_chart = gr.Plot()
with gr.Tab("内存利用率"):
memory_chart = gr.Plot()
with gr.Tab("利用率"):
utilization_chart = gr.Plot()
with gr.Tab("温度"):
temperature_chart = gr.Plot()
last_update = gr.Textbox(label="更新时间", interactive=False)
def update_device_list():
"""更新设备列表"""
try:
targets_info = monitor.get_target_status_info()
if not targets_info:
return gr.Radio(choices=[], label="选择设备", value=None)
choices = []
for info in targets_info:
display_text = f"{info['status']} {info['ip']}"
choice_value = info['url']
choices.append((display_text, choice_value))
return gr.Radio(choices=choices, label="选择设备", value=None)
except Exception as e:
print(f"更新设备列表时发生错误: {str(e)}")
return gr.Radio(choices=[], label="选择设备", value=None)
def handle_single_add(url):
"""处理添加单个URL"""
try:
success, message = monitor.add_target(url)
if success:
return message, update_device_list()
else:
return message, update_device_list()
except Exception as e:
return f"添加失败: {str(e)}", update_device_list()
def handle_delete_device(ip):
"""处理删除设备"""
try:
if ip.strip():
success, message = monitor.remove_target_by_ip(ip.strip())
if success:
return message, update_device_list(), "" # 清空输入框
else:
return message, update_device_list(), ip
return "请输入IP地址", update_device_list(), ip
except Exception as e:
return f"删除失败: {str(e)}", update_device_list(), ip
def handle_refresh():
"""刷新设备列表"""
try:
targets_info = monitor.refresh_targets()
return f"已刷新,当前监控 {len(targets_info)} 个设备", update_device_list()
except Exception as e:
return f"刷新失败: {str(e)}", update_device_list()
def handle_device_select(selected_url):
"""处理设备选择,自动刷新图表"""
try:
if selected_url:
result = monitor.refresh_all_charts(selected_url)
return result
return monitor.refresh_all_charts(None)
except Exception as e:
error_msg = f"刷新图表失败: {str(e)}"
error_fig = go.Figure()
error_fig.update_layout(title=error_msg, height=600)
return error_fig, error_fig, error_fig, error_fig, error_msg
# 绑定事件
single_add_btn.click(
fn=handle_single_add,
inputs=single_url,
outputs=[status_output, device_radio]
)
delete_btn.click(
fn=handle_delete_device,
inputs=delete_ip,
outputs=[status_output, device_radio, delete_ip]
)
start_btn.click(
fn=monitor.start_monitoring,
inputs=None,
outputs=status_output
)
stop_btn.click(
fn=monitor.stop_monitoring,
inputs=None,
outputs=status_output
)
refresh_btn.click(
fn=handle_refresh,
inputs=None,
outputs=[status_output, device_radio]
)
# 设备选择变化时自动刷新图表
device_radio.change(
fn=handle_device_select,
inputs=device_radio,
outputs=[power_chart, memory_chart, utilization_chart, temperature_chart, last_update]
)
# 初始化设备列表
demo.load(fn=update_device_list, outputs=device_radio)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
\ No newline at end of file
gradio==6.3.0
requests==2.31.0
plotly==5.20.0
pandas==2.2.2
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment