Commit bc71a5ea authored by zk's avatar zk
Browse files

Add GPU DCU monitor with grouping

parents
node_modules/
data/servers.json
*.log
.env
# GPU/DCU 资源看板
一个面向测试团队共享使用的 GPU/DCU 服务器占用看板。它通过 SSH 定时在服务器上执行 `hy-smi``nvidia-smi`,集中展示每台机器的卡数、型号、显存占用、算力占用、温度、功耗、空闲/占用/离线状态。
## 推荐使用方式
最推荐的方式是:**在一台能免密 SSH 登录所有服务器的电脑或跳板机上部署一次,然后把网页地址共享给大家。**
这样每个同事不用单独安装,也不用每个人都配置服务器 SSH 密钥。大家只需要访问:
```text
http://部署机器IP:3066
```
## 快速启动
Windows 用户可以双击:
```text
start-windows.bat
```
或者手动启动:
```powershell
npm start
```
默认地址:
```text
http://localhost:3066
```
如果要给同网段同事访问,在部署机器的防火墙放行 `3066` 端口后,让大家访问:
```text
http://你的电脑IP:3066
```
## 使用前提
- 部署机器需要安装 Node.js 18 或更高版本。
- 部署机器需要能通过 SSH 免密登录目标服务器。
- 海光服务器上需要能执行 `hy-smi`
- NVIDIA 服务器上需要能执行 `nvidia-smi`
建议先在部署机器上验证:
```powershell
ssh root@10.0.0.12 hy-smi
ssh root@10.0.0.13 nvidia-smi
```
## 功能
- 网页端添加、编辑、删除服务器。
- 自动识别卡数。
- 自动识别显卡型号:
- 海光:通过 `hy-smi --showproductname` 识别,例如 `BW100``BW150`,未来新型号也会按输出自动提取。
- NVIDIA:通过 `nvidia-smi --query-gpu=name` 识别。
- 同时展示显存占用和算力占用。
- 显存泄漏或 vLLM 服务未释放显存时,即使算力为 0,也会判定该卡不可用。
- 主界面水位色块按占用率从下往上填充。
- 占用颜色分级:绿色、黄绿色、橘色、红色。
- 默认每 10 秒自动采集一次,也可以点“手动刷新”立即刷新。
## 配置说明
网页里点击“添加服务器”即可写入配置。真实配置保存在:
```text
data/servers.json
```
该文件已被 `.gitignore` 忽略,避免把真实服务器地址提交到仓库。示例配置见:
```text
data/servers.sample.json
```
服务器支持 `group` 分组字段,适合公共部署时按项目或客户线管理,例如“通信中兴组”“政府联想组”“企业浪潮组”“金融华三组”“深度组”。页面会自动生成分组筛选入口;`tags` 仍然可以用于补充“8卡”“回归”“临时”等标签。
## 环境变量
```powershell
$env:PORT=3066
$env:POLL_INTERVAL_MS=10000
$env:SSH_TIMEOUT_MS=8000
npm start
```
常用配置:
- `PORT`:网页端口,默认 `3066`
- `POLL_INTERVAL_MS`:自动采集间隔,默认 `10000` 毫秒。
- `SSH_TIMEOUT_MS`:单台服务器 SSH 超时,默认 `8000` 毫秒。
- `SSH_PATH`:自定义 SSH 程序路径。Windows 默认使用 `C:\Windows\System32\OpenSSH\ssh.exe`
## 上传到 GitHub
首次上传:
```powershell
git init
git add .
git commit -m "Initial GPU DCU monitor"
git branch -M main
git remote add origin https://github.com/你的用户名/你的仓库名.git
git push -u origin main
```
后续更新:
```powershell
git add .
git commit -m "Update dashboard"
git push
```
## 给同事一键使用
最简单的方式:
1. 在 GitHub 页面点击 `Code` -> `Download ZIP`
2. 解压 ZIP。
3. 双击 `start-windows.bat`
4. 打开或分享 `http://localhost:3066` / `http://部署机器IP:3066`
更推荐的团队方式:
1. 由你在一台固定机器上运行 `start-windows.bat`
2. 你在网页里添加所有服务器。
3. 同事只访问 `http://部署机器IP:3066`
如果你想做到真正的“无需安装 Node.js,双击一个 exe 就能运行”,后续可以再做 Windows 打包版。常见路线是 Electron、pkg/nexe 或 Inno Setup,但会比当前轻量 Web 版复杂一些。
## 注意事项
- 不要把 `data/servers.json` 提交到 GitHub,里面可能包含内部服务器 IP。
- 不要把 SSH 私钥提交到 GitHub。
- 如果同事访问不了网页,优先检查部署机器防火墙是否放行了 `3066` 端口。
[
{
"id": "sample-8-card",
"name": "通信中兴测试机 A",
"host": "10.0.0.12",
"port": 22,
"user": "root",
"command": "hy-smi",
"group": "通信中兴组",
"tags": ["8卡", "公共池"]
},
{
"id": "sample-4-card",
"name": "政府联想回归机 B",
"host": "10.0.0.23",
"port": 22,
"user": "root",
"command": "hy-smi",
"group": "政府联想组",
"tags": ["4卡", "夜间回归"]
}
]
{
"name": "gpu-dcu-server-monitor",
"version": "0.1.0",
"private": true,
"description": "Shared web dashboard for monitoring GPU/DCU server occupancy.",
"type": "commonjs",
"scripts": {
"start": "node server.js"
},
"engines": {
"node": ">=18"
}
}
const state = {
servers: [],
filter: "all",
groupFilter: "all",
query: "",
selectedId: null,
pollIntervalMs: 10000,
timer: null
};
const els = {
grid: document.querySelector("#serverGrid"),
empty: document.querySelector("#emptyState"),
detail: document.querySelector("#detailPanel"),
groupFilters: document.querySelector("#groupFilters"),
groupOptions: document.querySelector("#groupOptions"),
lastRefresh: document.querySelector("#lastRefresh"),
search: document.querySelector("#searchInput"),
toast: document.querySelector("#toast"),
dialog: document.querySelector("#serverDialog"),
form: document.querySelector("#serverForm"),
dialogTitle: document.querySelector("#dialogTitle"),
deleteBtn: document.querySelector("#deleteServerBtn"),
fields: {
id: document.querySelector("#serverId"),
name: document.querySelector("#serverName"),
host: document.querySelector("#serverHost"),
user: document.querySelector("#serverUser"),
port: document.querySelector("#serverPort"),
command: document.querySelector("#serverCommand"),
group: document.querySelector("#serverGroup"),
tags: document.querySelector("#serverTags")
}
};
document.querySelector("#addServerBtn").addEventListener("click", () => openDialog());
document.querySelector("#emptyAddBtn").addEventListener("click", () => openDialog());
document.querySelector("#closeDialogBtn").addEventListener("click", () => els.dialog.close());
document.querySelector("#refreshBtn").addEventListener("click", manualRefresh);
document.querySelectorAll(".filter").forEach((button) => {
button.addEventListener("click", () => {
state.filter = button.dataset.filter;
document.querySelectorAll(".filter").forEach((item) => item.classList.toggle("active", item === button));
render();
});
});
els.search.addEventListener("input", () => {
state.query = els.search.value.trim().toLowerCase();
render();
});
els.form.addEventListener("submit", saveServer);
els.deleteBtn.addEventListener("click", deleteSelectedServer);
loadServers();
async function loadServers() {
try {
const payload = await requestJson("/api/servers");
state.servers = payload.servers || [];
state.pollIntervalMs = payload.pollIntervalMs || state.pollIntervalMs;
els.lastRefresh.textContent = payload.lastRefresh ? `更新 ${formatTime(payload.lastRefresh)}` : "等待刷新";
if (!state.selectedId && state.servers[0]) state.selectedId = state.servers[0].id;
if (state.selectedId && !state.servers.some((server) => server.id === state.selectedId)) {
state.selectedId = state.servers[0]?.id || null;
}
render();
scheduleNextLoad();
} catch (error) {
showToast(error.message);
scheduleNextLoad();
}
}
function scheduleNextLoad() {
window.clearTimeout(state.timer);
state.timer = window.setTimeout(loadServers, state.pollIntervalMs);
}
async function manualRefresh() {
const button = document.querySelector("#refreshBtn");
button.disabled = true;
try {
await requestJson("/api/refresh", { method: "POST" });
await loadServers();
showToast("刷新完成");
} catch (error) {
showToast(error.message);
} finally {
button.disabled = false;
}
}
function render() {
renderStats();
renderGroups();
renderGrid();
renderDetail();
}
function renderGroups() {
const groups = groupSummaries();
if (!groups.some((group) => group.name === state.groupFilter)) {
state.groupFilter = "all";
}
if (els.groupFilters) {
els.groupFilters.innerHTML = [
groupButtonHtml("all", "全部分组", state.servers.length),
...groups.map((group) => groupButtonHtml(group.name, group.name, group.count))
].join("");
els.groupFilters.querySelectorAll(".group-filter").forEach((button) => {
button.addEventListener("click", () => {
state.groupFilter = button.dataset.group;
render();
});
});
}
if (els.groupOptions) {
const defaults = ["通信中兴组", "政府联想组", "企业浪潮组", "金融华三组", "深度组", "未分组"];
const names = [...new Set([...defaults, ...groups.map((group) => group.name)])];
els.groupOptions.innerHTML = names.map((name) => `<option value="${escapeHtml(name)}"></option>`).join("");
}
}
function groupSummaries() {
const byGroup = new Map();
for (const server of state.servers) {
const group = serverGroup(server);
byGroup.set(group, (byGroup.get(group) || 0) + 1);
}
return [...byGroup.entries()]
.map(([name, count]) => ({ name, count }))
.sort((a, b) => a.name.localeCompare(b.name, "zh-CN"));
}
function groupButtonHtml(value, label, count) {
const active = state.groupFilter === value ? " active" : "";
return `
<button class="group-filter${active}" data-group="${escapeHtml(value)}" type="button">
<span>${escapeHtml(label)}</span><strong>${count}</strong>
</button>`;
}
function renderStats() {
const totals = state.servers.reduce(
(acc, server) => {
const status = server.status || {};
acc.cards += status.totalCount || server.gpuCount || 0;
acc.busyCards += status.busyCount || 0;
acc.freeCards += status.freeCount || 0;
if (getServerKind(server) === "free") acc.freeServers += 1;
if (getServerKind(server) === "busy") acc.busyServers += 1;
if (getServerKind(server) === "offline") acc.offlineServers += 1;
return acc;
},
{ cards: 0, busyCards: 0, freeCards: 0, freeServers: 0, busyServers: 0, offlineServers: 0 }
);
setText("#statServers", state.servers.length);
setText("#statCards", totals.cards);
setText("#statFreeCards", totals.freeCards);
setText("#statBusyCards", totals.busyCards);
setText("#countAll", state.servers.length);
setText("#countFree", totals.freeServers);
setText("#countBusy", totals.busyServers);
setText("#countOffline", totals.offlineServers);
}
function renderGrid() {
const servers = filteredServers();
els.grid.innerHTML = "";
els.empty.classList.toggle("hidden", state.servers.length !== 0);
els.grid.classList.toggle("hidden", state.servers.length === 0);
for (const server of servers) {
const status = server.status || {};
const busyPercent = status.totalCount ? Math.round(((status.busyCount || 0) / status.totalCount) * 100) : 0;
const card = document.createElement("article");
card.className = `server-card ${serverOccupancyClass(server)} ${server.id === state.selectedId ? "selected" : ""}`;
card.tabIndex = 0;
card.innerHTML = serverCardHtml(server);
card.addEventListener("click", () => {
state.selectedId = server.id;
render();
});
card.addEventListener("keydown", (event) => {
if (event.key === "Enter" || event.key === " ") {
event.preventDefault();
state.selectedId = server.id;
render();
}
});
card.querySelector(".edit-card").addEventListener("click", (event) => {
event.stopPropagation();
openDialog(server);
});
card.style.setProperty("--busy", `${busyPercent}%`);
els.grid.appendChild(card);
}
}
function serverCardHtml(server) {
const status = server.status || {};
const kind = getServerKind(server);
const serverLevel = serverOccupancyClass(server);
const totalCount = status.totalCount || server.gpuCount || 0;
const tags = [...new Set([serverGroup(server), ...(server.tags?.length ? server.tags : [totalCount ? `${totalCount}卡` : "自动识别"])])];
return `
<div class="card-head">
<div>
<div class="server-name">${escapeHtml(server.name)}</div>
<div class="server-host">${escapeHtml(server.user ? `${server.user}@${server.host}` : server.host)}:${server.port}</div>
<div class="server-model">${escapeHtml(modelSummary(server))}</div>
</div>
<span class="status-pill status-${kind} ${serverLevel}">${kindLabel(kind)}</span>
</div>
<div class="gpu-ring">
<div class="donut ${serverLevel}"><span>${totalCount ? `${status.busyCount || 0}/${totalCount}` : "识别中"}</span></div>
<div class="summary">
<strong>${escapeHtml(status.summary || "等待刷新")}</strong>
<span>${status.updatedAt ? formatTime(status.updatedAt) : "未采集"}</span>
</div>
</div>
<div class="gpu-grid">
${gpuChips(status.gpus || [], totalCount, kind)}
</div>
<div class="tag-list">
${tags.map((tag) => `<span class="tag">${escapeHtml(tag)}</span>`).join("")}
<button class="icon-button edit-card" type="button" aria-label="编辑服务器">✎</button>
</div>
`;
}
function gpuChips(gpus, count, serverKind) {
const list = gpus.length ? gpus : Array.from({ length: count }, (_, index) => ({ index, state: "unknown" }));
return list
.slice(0, count)
.map((gpu) => {
const cls = serverKind === "offline" ? "offline" : gpu.state || "unknown";
const chipLevel = gpuOccupancyClass(gpu);
const compute = formatPercent(gpu.utilization);
const vram = formatPercent(gpu.memoryUtilization);
const computeLevel = normalizePercent(gpu.utilization);
const vramLevel = normalizePercent(gpu.memoryUtilization);
const computeClass = occupancyClass(gpu.utilization);
const vramClass = occupancyClass(gpu.memoryUtilization);
return `
<span class="gpu-chip ${cls} ${chipLevel}">
<b>#${escapeHtml(gpu.index)}</b>
<span class="chip-metrics">
<span class="chip-block memory ${vramClass}" style="--level:${vramLevel}%">
<i></i>
<em>显存</em>
<strong>${escapeHtml(vram)}</strong>
</span>
<span class="chip-block compute ${computeClass}" style="--level:${computeLevel}%">
<i></i>
<em>算力</em>
<strong>${escapeHtml(compute)}</strong>
</span>
</span>
</span>`;
})
.join("");
}
function renderDetail() {
const server = state.servers.find((item) => item.id === state.selectedId);
if (!server) {
els.detail.innerHTML = `
<div class="detail-empty">
<div class="detail-pulse"></div>
<h3>选择一台服务器</h3>
<p>查看每张 GPU/DCU 卡的占用、显存、温度和连接状态。</p>
</div>`;
return;
}
const status = server.status || {};
const kind = getServerKind(server);
const totalCount = status.totalCount || server.gpuCount || 0;
els.detail.innerHTML = `
<div class="detail-head">
<div>
<p class="eyebrow">${totalCount ? `${escapeHtml(totalCount)}卡服务器` : "自动识别卡数"} · ${commandLabel(server.command)}</p>
<h3>${escapeHtml(server.name)}</h3>
</div>
<button class="icon-button" id="detailEditBtn" type="button" aria-label="编辑服务器">✎</button>
</div>
<div class="detail-meta">
<div class="meta-box"><span>状态</span><strong>${kindLabel(kind)}</strong></div>
<div class="meta-box"><span>占用</span><strong>${totalCount ? `${status.busyCount || 0}/${totalCount}` : "识别中"}</strong></div>
<div class="meta-box"><span>分组</span><strong>${escapeHtml(serverGroup(server))}</strong></div>
<div class="meta-box"><span>地址</span><strong>${escapeHtml(server.host)}:${server.port}</strong></div>
<div class="meta-box"><span>型号</span><strong>${escapeHtml(modelSummary(server))}</strong></div>
<div class="meta-box"><span>延迟</span><strong>${status.latencyMs ? `${status.latencyMs}ms` : "-"}</strong></div>
</div>
${status.error ? `<div class="meta-box"><span>错误</span><strong>${escapeHtml(status.error)}</strong></div>` : ""}
<div class="gpu-list">
${(status.gpus || []).map(gpuRowHtml).join("")}
</div>
`;
document.querySelector("#detailEditBtn").addEventListener("click", () => openDialog(server));
}
function gpuRowHtml(gpu) {
const utilization = normalizePercent(gpu.utilization);
const memoryUtilization = normalizePercent(gpu.memoryUtilization);
const utilizationClass = occupancyClass(gpu.utilization);
const memoryUtilizationClass = occupancyClass(gpu.memoryUtilization);
const memory = gpu.memoryTotalMiB
? `${gpu.memoryUsedMiB || 0}/${gpu.memoryTotalMiB} MiB`
: gpu.memoryUtilization !== null && gpu.memoryUtilization !== undefined
? `${gpu.memoryUtilization}%`
: "-";
return `
<div class="gpu-row">
<div class="gpu-row-head">
<strong>卡 #${gpu.index}${gpu.model ? ` · ${escapeHtml(gpu.model)}` : ""}</strong>
<span>${gpuStateLabel(gpu.state)}</span>
</div>
<div class="bar-stack">
<div class="metric-line">
<span>显存</span>
<div class="bar memory"><i class="${memoryUtilizationClass}" style="width:${memoryUtilization}%"></i></div>
<strong>${formatPercent(gpu.memoryUtilization)}</strong>
</div>
<div class="metric-line">
<span>算力</span>
<div class="bar compute"><i class="${utilizationClass}" style="width:${utilization}%"></i></div>
<strong>${formatPercent(gpu.utilization)}</strong>
</div>
</div>
<div class="gpu-metrics">
<span>显存 ${escapeHtml(memory)}</span>
<span>温度 ${gpu.temperatureC ?? "-"}</span>
<span>功耗 ${gpu.powerW ?? "-"}W</span>
</div>
</div>`;
}
function normalizePercent(value) {
const number = Number(value);
if (!Number.isFinite(number)) return 0;
return Math.max(0, Math.min(100, number));
}
function formatPercent(value) {
if (value === null || value === undefined || value === "") return "-";
const number = Number(value);
if (!Number.isFinite(number)) return "-";
return `${Number.isInteger(number) ? number : number.toFixed(1)}%`;
}
function occupancyClass(value) {
const percent = normalizePercent(value);
if (percent >= 80) return "level-critical";
if (percent >= 40) return "level-warning";
if (percent >= 10) return "level-low";
return "level-free";
}
function gpuOccupancyClass(gpu) {
return occupancyClass(Math.max(normalizePercent(gpu.memoryUtilization), normalizePercent(gpu.utilization)));
}
function serverOccupancyClass(server) {
const gpus = server.status?.gpus || [];
if (!gpus.length) return "level-free";
const max = gpus.reduce((value, gpu) => Math.max(value, normalizePercent(gpu.memoryUtilization), normalizePercent(gpu.utilization)), 0);
return occupancyClass(max);
}
function filteredServers() {
return state.servers.filter((server) => {
const kind = getServerKind(server);
const matchesFilter = state.filter === "all" || state.filter === kind;
const matchesGroup = state.groupFilter === "all" || serverGroup(server) === state.groupFilter;
const text = [server.name, server.host, server.user, serverGroup(server), modelSummary(server), ...(server.tags || [])].join(" ").toLowerCase();
return matchesFilter && matchesGroup && (!state.query || text.includes(state.query));
});
}
function serverGroup(server) {
return String(server.group || "未分组").trim() || "未分组";
}
function getServerKind(server) {
const status = server.status || {};
if (status.state === "offline") return "offline";
if (status.state === "pending") return "pending";
return (status.busyCount || 0) > 0 ? "busy" : "free";
}
function kindLabel(kind) {
return { free: "空闲", busy: "占用", offline: "离线", pending: "刷新中" }[kind] || "未知";
}
function gpuStateLabel(kind) {
return { free: "空闲", busy: "占用", offline: "离线", unknown: "未知" }[kind] || "未知";
}
function commandLabel(command) {
return command === "nvidia-smi" ? "NVIDIA GPU" : "海光 DCU";
}
function modelSummary(server) {
const models = server.status?.models?.length
? server.status.models
: (server.status?.gpus || []).map((gpu) => gpu.model).filter(Boolean);
const unique = [...new Set(models)];
if (!unique.length) return "型号识别中";
return unique.length === 1 ? unique[0] : unique.join(" / ");
}
function openDialog(server) {
const editing = Boolean(server);
els.dialogTitle.textContent = editing ? "编辑服务器" : "添加服务器";
els.deleteBtn.classList.toggle("hidden", !editing);
els.fields.id.value = server?.id || "";
els.fields.name.value = server?.name || "";
els.fields.host.value = server?.host || "";
els.fields.user.value = server?.user || "root";
els.fields.port.value = server?.port || 22;
els.fields.command.value = server?.command || "hy-smi";
els.fields.group.value = server?.group || "";
els.fields.tags.value = (server?.tags || []).join(", ");
els.dialog.showModal();
els.fields.name.focus();
}
async function saveServer(event) {
event.preventDefault();
const id = els.fields.id.value;
const body = {
name: els.fields.name.value,
host: els.fields.host.value,
user: els.fields.user.value,
port: Number(els.fields.port.value || 22),
command: els.fields.command.value || "hy-smi",
group: els.fields.group.value,
tags: els.fields.tags.value
};
try {
const payload = await requestJson(id ? `/api/servers/${encodeURIComponent(id)}` : "/api/servers", {
method: id ? "PATCH" : "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(body)
});
state.selectedId = payload.server.id;
els.dialog.close();
await loadServers();
showToast("已保存");
} catch (error) {
showToast(error.message);
}
}
async function deleteSelectedServer() {
const id = els.fields.id.value;
if (!id) return;
try {
await requestJson(`/api/servers/${encodeURIComponent(id)}`, { method: "DELETE" });
state.selectedId = null;
els.dialog.close();
await loadServers();
showToast("已删除");
} catch (error) {
showToast(error.message);
}
}
async function requestJson(url, options) {
const response = await fetch(url, options);
const payload = await response.json().catch(() => ({}));
if (!response.ok) {
throw new Error(payload.error || `请求失败 ${response.status}`);
}
return payload;
}
function showToast(message) {
els.toast.textContent = message;
els.toast.classList.remove("hidden");
window.clearTimeout(els.toastTimer);
els.toastTimer = window.setTimeout(() => els.toast.classList.add("hidden"), 2600);
}
function setText(selector, value) {
document.querySelector(selector).textContent = value;
}
function formatTime(value) {
return new Date(value).toLocaleTimeString("zh-CN", { hour12: false });
}
function escapeHtml(value) {
return String(value ?? "")
.replace(/&/g, "&amp;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
.replace(/"/g, "&quot;")
.replace(/'/g, "&#039;");
}
<!doctype html>
<html lang="zh-CN">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>GPU/DCU 资源看板</title>
<link rel="stylesheet" href="/styles.css" />
</head>
<body>
<div class="shell">
<aside class="sidebar">
<div class="brand">
<div class="brand-mark">G</div>
<div>
<h1>GPU/DCU 资源看板</h1>
<p id="lastRefresh">等待刷新</p>
</div>
</div>
<div class="filters" role="tablist" aria-label="服务器筛选">
<button class="filter active" data-filter="all" type="button">
<span>全部</span><strong id="countAll">0</strong>
</button>
<button class="filter" data-filter="free" type="button">
<span>空闲</span><strong id="countFree">0</strong>
</button>
<button class="filter" data-filter="busy" type="button">
<span>占用</span><strong id="countBusy">0</strong>
</button>
<button class="filter" data-filter="offline" type="button">
<span>离线</span><strong id="countOffline">0</strong>
</button>
</div>
<div class="sidebar-actions">
<button class="primary-action" id="addServerBtn" type="button">
<span aria-hidden="true"></span>添加服务器
</button>
<button class="ghost-action" id="refreshBtn" type="button">
<span aria-hidden="true"></span>手动刷新
</button>
</div>
</aside>
<main class="content">
<section class="topline">
<div>
<p class="eyebrow">共享测试资源</p>
<h2>服务器占用情况</h2>
</div>
<div class="search-wrap">
<span aria-hidden="true"></span>
<input id="searchInput" type="search" placeholder="搜索服务器、IP、标签" />
</div>
</section>
<section class="stats" aria-label="资源统计">
<div class="stat">
<span>服务器</span>
<strong id="statServers">0</strong>
</div>
<div class="stat">
<span>总卡数</span>
<strong id="statCards">0</strong>
</div>
<div class="stat good">
<span>空闲卡</span>
<strong id="statFreeCards">0</strong>
</div>
<div class="stat warn">
<span>占用卡</span>
<strong id="statBusyCards">0</strong>
</div>
</section>
<section class="group-panel" id="groupFilters" aria-label="服务器分组"></section>
<section class="server-grid" id="serverGrid" aria-live="polite"></section>
<section class="empty-state hidden" id="emptyState">
<div class="empty-icon"></div>
<h3>添加第一台服务器</h3>
<p>配置 SSH 登录信息后,看板会定时采集显存和算力占用。</p>
<button class="primary-action compact" id="emptyAddBtn" type="button">添加服务器</button>
</section>
</main>
<aside class="detail" id="detailPanel">
<div class="detail-empty">
<div class="detail-pulse"></div>
<h3>选择一台服务器</h3>
<p>查看每张加速卡的占用、显存、温度和连接状态。</p>
</div>
</aside>
</div>
<dialog id="serverDialog" class="server-dialog">
<form id="serverForm" method="dialog">
<div class="dialog-head">
<div>
<p class="eyebrow">服务器配置</p>
<h3 id="dialogTitle">添加服务器</h3>
</div>
<button class="icon-button" id="closeDialogBtn" type="button" aria-label="关闭">×</button>
</div>
<input id="serverId" type="hidden" />
<label>
<span>名称</span>
<input id="serverName" autocomplete="off" required placeholder="例如:算法公共机 A" />
</label>
<label>
<span>Host / IP</span>
<input id="serverHost" autocomplete="off" required placeholder="10.0.0.12" />
</label>
<div class="field-row">
<label>
<span>SSH 用户</span>
<input id="serverUser" autocomplete="username" value="root" placeholder="root" />
</label>
<label>
<span>端口</span>
<input id="serverPort" type="number" min="1" max="65535" value="22" />
</label>
</div>
<label>
<span>分组</span>
<input id="serverGroup" list="groupOptions" autocomplete="off" placeholder="通信中兴组" />
<datalist id="groupOptions"></datalist>
</label>
<label>
<span>标签</span>
<input id="serverTags" placeholder="公共池, 回归" />
</label>
<label>
<span>采集命令</span>
<select id="serverCommand">
<option value="hy-smi">hy-smi(海光 DCU)</option>
<option value="nvidia-smi">nvidia-smi(NVIDIA GPU)</option>
</select>
</label>
<div class="dialog-actions">
<button class="ghost-action compact" id="deleteServerBtn" type="button">删除</button>
<button class="primary-action compact" type="submit">保存</button>
</div>
</form>
</dialog>
<div class="toast hidden" id="toast"></div>
<script src="/app.js"></script>
</body>
</html>
:root {
color-scheme: light;
--bg: #eef3f6;
--panel: #ffffff;
--panel-soft: #f7fafb;
--text: #17212b;
--muted: #687684;
--line: #dbe4e9;
--teal: #0f9f9a;
--teal-dark: #087873;
--green: #19a35b;
--amber: #d28411;
--red: #d84949;
--blue: #2c75d6;
--shadow: 0 20px 60px rgba(30, 51, 61, 0.12);
font-family: Inter, "Segoe UI", "Microsoft YaHei", system-ui, sans-serif;
}
* {
box-sizing: border-box;
}
html,
body {
width: 100%;
height: 100%;
}
body {
margin: 0;
background:
linear-gradient(135deg, rgba(15, 159, 154, 0.08), transparent 34%),
linear-gradient(315deg, rgba(44, 117, 214, 0.08), transparent 32%),
var(--bg);
color: var(--text);
overflow: hidden;
}
button,
input,
select {
font: inherit;
}
button {
cursor: pointer;
}
.shell {
display: grid;
grid-template-columns: 252px minmax(560px, 1fr) 340px;
gap: 12px;
width: 100%;
height: 100dvh;
min-height: 0;
padding: 12px;
}
.sidebar,
.content,
.detail {
background: rgba(255, 255, 255, 0.86);
border: 1px solid rgba(219, 228, 233, 0.86);
box-shadow: var(--shadow);
backdrop-filter: blur(18px);
}
.sidebar {
display: flex;
flex-direction: column;
gap: 18px;
min-height: 0;
border-radius: 18px;
padding: 16px;
}
.brand {
display: flex;
align-items: center;
gap: 12px;
}
.brand-mark {
display: grid;
place-items: center;
width: 42px;
height: 42px;
border-radius: 12px;
background: var(--text);
color: white;
font-weight: 800;
}
h1,
h2,
h3,
p {
margin: 0;
}
h1 {
font-size: 18px;
}
h2 {
margin-top: 4px;
font-size: 30px;
letter-spacing: 0;
}
.brand p,
.detail-empty p,
.empty-state p {
margin-top: 4px;
color: var(--muted);
font-size: 13px;
}
.filters {
display: grid;
gap: 8px;
}
.filter {
display: flex;
align-items: center;
justify-content: space-between;
min-height: 44px;
border: 1px solid transparent;
border-radius: 10px;
background: transparent;
color: var(--muted);
padding: 0 12px;
}
.filter.active,
.filter:hover {
border-color: var(--line);
background: var(--panel-soft);
color: var(--text);
}
.filter strong {
color: inherit;
}
.sidebar-actions {
display: grid;
gap: 10px;
margin-top: auto;
}
.primary-action,
.ghost-action {
display: inline-flex;
align-items: center;
justify-content: center;
gap: 8px;
min-height: 44px;
border-radius: 10px;
border: 1px solid transparent;
padding: 0 14px;
font-weight: 700;
}
.primary-action {
background: var(--teal);
color: #fff;
}
.primary-action:hover {
background: var(--teal-dark);
}
.ghost-action {
background: var(--panel-soft);
border-color: var(--line);
color: var(--text);
}
.compact {
min-height: 38px;
}
.content {
display: flex;
flex-direction: column;
min-height: 0;
border-radius: 18px;
padding: 18px;
overflow: hidden;
}
.topline {
display: flex;
align-items: center;
justify-content: space-between;
gap: 18px;
flex: 0 0 auto;
}
.eyebrow {
color: var(--teal-dark);
font-size: 12px;
font-weight: 800;
}
.search-wrap {
display: flex;
align-items: center;
gap: 8px;
width: min(360px, 100%);
min-height: 42px;
border: 1px solid var(--line);
border-radius: 10px;
background: #fff;
padding: 0 12px;
color: var(--muted);
}
.search-wrap input {
width: 100%;
border: 0;
outline: 0;
background: transparent;
color: var(--text);
}
.stats {
display: grid;
grid-template-columns: repeat(4, minmax(0, 1fr));
gap: 10px;
margin: 14px 0;
flex: 0 0 auto;
}
.group-panel {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin: 0 0 12px;
flex: 0 0 auto;
}
.group-filter {
display: inline-flex;
align-items: center;
gap: 8px;
min-height: 34px;
border: 1px solid var(--line);
border-radius: 999px;
background: #fff;
color: var(--muted);
padding: 0 12px;
font-size: 13px;
font-weight: 800;
}
.group-filter.active,
.group-filter:hover {
border-color: rgba(15, 159, 154, 0.48);
background: #e9f7f5;
color: var(--teal-dark);
}
.group-filter strong {
min-width: 18px;
border-radius: 999px;
background: rgba(15, 159, 154, 0.12);
color: inherit;
padding: 2px 6px;
text-align: center;
font-size: 12px;
}
.stat {
min-height: 74px;
border: 1px solid var(--line);
border-radius: 12px;
background: var(--panel-soft);
padding: 12px;
}
.stat span {
color: var(--muted);
font-size: 13px;
}
.stat strong {
display: block;
margin-top: 4px;
font-size: 25px;
}
.stat.good strong {
color: var(--green);
}
.stat.warn strong {
color: var(--amber);
}
.server-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
gap: 12px;
align-items: start;
flex: 1 1 auto;
min-height: 0;
overflow-y: auto;
padding: 2px 4px 12px 2px;
}
.server-card {
min-height: 0;
min-width: 0;
border: 1px solid var(--line);
border-radius: 12px;
background: #fff;
padding: 13px;
transition: transform 0.16s ease, box-shadow 0.16s ease, border-color 0.16s ease;
}
.server-card:hover,
.server-card.selected {
transform: translateY(-2px);
border-color: rgba(15, 159, 154, 0.45);
box-shadow: 0 18px 40px rgba(30, 51, 61, 0.12);
}
.card-head {
display: flex;
align-items: start;
justify-content: space-between;
gap: 12px;
}
.server-name {
font-size: 17px;
font-weight: 800;
word-break: break-word;
}
.server-host {
margin-top: 4px;
color: var(--muted);
font-size: 12px;
}
.server-model {
margin-top: 4px;
color: var(--teal-dark);
font-size: 12px;
font-weight: 800;
overflow-wrap: anywhere;
}
.status-pill {
flex: 0 0 auto;
border-radius: 999px;
padding: 5px 10px;
color: #fff;
font-size: 12px;
font-weight: 800;
}
.status-pill.level-free,
.status-pill.level-low,
.status-pill.level-warning,
.status-pill.level-critical {
background: var(--level-color);
}
.status-free {
background: var(--green);
}
.status-busy {
background: var(--amber);
}
.status-offline {
background: var(--red);
}
.status-pending {
background: var(--blue);
}
.gpu-ring {
display: grid;
grid-template-columns: 72px 1fr;
align-items: center;
gap: 12px;
margin: 12px 0;
}
.donut {
position: relative;
display: grid;
place-items: center;
width: 72px;
height: 72px;
border-radius: 50%;
background: conic-gradient(var(--level-color, var(--green)) var(--busy, 0%), #dfe8ec 0);
}
.donut::before {
content: "";
position: absolute;
width: 50px;
height: 50px;
border-radius: 50%;
background: #fff;
}
.donut span {
position: relative;
z-index: 1;
font-weight: 900;
}
.summary strong {
display: block;
font-size: 21px;
}
.summary span {
color: var(--muted);
font-size: 12px;
}
.gpu-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(58px, 1fr));
gap: 6px;
}
.gpu-chip {
display: grid;
grid-template-rows: 16px 1fr;
gap: 4px;
min-height: 70px;
border: 1px solid #cfe7dd;
border-radius: 8px;
padding: 4px;
font-size: 10px;
font-weight: 800;
background: #e8f5ef;
color: #0c7040;
line-height: 1.2;
}
.gpu-chip b {
color: currentColor;
font-size: 11px;
font-weight: 800;
text-align: center;
}
.gpu-chip.offline,
.gpu-chip.unknown {
background: #edf2f5;
border-color: #d7e0e5;
color: #687684;
}
.gpu-chip.level-free {
background: #e8f5ef;
border-color: #bde4cf;
color: #0c7040;
}
.gpu-chip.level-low {
background: #f1f7dc;
border-color: #d8e899;
color: #617414;
}
.gpu-chip.level-warning {
background: #fff2da;
border-color: #f2cf93;
color: #9b5d08;
}
.gpu-chip.level-critical {
background: #fde8e8;
border-color: #f3b2b2;
color: #9f2f2f;
}
.chip-metrics {
display: grid;
grid-template-columns: repeat(2, minmax(24px, 1fr));
gap: 4px;
min-width: 0;
}
.chip-block {
position: relative;
display: grid;
align-content: center;
justify-items: center;
min-width: 0;
min-height: 44px;
overflow: hidden;
border-radius: 6px;
background: rgba(255, 255, 255, 0.78);
color: var(--text);
}
.chip-block i {
position: absolute;
inset: auto 0 0 0;
width: 100%;
height: var(--level);
opacity: 0.86;
}
.chip-block.memory i {
background: var(--level-color, #19a35b);
}
.chip-block.compute i {
background: var(--level-color, #19a35b);
}
.level-free {
--level-color: #19a35b;
}
.level-low {
--level-color: #a9c83d;
}
.level-warning {
--level-color: #e39a24;
}
.level-critical {
--level-color: #d84949;
}
.chip-block em,
.chip-block strong {
position: relative;
z-index: 1;
font-style: normal;
}
.chip-block em {
font-size: 9px;
color: rgba(23, 33, 43, 0.74);
}
.chip-block strong {
font-size: 10px;
color: var(--text);
}
.tag-list {
display: flex;
flex-wrap: wrap;
gap: 6px;
margin-top: 14px;
}
.tag {
border-radius: 999px;
background: #eef6f6;
color: var(--teal-dark);
padding: 4px 8px;
font-size: 12px;
font-weight: 700;
}
.detail {
min-height: 0;
border-radius: 18px;
padding: 16px;
overflow-y: auto;
}
.detail-empty {
display: grid;
place-items: center;
align-content: center;
min-height: calc(100dvh - 56px);
text-align: center;
}
.detail-pulse,
.empty-icon {
display: grid;
place-items: center;
width: 58px;
height: 58px;
border-radius: 16px;
margin-bottom: 14px;
background: #e7f6f4;
color: var(--teal-dark);
}
.detail-pulse::before {
content: "";
width: 24px;
height: 24px;
border-radius: 50%;
background: var(--teal);
}
.detail-head {
display: flex;
align-items: start;
justify-content: space-between;
gap: 12px;
margin-bottom: 16px;
}
.detail-head h3 {
font-size: 22px;
}
.icon-button {
display: grid;
place-items: center;
width: 36px;
height: 36px;
border: 1px solid var(--line);
border-radius: 10px;
background: #fff;
color: var(--text);
font-size: 20px;
}
.detail-meta {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 10px;
margin-bottom: 16px;
}
.meta-box,
.gpu-row {
border: 1px solid var(--line);
border-radius: 10px;
background: var(--panel-soft);
padding: 12px;
}
.meta-box span,
.gpu-row span {
color: var(--muted);
font-size: 12px;
}
.meta-box strong {
display: block;
margin-top: 4px;
overflow-wrap: anywhere;
}
.gpu-list {
display: grid;
gap: 10px;
}
.gpu-row {
background: #fff;
}
.gpu-row-head {
display: flex;
align-items: center;
justify-content: space-between;
gap: 10px;
margin-bottom: 9px;
}
.bar {
height: 9px;
border-radius: 999px;
background: #e4edf1;
overflow: hidden;
}
.bar > i {
display: block;
width: 0;
height: 100%;
border-radius: inherit;
background: var(--level-color, #19a35b);
}
.bar.memory > i {
background: var(--level-color, #19a35b);
}
.bar.compute > i {
background: var(--level-color, #19a35b);
}
.bar-stack {
display: grid;
gap: 8px;
}
.metric-line {
display: grid;
grid-template-columns: 34px minmax(0, 1fr) 44px;
align-items: center;
gap: 8px;
}
.metric-line strong {
color: var(--text);
font-size: 12px;
text-align: right;
}
.gpu-metrics {
display: grid;
grid-template-columns: repeat(3, minmax(0, 1fr));
gap: 8px;
margin-top: 9px;
font-size: 12px;
}
.empty-state {
display: grid;
place-items: center;
min-height: 360px;
border: 1px dashed #b8c9d1;
border-radius: 14px;
background: rgba(255, 255, 255, 0.55);
text-align: center;
}
.empty-state .primary-action {
margin-top: 14px;
}
.hidden {
display: none !important;
}
.server-dialog {
width: min(540px, calc(100vw - 28px));
border: 1px solid var(--line);
border-radius: 16px;
box-shadow: var(--shadow);
padding: 0;
}
.server-dialog::backdrop {
background: rgba(23, 33, 43, 0.32);
backdrop-filter: blur(4px);
}
.server-dialog form {
display: grid;
gap: 14px;
padding: 20px;
}
.dialog-head,
.dialog-actions,
.field-row {
display: flex;
gap: 12px;
}
.dialog-head,
.dialog-actions {
align-items: center;
justify-content: space-between;
}
label {
display: grid;
gap: 6px;
width: 100%;
color: var(--muted);
font-size: 13px;
font-weight: 700;
}
input,
select {
width: 100%;
min-height: 40px;
border: 1px solid var(--line);
border-radius: 9px;
outline: 0;
background: #fff;
color: var(--text);
padding: 0 11px;
}
input:focus,
select:focus {
border-color: rgba(15, 159, 154, 0.72);
box-shadow: 0 0 0 3px rgba(15, 159, 154, 0.12);
}
.toast {
position: fixed;
right: 22px;
bottom: 22px;
max-width: min(420px, calc(100vw - 44px));
border-radius: 12px;
background: var(--text);
color: #fff;
padding: 12px 14px;
box-shadow: var(--shadow);
font-size: 14px;
}
@media (max-width: 1500px) {
body {
overflow: auto;
}
.shell {
grid-template-columns: 240px minmax(560px, 1fr);
height: auto;
min-height: 100dvh;
}
.content {
min-height: calc(100dvh - 24px);
}
.detail {
grid-column: 1 / -1;
max-height: none;
}
.detail-empty {
min-height: 220px;
}
}
@media (max-width: 900px) {
body {
overflow: auto;
}
.shell {
grid-template-columns: 1fr;
height: auto;
padding: 10px;
}
.content {
min-height: auto;
}
.sidebar {
gap: 14px;
}
.filters,
.stats {
grid-template-columns: repeat(2, minmax(0, 1fr));
}
.server-grid {
grid-template-columns: minmax(0, 1fr);
overflow: visible;
}
.topline,
.field-row {
align-items: stretch;
flex-direction: column;
}
h2 {
font-size: 24px;
}
}
@media (max-width: 480px) {
.gpu-grid {
grid-template-columns: repeat(2, minmax(0, 1fr));
}
}
const http = require("node:http");
const fs = require("node:fs");
const path = require("node:path");
const { spawn } = require("node:child_process");
const { randomUUID } = require("node:crypto");
const PORT = Number(process.env.PORT || 3066);
const POLL_INTERVAL_MS = Number(process.env.POLL_INTERVAL_MS || 10000);
const SSH_TIMEOUT_MS = Number(process.env.SSH_TIMEOUT_MS || 8000);
const ROOT = __dirname;
const DATA_DIR = path.join(ROOT, "data");
const CONFIG_PATH = path.join(DATA_DIR, "servers.json");
const PUBLIC_DIR = path.join(ROOT, "public");
const MIME_TYPES = {
".html": "text/html; charset=utf-8",
".css": "text/css; charset=utf-8",
".js": "application/javascript; charset=utf-8",
".json": "application/json; charset=utf-8",
".svg": "image/svg+xml; charset=utf-8",
".png": "image/png",
".ico": "image/x-icon"
};
let statusCache = new Map();
let lastRefresh = null;
let refreshInFlight = null;
let refreshInFlightIncludesModels = false;
function ensureDataFile() {
if (!fs.existsSync(DATA_DIR)) {
fs.mkdirSync(DATA_DIR, { recursive: true });
}
if (!fs.existsSync(CONFIG_PATH)) {
fs.writeFileSync(CONFIG_PATH, "[]\n", "utf8");
}
}
function loadServers() {
ensureDataFile();
try {
const raw = fs.readFileSync(CONFIG_PATH, "utf8");
const parsed = JSON.parse(raw);
return Array.isArray(parsed) ? parsed.map(normalizeServer).filter(Boolean) : [];
} catch (error) {
console.error("Failed to read server config:", error.message);
return [];
}
}
function saveServers(servers) {
ensureDataFile();
const tmp = `${CONFIG_PATH}.tmp`;
fs.writeFileSync(tmp, `${JSON.stringify(servers, null, 2)}\n`, "utf8");
fs.renameSync(tmp, CONFIG_PATH);
}
function normalizeServer(input) {
if (!input || typeof input !== "object") return null;
const host = String(input.host || "").trim();
if (!host) return null;
const name = String(input.name || host).trim();
const user = String(input.user || "root").trim();
const command = normalizeCommand(input.command);
const group = normalizeGroup(input.group || input.team);
const tags = Array.isArray(input.tags)
? input.tags.map((tag) => String(tag).trim()).filter(Boolean).slice(0, 6)
: String(input.tags || "")
.split(",")
.map((tag) => tag.trim())
.filter(Boolean)
.slice(0, 6);
return {
id: String(input.id || randomUUID()),
name,
host,
port: clampInt(input.port, 1, 65535, 22),
user,
gpuCount: optionalInt(input.gpuCount, 0, 32),
command,
group,
tags,
models: normalizeModels(input.models),
gpuModels: normalizeGpuModels(input.gpuModels)
};
}
function normalizeGroup(group) {
const value = String(group || "").replace(/\s+/g, " ").trim();
return value || "未分组";
}
function normalizeModels(models) {
if (!Array.isArray(models)) return [];
return models.map(normalizeModelName).filter(Boolean).slice(0, 32);
}
function normalizeGpuModels(gpuModels) {
if (!Array.isArray(gpuModels)) return [];
return gpuModels
.map((gpu) => {
if (!gpu || typeof gpu !== "object") return null;
const index = optionalInt(gpu.index, 0, 31);
return {
index,
model: normalizeModelName(gpu.model),
vendor: normalizeModelName(gpu.vendor)
};
})
.filter((gpu) => gpu && (gpu.model || gpu.vendor))
.slice(0, 32);
}
function clampInt(value, min, max, fallback) {
const parsed = Number.parseInt(value, 10);
if (!Number.isFinite(parsed)) return fallback;
return Math.min(max, Math.max(min, parsed));
}
function optionalInt(value, min, max) {
if (value === undefined || value === null || value === "") return 0;
return clampInt(value, min, max, 0);
}
function publicServer(server) {
const cached = statusCache.get(server.id);
return {
...server,
status: cached || createPendingStatus(server)
};
}
function createPendingStatus(server) {
const totalCount = server.gpuCount || 0;
const gpus = applySavedModels(
Array.from({ length: totalCount }, (_, index) => ({
index,
state: "unknown",
utilization: null,
memoryUtilization: null,
memoryUsedMiB: null,
memoryTotalMiB: null,
temperatureC: null,
powerW: null,
raw: ""
})),
server
);
return {
state: "pending",
summary: "等待刷新",
updatedAt: null,
latencyMs: null,
busyCount: 0,
freeCount: totalCount,
totalCount,
models: server.models && server.models.length ? server.models : collectModels(gpus),
gpus,
error: null
};
}
async function refreshAll(options = {}) {
const includeModels = Boolean(options.includeModels);
if (refreshInFlight) {
if (!includeModels || refreshInFlightIncludesModels) return refreshInFlight;
await refreshInFlight;
}
const servers = loadServers();
refreshInFlightIncludesModels = includeModels;
refreshInFlight = Promise.all(servers.map((server) => refreshServer(server, { includeModels })))
.then((results) => {
lastRefresh = new Date().toISOString();
return results;
})
.finally(() => {
refreshInFlight = null;
refreshInFlightIncludesModels = false;
});
return refreshInFlight;
}
async function refreshServer(server, options = {}) {
const includeModels = Boolean(options.includeModels);
const started = Date.now();
try {
const output = await runProbeCommand(server);
const parsed = parseProbeOutput(output.stdout, server.gpuCount, server.command);
let gpus = applySavedModels(parsed.gpus, server);
let models = collectModels(gpus);
if (includeModels) {
try {
const modelOutput = await runProbeCommand(server, buildModelCommand(server.command));
const modelByIndex = parseModelOutput(modelOutput.stdout, server.command);
gpus = mergeGpuModels(gpus, modelByIndex);
models = collectModels(gpus);
} catch (modelError) {
console.warn(`Model detection failed for ${server.host}: ${modelError.message}`);
}
}
const totalCount = Math.max(parsed.totalCount, gpus.length);
const busyCount = gpus.filter((gpu) => gpu.state === "busy").length;
const latencyMs = Date.now() - started;
const status = {
state: "online",
summary: parsed.busyCount > 0 ? `${parsed.busyCount}/${parsed.totalCount} 占用` : "全部空闲",
updatedAt: new Date().toISOString(),
latencyMs,
busyCount,
freeCount: Math.max(totalCount - busyCount, 0),
totalCount,
models,
gpus,
error: null
};
if (busyCount > 0) {
status.summary = `${busyCount}/${totalCount} 占用`;
}
statusCache.set(server.id, status);
if (server.gpuCount !== totalCount || includeModels) {
persistDetectedServerInfo(server.id, {
gpuCount: totalCount,
models: includeModels ? models : undefined,
gpuModels: includeModels ? extractGpuModels(gpus) : undefined
});
}
return { id: server.id, ok: true };
} catch (error) {
const pending = createPendingStatus(server);
const status = {
state: "offline",
summary: "连接失败",
updatedAt: new Date().toISOString(),
latencyMs: Date.now() - started,
busyCount: 0,
freeCount: 0,
totalCount: server.gpuCount,
models: pending.models,
gpus: pending.gpus,
error: error.message
};
statusCache.set(server.id, status);
return { id: server.id, ok: false, error: error.message };
}
}
function runProbeCommand(server, remoteCommand = buildRemoteCommand(server.command)) {
const target = server.user ? `${server.user}@${server.host}` : server.host;
const sshPath = process.env.SSH_PATH || (process.platform === "win32" ? "C:\\Windows\\System32\\OpenSSH\\ssh.exe" : "ssh");
const args = [
"-p",
String(server.port),
"-o",
"BatchMode=yes",
"-o",
`ConnectTimeout=${Math.ceil(SSH_TIMEOUT_MS / 1000)}`,
"-o",
"StrictHostKeyChecking=accept-new",
target,
remoteCommand
];
return new Promise((resolve, reject) => {
const child = spawn(sshPath, args, { windowsHide: true });
let stdout = "";
let stderr = "";
let settled = false;
const timer = setTimeout(() => {
if (settled) return;
settled = true;
child.kill();
reject(new Error(`SSH 超时 (${Math.ceil(SSH_TIMEOUT_MS / 1000)}s)`));
}, SSH_TIMEOUT_MS);
child.stdout.on("data", (chunk) => {
stdout += chunk.toString("utf8");
});
child.stderr.on("data", (chunk) => {
stderr += chunk.toString("utf8");
});
child.on("error", (error) => {
if (settled) return;
settled = true;
clearTimeout(timer);
reject(new Error(`无法启动 ssh: ${error.message}`));
});
child.on("close", (code) => {
if (settled) return;
settled = true;
clearTimeout(timer);
if (code === 0) {
resolve({ stdout, stderr });
} else {
reject(new Error(cleanError(stderr || stdout || `ssh exit ${code}`)));
}
});
});
}
function normalizeCommand(command) {
const value = String(command || "hy-smi").trim();
return value === "nvidia-smi" ? "nvidia-smi" : "hy-smi";
}
function buildRemoteCommand(command) {
if (command === "nvidia-smi") {
return [
"nvidia-smi",
"--query-gpu=index,utilization.gpu,utilization.memory,memory.used,memory.total,temperature.gpu,power.draw",
"--format=csv,noheader,nounits"
].join(" ");
}
return "hy-smi";
}
function buildModelCommand(command) {
if (command === "nvidia-smi") {
return [
"nvidia-smi",
"--query-gpu=index,name",
"--format=csv,noheader,nounits"
].join(" ");
}
return "hy-smi --showproductname";
}
function persistDetectedServerInfo(serverId, detected) {
const servers = loadServers();
const index = servers.findIndex((server) => server.id === serverId);
if (index === -1) return;
const next = { ...servers[index] };
let changed = false;
if (detected.gpuCount && next.gpuCount !== detected.gpuCount) {
next.gpuCount = detected.gpuCount;
changed = true;
}
if (Array.isArray(detected.models)) {
next.models = detected.models;
changed = true;
}
if (Array.isArray(detected.gpuModels)) {
next.gpuModels = detected.gpuModels;
changed = true;
}
if (!changed) return;
servers[index] = next;
saveServers(servers);
}
function cleanError(message) {
return String(message)
.replace(/\r/g, "")
.split("\n")
.map((line) => line.trim())
.filter(Boolean)
.slice(-3)
.join(" | ")
.slice(0, 240);
}
function parseProbeOutput(output, expectedCount, command) {
if (command === "nvidia-smi") {
return parseNvidiaSmi(output, expectedCount);
}
return parseHySmi(output, expectedCount);
}
function parseNvidiaSmi(output, expectedCount) {
const lines = String(output || "")
.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean);
const byIndex = new Map();
for (const line of lines) {
const parts = line.split(",").map((part) => part.trim());
if (parts.length < 7) continue;
const index = Number.parseInt(parts[0], 10);
if (!Number.isFinite(index)) continue;
const gpu = createGpu(index);
gpu.utilization = parseNullableNumber(parts[1]);
gpu.memoryUtilization = parseNullableNumber(parts[2]);
gpu.memoryUsedMiB = parseNullableNumber(parts[3]);
gpu.memoryTotalMiB = parseNullableNumber(parts[4]);
gpu.temperatureC = parseNullableNumber(parts[5]);
gpu.powerW = parseNullableNumber(parts[6]);
gpu.raw = line.slice(0, 280);
if (gpu.memoryUtilization === null && gpu.memoryTotalMiB) {
gpu.memoryUtilization = Math.round(((gpu.memoryUsedMiB || 0) / gpu.memoryTotalMiB) * 1000) / 10;
}
byIndex.set(index, gpu);
}
return finalizeParsedGpus(byIndex, expectedCount);
}
function parseHySmiWithProduct(output, expectedCount) {
const [metricsOutput, productOutput = ""] = String(output || "").split("__GPU_MONITOR_PRODUCT__");
const parsed = parseHySmi(metricsOutput, expectedCount);
const productByIndex = parseHyProductNames(productOutput);
if (productByIndex.size === 0) return parsed;
const gpus = parsed.gpus.map((gpu) => ({
...gpu,
...(productByIndex.get(gpu.index) || {})
}));
return {
...parsed,
models: collectModels(gpus),
gpus
};
}
function parseModelOutput(output, command) {
if (command === "nvidia-smi") {
return parseNvidiaModels(output);
}
return parseHyProductNames(output);
}
function parseNvidiaModels(output) {
const byIndex = new Map();
const lines = String(output || "")
.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean);
for (const line of lines) {
const parts = line.split(",").map((part) => part.trim());
if (parts.length < 2) continue;
const index = Number.parseInt(parts[0], 10);
const model = normalizeModelName(parts.slice(1).join(", "));
if (Number.isFinite(index) && model) {
byIndex.set(index, { model });
}
}
return byIndex;
}
function parseHyProductNames(output) {
const byIndex = new Map();
const lines = String(output || "").split(/\r?\n/);
for (const line of lines) {
const series = line.match(/\b(?:HCU|DCU|GPU)\[(\d{1,2})\].*?\bCard\s+Series\s*:\s*(.+)$/i);
if (series) {
const index = Number.parseInt(series[1], 10);
const existing = byIndex.get(index) || {};
byIndex.set(index, { ...existing, model: normalizeModelName(series[2]) });
continue;
}
const vendor = line.match(/\b(?:HCU|DCU|GPU)\[(\d{1,2})\].*?\bCard\s+Vendor\s*:\s*(.+)$/i);
if (vendor) {
const index = Number.parseInt(vendor[1], 10);
const existing = byIndex.get(index) || {};
byIndex.set(index, { ...existing, vendor: normalizeModelName(vendor[2]) });
}
}
return byIndex;
}
function parseHySmi(output, expectedCount) {
const lines = String(output || "").split(/\r?\n/);
const byIndex = new Map();
for (const line of lines) {
const index = getGpuIndex(line);
if (index === null || index < 0 || index > 31) continue;
const existing = byIndex.get(index) || createGpu(index);
const next = parseGpuLine(line, existing);
byIndex.set(index, next);
}
if (byIndex.size === 0) {
const metricLines = lines.filter((line) => /^\s*\d{1,2}\s+/.test(line) && /%|MiB|GiB|W|C/i.test(line));
metricLines.slice(0, expectedCount).forEach((line, index) => {
byIndex.set(index, parseGpuLine(line, createGpu(index)));
});
}
return finalizeParsedGpus(byIndex, expectedCount);
}
function finalizeParsedGpus(byIndex, expectedCount) {
const detectedCount = byIndex.size ? Math.max(...byIndex.keys()) + 1 : 0;
const totalCount = Math.max(expectedCount || 0, detectedCount);
const gpus = Array.from({ length: totalCount }, (_, index) => {
const parsed = byIndex.get(index) || createGpu(index);
const busy = isGpuBusy(parsed);
return {
...parsed,
state: busy ? "busy" : parsed.utilization === null && parsed.memoryUsedMiB === null ? "unknown" : "free"
};
});
return {
totalCount,
busyCount: gpus.filter((gpu) => gpu.state === "busy").length,
models: collectModels(gpus),
gpus
};
}
function createGpu(index) {
return {
index,
state: "unknown",
utilization: null,
memoryUtilization: null,
memoryUsedMiB: null,
memoryTotalMiB: null,
temperatureC: null,
powerW: null,
model: null,
vendor: null,
raw: ""
};
}
function getGpuIndex(line) {
const patterns = [
/^\s*\|\s*(\d{1,2})\s+[^|]+?\|/,
/^\s*(\d{1,2})\s+\d+(?:\.\d+)?C\s+/i,
/^\s*(\d{1,2})\s+(?:DCU|GPU|card)/i,
/\b(?:DCU|GPU|card)\s*[:#-]?\s*(\d{1,2})\b/i
];
for (const pattern of patterns) {
const match = String(line).match(pattern);
if (match) return Number.parseInt(match[1], 10);
}
return null;
}
function parseGpuLine(line, gpu) {
const next = { ...gpu, raw: [gpu.raw, line.trim()].filter(Boolean).join(" | ").slice(0, 280) };
const percentages = [...String(line).matchAll(/(\d+(?:\.\d+)?)\s*%/g)]
.map((match) => Number.parseFloat(match[1]))
.filter((value) => value >= 0 && value <= 100);
if (percentages.length) {
next.utilization = percentages[percentages.length - 1];
}
if (/^\s*\d{1,2}\s+\d+(?:\.\d+)?C\s+/i.test(line) && percentages.length >= 2) {
next.memoryUtilization = percentages[0];
next.utilization = percentages[1];
}
const temp = String(line).match(/(\d+(?:\.\d+)?)\s*C\b/i);
if (temp) next.temperatureC = Number.parseFloat(temp[1]);
const power = String(line).match(/(\d+(?:\.\d+)?)\s*W\b/i);
if (power) next.powerW = Number.parseFloat(power[1]);
const memory = String(line).match(/(\d+(?:\.\d+)?)\s*(MiB|GiB|MB|GB)\s*\/\s*(\d+(?:\.\d+)?)\s*(MiB|GiB|MB|GB)/i);
if (memory) {
next.memoryUsedMiB = toMiB(Number.parseFloat(memory[1]), memory[2]);
next.memoryTotalMiB = toMiB(Number.parseFloat(memory[3]), memory[4]);
}
return next;
}
function toMiB(value, unit) {
return /g/i.test(unit) ? Math.round(value * 1024) : Math.round(value);
}
function parseNullableNumber(value) {
const normalized = String(value || "").replace(/[^\d.-]/g, "");
if (!normalized || normalized.toLowerCase() === "nan") return null;
const parsed = Number.parseFloat(normalized);
return Number.isFinite(parsed) ? parsed : null;
}
function normalizeModelName(value) {
const text = String(value || "")
.replace(/\s+/g, " ")
.replace(/^[-:]+|[-:]+$/g, "")
.trim();
if (!text || /^N\/A$/i.test(text) || /^unknown$/i.test(text)) return null;
return text;
}
function collectModels(gpus) {
const models = [];
for (const gpu of gpus) {
if (gpu.model && !models.includes(gpu.model)) models.push(gpu.model);
}
return models;
}
function applySavedModels(gpus, server) {
const savedByIndex = new Map((server.gpuModels || []).map((gpu) => [gpu.index, gpu]));
const fallbackModel = (server.models || []).length === 1 ? server.models[0] : null;
return gpus.map((gpu) => {
const saved = savedByIndex.get(gpu.index) || {};
return {
...gpu,
model: gpu.model || saved.model || fallbackModel || null,
vendor: gpu.vendor || saved.vendor || null
};
});
}
function mergeGpuModels(gpus, modelByIndex) {
const detectedCount = modelByIndex.size ? Math.max(...modelByIndex.keys()) + 1 : 0;
const totalCount = Math.max(gpus.length, detectedCount);
return Array.from({ length: totalCount }, (_, index) => gpus[index] || createGpu(index)).map((gpu) => {
const detected = modelByIndex.get(gpu.index) || {};
return {
...gpu,
model: detected.model || gpu.model || null,
vendor: detected.vendor || gpu.vendor || null
};
});
}
function extractGpuModels(gpus) {
return gpus
.map((gpu) => ({
index: gpu.index,
model: gpu.model || null,
vendor: gpu.vendor || null
}))
.filter((gpu) => gpu.model || gpu.vendor);
}
function isGpuBusy(gpu) {
const utilBusy = typeof gpu.utilization === "number" && gpu.utilization >= 10;
const memoryPercentBusy = typeof gpu.memoryUtilization === "number" && gpu.memoryUtilization >= 10;
const memBusy = typeof gpu.memoryUsedMiB === "number" && gpu.memoryUsedMiB >= 512;
return utilBusy || memoryPercentBusy || memBusy;
}
function sendJson(res, statusCode, payload) {
const body = JSON.stringify(payload);
res.writeHead(statusCode, {
"Content-Type": "application/json; charset=utf-8",
"Cache-Control": "no-store"
});
res.end(body);
}
function readJson(req) {
return new Promise((resolve, reject) => {
let body = "";
req.on("data", (chunk) => {
body += chunk.toString("utf8");
if (body.length > 1024 * 1024) {
reject(new Error("请求体过大"));
req.destroy();
}
});
req.on("end", () => {
if (!body.trim()) return resolve({});
try {
resolve(JSON.parse(body));
} catch {
reject(new Error("JSON 格式无效"));
}
});
req.on("error", reject);
});
}
function serveStatic(req, res) {
const url = new URL(req.url, `http://${req.headers.host}`);
const requestedPath = decodeURIComponent(url.pathname === "/" ? "/index.html" : url.pathname);
const filePath = path.normalize(path.join(PUBLIC_DIR, requestedPath));
if (!filePath.startsWith(PUBLIC_DIR)) {
res.writeHead(403);
res.end("Forbidden");
return;
}
fs.readFile(filePath, (error, content) => {
if (error) {
res.writeHead(404);
res.end("Not found");
return;
}
const ext = path.extname(filePath);
res.writeHead(200, {
"Content-Type": MIME_TYPES[ext] || "application/octet-stream",
"Cache-Control": "no-store"
});
res.end(content);
});
}
async function handleApi(req, res) {
const url = new URL(req.url, `http://${req.headers.host}`);
const parts = url.pathname.split("/").filter(Boolean);
if (req.method === "GET" && url.pathname === "/api/servers") {
const servers = loadServers().map(publicServer);
sendJson(res, 200, {
servers,
lastRefresh,
pollIntervalMs: POLL_INTERVAL_MS,
refreshing: Boolean(refreshInFlight)
});
return;
}
if (req.method === "POST" && url.pathname === "/api/servers") {
try {
const body = await readJson(req);
const server = normalizeServer(body);
if (!server) {
sendJson(res, 400, { error: "服务器地址不能为空" });
return;
}
const servers = loadServers();
servers.push(server);
saveServers(servers);
refreshServer(server, { includeModels: true }).catch((error) => console.error(error));
sendJson(res, 201, { server: publicServer(server) });
} catch (error) {
sendJson(res, 400, { error: error.message });
}
return;
}
if (req.method === "PATCH" && parts[0] === "api" && parts[1] === "servers" && parts[2]) {
try {
const body = await readJson(req);
const servers = loadServers();
const index = servers.findIndex((server) => server.id === parts[2]);
if (index === -1) {
sendJson(res, 404, { error: "服务器不存在" });
return;
}
const updated = normalizeServer({ ...servers[index], ...body, id: servers[index].id });
if (!updated) {
sendJson(res, 400, { error: "服务器地址不能为空" });
return;
}
servers[index] = updated;
saveServers(servers);
refreshServer(updated, { includeModels: true }).catch((error) => console.error(error));
sendJson(res, 200, { server: publicServer(updated) });
} catch (error) {
sendJson(res, 400, { error: error.message });
}
return;
}
if (req.method === "DELETE" && parts[0] === "api" && parts[1] === "servers" && parts[2]) {
const servers = loadServers();
const nextServers = servers.filter((server) => server.id !== parts[2]);
if (nextServers.length === servers.length) {
sendJson(res, 404, { error: "服务器不存在" });
return;
}
saveServers(nextServers);
statusCache.delete(parts[2]);
sendJson(res, 200, { ok: true });
return;
}
if (req.method === "POST" && url.pathname === "/api/refresh") {
const result = await refreshAll({ includeModels: true });
sendJson(res, 200, { ok: true, result, lastRefresh });
return;
}
sendJson(res, 404, { error: "API 不存在" });
}
const server = http.createServer((req, res) => {
if (req.url.startsWith("/api/")) {
handleApi(req, res).catch((error) => {
console.error(error);
sendJson(res, 500, { error: error.message });
});
return;
}
serveStatic(req, res);
});
ensureDataFile();
refreshAll().catch((error) => console.error(error));
setInterval(() => {
refreshAll().catch((error) => console.error(error));
}, POLL_INTERVAL_MS);
server.listen(PORT, "0.0.0.0", () => {
console.log(`GPU/DCU monitor is running at http://localhost:${PORT}`);
});
@echo off
setlocal
cd /d "%~dp0"
where node >nul 2>nul
if errorlevel 1 (
echo Node.js was not found.
echo Please install Node.js 18 or newer from https://nodejs.org/
echo.
pause
exit /b 1
)
if not exist data mkdir data
echo Starting GPU/DCU resource dashboard...
echo Open http://localhost:3066 in your browser.
echo Press Ctrl+C in this window to stop the dashboard.
echo.
start "" "http://localhost:3066"
npm start
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment