Commit 187bf098 authored by zk's avatar zk
Browse files

Add model asset inventory

parent a932965f
node_modules/ node_modules/
data/servers.json data/servers.json
data/backups/
*.log *.log
*.tar *.tar
*.zip *.zip
......
...@@ -2,6 +2,10 @@ ...@@ -2,6 +2,10 @@
## 2026-05-28 ## 2026-05-28
- 新增服务器配置定期备份,默认保存到 `data/backups/` 并保留最近 30 份。
- 新增模型资产盘点,支持展示每台服务器 `/models``/public``/data` 等目录下的模型文件/目录和 Docker images。
- 新增“刷新模型资产”入口,并支持按模型路径、模型名和镜像名称进行搜索。
- 新增采集并发控制,GPU/DCU 状态默认最多同时采集 8 台,模型资产默认最多同时采集 3 台。
- 新增 `hy-smi` 登录 Shell 兜底采集方式,兼容需要加载启动脚本后才能使用 DTK 环境的服务器。 - 新增 `hy-smi` 登录 Shell 兜底采集方式,兼容需要加载启动脚本后才能使用 DTK 环境的服务器。
- 验证本地可以访问 `10.17.26.107`,并把同样的采集方式同步到公共部署服务。 - 验证本地可以访问 `10.17.26.107`,并把同样的采集方式同步到公共部署服务。
......
...@@ -24,8 +24,10 @@ ...@@ -24,8 +24,10 @@
- 同时显示显存占用和算力占用。 - 同时显示显存占用和算力占用。
- 根据显存占用和算力占用综合判断每张卡是否可用。 - 根据显存占用和算力占用综合判断每张卡是否可用。
- 主界面用水位色块展示每张卡的显存和算力占用。 - 主界面用水位色块展示每张卡的显存和算力占用。
- 右上角支持按服务器名称、IP、分组、标签和型号进行模糊搜索。 - 盘点每台服务器 `/models``/public``/data` 等目录下的模型文件/目录,并展示 Docker images。
- 右上角支持按服务器名称、IP、分组、标签、型号、模型路径和镜像名称进行模糊搜索。
- 型号只在新增服务器和手动刷新时重新识别,日常自动刷新只采集占用数据。 - 型号只在新增服务器和手动刷新时重新识别,日常自动刷新只采集占用数据。
- 定期备份服务器配置文件,便于误操作后恢复。
## 环境准备 ## 环境准备
...@@ -152,6 +154,7 @@ ssh root@10.0.0.13 nvidia-smi ...@@ -152,6 +154,7 @@ ssh root@10.0.0.13 nvidia-smi
Environment=PORT=3066 Environment=PORT=3066
Environment=POLL_INTERVAL_MS=10000 Environment=POLL_INTERVAL_MS=10000
Environment=SSH_TIMEOUT_MS=20000 Environment=SSH_TIMEOUT_MS=20000
Environment=ASSET_REFRESH_INTERVAL_MS=1800000
ExecStart=/usr/bin/node /opt/gpu-dcu-monitor/server.js ExecStart=/usr/bin/node /opt/gpu-dcu-monitor/server.js
Restart=always Restart=always
RestartSec=3 RestartSec=3
...@@ -199,6 +202,14 @@ data/servers.sample.json ...@@ -199,6 +202,14 @@ data/servers.sample.json
服务器配置支持 `group` 分组字段。页面会根据已有服务器自动生成分组筛选入口;`tags` 用于补充额外标签。 服务器配置支持 `group` 分组字段。页面会根据已有服务器自动生成分组筛选入口;`tags` 用于补充额外标签。
程序会定期把服务器配置备份到:
```text
data/backups/
```
默认保留最近 30 份备份。
## 环境变量 ## 环境变量
Windows PowerShell 示例: Windows PowerShell 示例:
...@@ -221,6 +232,14 @@ PORT=3066 POLL_INTERVAL_MS=10000 SSH_TIMEOUT_MS=20000 npm start ...@@ -221,6 +232,14 @@ PORT=3066 POLL_INTERVAL_MS=10000 SSH_TIMEOUT_MS=20000 npm start
- `PORT`:网页端口,默认 `3066` - `PORT`:网页端口,默认 `3066`
- `POLL_INTERVAL_MS`:自动采集间隔,默认 `10000` 毫秒。 - `POLL_INTERVAL_MS`:自动采集间隔,默认 `10000` 毫秒。
- `SSH_TIMEOUT_MS`:单台服务器 SSH/采集命令超时,默认 `20000` 毫秒。部分 NVIDIA 机器执行 `nvidia-smi` 较慢时可以继续调大。 - `SSH_TIMEOUT_MS`:单台服务器 SSH/采集命令超时,默认 `20000` 毫秒。部分 NVIDIA 机器执行 `nvidia-smi` 较慢时可以继续调大。
- `REFRESH_CONCURRENCY`:GPU/DCU 状态采集并发数,默认 `8`
- `ASSET_REFRESH_INTERVAL_MS`:模型资产和 Docker 镜像盘点间隔,默认 `1800000` 毫秒。
- `ASSET_SSH_TIMEOUT_MS`:单台服务器资产盘点 SSH 超时,默认 `30000` 毫秒。
- `ASSET_CONCURRENCY`:资产盘点并发数,默认 `3`
- `ASSET_PATHS`:模型目录扫描路径,默认 `/models,/public,/data`
- `ASSET_MAX_ITEMS`:每台服务器最多返回的模型条目和镜像条目数量,默认 `160`
- `BACKUP_INTERVAL_MS`:服务器配置定期备份间隔,默认 `86400000` 毫秒。
- `BACKUP_RETENTION`:服务器配置备份保留份数,默认 `30`
- `SSH_PATH`:自定义 SSH 程序路径。Windows 默认使用 `C:\Windows\System32\OpenSSH\ssh.exe` - `SSH_PATH`:自定义 SSH 程序路径。Windows 默认使用 `C:\Windows\System32\OpenSSH\ssh.exe`
## 运维命令 ## 运维命令
...@@ -244,3 +263,4 @@ ss -lntp | grep 3066 ...@@ -244,3 +263,4 @@ ss -lntp | grep 3066
- 如果页面能打开但服务器显示离线,优先在部署机器上手动执行 `ssh root@目标IP hy-smi``ssh root@目标IP nvidia-smi` - 如果页面能打开但服务器显示离线,优先在部署机器上手动执行 `ssh root@目标IP hy-smi``ssh root@目标IP nvidia-smi`
- 如果 NVIDIA 服务器偶发超时,可以调大 `SSH_TIMEOUT_MS` - 如果 NVIDIA 服务器偶发超时,可以调大 `SSH_TIMEOUT_MS`
- 如果多人共同查看,建议部署在一台固定机器上,由这台机器统一采集。 - 如果多人共同查看,建议部署在一台固定机器上,由这台机器统一采集。
- 模型资产盘点默认不计算目录大小,避免对大模型盘产生明显 IO 压力。
...@@ -5,6 +5,8 @@ const state = { ...@@ -5,6 +5,8 @@ const state = {
query: "", query: "",
selectedId: null, selectedId: null,
pollIntervalMs: 10000, pollIntervalMs: 10000,
assetRefreshing: false,
assetDetailLoading: false,
timer: null timer: null
}; };
...@@ -37,6 +39,7 @@ document.querySelector("#addServerBtn").addEventListener("click", () => openDial ...@@ -37,6 +39,7 @@ document.querySelector("#addServerBtn").addEventListener("click", () => openDial
document.querySelector("#emptyAddBtn").addEventListener("click", () => openDialog()); document.querySelector("#emptyAddBtn").addEventListener("click", () => openDialog());
document.querySelector("#closeDialogBtn").addEventListener("click", () => els.dialog.close()); document.querySelector("#closeDialogBtn").addEventListener("click", () => els.dialog.close());
document.querySelector("#refreshBtn").addEventListener("click", manualRefresh); document.querySelector("#refreshBtn").addEventListener("click", manualRefresh);
document.querySelector("#assetRefreshBtn").addEventListener("click", refreshAssets);
document.querySelectorAll(".filter").forEach((button) => { document.querySelectorAll(".filter").forEach((button) => {
button.addEventListener("click", () => { button.addEventListener("click", () => {
state.filter = button.dataset.filter; state.filter = button.dataset.filter;
...@@ -58,12 +61,14 @@ async function loadServers() { ...@@ -58,12 +61,14 @@ async function loadServers() {
const payload = await requestJson("/api/servers"); const payload = await requestJson("/api/servers");
state.servers = payload.servers || []; state.servers = payload.servers || [];
state.pollIntervalMs = payload.pollIntervalMs || state.pollIntervalMs; state.pollIntervalMs = payload.pollIntervalMs || state.pollIntervalMs;
state.assetRefreshing = Boolean(payload.assetRefreshing);
els.lastRefresh.textContent = payload.lastRefresh ? `更新 ${formatTime(payload.lastRefresh)}` : "等待刷新"; els.lastRefresh.textContent = payload.lastRefresh ? `更新 ${formatTime(payload.lastRefresh)}` : "等待刷新";
if (!state.selectedId && state.servers[0]) state.selectedId = state.servers[0].id; if (!state.selectedId && state.servers[0]) state.selectedId = state.servers[0].id;
if (state.selectedId && !state.servers.some((server) => server.id === state.selectedId)) { if (state.selectedId && !state.servers.some((server) => server.id === state.selectedId)) {
state.selectedId = state.servers[0]?.id || null; state.selectedId = state.servers[0]?.id || null;
} }
render(); render();
loadSelectedAssets();
scheduleNextLoad(); scheduleNextLoad();
} catch (error) { } catch (error) {
showToast(error.message); showToast(error.message);
...@@ -71,6 +76,23 @@ async function loadServers() { ...@@ -71,6 +76,23 @@ async function loadServers() {
} }
} }
async function loadSelectedAssets() {
if (!state.selectedId || state.assetDetailLoading) return;
state.assetDetailLoading = true;
try {
const payload = await requestJson(`/api/servers/${encodeURIComponent(state.selectedId)}/assets`);
const server = state.servers.find((item) => item.id === state.selectedId);
if (server && payload.assets) {
server.assets = payload.assets;
renderDetail();
}
} catch (error) {
console.warn(error);
} finally {
state.assetDetailLoading = false;
}
}
function scheduleNextLoad() { function scheduleNextLoad() {
window.clearTimeout(state.timer); window.clearTimeout(state.timer);
state.timer = window.setTimeout(loadServers, state.pollIntervalMs); state.timer = window.setTimeout(loadServers, state.pollIntervalMs);
...@@ -90,6 +112,20 @@ async function manualRefresh() { ...@@ -90,6 +112,20 @@ async function manualRefresh() {
} }
} }
async function refreshAssets() {
const button = document.querySelector("#assetRefreshBtn");
button.disabled = true;
try {
await requestJson("/api/assets/refresh", { method: "POST" });
await loadServers();
showToast("模型资产刷新完成");
} catch (error) {
showToast(error.message);
} finally {
button.disabled = false;
}
}
function render() { function render() {
renderStats(); renderStats();
renderGroups(); renderGroups();
...@@ -165,6 +201,8 @@ function renderStats() { ...@@ -165,6 +201,8 @@ function renderStats() {
setText("#countFree", totals.freeServers); setText("#countFree", totals.freeServers);
setText("#countBusy", totals.busyServers); setText("#countBusy", totals.busyServers);
setText("#countOffline", totals.offlineServers); setText("#countOffline", totals.offlineServers);
const assetButton = document.querySelector("#assetRefreshBtn");
if (assetButton) assetButton.disabled = state.assetRefreshing;
} }
function renderGrid() { function renderGrid() {
...@@ -183,12 +221,14 @@ function renderGrid() { ...@@ -183,12 +221,14 @@ function renderGrid() {
card.addEventListener("click", () => { card.addEventListener("click", () => {
state.selectedId = server.id; state.selectedId = server.id;
render(); render();
loadSelectedAssets();
}); });
card.addEventListener("keydown", (event) => { card.addEventListener("keydown", (event) => {
if (event.key === "Enter" || event.key === " ") { if (event.key === "Enter" || event.key === " ") {
event.preventDefault(); event.preventDefault();
state.selectedId = server.id; state.selectedId = server.id;
render(); render();
loadSelectedAssets();
} }
}); });
card.querySelector(".edit-card").addEventListener("click", (event) => { card.querySelector(".edit-card").addEventListener("click", (event) => {
...@@ -202,6 +242,7 @@ function renderGrid() { ...@@ -202,6 +242,7 @@ function renderGrid() {
function serverCardHtml(server) { function serverCardHtml(server) {
const status = server.status || {}; const status = server.status || {};
const assets = server.assets || {};
const kind = getServerKind(server); const kind = getServerKind(server);
const serverLevel = serverOccupancyClass(server); const serverLevel = serverOccupancyClass(server);
const totalCount = status.totalCount || server.gpuCount || 0; const totalCount = status.totalCount || server.gpuCount || 0;
...@@ -225,6 +266,11 @@ function serverCardHtml(server) { ...@@ -225,6 +266,11 @@ function serverCardHtml(server) {
<div class="gpu-grid"> <div class="gpu-grid">
${gpuChips(status.gpus || [], totalCount, kind)} ${gpuChips(status.gpus || [], totalCount, kind)}
</div> </div>
<div class="asset-summary ${assets.state === "failed" ? "failed" : ""}">
<span>模型 ${assets.modelCount || 0}</span>
<span>镜像 ${assets.dockerCount || 0}</span>
<em>${assetUpdatedText(assets)}</em>
</div>
<div class="tag-list"> <div class="tag-list">
${tags.map((tag) => `<span class="tag">${escapeHtml(tag)}</span>`).join("")} ${tags.map((tag) => `<span class="tag">${escapeHtml(tag)}</span>`).join("")}
<button class="icon-button edit-card" type="button" aria-label="编辑服务器">✎</button> <button class="icon-button edit-card" type="button" aria-label="编辑服务器">✎</button>
...@@ -278,6 +324,7 @@ function renderDetail() { ...@@ -278,6 +324,7 @@ function renderDetail() {
} }
const status = server.status || {}; const status = server.status || {};
const assets = server.assets || {};
const kind = getServerKind(server); const kind = getServerKind(server);
const totalCount = status.totalCount || server.gpuCount || 0; const totalCount = status.totalCount || server.gpuCount || 0;
els.detail.innerHTML = ` els.detail.innerHTML = `
...@@ -300,10 +347,66 @@ function renderDetail() { ...@@ -300,10 +347,66 @@ function renderDetail() {
<div class="gpu-list"> <div class="gpu-list">
${(status.gpus || []).map(gpuRowHtml).join("")} ${(status.gpus || []).map(gpuRowHtml).join("")}
</div> </div>
${assetPanelHtml(assets)}
`; `;
document.querySelector("#detailEditBtn").addEventListener("click", () => openDialog(server)); document.querySelector("#detailEditBtn").addEventListener("click", () => openDialog(server));
} }
function assetPanelHtml(assets) {
const modelItems = assets.modelItems || [];
const dockerImages = assets.dockerImages || [];
const modelList = modelItems.length
? modelItems.slice(0, 80).map(modelItemHtml).join("")
: assets.modelCount
? `<div class="asset-empty">正在加载模型详情</div>`
: `<div class="asset-empty">未发现模型目录或文件</div>`;
const dockerList = dockerImages.length
? dockerImages.slice(0, 80).map(dockerImageHtml).join("")
: assets.dockerCount
? `<div class="asset-empty">正在加载镜像详情</div>`
: `<div class="asset-empty">未发现 Docker 镜像</div>`;
return `
<section class="asset-panel">
<div class="asset-head">
<div>
<p class="eyebrow">模型资产</p>
<h3>模型路径与镜像</h3>
</div>
<span>${assetUpdatedText(assets)}</span>
</div>
${assets.error ? `<div class="asset-error">${escapeHtml(assets.error)}</div>` : ""}
<div class="asset-columns">
<div class="asset-column">
<div class="asset-title"><strong>挂载目录模型</strong><span>${modelItems.length}</span></div>
<div class="asset-list">${modelList}</div>
</div>
<div class="asset-column">
<div class="asset-title"><strong>Docker Images</strong><span>${dockerImages.length}</span></div>
<div class="asset-list">${dockerList}</div>
</div>
</div>
</section>`;
}
function modelItemHtml(item) {
return `
<div class="asset-item">
<strong>${escapeHtml(item.name || "-")}</strong>
<span>${escapeHtml(item.path || "-")}</span>
<em>${escapeHtml(item.root || "-")}${item.type === "file" ? " · 文件" : " · 目录"}</em>
</div>`;
}
function dockerImageHtml(image) {
return `
<div class="asset-item">
<strong>${escapeHtml(image.repository || "-")}:${escapeHtml(image.tag || "-")}</strong>
<span>${escapeHtml(image.imageId || "-")} · ${escapeHtml(image.size || "-")}</span>
<em>${escapeHtml(image.created || "-")}</em>
</div>`;
}
function gpuRowHtml(gpu) { function gpuRowHtml(gpu) {
const utilization = normalizePercent(gpu.utilization); const utilization = normalizePercent(gpu.utilization);
const memoryUtilization = normalizePercent(gpu.memoryUtilization); const memoryUtilization = normalizePercent(gpu.memoryUtilization);
...@@ -377,11 +480,35 @@ function filteredServers() { ...@@ -377,11 +480,35 @@ function filteredServers() {
const kind = getServerKind(server); const kind = getServerKind(server);
const matchesFilter = state.filter === "all" || state.filter === kind; const matchesFilter = state.filter === "all" || state.filter === kind;
const matchesGroup = state.groupFilter === "all" || serverGroup(server) === state.groupFilter; const matchesGroup = state.groupFilter === "all" || serverGroup(server) === state.groupFilter;
const text = [server.name, server.host, server.user, serverGroup(server), modelSummary(server), ...(server.tags || [])].join(" ").toLowerCase(); const text = [
server.name,
server.host,
server.user,
serverGroup(server),
modelSummary(server),
...(server.tags || []),
assetSearchText(server)
]
.join(" ")
.toLowerCase();
return matchesFilter && matchesGroup && (!state.query || text.includes(state.query)); return matchesFilter && matchesGroup && (!state.query || text.includes(state.query));
}); });
} }
function assetSearchText(server) {
const assets = server.assets || {};
if (assets.searchText) return assets.searchText;
const modelText = (assets.modelItems || []).flatMap((item) => [item.name, item.path, item.root]);
const dockerText = (assets.dockerImages || []).flatMap((image) => [image.repository, image.tag, image.imageId]);
return [...modelText, ...dockerText].filter(Boolean).join(" ");
}
function assetUpdatedText(assets) {
if (!assets || !assets.updatedAt) return "未盘点";
if (assets.state === "failed") return "盘点失败";
return `盘点 ${formatTime(assets.updatedAt)}`;
}
function serverGroup(server) { function serverGroup(server) {
return String(server.group || "未分组").trim() || "未分组"; return String(server.group || "未分组").trim() || "未分组";
} }
......
...@@ -39,6 +39,9 @@ ...@@ -39,6 +39,9 @@
<button class="ghost-action" id="refreshBtn" type="button"> <button class="ghost-action" id="refreshBtn" type="button">
<span aria-hidden="true"></span>手动刷新 <span aria-hidden="true"></span>手动刷新
</button> </button>
<button class="ghost-action" id="assetRefreshBtn" type="button">
<span aria-hidden="true"></span>刷新模型资产
</button>
</div> </div>
</aside> </aside>
...@@ -50,7 +53,7 @@ ...@@ -50,7 +53,7 @@
</div> </div>
<div class="search-wrap"> <div class="search-wrap">
<span aria-hidden="true"></span> <span aria-hidden="true"></span>
<input id="searchInput" type="search" placeholder="搜索服务器、IP、标签" /> <input id="searchInput" type="search" placeholder="搜索服务器、IP、模型、镜像" />
</div> </div>
</section> </section>
......
...@@ -303,7 +303,7 @@ h2 { ...@@ -303,7 +303,7 @@ h2 {
.server-grid { .server-grid {
display: grid; display: grid;
grid-template-columns: repeat(auto-fill, minmax(min(100%, 270px), 1fr)); grid-template-columns: repeat(auto-fill, minmax(min(100%, 270px), 1fr));
grid-auto-rows: minmax(340px, auto); grid-auto-rows: minmax(368px, auto);
gap: 14px; gap: 14px;
align-items: stretch; align-items: stretch;
flex: 1 1 auto; flex: 1 1 auto;
...@@ -316,7 +316,7 @@ h2 { ...@@ -316,7 +316,7 @@ h2 {
position: relative; position: relative;
display: flex; display: flex;
flex-direction: column; flex-direction: column;
min-height: 340px; min-height: 368px;
min-width: 0; min-width: 0;
border: 1px solid var(--line); border: 1px solid var(--line);
border-radius: 12px; border-radius: 12px;
...@@ -573,6 +573,40 @@ h2 { ...@@ -573,6 +573,40 @@ h2 {
margin-top: 12px; margin-top: 12px;
} }
.asset-summary {
display: grid;
grid-template-columns: auto auto minmax(0, 1fr);
gap: 6px;
align-items: center;
min-width: 0;
margin-top: 10px;
color: var(--muted);
font-size: 12px;
}
.asset-summary span {
border-radius: 999px;
background: #eef6f6;
color: var(--teal-dark);
padding: 3px 7px;
font-weight: 800;
}
.asset-summary em {
min-width: 0;
overflow: hidden;
color: var(--muted);
font-style: normal;
text-align: right;
text-overflow: ellipsis;
white-space: nowrap;
}
.asset-summary.failed span {
background: #fde8e8;
color: #9f2f2f;
}
.tag { .tag {
max-width: 100%; max-width: 100%;
border-radius: 999px; border-radius: 999px;
...@@ -680,6 +714,97 @@ h2 { ...@@ -680,6 +714,97 @@ h2 {
gap: 10px; gap: 10px;
} }
.asset-panel {
display: grid;
gap: 12px;
margin-top: 16px;
}
.asset-head,
.asset-title {
display: flex;
align-items: center;
justify-content: space-between;
gap: 10px;
}
.asset-head h3 {
margin-top: 2px;
font-size: 18px;
}
.asset-head span,
.asset-title span,
.asset-item span,
.asset-item em,
.asset-empty,
.asset-error {
color: var(--muted);
font-size: 12px;
}
.asset-columns {
display: grid;
grid-template-columns: 1fr;
gap: 12px;
}
.asset-column {
display: grid;
gap: 8px;
min-width: 0;
}
.asset-title {
border-bottom: 1px solid var(--line);
padding-bottom: 6px;
}
.asset-list {
display: grid;
gap: 8px;
max-height: 360px;
overflow-y: auto;
padding-right: 4px;
}
.asset-item {
display: grid;
gap: 3px;
min-width: 0;
border: 1px solid var(--line);
border-radius: 9px;
background: #fff;
padding: 9px;
}
.asset-item strong,
.asset-item span,
.asset-item em {
min-width: 0;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.asset-item em {
font-style: normal;
}
.asset-empty,
.asset-error {
border: 1px dashed var(--line);
border-radius: 9px;
background: var(--panel-soft);
padding: 10px;
}
.asset-error {
border-color: #f3b2b2;
background: #fde8e8;
color: #9f2f2f;
}
.gpu-row { .gpu-row {
background: #fff; background: #fff;
} }
......
...@@ -7,9 +7,18 @@ const crypto = require("crypto"); ...@@ -7,9 +7,18 @@ const crypto = require("crypto");
const PORT = Number(process.env.PORT || 3066); const PORT = Number(process.env.PORT || 3066);
const POLL_INTERVAL_MS = Number(process.env.POLL_INTERVAL_MS || 10000); const POLL_INTERVAL_MS = Number(process.env.POLL_INTERVAL_MS || 10000);
const SSH_TIMEOUT_MS = Number(process.env.SSH_TIMEOUT_MS || 20000); const SSH_TIMEOUT_MS = Number(process.env.SSH_TIMEOUT_MS || 20000);
const REFRESH_CONCURRENCY = clampInt(process.env.REFRESH_CONCURRENCY, 1, 32, 8);
const ASSET_REFRESH_INTERVAL_MS = Number(process.env.ASSET_REFRESH_INTERVAL_MS || 30 * 60 * 1000);
const ASSET_SSH_TIMEOUT_MS = Number(process.env.ASSET_SSH_TIMEOUT_MS || 30000);
const ASSET_CONCURRENCY = clampInt(process.env.ASSET_CONCURRENCY, 1, 16, 3);
const ASSET_MAX_ITEMS = clampInt(process.env.ASSET_MAX_ITEMS, 20, 1000, 160);
const ASSET_PATHS = parseCsv(process.env.ASSET_PATHS || "/models,/public,/data");
const BACKUP_INTERVAL_MS = Number(process.env.BACKUP_INTERVAL_MS || 24 * 60 * 60 * 1000);
const BACKUP_RETENTION = clampInt(process.env.BACKUP_RETENTION, 1, 365, 30);
const ROOT = __dirname; const ROOT = __dirname;
const DATA_DIR = path.join(ROOT, "data"); const DATA_DIR = path.join(ROOT, "data");
const CONFIG_PATH = path.join(DATA_DIR, "servers.json"); const CONFIG_PATH = path.join(DATA_DIR, "servers.json");
const BACKUP_DIR = path.join(DATA_DIR, "backups");
const PUBLIC_DIR = path.join(ROOT, "public"); const PUBLIC_DIR = path.join(ROOT, "public");
const MIME_TYPES = { const MIME_TYPES = {
...@@ -23,9 +32,12 @@ const MIME_TYPES = { ...@@ -23,9 +32,12 @@ const MIME_TYPES = {
}; };
let statusCache = new Map(); let statusCache = new Map();
let assetCache = new Map();
let lastRefresh = null; let lastRefresh = null;
let lastAssetRefresh = null;
let refreshInFlight = null; let refreshInFlight = null;
let refreshInFlightIncludesModels = false; let refreshInFlightIncludesModels = false;
let assetRefreshInFlight = null;
function createId() { function createId() {
if (typeof crypto.randomUUID === "function") return crypto.randomUUID(); if (typeof crypto.randomUUID === "function") return crypto.randomUUID();
...@@ -38,6 +50,9 @@ function ensureDataFile() { ...@@ -38,6 +50,9 @@ function ensureDataFile() {
if (!fs.existsSync(DATA_DIR)) { if (!fs.existsSync(DATA_DIR)) {
fs.mkdirSync(DATA_DIR, { recursive: true }); fs.mkdirSync(DATA_DIR, { recursive: true });
} }
if (!fs.existsSync(BACKUP_DIR)) {
fs.mkdirSync(BACKUP_DIR, { recursive: true });
}
if (!fs.existsSync(CONFIG_PATH)) { if (!fs.existsSync(CONFIG_PATH)) {
fs.writeFileSync(CONFIG_PATH, "[]\n", "utf8"); fs.writeFileSync(CONFIG_PATH, "[]\n", "utf8");
} }
...@@ -62,6 +77,61 @@ function saveServers(servers) { ...@@ -62,6 +77,61 @@ function saveServers(servers) {
fs.renameSync(tmp, CONFIG_PATH); fs.renameSync(tmp, CONFIG_PATH);
} }
function backupServerConfig(reason) {
ensureDataFile();
if (!fs.existsSync(CONFIG_PATH)) return;
const stamp = compactTimestamp(new Date());
const suffix = reason ? `.${reason}` : "";
const target = path.join(BACKUP_DIR, `servers.${stamp}${suffix}.json`);
try {
fs.copyFileSync(CONFIG_PATH, target);
pruneBackups();
} catch (error) {
console.warn(`Server config backup failed: ${error.message}`);
}
}
function pruneBackups() {
if (!fs.existsSync(BACKUP_DIR)) return;
const backups = fs
.readdirSync(BACKUP_DIR)
.filter((name) => /^servers\.\d{14}.*\.json$/.test(name))
.map((name) => ({
name,
filePath: path.join(BACKUP_DIR, name),
mtimeMs: fs.statSync(path.join(BACKUP_DIR, name)).mtimeMs
}))
.sort((a, b) => b.mtimeMs - a.mtimeMs);
backups.slice(BACKUP_RETENTION).forEach((backup) => {
try {
fs.unlinkSync(backup.filePath);
} catch (error) {
console.warn(`Failed to prune backup ${backup.name}: ${error.message}`);
}
});
}
function compactTimestamp(date) {
const pad = (value) => String(value).padStart(2, "0");
return [
date.getFullYear(),
pad(date.getMonth() + 1),
pad(date.getDate()),
pad(date.getHours()),
pad(date.getMinutes()),
pad(date.getSeconds())
].join("");
}
function parseCsv(value) {
return String(value || "")
.split(",")
.map((item) => item.trim())
.filter(Boolean)
.slice(0, 24);
}
function normalizeServer(input) { function normalizeServer(input) {
if (!input || typeof input !== "object") return null; if (!input || typeof input !== "object") return null;
const host = String(input.host || "").trim(); const host = String(input.host || "").trim();
...@@ -131,11 +201,60 @@ function optionalInt(value, min, max) { ...@@ -131,11 +201,60 @@ function optionalInt(value, min, max) {
return clampInt(value, min, max, 0); return clampInt(value, min, max, 0);
} }
function publicServer(server) { function publicServer(server, options = {}) {
const cached = statusCache.get(server.id); const cached = statusCache.get(server.id);
const assets = assetCache.get(server.id);
return { return {
...server, ...server,
status: cached || createPendingStatus(server) status: cached || createPendingStatus(server),
assets: publicAssetStatus(assets || createPendingAssetStatus(), options.includeAssetDetails)
};
}
function publicAssetStatus(assets, includeDetails) {
const modelItems = assets.modelItems || [];
const dockerImages = assets.dockerImages || [];
const summary = {
state: assets.state,
updatedAt: assets.updatedAt,
latencyMs: assets.latencyMs,
paths: assets.paths,
modelCount: assets.modelCount || modelItems.length,
dockerCount: assets.dockerCount || dockerImages.length,
searchText: assetSearchText(modelItems, dockerImages),
error: assets.error || null
};
if (includeDetails) {
summary.modelItems = modelItems;
summary.dockerImages = dockerImages;
} else {
summary.modelItems = [];
summary.dockerImages = [];
}
return summary;
}
function assetSearchText(modelItems, dockerImages) {
return [
...modelItems.flatMap((item) => [item.name, item.path, item.root]),
...dockerImages.flatMap((image) => [image.repository, image.tag, image.imageId])
]
.filter(Boolean)
.join(" ")
.slice(0, 12000);
}
function createPendingAssetStatus() {
return {
state: "pending",
updatedAt: null,
latencyMs: null,
paths: ASSET_PATHS,
modelCount: 0,
dockerCount: 0,
modelItems: [],
dockerImages: [],
error: null
}; };
} }
...@@ -177,7 +296,7 @@ async function refreshAll(options = {}) { ...@@ -177,7 +296,7 @@ async function refreshAll(options = {}) {
} }
const servers = loadServers(); const servers = loadServers();
refreshInFlightIncludesModels = includeModels; refreshInFlightIncludesModels = includeModels;
refreshInFlight = Promise.all(servers.map((server) => refreshServer(server, { includeModels }))) refreshInFlight = mapWithConcurrency(servers, REFRESH_CONCURRENCY, (server) => refreshServer(server, { includeModels }))
.then((results) => { .then((results) => {
lastRefresh = new Date().toISOString(); lastRefresh = new Date().toISOString();
return results; return results;
...@@ -189,6 +308,35 @@ async function refreshAll(options = {}) { ...@@ -189,6 +308,35 @@ async function refreshAll(options = {}) {
return refreshInFlight; return refreshInFlight;
} }
async function refreshAssetsAll() {
if (assetRefreshInFlight) return assetRefreshInFlight;
const servers = loadServers();
assetRefreshInFlight = mapWithConcurrency(servers, ASSET_CONCURRENCY, refreshServerAssets)
.then((results) => {
lastAssetRefresh = new Date().toISOString();
return results;
})
.finally(() => {
assetRefreshInFlight = null;
});
return assetRefreshInFlight;
}
async function mapWithConcurrency(items, limit, worker) {
const results = new Array(items.length);
let nextIndex = 0;
const workerCount = Math.min(limit, items.length);
const runners = Array.from({ length: workerCount }, async () => {
while (nextIndex < items.length) {
const current = nextIndex;
nextIndex += 1;
results[current] = await worker(items[current], current);
}
});
await Promise.all(runners);
return results;
}
async function refreshServer(server, options = {}) { async function refreshServer(server, options = {}) {
const includeModels = Boolean(options.includeModels); const includeModels = Boolean(options.includeModels);
const started = Date.now(); const started = Date.now();
...@@ -253,7 +401,40 @@ async function refreshServer(server, options = {}) { ...@@ -253,7 +401,40 @@ async function refreshServer(server, options = {}) {
} }
} }
function runProbeCommand(server, remoteCommand = buildRemoteCommand(server.command)) { async function refreshServerAssets(server) {
const started = Date.now();
try {
const output = await runProbeCommand(server, buildAssetCommand(), ASSET_SSH_TIMEOUT_MS);
const parsed = parseAssetOutput(output.stdout);
const status = {
state: "online",
updatedAt: new Date().toISOString(),
latencyMs: Date.now() - started,
paths: ASSET_PATHS,
modelCount: parsed.modelItems.length,
dockerCount: parsed.dockerImages.length,
modelItems: parsed.modelItems,
dockerImages: parsed.dockerImages,
error: null
};
assetCache.set(server.id, status);
return { id: server.id, ok: true };
} catch (error) {
const previous = assetCache.get(server.id) || createPendingAssetStatus();
const status = {
...previous,
state: "failed",
updatedAt: new Date().toISOString(),
latencyMs: Date.now() - started,
paths: ASSET_PATHS,
error: error.message
};
assetCache.set(server.id, status);
return { id: server.id, ok: false, error: error.message };
}
}
function runProbeCommand(server, remoteCommand = buildRemoteCommand(server.command), timeoutMs = SSH_TIMEOUT_MS) {
const target = server.user ? `${server.user}@${server.host}` : server.host; const target = server.user ? `${server.user}@${server.host}` : server.host;
const sshPath = process.env.SSH_PATH || (process.platform === "win32" ? "C:\\Windows\\System32\\OpenSSH\\ssh.exe" : "ssh"); const sshPath = process.env.SSH_PATH || (process.platform === "win32" ? "C:\\Windows\\System32\\OpenSSH\\ssh.exe" : "ssh");
const args = [ const args = [
...@@ -262,7 +443,7 @@ function runProbeCommand(server, remoteCommand = buildRemoteCommand(server.comma ...@@ -262,7 +443,7 @@ function runProbeCommand(server, remoteCommand = buildRemoteCommand(server.comma
"-o", "-o",
"BatchMode=yes", "BatchMode=yes",
"-o", "-o",
`ConnectTimeout=${Math.ceil(SSH_TIMEOUT_MS / 1000)}`, `ConnectTimeout=${Math.ceil(timeoutMs / 1000)}`,
"-o", "-o",
"StrictHostKeyChecking=accept-new", "StrictHostKeyChecking=accept-new",
target, target,
...@@ -279,8 +460,8 @@ function runProbeCommand(server, remoteCommand = buildRemoteCommand(server.comma ...@@ -279,8 +460,8 @@ function runProbeCommand(server, remoteCommand = buildRemoteCommand(server.comma
if (settled) return; if (settled) return;
settled = true; settled = true;
child.kill(); child.kill();
reject(new Error(`SSH 超时 (${Math.ceil(SSH_TIMEOUT_MS / 1000)}s)`)); reject(new Error(`SSH 超时 (${Math.ceil(timeoutMs / 1000)}s)`));
}, SSH_TIMEOUT_MS); }, timeoutMs);
child.stdout.on("data", (chunk) => { child.stdout.on("data", (chunk) => {
stdout += chunk.toString("utf8"); stdout += chunk.toString("utf8");
...@@ -334,6 +515,34 @@ function buildModelCommand(command) { ...@@ -334,6 +515,34 @@ function buildModelCommand(command) {
return buildHySmiCommand("--showproductname"); return buildHySmiCommand("--showproductname");
} }
function buildAssetCommand() {
const paths = ASSET_PATHS.length ? ASSET_PATHS : ["/models", "/public", "/data"];
const pathList = paths.map(shellQuote).join(" ");
const perPathLimit = Math.max(20, Math.ceil(ASSET_MAX_ITEMS / paths.length));
const modelCommand = [
`for p in ${pathList}; do`,
`if [ -d "$p" ]; then`,
`{`,
`find "$p" -mindepth 1 -maxdepth 2 -type d ! -name '.*' ! -name '__pycache__' -printf 'MODEL\\t%p\\td\\t%TY-%Tm-%Td %TH:%TM\\n' 2>/dev/null;`,
`find "$p" -mindepth 1 -maxdepth 1 -type f \\( -iname '*.gguf' -o -iname '*.safetensors' -o -iname '*.bin' -o -iname '*.onnx' -o -iname '*.pt' -o -iname '*.pth' -o -iname '*.ckpt' \\) -printf 'MODEL\\t%p\\tf\\t%TY-%Tm-%Td %TH:%TM\\n' 2>/dev/null;`,
`} | head -n ${perPathLimit};`,
`fi;`,
`done | head -n ${ASSET_MAX_ITEMS}`
].join(" ");
const dockerCommand = [
"docker images",
"--format 'DOCKER\\t{{.Repository}}\\t{{.Tag}}\\t{{.ID}}\\t{{.Size}}\\t{{.CreatedSince}}'",
`2>/dev/null | head -n ${ASSET_MAX_ITEMS}`
].join(" ");
return [
"printf '__GPU_MONITOR_MODELS__\\n';",
modelCommand,
"; printf '__GPU_MONITOR_DOCKER__\\n';",
dockerCommand,
"|| true"
].join(" ");
}
function buildHySmiCommand(args) { function buildHySmiCommand(args) {
const command = ["hy-smi", args].filter(Boolean).join(" "); const command = ["hy-smi", args].filter(Boolean).join(" ");
return `(${command} 2>/dev/null || bash -ilc ${shellQuote(command)})`; return `(${command} 2>/dev/null || bash -ilc ${shellQuote(command)})`;
...@@ -439,6 +648,82 @@ function parseModelOutput(output, command) { ...@@ -439,6 +648,82 @@ function parseModelOutput(output, command) {
return parseHyProductNames(output); return parseHyProductNames(output);
} }
function parseAssetOutput(output) {
const modelItems = [];
const dockerImages = [];
let section = "";
const lines = String(output || "").split(/\r?\n/);
for (const rawLine of lines) {
const line = rawLine.trim();
if (!line) continue;
if (line === "__GPU_MONITOR_MODELS__") {
section = "models";
continue;
}
if (line === "__GPU_MONITOR_DOCKER__") {
section = "docker";
continue;
}
if (section === "models" && line.startsWith("MODEL\t")) {
const parts = line.split("\t");
const filePath = parts[1] || "";
const type = parts[2] === "f" ? "file" : "dir";
const modifiedAt = parts[3] || "";
const name = path.posix.basename(filePath.replace(/\\/g, "/"));
if (filePath && name) {
modelItems.push({
name: normalizeAssetName(name),
path: filePath,
root: assetRoot(filePath),
type,
modifiedAt
});
}
continue;
}
if (section === "docker" && line.startsWith("DOCKER\t")) {
const parts = line.split("\t");
const repository = normalizeAssetName(parts[1]);
if (!repository || repository === "<none>") continue;
dockerImages.push({
repository,
tag: normalizeAssetName(parts[2]) || "<none>",
imageId: normalizeAssetName(parts[3]),
size: normalizeAssetName(parts[4]),
created: normalizeAssetName(parts.slice(5).join(" "))
});
}
}
return {
modelItems: dedupeBy(modelItems, (item) => `${item.path}:${item.type}`).slice(0, ASSET_MAX_ITEMS),
dockerImages: dedupeBy(dockerImages, (item) => `${item.repository}:${item.tag}:${item.imageId}`).slice(0, ASSET_MAX_ITEMS)
};
}
function normalizeAssetName(value) {
return String(value || "").replace(/\s+/g, " ").trim();
}
function assetRoot(filePath) {
const normalized = String(filePath || "").replace(/\\/g, "/");
const root = ASSET_PATHS.find((assetPath) => normalized === assetPath || normalized.startsWith(`${assetPath}/`));
return root || normalized.split("/").slice(0, 2).join("/") || "/";
}
function dedupeBy(items, keyFn) {
const seen = new Set();
const result = [];
for (const item of items) {
const key = keyFn(item);
if (seen.has(key)) continue;
seen.add(key);
result.push(item);
}
return result;
}
function parseNvidiaModels(output) { function parseNvidiaModels(output) {
const byIndex = new Map(); const byIndex = new Map();
const lines = String(output || "") const lines = String(output || "")
...@@ -716,12 +1001,30 @@ async function handleApi(req, res) { ...@@ -716,12 +1001,30 @@ async function handleApi(req, res) {
const parts = url.pathname.split("/").filter(Boolean); const parts = url.pathname.split("/").filter(Boolean);
if (req.method === "GET" && url.pathname === "/api/servers") { if (req.method === "GET" && url.pathname === "/api/servers") {
const servers = loadServers().map(publicServer); const includeAssetDetails = url.searchParams.get("assetDetails") === "1";
const servers = loadServers().map((server) => publicServer(server, { includeAssetDetails }));
sendJson(res, 200, { sendJson(res, 200, {
servers, servers,
lastRefresh, lastRefresh,
lastAssetRefresh,
pollIntervalMs: POLL_INTERVAL_MS, pollIntervalMs: POLL_INTERVAL_MS,
refreshing: Boolean(refreshInFlight) refreshing: Boolean(refreshInFlight),
assetRefreshing: Boolean(assetRefreshInFlight),
assetRefreshIntervalMs: ASSET_REFRESH_INTERVAL_MS,
assetPaths: ASSET_PATHS
});
return;
}
if (req.method === "GET" && parts[0] === "api" && parts[1] === "servers" && parts[2] && parts[3] === "assets") {
const servers = loadServers();
const server = servers.find((item) => item.id === parts[2]);
if (!server) {
sendJson(res, 404, { error: "服务器不存在" });
return;
}
sendJson(res, 200, {
assets: publicAssetStatus(assetCache.get(server.id) || createPendingAssetStatus(), true)
}); });
return; return;
} }
...@@ -788,6 +1091,12 @@ async function handleApi(req, res) { ...@@ -788,6 +1091,12 @@ async function handleApi(req, res) {
return; return;
} }
if (req.method === "POST" && url.pathname === "/api/assets/refresh") {
const result = await refreshAssetsAll();
sendJson(res, 200, { ok: true, result, lastAssetRefresh });
return;
}
sendJson(res, 404, { error: "API 不存在" }); sendJson(res, 404, { error: "API 不存在" });
} }
...@@ -803,10 +1112,20 @@ const server = http.createServer((req, res) => { ...@@ -803,10 +1112,20 @@ const server = http.createServer((req, res) => {
}); });
ensureDataFile(); ensureDataFile();
backupServerConfig("startup");
refreshAll().catch((error) => console.error(error)); refreshAll().catch((error) => console.error(error));
setTimeout(() => {
refreshAssetsAll().catch((error) => console.error(error));
}, 2000);
setInterval(() => { setInterval(() => {
refreshAll().catch((error) => console.error(error)); refreshAll().catch((error) => console.error(error));
}, POLL_INTERVAL_MS); }, POLL_INTERVAL_MS);
setInterval(() => {
backupServerConfig("scheduled");
}, BACKUP_INTERVAL_MS);
setInterval(() => {
refreshAssetsAll().catch((error) => console.error(error));
}, ASSET_REFRESH_INTERVAL_MS);
server.listen(PORT, "0.0.0.0", () => { server.listen(PORT, "0.0.0.0", () => {
console.log(`GPU/DCU monitor is running at http://localhost:${PORT}`); console.log(`GPU/DCU monitor is running at http://localhost:${PORT}`);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment