Commit 15f150c7 authored by one's avatar one
Browse files

[hytop] Add `--wait-idle-seconds`, move `--timeout` to the root command

parent 34675024
......@@ -4,6 +4,7 @@
```bash
uv pip install -e .
hytop --help
hytop gpu --help
```
......@@ -11,7 +12,23 @@ hytop gpu --help
- Python >= 3.10
- Python packages: `rich`, `typer`
- Passwordless SSH for remote monitoring
- Passwordless SSH for remote
## `hytop`
```bash
# Show the version number
hytop --version
# Specify a timeout for the subcommand
hytop --timeout 300 [COMMAND]
# 0.5-second interval and 5-second rolling window for the subcommand
hytop -n 0.5 --window 5 [COMMAND]
# Specify a list of nodes for the subcommand
hytop -H node01,node02 [COMMAND]
```
## `hytop gpu`
......@@ -22,15 +39,18 @@ A lightweight script for live `hy-smi` polling with rolling averages across mult
Simple examples:
```bash
# Local node, all GPUs, 5-second rolling window
hytop -n 1 --window 5 gpu
# Local node, all GPUs
hytop gpu
# Two nodes, monitor only GPU 0 and 1
hytop -H node01,node02 -n 1 gpu --devices 0,1
# Two nodes, 0.5-second interval
hytop -H node01,node02 -n 0.5 gpu
# Exit with code 0 when all monitored GPUs are available
hytop gpu --devices 0,1 --wait-idle
# Wait for GPUs to be idle for 30 seconds before exiting
hytop gpu --devices 0,1 --wait-idle --wait-idle-seconds 30
# Wait at most 300s for availability (exit 124 on timeout)
hytop gpu --devices 0,1 --wait-idle --timeout 300
......@@ -42,7 +62,7 @@ hytop gpu --showpower --showtemp
Queue jobs in shared environments:
```bash
if hytop -H node01,node02 gpu --wait-idle --timeout 300; then
if hytop -H node01,node02 gpu --timeout 300 --wait-idle; then
echo "GPUs available, starting workload..."
# YOUR COMMAND HERE (e.g., python train.py)
else
......
......@@ -42,7 +42,12 @@ def gpu(
wait_idle: bool = typer.Option(
False,
"--wait-idle",
help="Exit 0 when all monitored GPUs have zero VRAM/HCU avg in the configured window.",
help="Exit 0 when all monitored GPUs have zero VRAM/HCU avg.",
),
wait_idle_seconds: float = typer.Option(
10.0,
"--wait-idle-seconds",
help="How long GPUs must stay idle before exiting. Effective only with --wait-idle.",
),
showtemp: bool = typer.Option(
False,
......@@ -74,11 +79,6 @@ def gpu(
callback=remember_show_flag_callback,
help="Display GPU utilization.",
),
timeout: Optional[float] = typer.Option(
None,
"--timeout",
help="Max runtime in seconds. Effective only with --wait-idle.",
),
) -> None:
"""GPU monitoring commands."""
......@@ -90,6 +90,7 @@ def gpu(
host_list = ctx.obj["hosts"]
interval = ctx.obj["interval"]
window_value = ctx.obj["window"]
timeout_value = ctx.obj.get("timeout")
selected_show_flags = {
"showtemp": showtemp,
"showpower": showpower,
......@@ -111,11 +112,6 @@ def gpu(
parsed_device_filter: Optional[Set[int]] = None
if device_filter:
parsed_device_filter = set(parse_csv_ints(device_filter, "--devices"))
timeout_value = (
float(timeout)
if timeout is not None
else None
)
except ValueError as exc:
typer.echo(f"argument error: {exc}", err=True)
raise typer.Exit(code=2) from exc
......@@ -127,6 +123,7 @@ def gpu(
window=window_value,
interval=interval,
wait_idle=wait_idle,
wait_idle_duration=max(wait_idle_seconds, interval),
timeout=timeout_value,
)
raise typer.Exit(code=code)
......
......@@ -262,6 +262,7 @@ def run_monitor(
interval: float,
wait_idle: bool,
timeout: Optional[float],
wait_idle_duration: float = 10.0,
) -> int:
"""Run the asynchronous collector + periodic renderer monitor loop.
......@@ -271,6 +272,7 @@ def run_monitor(
window: Rolling window length in seconds.
interval: Sampling interval in seconds.
wait_idle: Whether to exit when all monitored GPUs become idle.
wait_idle_duration: How long GPUs must stay idle before exiting.
timeout: Optional timeout for wait-idle mode.
Returns:
......@@ -334,7 +336,7 @@ def run_monitor(
refresh=True,
)
elapsed_since_start = time.monotonic() - started
warmup_done = elapsed_since_start >= state.max_window
warmup_done = elapsed_since_start >= wait_idle_duration
if (
wait_idle
and warmup_done
......
from __future__ import annotations
from typing import Optional
import typer
from hytop import __version__
......@@ -50,11 +52,17 @@ def root(
"--window",
help="Single rolling window in seconds. Default: 5.0",
),
timeout: Optional[float] = typer.Option(
None,
"--timeout",
help="Max runtime in seconds.",
),
) -> None:
"""Root callback that parses global options and stores them in context."""
try:
host_list = parse_csv_strings(hosts, "--hosts")
window_value = parse_positive_float(str(window), "--window")
timeout_value = float(timeout) if timeout is not None else None
except ValueError as exc:
typer.echo(f"argument error: {exc}", err=True)
raise typer.Exit(code=2) from exc
......@@ -63,6 +71,7 @@ def root(
"hosts": host_list,
"interval": interval,
"window": window_value,
"timeout": timeout_value,
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment