Commit 15f150c7 authored by one's avatar one
Browse files

[hytop] Add `--wait-idle-seconds`, move `--timeout` to the root command

parent 34675024
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
```bash ```bash
uv pip install -e . uv pip install -e .
hytop --help
hytop gpu --help hytop gpu --help
``` ```
...@@ -11,7 +12,23 @@ hytop gpu --help ...@@ -11,7 +12,23 @@ hytop gpu --help
- Python >= 3.10 - Python >= 3.10
- Python packages: `rich`, `typer` - Python packages: `rich`, `typer`
- Passwordless SSH for remote monitoring - Passwordless SSH for remote
## `hytop`
```bash
# Show the version number
hytop --version
# Specify a timeout for the subcommand
hytop --timeout 300 [COMMAND]
# 0.5-second interval and 5-second rolling window for the subcommand
hytop -n 0.5 --window 5 [COMMAND]
# Specify a list of nodes for the subcommand
hytop -H node01,node02 [COMMAND]
```
## `hytop gpu` ## `hytop gpu`
...@@ -22,15 +39,18 @@ A lightweight script for live `hy-smi` polling with rolling averages across mult ...@@ -22,15 +39,18 @@ A lightweight script for live `hy-smi` polling with rolling averages across mult
Simple examples: Simple examples:
```bash ```bash
# Local node, all GPUs, 5-second rolling window # Local node, all GPUs
hytop -n 1 --window 5 gpu hytop gpu
# Two nodes, monitor only GPU 0 and 1 # Two nodes, 0.5-second interval
hytop -H node01,node02 -n 1 gpu --devices 0,1 hytop -H node01,node02 -n 0.5 gpu
# Exit with code 0 when all monitored GPUs are available # Exit with code 0 when all monitored GPUs are available
hytop gpu --devices 0,1 --wait-idle hytop gpu --devices 0,1 --wait-idle
# Wait for GPUs to be idle for 30 seconds before exiting
hytop gpu --devices 0,1 --wait-idle --wait-idle-seconds 30
# Wait at most 300s for availability (exit 124 on timeout) # Wait at most 300s for availability (exit 124 on timeout)
hytop gpu --devices 0,1 --wait-idle --timeout 300 hytop gpu --devices 0,1 --wait-idle --timeout 300
...@@ -42,7 +62,7 @@ hytop gpu --showpower --showtemp ...@@ -42,7 +62,7 @@ hytop gpu --showpower --showtemp
Queue jobs in shared environments: Queue jobs in shared environments:
```bash ```bash
if hytop -H node01,node02 gpu --wait-idle --timeout 300; then if hytop -H node01,node02 gpu --timeout 300 --wait-idle; then
echo "GPUs available, starting workload..." echo "GPUs available, starting workload..."
# YOUR COMMAND HERE (e.g., python train.py) # YOUR COMMAND HERE (e.g., python train.py)
else else
......
...@@ -42,7 +42,12 @@ def gpu( ...@@ -42,7 +42,12 @@ def gpu(
wait_idle: bool = typer.Option( wait_idle: bool = typer.Option(
False, False,
"--wait-idle", "--wait-idle",
help="Exit 0 when all monitored GPUs have zero VRAM/HCU avg in the configured window.", help="Exit 0 when all monitored GPUs have zero VRAM/HCU avg.",
),
wait_idle_seconds: float = typer.Option(
10.0,
"--wait-idle-seconds",
help="How long GPUs must stay idle before exiting. Effective only with --wait-idle.",
), ),
showtemp: bool = typer.Option( showtemp: bool = typer.Option(
False, False,
...@@ -74,11 +79,6 @@ def gpu( ...@@ -74,11 +79,6 @@ def gpu(
callback=remember_show_flag_callback, callback=remember_show_flag_callback,
help="Display GPU utilization.", help="Display GPU utilization.",
), ),
timeout: Optional[float] = typer.Option(
None,
"--timeout",
help="Max runtime in seconds. Effective only with --wait-idle.",
),
) -> None: ) -> None:
"""GPU monitoring commands.""" """GPU monitoring commands."""
...@@ -90,6 +90,7 @@ def gpu( ...@@ -90,6 +90,7 @@ def gpu(
host_list = ctx.obj["hosts"] host_list = ctx.obj["hosts"]
interval = ctx.obj["interval"] interval = ctx.obj["interval"]
window_value = ctx.obj["window"] window_value = ctx.obj["window"]
timeout_value = ctx.obj.get("timeout")
selected_show_flags = { selected_show_flags = {
"showtemp": showtemp, "showtemp": showtemp,
"showpower": showpower, "showpower": showpower,
...@@ -111,11 +112,6 @@ def gpu( ...@@ -111,11 +112,6 @@ def gpu(
parsed_device_filter: Optional[Set[int]] = None parsed_device_filter: Optional[Set[int]] = None
if device_filter: if device_filter:
parsed_device_filter = set(parse_csv_ints(device_filter, "--devices")) parsed_device_filter = set(parse_csv_ints(device_filter, "--devices"))
timeout_value = (
float(timeout)
if timeout is not None
else None
)
except ValueError as exc: except ValueError as exc:
typer.echo(f"argument error: {exc}", err=True) typer.echo(f"argument error: {exc}", err=True)
raise typer.Exit(code=2) from exc raise typer.Exit(code=2) from exc
...@@ -127,6 +123,7 @@ def gpu( ...@@ -127,6 +123,7 @@ def gpu(
window=window_value, window=window_value,
interval=interval, interval=interval,
wait_idle=wait_idle, wait_idle=wait_idle,
wait_idle_duration=max(wait_idle_seconds, interval),
timeout=timeout_value, timeout=timeout_value,
) )
raise typer.Exit(code=code) raise typer.Exit(code=code)
......
...@@ -262,6 +262,7 @@ def run_monitor( ...@@ -262,6 +262,7 @@ def run_monitor(
interval: float, interval: float,
wait_idle: bool, wait_idle: bool,
timeout: Optional[float], timeout: Optional[float],
wait_idle_duration: float = 10.0,
) -> int: ) -> int:
"""Run the asynchronous collector + periodic renderer monitor loop. """Run the asynchronous collector + periodic renderer monitor loop.
...@@ -271,6 +272,7 @@ def run_monitor( ...@@ -271,6 +272,7 @@ def run_monitor(
window: Rolling window length in seconds. window: Rolling window length in seconds.
interval: Sampling interval in seconds. interval: Sampling interval in seconds.
wait_idle: Whether to exit when all monitored GPUs become idle. wait_idle: Whether to exit when all monitored GPUs become idle.
wait_idle_duration: How long GPUs must stay idle before exiting.
timeout: Optional timeout for wait-idle mode. timeout: Optional timeout for wait-idle mode.
Returns: Returns:
...@@ -334,7 +336,7 @@ def run_monitor( ...@@ -334,7 +336,7 @@ def run_monitor(
refresh=True, refresh=True,
) )
elapsed_since_start = time.monotonic() - started elapsed_since_start = time.monotonic() - started
warmup_done = elapsed_since_start >= state.max_window warmup_done = elapsed_since_start >= wait_idle_duration
if ( if (
wait_idle wait_idle
and warmup_done and warmup_done
......
from __future__ import annotations from __future__ import annotations
from typing import Optional
import typer import typer
from hytop import __version__ from hytop import __version__
...@@ -50,11 +52,17 @@ def root( ...@@ -50,11 +52,17 @@ def root(
"--window", "--window",
help="Single rolling window in seconds. Default: 5.0", help="Single rolling window in seconds. Default: 5.0",
), ),
timeout: Optional[float] = typer.Option(
None,
"--timeout",
help="Max runtime in seconds.",
),
) -> None: ) -> None:
"""Root callback that parses global options and stores them in context.""" """Root callback that parses global options and stores them in context."""
try: try:
host_list = parse_csv_strings(hosts, "--hosts") host_list = parse_csv_strings(hosts, "--hosts")
window_value = parse_positive_float(str(window), "--window") window_value = parse_positive_float(str(window), "--window")
timeout_value = float(timeout) if timeout is not None else None
except ValueError as exc: except ValueError as exc:
typer.echo(f"argument error: {exc}", err=True) typer.echo(f"argument error: {exc}", err=True)
raise typer.Exit(code=2) from exc raise typer.Exit(code=2) from exc
...@@ -63,6 +71,7 @@ def root( ...@@ -63,6 +71,7 @@ def root(
"hosts": host_list, "hosts": host_list,
"interval": interval, "interval": interval,
"window": window_value, "window": window_value,
"timeout": timeout_value,
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment