service.py

from __future__ import annotations

import sys
import threading
import time
from collections.abc import Sequence

from hytop.core.history import SlidingHistory
from hytop.core.ssh import collect_from_host
from hytop.gpu.metrics import hy_smi_args_for_show_flags
from hytop.gpu.models import HostSnapshot, MonitorState, NodeResult
from hytop.gpu.parser import parse_hy_smi_output


def collect_node(
    host: str,
    ssh_timeout: float,
    cmd_timeout: float,
    hy_smi_args: Sequence[str],
) -> NodeResult:
    """Collect one host snapshot and parse it into structured samples.

    Args:
        host: Hostname or localhost alias.
        ssh_timeout: SSH connect timeout in seconds.
        cmd_timeout: Command timeout in seconds.

    Returns:
        Normalized collection result for the host.
    """

    raw = collect_from_host(
        host=host, ssh_timeout=ssh_timeout, cmd_timeout=cmd_timeout, hy_smi_args=hy_smi_args
    )
    if raw.error:
        return NodeResult(host=host, samples={}, error=raw.error)
    sample_ts = time.monotonic()
    samples, parse_error = parse_hy_smi_output(raw.stdout, sample_ts=sample_ts)
    if not samples:
        reason = parse_error or "unknown parse error"
        return NodeResult(host=host, samples={}, error=f"no gpu rows parsed ({reason})")
    return NodeResult(host=host, samples=samples)


def host_collector_loop(
    host: str,
    ssh_timeout: float,
    cmd_timeout: float,
    hy_smi_args: Sequence[str],
    interval: float,
    state: dict[str, HostSnapshot],
    state_lock: threading.Lock,
    stop_event: threading.Event,
) -> None:
    """Continuously collect one host and publish latest snapshot state.

    Args:
        host: Hostname to collect.
        ssh_timeout: SSH connect timeout in seconds.
        cmd_timeout: Command timeout in seconds.
        interval: Desired collection interval in seconds.
        state: Shared per-host snapshot map.
        state_lock: Lock guarding shared state writes.
        stop_event: Stop signal for graceful shutdown.
    """

    while not stop_event.is_set():
        started = time.monotonic()
        result = collect_node(host, ssh_timeout, cmd_timeout, hy_smi_args)
        with state_lock:
            snapshot = state[host]
            snapshot.seq += 1
            snapshot.updated_ts = time.monotonic()
            snapshot.result = result
        sleep_s = max(0.0, interval - (time.monotonic() - started))
        if stop_event.wait(sleep_s):
            break


def availability_ready(
    window: float,
    histories: dict[tuple[str, int], SlidingHistory],
    monitored_keys: set[tuple[str, int]],
    hosts: list[str],
    errors: dict[str, str],
) -> bool:
    """Check whether all monitored GPUs satisfy idle availability criteria.

    Args:
        window: Rolling window length in seconds.
        histories: Sliding histories by host+gpu key.
        monitored_keys: Effective host+gpu keys to evaluate.
        hosts: Host list used for host-level error checks.
        errors: Latest host-level errors.

    Returns:
        True when all monitored GPUs are fresh and window averages are idle.
    """

    if not monitored_keys:
        return False
    now = time.monotonic()
    if any(errors.get(host) for host in hosts):
        return False
    for key in monitored_keys:
        history = histories.get(key)
        if history is None:
            return False
        latest = history.latest()
        if latest is None or (now - latest.ts) > window:
            return False
        if latest.vram_pct is None or latest.gpu_pct is None:
            return False
        if history.avg("vram_pct", window, now) != 0.0:
            return False
        if history.avg("gpu_pct", window, now) != 0.0:
            return False
    return True


def init_monitor_state(
    hosts: list[str],
    device_filter: set[int] | None,
    max_window: float,
) -> MonitorState:
    """Create initial monitor state for the run.

    Args:
        hosts: Host list.
        device_filter: Optional set of GPU ids to monitor.
        max_window: Sliding window length in seconds.

    Returns:
        Initialized monitor state object.
    """

    monitored_keys = {(h, d) for h in hosts for d in device_filter} if device_filter else set()
    return MonitorState(
        max_window=max_window,
        histories={},
        discovered_keys=set(),
        last_applied_sample_ts={},
        monitored_keys=monitored_keys,
        errors={},
        host_state={host: HostSnapshot() for host in hosts},
        processed_seq={host: 0 for host in hosts},
        state_lock=threading.Lock(),
        stop_event=threading.Event(),
    )


def start_collectors(
    hosts: list[str],
    ssh_timeout: float,
    cmd_timeout: float,
    hy_smi_args: Sequence[str],
    interval: float,
    state: MonitorState,
) -> list[threading.Thread]:
    """Start one daemon collector thread per host.

    Args:
        hosts: Host list.
        ssh_timeout: SSH connect timeout in seconds.
        cmd_timeout: Command timeout in seconds.
        interval: Desired collection interval in seconds.
        state: Shared monitor state.

    Returns:
        Started collector thread list.
    """

    workers: list[threading.Thread] = []
    for host in hosts:
        worker = threading.Thread(
            target=host_collector_loop,
            args=(
                host,
                ssh_timeout,
                cmd_timeout,
                hy_smi_args,
                interval,
                state.host_state,
                state.state_lock,
                state.stop_event,
            ),
            daemon=True,
            name=f"collector-{host}",
        )
        worker.start()
        workers.append(worker)
    return workers


def drain_pending_nodes(hosts: list[str], state: MonitorState) -> list[NodeResult]:
    """Fetch unseen host snapshots since the previous render tick.

    Args:
        hosts: Host list used to preserve deterministic ordering.
        state: Shared monitor state.

    Returns:
        Newly published node results to apply this tick.
    """

    nodes: list[NodeResult] = []
    with state.state_lock:
        for host in hosts:
            snapshot = state.host_state[host]
            if snapshot.seq <= state.processed_seq[host]:
                continue
            state.processed_seq[host] = snapshot.seq
            if snapshot.result is not None:
                nodes.append(snapshot.result)
    return nodes


def apply_node_results(
    nodes: list[NodeResult],
    device_filter: set[int] | None,
    state: MonitorState,
) -> None:
    """Apply collected node results into histories and error state.

    Args:
        nodes: Newly collected node results.
        device_filter: Optional GPU id filter.
        state: Shared monitor state.
    """

    for node in nodes:
        if node.error:
            state.errors[node.host] = node.error
            continue
        state.errors.pop(node.host, None)
        for gpu_id, sample in node.samples.items():
            key = (node.host, gpu_id)
            state.discovered_keys.add(key)
            if device_filter is not None and gpu_id not in device_filter:
                continue
            history = state.histories.get(key)
            if history is None:
                history = SlidingHistory(max_window_s=state.max_window)
                state.histories[key] = history
            last_ts = state.last_applied_sample_ts.get(key)
            if last_ts is not None and sample.ts <= last_ts:
                continue
            history.add(sample)
            state.last_applied_sample_ts[key] = sample.ts


def run_monitor(
    hosts: list[str],
    device_filter: set[int] | None,
    show_flags: Sequence[str],
    window: float,
    interval: float,
    wait_idle: bool,
    timeout: float | None,
    wait_idle_duration: float = 10.0,
) -> int:
    """Run the GPU monitor as a Textual TUI application.

    Starts per-host collector threads, then launches the Textual App which
    polls the shared MonitorState and refreshes the DataTable on a timer.
    The app returns an integer exit code via ``App.exit(code)``.

    Args:
        hosts: Host list to monitor.
        device_filter: Optional GPU id filter.
        show_flags: Ordered list of show flags controlling displayed columns.
        window: Rolling window length in seconds.
        interval: Sampling interval in seconds.
        wait_idle: Whether to exit when all monitored GPUs become idle.
        wait_idle_duration: How long GPUs must stay idle before exiting.
        timeout: Optional timeout for wait-idle mode.

    Returns:
        Process-style exit code:
            0 for success,
            2 for invalid arguments,
            124 for timeout in wait-idle mode,
            130 when interrupted by user.
    """

    # Import here to avoid a circular import at module load time.
    from hytop.gpu.app import GpuMonitorApp

    if interval <= 0:
        print("argument error: --interval must be > 0", file=sys.stderr)
        return 2
    if interval > window:
        print("argument error: --interval must be <= --window value", file=sys.stderr)
        return 2

    state = init_monitor_state(hosts=hosts, device_filter=device_filter, max_window=window)
    hy_smi_args = hy_smi_args_for_show_flags(show_flags, wait_idle=wait_idle)
    ssh_timeout = min(max(5 * interval, 2.0), 5.0)
    cmd_timeout = min(max(10 * interval, 5.0), 10.0)
    started = time.monotonic()

    workers = start_collectors(
        hosts=hosts,
        ssh_timeout=ssh_timeout,
        cmd_timeout=cmd_timeout,
        hy_smi_args=hy_smi_args,
        interval=interval,
        state=state,
    )

    app = GpuMonitorApp(
        hosts=hosts,
        show_flags=show_flags,
        window=window,
        interval=interval,
        wait_idle=wait_idle,
        wait_idle_duration=wait_idle_duration,
        timeout=timeout,
        state=state,
        device_filter=device_filter,
        started=started,
    )

    try:
        app.run()
    except KeyboardInterrupt:
        pass
    finally:
        state.stop_event.set()
        for worker in workers:
            worker.join(timeout=min(0.2, interval))

    return app.return_code or 0