runtime.py

# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

import logging
import time
from pathlib import Path

import requests

from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME

logger = logging.getLogger(__name__)
REPO_ROOT = Path(__file__).resolve().parents[3]
DYNAMO_BIN = REPO_ROOT / "dynamo" / "bin"
MIN_EXPECTED_MEMORY_RETURN_FRACTION = 0.6


def get_gpu_memory_used(device: int = 0) -> int:
    import pynvml

    pynvml.nvmlInit()
    try:
        handle = pynvml.nvmlDeviceGetHandleByIndex(device)
        return pynvml.nvmlDeviceGetMemoryInfo(handle).used
    finally:
        pynvml.nvmlShutdown()


def send_completion(
    port: int,
    prompt: str = "Hello",
    max_retries: int = 3,
    retry_delay: float = 1.0,
) -> dict:
    last_error = None
    for attempt in range(max_retries):
        try:
            response = requests.post(
                f"http://localhost:{port}/v1/completions",
                json={
                    "model": FAULT_TOLERANCE_MODEL_NAME,
                    "prompt": prompt,
                    "max_tokens": 20,
                },
                timeout=120,
            )
            response.raise_for_status()
            result = response.json()
            assert result.get("choices"), "No choices in response"
            if attempt > 0:
                logger.info("send_completion succeeded after %d attempts", attempt + 1)
            return result
        except (requests.exceptions.RequestException, AssertionError) as exc:
            last_error = exc
            if attempt < max_retries - 1:
                logger.debug(
                    "send_completion attempt %d/%d failed: %s",
                    attempt + 1,
                    max_retries,
                    exc,
                )
                time.sleep(retry_delay)
    raise last_error  # type: ignore[misc]