vram_utils.py

# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""GPU VRAM utilities for parallel test execution.

Functions:
    detect_gpus()                  Enumerate GPUs via pynvml
    auto_worker_count(gpus, limit) Calculate slot count for -n auto
    write_test_meta(items)         Serialize profiled/requested vram + timeout
    load_test_meta()               Read the serialized test metadata
    print_gpu_plan(gpus, limit, would_run)  Dry-run GPU plan summary

Usage:
    # Sequential (filter only)
    pytest --max-vram-gib=10 -m "gpu_1 and vllm" tests/serve/

    # Parallel (VRAM-aware scheduling)
    pytest --max-vram-gib=10 -n auto -m "gpu_1 and vllm" tests/serve/
"""

from __future__ import annotations

import json
import logging
import os
import tempfile

import pynvml

_logger = logging.getLogger(__name__)

# When 2+ tests run concurrently, reserve 15% of GPU VRAM for CUDA context
# overhead across processes.  A single test gets the full GPU (0% margin).
VRAM_MULTI_PROC_MARGIN = 0.15

_TEST_META_FILENAME = "pytest_gpu_parallel_test_meta.json"


def detect_gpus() -> list[dict]:
    """Return list of dicts with 'index', 'name', 'total_mib' per GPU.

    Uses pynvml (already a dependency via profile_pytest.py).
    Returns empty list if no GPUs or pynvml is unavailable.
    """
    try:
        pynvml.nvmlInit()
    except pynvml.NVMLError:
        return []
    try:
        count = pynvml.nvmlDeviceGetCount()
        gpus = []
        for i in range(count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            name = pynvml.nvmlDeviceGetName(handle)
            mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
            gpus.append(
                {
                    "index": i,
                    "name": name,
                    "total_mib": mem.total // (1024 * 1024),
                }
            )
        return gpus
    finally:
        pynvml.nvmlShutdown()


def auto_worker_count(
    gpus: list[dict],
    vram_limit: float,
    test_profiled_gibs: list[float] | None = None,
) -> int:
    """Calculate slot count for -n auto.

    Uses the smallest profiled test size (if provided) to maximize parallelism.
    Falls back to vram_limit when no test sizes are available.
    """
    if not gpus or vram_limit <= 0:
        return len(gpus) or 1
    min_gpu_gib = min(g["total_mib"] for g in gpus) / 1024.0
    budget_gib = min_gpu_gib * (1.0 - VRAM_MULTI_PROC_MARGIN)
    divisor = vram_limit
    if test_profiled_gibs:
        nonzero = [g for g in test_profiled_gibs if g > 0]
        if nonzero:
            divisor = min(nonzero)
    workers_per_gpu = max(1, int(budget_gib / divisor)) if divisor > 0 else 1
    return len(gpus) * workers_per_gpu


def write_test_meta(items, dest_dir: str | None = None) -> None:
    """Serialize profiled_vram_gib, timeout, and KV cache markers to JSON.

    Called from pytest_collection_modifyitems so the GPU orchestrator can
    read test metadata without re-collecting.
    """
    test_meta: dict[str, dict] = {}
    for item in items:
        meta: dict = {}
        profiled_mark = item.get_closest_marker("profiled_vram_gib")
        if profiled_mark and profiled_mark.args:
            meta["profiled_vram_gib"] = profiled_mark.args[0]
        kv_bytes_mark = item.get_closest_marker("requested_vllm_kv_cache_bytes")
        if kv_bytes_mark and kv_bytes_mark.args:
            meta["requested_vllm_kv_cache_bytes"] = kv_bytes_mark.args[0]
        timeout_mark = item.get_closest_marker("timeout")
        if timeout_mark and timeout_mark.args:
            meta["timeout"] = timeout_mark.args[0]
        kv_tokens_mark = item.get_closest_marker("requested_sglang_kv_tokens")
        if kv_tokens_mark and kv_tokens_mark.args:
            meta["requested_sglang_kv_tokens"] = kv_tokens_mark.args[0]
        trtllm_tokens_mark = item.get_closest_marker("requested_trtllm_kv_tokens")
        if trtllm_tokens_mark and trtllm_tokens_mark.args:
            meta["requested_trtllm_kv_tokens"] = trtllm_tokens_mark.args[0]
        trtllm_vram_mark = item.get_closest_marker("requested_trtllm_vram_gib")
        if trtllm_vram_mark and trtllm_vram_mark.args:
            meta["requested_trtllm_vram_gib"] = trtllm_vram_mark.args[0]
        skip_mark = item.get_closest_marker("skip")
        if skip_mark:
            reason = skip_mark.kwargs.get("reason", "")
            if not reason and skip_mark.args:
                reason = skip_mark.args[0]
            meta["skip_reason"] = reason or "skipped"
        if meta:
            test_meta[item.nodeid] = meta
    if test_meta:
        path = os.path.join(dest_dir or tempfile.gettempdir(), _TEST_META_FILENAME)
        with open(path, "w") as f:
            json.dump(test_meta, f)


def load_test_meta() -> dict[str, dict]:
    """Load the nodeid -> {profiled_vram_gib, timeout, ...} map."""
    path = os.path.join(tempfile.gettempdir(), _TEST_META_FILENAME)
    try:
        with open(path) as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        return {}


def print_gpu_plan(
    gpus: list[dict], vram_limit: float, would_run: list[tuple[str, float]]
) -> None:
    """Print the GPU-parallel plan section for --dry-run output."""
    min_gpu_gib = min(g["total_mib"] for g in gpus) / 1024.0
    budget_gib = min_gpu_gib * (1.0 - VRAM_MULTI_PROC_MARGIN)
    profiled_gibs = [gib for _, gib in would_run if gib is not None and gib > 0]
    min_test_gib = min(profiled_gibs) if profiled_gibs else vram_limit
    auto_slots = max(1, int(budget_gib / min_test_gib)) if min_test_gib > 0 else 1

    print(f"\n{'=' * 60}")
    print("GPU-Parallel Plan")
    print(f"{'=' * 60}")
    for gpu in gpus:
        gib = gpu["total_mib"] / 1024
        print(f"  GPU {gpu['index']}: {gpu['name']} ({gib:.1f} GiB)")
    print(f"\n  Usable VRAM: {budget_gib:.0f} GiB")
    print("\n  Run options:")
    print("    (no -n)  : sequential, 1 test at a time")
    print(
        f"    -n auto  : up to {auto_slots} slots per GPU "
        f"({budget_gib:.0f} / {min_test_gib:.0f} GiB smallest test)"
    )
    print(f"    -n N     : N concurrent slots across {len(gpus)} GPU(s)")
    print("\n  Usage:")
    print(
        f"    pytest --max-vram-gib={vram_limit:.0f} -n {auto_slots} "
        f'-m "gpu_1 and vllm" tests/serve/'
    )