# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Mock GPU allocation test for profiler validation.

Local-only: this test is skipped in CI (GitHub Actions / GitLab CI).
Do NOT mark it as pre_merge, post_merge, nightly, or e2e -- it exists
solely to validate profile_pytest.py's binary search locally.
"""

import logging
import os
import time

import pytest

pytestmark = pytest.mark.skipif(
    os.environ.get("CI") is not None
    or os.environ.get("GITHUB_ACTIONS") is not None
    or os.environ.get("GITLAB_CI") is not None,
    reason="Mock GPU allocation test is for local profiling only, not CI",
)

torch = pytest.importorskip("torch", reason="torch required for GPU allocation test")

logger = logging.getLogger(__name__)

ALLOC_MIB = 4096  # 4 GiB


# This cannot be pre_merge, post_merge, nightly, or e2e. It's a mock test for local testing.
@pytest.mark.gpu_1
@pytest.mark.timeout(30)
def test_mock_4gb_gpu_alloc():
    """Allocate 4 GiB of GPU VRAM, hold 2s, release. Honors _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES."""
    if not torch.cuda.is_available():
        pytest.skip("CUDA not available")

    device = 0
    total_mib = torch.cuda.get_device_properties(device).total_memory / (1024 * 1024)

    kv_bytes_str = os.environ.get("_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES")
    if kv_bytes_str is not None:
        cap_mib = int(kv_bytes_str) / (1024 * 1024)
        logger.info(
            "_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=%s -> cap %.0f MiB (%.1f GiB) of %.0f MiB total",
            kv_bytes_str,
            cap_mib,
            cap_mib / 1024,
            total_mib,
        )
        if ALLOC_MIB > cap_mib:
            raise RuntimeError(
                f"Requested {ALLOC_MIB} MiB exceeds KV cache cap "
                f"of {cap_mib:.0f} MiB ({kv_bytes_str} bytes)"
            )

    num_elements = (ALLOC_MIB * 1024 * 1024) // 4
    logger.info(
        "Allocating %d MiB (%.1f GiB) on cuda:%d ...",
        ALLOC_MIB,
        ALLOC_MIB / 1024,
        device,
    )

    tensor = torch.empty(num_elements, dtype=torch.float32, device=f"cuda:{device}")
    logger.info(
        "Allocated. torch reports %.0f MiB in use.",
        torch.cuda.memory_allocated(device) / (1024 * 1024),
    )

    time.sleep(2.0)

    del tensor
    torch.cuda.empty_cache()
    logger.info(
        "Released. torch reports %.0f MiB in use.",
        torch.cuda.memory_allocated(device) / (1024 * 1024),
    )