feat: GPU VRAM profiler via memory fraction injection + profiled test markers...

feat: GPU VRAM profiler via memory fraction injection + profiled test markers (part 2 - vLLM only) (#6719) Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

feat: GPU VRAM profiler via memory fraction injection + profiled test markers...
feat: GPU VRAM profiler via memory fraction injection + profiled test markers (part 2 - vLLM only) (#6719) Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
0b20745e · Keiven C · GitHub · d047851e · 0b20745e
Unverified Commit 0b20745e authored Mar 18, 2026 by Keiven C Committed by GitHub Mar 18, 2026
Show whitespace changes
Inline Side-by-side

Showing with 79 additions and 0 deletions

tests/utils/test_mock_gpu_alloc.py tests/utils/test_mock_gpu_alloc.py +79 -0

No files found.
--- a/tests/utils/test_mock_gpu_alloc.py
+++ b/tests/utils/test_mock_gpu_alloc.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Mock GPU allocation test for profiler validation.
+
+Local-only: this test is skipped in CI (GitHub Actions / GitLab CI).
+Do NOT mark it as pre_merge, post_merge, nightly, or e2e -- it exists
+solely to validate profile_pytest.py's binary search locally.
+"""
+
+import logging
+import os
+import time
+
+import pytest
+
+pytestmark = pytest.mark.skipif(
+    os.environ.get("CI") is not None
+    or os.environ.get("GITHUB_ACTIONS") is not None
+    or os.environ.get("GITLAB_CI") is not None,
+    reason="Mock GPU allocation test is for local profiling only, not CI",
+)
+
+torch = pytest.importorskip("torch", reason="torch required for GPU allocation test")
+
+logger = logging.getLogger(__name__)
+
+ALLOC_MIB = 4096  # 4 GiB
+
+
+# This cannot be pre_merge, post_merge, nightly, or e2e. It's a mock test for local testing.
+@pytest.mark.gpu_1
+@pytest.mark.timeout(30)
+def test_mock_4gb_gpu_alloc():
+    """Allocate 4 GiB of GPU VRAM, hold 2s, release. Honors _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    device = 0
+    total_mib = torch.cuda.get_device_properties(device).total_memory / (1024 * 1024)
+
+    gpu_util = os.environ.get("_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE")
+    if gpu_util is not None:
+        cap_mib = total_mib * float(gpu_util)
+        logger.info(
+            "_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE=%.2f -> cap %.0f MiB (%.1f GiB) of %.0f MiB total",
+            float(gpu_util),
+            cap_mib,
+            cap_mib / 1024,
+            total_mib,
+        )
+        if ALLOC_MIB > cap_mib:
+            raise RuntimeError(
+                f"Requested {ALLOC_MIB} MiB exceeds _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE "
+                f"cap of {cap_mib:.0f} MiB ({gpu_util})"
+            )
+
+    num_elements = (ALLOC_MIB * 1024 * 1024) // 4
+    logger.info(
+        "Allocating %d MiB (%.1f GiB) on cuda:%d ...",
+        ALLOC_MIB,
+        ALLOC_MIB / 1024,
+        device,
+    )
+
+    tensor = torch.empty(num_elements, dtype=torch.float32, device=f"cuda:{device}")
+    logger.info(
+        "Allocated. torch reports %.0f MiB in use.",
+        torch.cuda.memory_allocated(device) / (1024 * 1024),
+    )
+
+    time.sleep(2.0)
+
+    del tensor
+    torch.cuda.empty_cache()
+    logger.info(
+        "Released. torch reports %.0f MiB in use.",
+        torch.cuda.memory_allocated(device) / (1024 * 1024),
+    )