Unverified Commit 0b20745e authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: GPU VRAM profiler via memory fraction injection + profiled test markers...


feat: GPU VRAM profiler via memory fraction injection + profiled test markers (part 2 - vLLM only) (#6719)
Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent d047851e
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Mock GPU allocation test for profiler validation.
Local-only: this test is skipped in CI (GitHub Actions / GitLab CI).
Do NOT mark it as pre_merge, post_merge, nightly, or e2e -- it exists
solely to validate profile_pytest.py's binary search locally.
"""
import logging
import os
import time
import pytest
pytestmark = pytest.mark.skipif(
os.environ.get("CI") is not None
or os.environ.get("GITHUB_ACTIONS") is not None
or os.environ.get("GITLAB_CI") is not None,
reason="Mock GPU allocation test is for local profiling only, not CI",
)
torch = pytest.importorskip("torch", reason="torch required for GPU allocation test")
logger = logging.getLogger(__name__)
ALLOC_MIB = 4096 # 4 GiB
# This cannot be pre_merge, post_merge, nightly, or e2e. It's a mock test for local testing.
@pytest.mark.gpu_1
@pytest.mark.timeout(30)
def test_mock_4gb_gpu_alloc():
"""Allocate 4 GiB of GPU VRAM, hold 2s, release. Honors _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE."""
if not torch.cuda.is_available():
pytest.skip("CUDA not available")
device = 0
total_mib = torch.cuda.get_device_properties(device).total_memory / (1024 * 1024)
gpu_util = os.environ.get("_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE")
if gpu_util is not None:
cap_mib = total_mib * float(gpu_util)
logger.info(
"_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE=%.2f -> cap %.0f MiB (%.1f GiB) of %.0f MiB total",
float(gpu_util),
cap_mib,
cap_mib / 1024,
total_mib,
)
if ALLOC_MIB > cap_mib:
raise RuntimeError(
f"Requested {ALLOC_MIB} MiB exceeds _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE "
f"cap of {cap_mib:.0f} MiB ({gpu_util})"
)
num_elements = (ALLOC_MIB * 1024 * 1024) // 4
logger.info(
"Allocating %d MiB (%.1f GiB) on cuda:%d ...",
ALLOC_MIB,
ALLOC_MIB / 1024,
device,
)
tensor = torch.empty(num_elements, dtype=torch.float32, device=f"cuda:{device}")
logger.info(
"Allocated. torch reports %.0f MiB in use.",
torch.cuda.memory_allocated(device) / (1024 * 1024),
)
time.sleep(2.0)
del tensor
torch.cuda.empty_cache()
logger.info(
"Released. torch reports %.0f MiB in use.",
torch.cuda.memory_allocated(device) / (1024 * 1024),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment