Unverified Commit 995e9a20 authored by haosdent's avatar haosdent Committed by GitHub
Browse files

[Bugfix] Use is_integrated to detect UMA GPUs for memory reporting (#35356)


Signed-off-by: default avatarhaosdent <haosdent@gmail.com>
Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 739e5945
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import MagicMock, patch
import torch import torch
from vllm_test_utils.monitor import monitor from vllm_test_utils.monitor import monitor
...@@ -61,3 +63,62 @@ def test_memory_profiling(): ...@@ -61,3 +63,62 @@ def test_memory_profiling():
del weights del weights
lib.cudaFree(handle1) lib.cudaFree(handle1)
lib.cudaFree(handle2) lib.cudaFree(handle2)
def test_memory_snapshot_uses_psutil_on_integrated_gpu():
"""On integrated (UMA) GPUs, free_memory should come from psutil."""
mock_cuda_free = 40 * 1024**3
mock_cuda_total = 120 * 1024**3
mock_psutil_available = 100 * 1024**3
with (
patch("vllm.utils.mem_utils.current_platform") as mock_platform,
patch("vllm.utils.mem_utils.psutil") as mock_psutil,
):
mock_platform.mem_get_info.return_value = (
mock_cuda_free,
mock_cuda_total,
)
mock_platform.is_integrated_gpu.return_value = True
mock_platform.memory_stats.return_value = {
"allocated_bytes.all.peak": 0,
}
mock_platform.memory_reserved.return_value = 0
mock_platform.current_device = lambda: "cuda:0"
mock_vmem = MagicMock()
mock_vmem.available = mock_psutil_available
mock_psutil.virtual_memory.return_value = mock_vmem
snapshot = MemorySnapshot(device="cuda:0")
assert snapshot.free_memory == mock_psutil_available
assert snapshot.total_memory == mock_cuda_total
mock_psutil.virtual_memory.assert_called_once()
def test_memory_snapshot_uses_cuda_on_discrete_gpu():
"""On discrete GPUs, free_memory should come from CUDA mem_get_info."""
mock_cuda_free = 70 * 1024**3
mock_cuda_total = 80 * 1024**3
with (
patch("vllm.utils.mem_utils.current_platform") as mock_platform,
patch("vllm.utils.mem_utils.psutil") as mock_psutil,
):
mock_platform.mem_get_info.return_value = (
mock_cuda_free,
mock_cuda_total,
)
mock_platform.is_integrated_gpu.return_value = False
mock_platform.memory_stats.return_value = {
"allocated_bytes.all.peak": 0,
}
mock_platform.memory_reserved.return_value = 0
mock_platform.current_device = lambda: "cuda:0"
snapshot = MemorySnapshot(device="cuda:0")
assert snapshot.free_memory == mock_cuda_free
assert snapshot.total_memory == mock_cuda_total
mock_psutil.virtual_memory.assert_not_called()
...@@ -548,6 +548,10 @@ class CudaPlatformBase(Platform): ...@@ -548,6 +548,10 @@ class CudaPlatformBase(Platform):
"""Currently, only Hopper and Blackwell GPUs are supported.""" """Currently, only Hopper and Blackwell GPUs are supported."""
return cls.is_device_capability(90) or cls.is_device_capability_family(100) return cls.is_device_capability(90) or cls.is_device_capability_family(100)
@classmethod
def is_integrated_gpu(cls, device_id: int = 0) -> bool:
return bool(torch.cuda.get_device_properties(device_id).is_integrated)
@classmethod @classmethod
def num_compute_units(cls, device_id: int = 0) -> int: def num_compute_units(cls, device_id: int = 0) -> int:
return torch.cuda.get_device_properties(device_id).multi_processor_count return torch.cuda.get_device_properties(device_id).multi_processor_count
......
...@@ -734,6 +734,18 @@ class Platform: ...@@ -734,6 +734,18 @@ class Platform:
""" """
return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase" # noqa return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase" # noqa
@classmethod
def is_integrated_gpu(cls, device_id: int = 0) -> bool:
"""
Returns whether the GPU is an integrated (UMA) device that shares
system memory with the CPU.
On UMA systems (e.g. NVIDIA GH200, DGX Spark, Jetson Orin),
cudaMemGetInfo may underreport free memory because it does not
account for reclaimable OS memory (page cache, buffers).
"""
return False
@classmethod @classmethod
def supports_mx(cls) -> bool: def supports_mx(cls) -> bool:
""" """
......
...@@ -106,22 +106,12 @@ class MemorySnapshot: ...@@ -106,22 +106,12 @@ class MemorySnapshot:
) )
self.free_memory, self.total_memory = current_platform.mem_get_info(device) self.free_memory, self.total_memory = current_platform.mem_get_info(device)
shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark if current_platform.is_integrated_gpu(device.index):
if ( # On UMA (Unified Memory Architecture) platforms where CPU and
current_platform.is_cuda() # GPU share physical memory (e.g. GH200, DGX Spark, Jetson Orin),
and current_platform.get_device_capability(device.index) # cudaMemGetInfo underreports free memory because it does not
in shared_sysmem_device_mem_sms # account for reclaimable OS memory (page cache, buffers).
): # Use psutil to get the true available memory.
# On UMA (Orin, Thor and Spark) platform,
# where both CPU and GPU rely on system memory,
# the cudaMemGetInfo function shows the amount of free system memory
# rather than what’s actually available.
# In the case,
# torch.cuda.mem_get_info() only reports "free" memory,
# which can be lower than what is actually
# available due to not including cache memory.
# There’s also a comprehensive reference page
# that explains how you can compute the proper value yourself.
# https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device # https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device
self.free_memory = psutil.virtual_memory().available self.free_memory = psutil.virtual_memory().available
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment