[Bugfix] Use is_integrated to detect UMA GPUs for memory reporting (#35356)

Signed-off-by: haosdent <haosdent@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Bugfix] Use is_integrated to detect UMA GPUs for memory reporting (#35356)
Signed-off-by: haosdent <haosdent@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
995e9a20 · haosdent · GitHub · 739e5945 · 995e9a20 · 995e9a20
Unverified Commit 995e9a20 authored Apr 14, 2026 by haosdent Committed by GitHub Apr 13, 2026
4 changed files
--- a/tests/utils_/test_mem_utils.py
+++ b/tests/utils_/test_mem_utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest.mock import MagicMock, patch
 import torch
 from vllm_test_utils.monitor import monitor
@@ -61,3 +63,62 @@ def test_memory_profiling():
    del weights
    lib.cudaFree(handle1)
    lib.cudaFree(handle2)
+def test_memory_snapshot_uses_psutil_on_integrated_gpu():
+    """On integrated (UMA) GPUs, free_memory should come from psutil."""
+    mock_cuda_free = 40 * 1024**3
+    mock_cuda_total = 120 * 1024**3
+    mock_psutil_available = 100 * 1024**3
+    with (
+        patch("vllm.utils.mem_utils.current_platform") as mock_platform,
+        patch("vllm.utils.mem_utils.psutil") as mock_psutil,
+    ):
+        mock_platform.mem_get_info.return_value = (
+            mock_cuda_free,
+            mock_cuda_total,
+        )
+        mock_platform.is_integrated_gpu.return_value = True
+        mock_platform.memory_stats.return_value = {
+            "allocated_bytes.all.peak": 0,
+        }
+        mock_platform.memory_reserved.return_value = 0
+        mock_platform.current_device = lambda: "cuda:0"
+        mock_vmem = MagicMock()
+        mock_vmem.available = mock_psutil_available
+        mock_psutil.virtual_memory.return_value = mock_vmem
+        snapshot = MemorySnapshot(device="cuda:0")
+        assert snapshot.free_memory == mock_psutil_available
+        assert snapshot.total_memory == mock_cuda_total
+        mock_psutil.virtual_memory.assert_called_once()
+def test_memory_snapshot_uses_cuda_on_discrete_gpu():
+    """On discrete GPUs, free_memory should come from CUDA mem_get_info."""
+    mock_cuda_free = 70 * 1024**3
+    mock_cuda_total = 80 * 1024**3
+    with (
+        patch("vllm.utils.mem_utils.current_platform") as mock_platform,
+        patch("vllm.utils.mem_utils.psutil") as mock_psutil,
+    ):
+        mock_platform.mem_get_info.return_value = (
+            mock_cuda_free,
+            mock_cuda_total,
+        )
+        mock_platform.is_integrated_gpu.return_value = False
+        mock_platform.memory_stats.return_value = {
+            "allocated_bytes.all.peak": 0,
+        }
+        mock_platform.memory_reserved.return_value = 0
+        mock_platform.current_device = lambda: "cuda:0"
+        snapshot = MemorySnapshot(device="cuda:0")
+        assert snapshot.free_memory == mock_cuda_free
+        assert snapshot.total_memory == mock_cuda_total
+        mock_psutil.virtual_memory.assert_not_called()
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -548,6 +548,10 @@ class CudaPlatformBase(Platform):
        """Currently, only Hopper and Blackwell GPUs are supported."""
        return cls.is_device_capability(90) or cls.is_device_capability_family(100)
+    @classmethod
+    def is_integrated_gpu(cls, device_id: int = 0) -> bool:
+        return bool(torch.cuda.get_device_properties(device_id).is_integrated)
    @classmethod
    def num_compute_units(cls, device_id: int = 0) -> int:
        return torch.cuda.get_device_properties(device_id).multi_processor_count

--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -734,6 +734,18 @@ class Platform:
        """
        return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"  # noqa
+    @classmethod
+    def is_integrated_gpu(cls, device_id: int = 0) -> bool:
+        """
+        Returns whether the GPU is an integrated (UMA) device that shares
+        system memory with the CPU.
+        On UMA systems (e.g. NVIDIA GH200, DGX Spark, Jetson Orin),
+        cudaMemGetInfo may underreport free memory because it does not
+        account for reclaimable OS memory (page cache, buffers).
+        """
+        return False
    @classmethod
    def supports_mx(cls) -> bool:
        """

--- a/vllm/utils/mem_utils.py
+++ b/vllm/utils/mem_utils.py
@@ -106,22 +106,12 @@ class MemorySnapshot:
        )
        self.free_memory, self.total_memory = current_platform.mem_get_info(device)
-        shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1))  # Orin, Thor, Spark
+        if current_platform.is_integrated_gpu(device.index):
-        if (
+            # On UMA (Unified Memory Architecture) platforms where CPU and
-            current_platform.is_cuda()
+            # GPU share physical memory (e.g. GH200, DGX Spark, Jetson Orin),
-            and current_platform.get_device_capability(device.index)
+            # cudaMemGetInfo underreports free memory because it does not
-            in shared_sysmem_device_mem_sms
+            # account for reclaimable OS memory (page cache, buffers).
-        ):
+            # Use psutil to get the true available memory.
-            # On UMA (Orin, Thor and Spark) platform,
-            # where both CPU and GPU rely on system memory,
-            # the cudaMemGetInfo function shows the amount of free system memory
-            # rather than what’s actually available.
-            # In the case,
-            # torch.cuda.mem_get_info() only reports "free" memory,
-            # which can be lower than what is actually
-            # available due to not including cache memory.
-            # There’s also a comprehensive reference page
-            # that explains how you can compute the proper value yourself.
            # https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device
            self.free_memory = psutil.virtual_memory().available