# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock, patch import torch from vllm_test_utils.monitor import monitor from vllm.utils.mem_utils import MemorySnapshot, memory_profiling from ..utils import create_new_process_for_each_test @create_new_process_for_each_test() def test_memory_profiling(): # Fake out some model loading + inference memory usage to test profiling # Memory used by other processes will show up as cuda usage outside of torch from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary lib = CudaRTLibrary() # 512 MiB allocation outside of this instance handle1 = lib.cudaMalloc(512 * 1024 * 1024) baseline_snapshot = MemorySnapshot() # load weights weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32) weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB def measure_current_non_torch(): free, total = torch.cuda.mem_get_info() current_used = total - free current_torch = torch.accelerator.memory_reserved() current_non_torch = current_used - current_torch return current_non_torch with ( memory_profiling( baseline_snapshot=baseline_snapshot, weights_memory=weights_memory ) as result, monitor(measure_current_non_torch) as monitored_values, ): # make a memory spike, 1 GiB spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32) del spike # Add some extra non-torch memory 256 MiB (simulate NCCL) handle2 = lib.cudaMalloc(256 * 1024 * 1024) # this is an analytic value, it is exact, # we only have 256 MiB non-torch memory increase measured_diff = monitored_values.values[-1] - monitored_values.values[0] assert measured_diff == 256 * 1024 * 1024 # Check that the memory usage is within 5% of the expected values # 5% tolerance is caused by cuda runtime. # we cannot control cuda runtime in the granularity of bytes, # which causes a small error (<10 MiB in practice) non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa assert abs(non_torch_ratio - 1) <= 0.05 assert result.torch_peak_increase == 1024 * 1024 * 1024 del weights lib.cudaFree(handle1) lib.cudaFree(handle2) def test_memory_snapshot_uses_psutil_on_integrated_gpu(): """On integrated (UMA) GPUs, free_memory should come from psutil.""" mock_cuda_free = 40 * 1024**3 mock_cuda_total = 120 * 1024**3 mock_psutil_available = 100 * 1024**3 with ( patch("vllm.utils.mem_utils.current_platform") as mock_platform, patch("vllm.utils.mem_utils.psutil") as mock_psutil, ): mock_platform.mem_get_info.return_value = ( mock_cuda_free, mock_cuda_total, ) mock_platform.is_integrated_gpu.return_value = True mock_platform.memory_stats.return_value = { "allocated_bytes.all.peak": 0, } mock_platform.memory_reserved.return_value = 0 mock_platform.current_device = lambda: "cuda:0" mock_vmem = MagicMock() mock_vmem.available = mock_psutil_available mock_psutil.virtual_memory.return_value = mock_vmem snapshot = MemorySnapshot(device="cuda:0") assert snapshot.free_memory == mock_psutil_available assert snapshot.total_memory == mock_cuda_total mock_psutil.virtual_memory.assert_called_once() def test_memory_snapshot_uses_cuda_on_discrete_gpu(): """On discrete GPUs, free_memory should come from CUDA mem_get_info.""" mock_cuda_free = 70 * 1024**3 mock_cuda_total = 80 * 1024**3 with ( patch("vllm.utils.mem_utils.current_platform") as mock_platform, patch("vllm.utils.mem_utils.psutil") as mock_psutil, ): mock_platform.mem_get_info.return_value = ( mock_cuda_free, mock_cuda_total, ) mock_platform.is_integrated_gpu.return_value = False mock_platform.memory_stats.return_value = { "allocated_bytes.all.peak": 0, } mock_platform.memory_reserved.return_value = 0 mock_platform.current_device = lambda: "cuda:0" snapshot = MemorySnapshot(device="cuda:0") assert snapshot.free_memory == mock_cuda_free assert snapshot.total_memory == mock_cuda_total mock_psutil.virtual_memory.assert_not_called()