add large_gpu_test

0dc55ec0 · zhuwenwen · 47be5a1c · 0dc55ec0 · 0dc55ec0 · 0dc55ec0
Commit 0dc55ec0 authored Dec 03, 2024 by zhuwenwen
6 changed files
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -23,7 +23,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.model_executor.model_loader.loader import get_model_loader
 from vllm.platforms import current_platform
-from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
+from vllm.utils import (FlexibleArgumentParser, GB_bytes,cuda_device_count_stateless,
                        get_open_port, is_hip)
 import vllm.envs as envs
 import os
@@ -459,6 +459,36 @@ def fork_new_process_for_each_test(
    return wrapper
+def large_gpu_test(*, min_gb: int):
+    """
+    Decorate a test to be skipped if no GPU is available or it does not have
+    sufficient memory.
+    Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
+    """
+    try:
+        if current_platform.is_cpu():
+            memory_gb = 0
+        else:
+            memory_gb = current_platform.get_device_total_memory() / GB_bytes
+    except Exception as e:
+        warnings.warn(
+            f"An error occurred when finding the available memory: {e}",
+            stacklevel=2,
+        )
+        memory_gb = 0
+    test_skipif = pytest.mark.skipif(
+        memory_gb < min_gb,
+        reason=f"Need at least {memory_gb}GB GPU memory to run the test.",
+    )
+    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
+        return test_skipif(fork_new_process_for_each_test(f))
+    return wrapper
 def multi_gpu_test(*, num_gpus: int):
    """

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
+import psutil
 import torch
 from .interface import Platform, PlatformEnum
@@ -9,6 +10,10 @@ class CpuPlatform(Platform):
    @classmethod
    def get_device_name(cls, device_id: int = 0) -> str:
        return "cpu"
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        return psutil.virtual_memory().total
    @classmethod
    def inference_mode(cls):

--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -83,6 +83,11 @@ class Platform:
    def get_device_name(cls, device_id: int = 0) -> str:
        raise NotImplementedError
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        """Get the total memory of a device in bytes."""
+        raise NotImplementedError
    @classmethod
    def inference_mode(cls):
        """A device-specific wrapper of `torch.inference_mode`.

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -29,3 +29,8 @@ class RocmPlatform(Platform):
    @lru_cache(maxsize=8)
    def get_device_name(cls, device_id: int = 0) -> str:
        return torch.cuda.get_device_name(device_id)
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -9,6 +9,10 @@ class TpuPlatform(Platform):
    @classmethod
    def get_device_name(cls, device_id: int = 0) -> str:
        raise NotImplementedError
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
    @classmethod
    def inference_mode(cls):

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -119,6 +119,9 @@ STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"
+GB_bytes = 1_000_000_000
+"""The number of bytes in one gigabyte (GB)."""
 GiB_bytes = 1 << 30
 """The number of bytes in one gibibyte (GiB)."""