[CI/Build] Avoid CUDA initialization (#8534)

6ffa3f31 · Cyrus Leung · GitHub · e3515729 · 6ffa3f31 · 6ffa3f31
Unverified Commit 6ffa3f31 authored Sep 18, 2024 by Cyrus Leung Committed by GitHub Sep 18, 2024
15 changed files
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -29,8 +29,9 @@ def query_marlin_supported_quant_types(has_zp: bool,
                                       device_capability: Optional[int] = None
                                       ):
    if device_capability is None:
-        major, minor = current_platform.get_device_capability()
-        device_capability = major * 10 + minor
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())

    if device_capability < 80:
        return []
@@ -52,8 +53,9 @@ def _check_marlin_supported(
        device_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]:

    if device_capability is None:
-        major, minor = current_platform.get_device_capability()
-        device_capability = major * 10 + minor
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())

    supported_types = query_marlin_supported_quant_types(
        has_zp, device_capability)

--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -10,8 +10,7 @@ from .marlin_utils import marlin_make_workspace, marlin_permute_scales


 def is_fp8_marlin_supported():
-    capability = current_platform.get_device_capability()
-    return capability[0] >= 8
+    return current_platform.has_device_capability(80)


 def apply_fp8_marlin_linear(

--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -17,8 +17,9 @@ def cutlass_fp8_supported() -> bool:
    # cutlass is not supported on Rocm
    if is_hip():
        return False
-    capability = current_platform.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()

    return ops.cutlass_scaled_mm_supports_fp8(capability)


--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -97,10 +97,10 @@ def _get_quantization_config(
    """Get the quantization config."""
    if model_config.quantization is not None:
        quant_config = get_quant_config(model_config, load_config)
-        capability = current_platform.get_device_capability()  # type: ignore
+        capability_tuple = current_platform.get_device_capability()

-        if capability is not None:
-            capability = capability[0] * 10 + capability[1]
+        if capability_tuple is not None:
+            capability = capability_tuple.to_int()
            if capability < quant_config.get_min_capability():
                raise ValueError(
                    f"The quantization method {model_config.quantization} "

--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -207,7 +207,7 @@ class Qwen2VisionAttention(nn.Module):
                selected_backend = backend_name_to_enum(backend_by_env_var)
        if selected_backend is None:
            # For Volta and Turing GPUs, use xformers instead.
-            device_available = current_platform.get_device_capability()[0] >= 8
+            device_available = current_platform.has_device_capability(80)
            if device_available:
                from transformers.utils import is_flash_attn_2_available


--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
 """Utils for model executor."""
-import random
 from typing import Any, Dict, Optional

-import numpy as np
 import torch

+from vllm.utils import seed_everything
+

 def set_random_seed(seed: int) -> None:
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
+    seed_everything(seed)


 def set_weight_attrs(

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -6,10 +6,10 @@ from .interface import Platform, PlatformEnum
 class CpuPlatform(Platform):
    _enum = PlatformEnum.CPU

-    @staticmethod
-    def get_device_name(device_id: int = 0) -> str:
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
        return "cpu"

-    @staticmethod
-    def inference_mode():
+    @classmethod
+    def inference_mode(cls):
        return torch.no_grad()
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -11,7 +11,7 @@ from typing_extensions import ParamSpec

 from vllm.logger import init_logger

-from .interface import Platform, PlatformEnum
+from .interface import DeviceCapability, Platform, PlatformEnum

 logger = init_logger(__name__)

@@ -96,19 +96,20 @@ def device_id_to_physical_device_id(device_id: int) -> int:
 class CudaPlatform(Platform):
    _enum = PlatformEnum.CUDA

-    @staticmethod
-    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
        physical_device_id = device_id_to_physical_device_id(device_id)
-        return get_physical_device_capability(physical_device_id)
+        major, minor = get_physical_device_capability(physical_device_id)
+        return DeviceCapability(major=major, minor=minor)

-    @staticmethod
-    def get_device_name(device_id: int = 0) -> str:
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
        physical_device_id = device_id_to_physical_device_id(device_id)
        return get_physical_device_name(physical_device_id)

-    @staticmethod
+    @classmethod
    @with_nvml_context
-    def is_full_nvlink(physical_device_ids: List[int]) -> bool:
+    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
        """
        query if the set of gpus are fully connected by nvlink (1 hop)
        """

--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
 import enum
-from typing import Optional, Tuple
+from typing import NamedTuple, Optional, Tuple, Union

 import torch

@@ -12,6 +12,23 @@ class PlatformEnum(enum.Enum):
    UNSPECIFIED = enum.auto()


+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer ``<major><minor>``.
+
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+
+
 class Platform:
    _enum: PlatformEnum

@@ -27,16 +44,47 @@ class Platform:
    def is_cpu(self) -> bool:
        return self._enum == PlatformEnum.CPU

-    @staticmethod
-    def get_device_capability(device_id: int = 0) -> Optional[Tuple[int, int]]:
+    def is_cuda_alike(self) -> bool:
+        """Stateless version of :func:`torch.cuda.is_available`."""
+        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
+
+    @classmethod
+    def get_device_capability(
+        cls,
+        device_id: int = 0,
+    ) -> Optional[DeviceCapability]:
+        """Stateless version of :func:`torch.cuda.get_device_capability`."""
        return None

-    @staticmethod
-    def get_device_name(device_id: int = 0) -> str:
+    @classmethod
+    def has_device_capability(
+        cls,
+        capability: Union[Tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Test whether this platform is compatible with a device capability.
+
+        The ``capability`` argument can either be:
+
+        - A tuple ``(major, minor)``.
+        - An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
+        """
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+
+        if isinstance(capability, tuple):
+            return current_capability >= capability
+
+        return current_capability.to_int() >= capability
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
        raise NotImplementedError

-    @staticmethod
-    def inference_mode():
+    @classmethod
+    def inference_mode(cls):
        """A device-specific wrapper of `torch.inference_mode`.

        This wrapper is recommended because some hardware backends such as TPU

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
 import os
 from functools import lru_cache
-from typing import Tuple

 import torch

 from vllm.logger import init_logger

-from .interface import Platform, PlatformEnum
+from .interface import DeviceCapability, Platform, PlatformEnum

 logger = init_logger(__name__)

@@ -20,12 +19,13 @@ if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
 class RocmPlatform(Platform):
    _enum = PlatformEnum.ROCM

-    @staticmethod
+    @classmethod
    @lru_cache(maxsize=8)
-    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
-        return torch.cuda.get_device_capability(device_id)
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)

-    @staticmethod
+    @classmethod
    @lru_cache(maxsize=8)
-    def get_device_name(device_id: int = 0) -> str:
+    def get_device_name(cls, device_id: int = 0) -> str:
        return torch.cuda.get_device_name(device_id)
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -6,6 +6,10 @@ from .interface import Platform, PlatformEnum
 class TpuPlatform(Platform):
    _enum = PlatformEnum.TPU

-    @staticmethod
-    def inference_mode():
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        raise NotImplementedError
+
+    @classmethod
+    def inference_mode(cls):
        return torch.no_grad()
--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
@@ -8,13 +8,15 @@ from huggingface_hub import file_exists, hf_hub_download
 from huggingface_hub.utils import EntryNotFoundError
 from safetensors.torch import load_file as safe_load_file

+from vllm.platforms import current_platform
+
 WEIGHTS_NAME = "adapter_model.bin"
 SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"


 # Get current device name based on available devices
 def infer_device() -> str:
-    if torch.cuda.is_available():
+    if current_platform.is_cuda_alike():
        return "cuda"
    return "cpu"


--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -17,6 +17,7 @@ import torch

 import vllm.envs as envs
 from vllm.connections import global_http_connection
+from vllm.platforms import current_platform
 from vllm.version import __version__ as VLLM_VERSION

 _config_home = envs.VLLM_CONFIG_ROOT
@@ -151,7 +152,7 @@ class UsageMessage:
                           usage_context: UsageContext,
                           extra_kvs: Dict[str, Any]) -> None:
        # Platform information
-        if torch.cuda.is_available():
+        if current_platform.is_cuda_alike():
            device_property = torch.cuda.get_device_properties(0)
            self.gpu_count = torch.cuda.device_count()
            self.gpu_type = device_property.name

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -5,6 +5,7 @@ import datetime
 import enum
 import gc
 import os
+import random
 import socket
 import subprocess
 import sys
@@ -32,6 +33,7 @@ from typing_extensions import ParamSpec, TypeIs, assert_never

 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
+from vllm.platforms import current_platform

 logger = init_logger(__name__)

@@ -373,6 +375,22 @@ def get_cpu_memory() -> int:
    return psutil.virtual_memory().total


+def seed_everything(seed: int) -> None:
+    """
+    Set the seed of each random module.
+
+    Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+
+    if current_platform.is_cuda_alike():
+        torch.cuda.manual_seed_all(seed)
+
+    if is_xpu():
+        torch.xpu.manual_seed_all(seed)
+
+
 def random_uuid() -> str:
    return str(uuid.uuid4().hex)

@@ -634,9 +652,7 @@ def create_kv_caches_with_random_flash(
    seed: int = 0,
    device: Optional[str] = "cuda",
 ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)

    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
    key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
@@ -678,9 +694,7 @@ def create_kv_caches_with_random(
            f"Does not support key cache of type fp8 with head_size {head_size}"
        )

-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)

    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)

@@ -750,7 +764,7 @@ class CudaMemoryProfiler:

    def current_memory_usage(self) -> float:
        # Return the memory usage in bytes.
-        if torch.cuda.is_available():
+        if current_platform.is_cuda_alike():
            torch.cuda.reset_peak_memory_stats(self.device)
            mem = torch.cuda.max_memory_allocated(self.device)
        elif is_xpu():

--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -454,14 +454,20 @@ def init_worker_distributed_environment(

 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
    # Check if the GPU supports the dtype.
-    if torch_dtype == torch.bfloat16:
-        compute_capability = current_platform.get_device_capability()
-        if compute_capability[0] < 8:
+    if torch_dtype == torch.bfloat16:  # noqa: SIM102
+        if not current_platform.has_device_capability(80):
+            capability = current_platform.get_device_capability()
            gpu_name = current_platform.get_device_name()
+
+            if capability is None:
+                compute_str = "does not have a compute capability"
+            else:
+                version_str = capability.as_version_str()
+                compute_str = f"has compute capability {version_str}"
+
            raise ValueError(
                "Bfloat16 is only supported on GPUs with compute capability "
-                f"of at least 8.0. Your {gpu_name} GPU has compute capability "
-                f"{compute_capability[0]}.{compute_capability[1]}. "
+                f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
                "You can use float16 instead by explicitly setting the"
                "`dtype` flag in CLI, for example: --dtype=half.")