Unverified Commit 6ffa3f31 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[CI/Build] Avoid CUDA initialization (#8534)

parent e3515729
......@@ -29,8 +29,9 @@ def query_marlin_supported_quant_types(has_zp: bool,
device_capability: Optional[int] = None
):
if device_capability is None:
major, minor = current_platform.get_device_capability()
device_capability = major * 10 + minor
capability_tuple = current_platform.get_device_capability()
device_capability = (-1 if capability_tuple is None else
capability_tuple.to_int())
if device_capability < 80:
return []
......@@ -52,8 +53,9 @@ def _check_marlin_supported(
device_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]:
if device_capability is None:
major, minor = current_platform.get_device_capability()
device_capability = major * 10 + minor
capability_tuple = current_platform.get_device_capability()
device_capability = (-1 if capability_tuple is None else
capability_tuple.to_int())
supported_types = query_marlin_supported_quant_types(
has_zp, device_capability)
......
......@@ -10,8 +10,7 @@ from .marlin_utils import marlin_make_workspace, marlin_permute_scales
def is_fp8_marlin_supported():
capability = current_platform.get_device_capability()
return capability[0] >= 8
return current_platform.has_device_capability(80)
def apply_fp8_marlin_linear(
......
......@@ -17,8 +17,9 @@ def cutlass_fp8_supported() -> bool:
# cutlass is not supported on Rocm
if is_hip():
return False
capability = current_platform.get_device_capability()
capability = capability[0] * 10 + capability[1]
capability_tuple = current_platform.get_device_capability()
capability = -1 if capability_tuple is None else capability_tuple.to_int()
return ops.cutlass_scaled_mm_supports_fp8(capability)
......
......@@ -97,10 +97,10 @@ def _get_quantization_config(
"""Get the quantization config."""
if model_config.quantization is not None:
quant_config = get_quant_config(model_config, load_config)
capability = current_platform.get_device_capability() # type: ignore
capability_tuple = current_platform.get_device_capability()
if capability is not None:
capability = capability[0] * 10 + capability[1]
if capability_tuple is not None:
capability = capability_tuple.to_int()
if capability < quant_config.get_min_capability():
raise ValueError(
f"The quantization method {model_config.quantization} "
......
......@@ -207,7 +207,7 @@ class Qwen2VisionAttention(nn.Module):
selected_backend = backend_name_to_enum(backend_by_env_var)
if selected_backend is None:
# For Volta and Turing GPUs, use xformers instead.
device_available = current_platform.get_device_capability()[0] >= 8
device_available = current_platform.has_device_capability(80)
if device_available:
from transformers.utils import is_flash_attn_2_available
......
"""Utils for model executor."""
import random
from typing import Any, Dict, Optional
import numpy as np
import torch
from vllm.utils import seed_everything
def set_random_seed(seed: int) -> None:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
seed_everything(seed)
def set_weight_attrs(
......
......@@ -6,10 +6,10 @@ from .interface import Platform, PlatformEnum
class CpuPlatform(Platform):
_enum = PlatformEnum.CPU
@staticmethod
def get_device_name(device_id: int = 0) -> str:
@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
return "cpu"
@staticmethod
def inference_mode():
@classmethod
def inference_mode(cls):
return torch.no_grad()
......@@ -11,7 +11,7 @@ from typing_extensions import ParamSpec
from vllm.logger import init_logger
from .interface import Platform, PlatformEnum
from .interface import DeviceCapability, Platform, PlatformEnum
logger = init_logger(__name__)
......@@ -96,19 +96,20 @@ def device_id_to_physical_device_id(device_id: int) -> int:
class CudaPlatform(Platform):
_enum = PlatformEnum.CUDA
@staticmethod
def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
@classmethod
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
physical_device_id = device_id_to_physical_device_id(device_id)
return get_physical_device_capability(physical_device_id)
major, minor = get_physical_device_capability(physical_device_id)
return DeviceCapability(major=major, minor=minor)
@staticmethod
def get_device_name(device_id: int = 0) -> str:
@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
physical_device_id = device_id_to_physical_device_id(device_id)
return get_physical_device_name(physical_device_id)
@staticmethod
@classmethod
@with_nvml_context
def is_full_nvlink(physical_device_ids: List[int]) -> bool:
def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
"""
query if the set of gpus are fully connected by nvlink (1 hop)
"""
......
import enum
from typing import Optional, Tuple
from typing import NamedTuple, Optional, Tuple, Union
import torch
......@@ -12,6 +12,23 @@ class PlatformEnum(enum.Enum):
UNSPECIFIED = enum.auto()
class DeviceCapability(NamedTuple):
major: int
minor: int
def as_version_str(self) -> str:
return f"{self.major}.{self.minor}"
def to_int(self) -> int:
"""
Express device capability as an integer ``<major><minor>``.
It is assumed that the minor version is always a single digit.
"""
assert 0 <= self.minor < 10
return self.major * 10 + self.minor
class Platform:
_enum: PlatformEnum
......@@ -27,16 +44,47 @@ class Platform:
def is_cpu(self) -> bool:
return self._enum == PlatformEnum.CPU
@staticmethod
def get_device_capability(device_id: int = 0) -> Optional[Tuple[int, int]]:
def is_cuda_alike(self) -> bool:
"""Stateless version of :func:`torch.cuda.is_available`."""
return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
@classmethod
def get_device_capability(
cls,
device_id: int = 0,
) -> Optional[DeviceCapability]:
"""Stateless version of :func:`torch.cuda.get_device_capability`."""
return None
@staticmethod
def get_device_name(device_id: int = 0) -> str:
@classmethod
def has_device_capability(
cls,
capability: Union[Tuple[int, int], int],
device_id: int = 0,
) -> bool:
"""
Test whether this platform is compatible with a device capability.
The ``capability`` argument can either be:
- A tuple ``(major, minor)``.
- An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
"""
current_capability = cls.get_device_capability(device_id=device_id)
if current_capability is None:
return False
if isinstance(capability, tuple):
return current_capability >= capability
return current_capability.to_int() >= capability
@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
raise NotImplementedError
@staticmethod
def inference_mode():
@classmethod
def inference_mode(cls):
"""A device-specific wrapper of `torch.inference_mode`.
This wrapper is recommended because some hardware backends such as TPU
......
import os
from functools import lru_cache
from typing import Tuple
import torch
from vllm.logger import init_logger
from .interface import Platform, PlatformEnum
from .interface import DeviceCapability, Platform, PlatformEnum
logger = init_logger(__name__)
......@@ -20,12 +19,13 @@ if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
class RocmPlatform(Platform):
_enum = PlatformEnum.ROCM
@staticmethod
@classmethod
@lru_cache(maxsize=8)
def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
return torch.cuda.get_device_capability(device_id)
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
major, minor = torch.cuda.get_device_capability(device_id)
return DeviceCapability(major=major, minor=minor)
@staticmethod
@classmethod
@lru_cache(maxsize=8)
def get_device_name(device_id: int = 0) -> str:
def get_device_name(cls, device_id: int = 0) -> str:
return torch.cuda.get_device_name(device_id)
......@@ -6,6 +6,10 @@ from .interface import Platform, PlatformEnum
class TpuPlatform(Platform):
_enum = PlatformEnum.TPU
@staticmethod
def inference_mode():
@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
raise NotImplementedError
@classmethod
def inference_mode(cls):
return torch.no_grad()
......@@ -8,13 +8,15 @@ from huggingface_hub import file_exists, hf_hub_download
from huggingface_hub.utils import EntryNotFoundError
from safetensors.torch import load_file as safe_load_file
from vllm.platforms import current_platform
WEIGHTS_NAME = "adapter_model.bin"
SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
# Get current device name based on available devices
def infer_device() -> str:
if torch.cuda.is_available():
if current_platform.is_cuda_alike():
return "cuda"
return "cpu"
......
......@@ -17,6 +17,7 @@ import torch
import vllm.envs as envs
from vllm.connections import global_http_connection
from vllm.platforms import current_platform
from vllm.version import __version__ as VLLM_VERSION
_config_home = envs.VLLM_CONFIG_ROOT
......@@ -151,7 +152,7 @@ class UsageMessage:
usage_context: UsageContext,
extra_kvs: Dict[str, Any]) -> None:
# Platform information
if torch.cuda.is_available():
if current_platform.is_cuda_alike():
device_property = torch.cuda.get_device_properties(0)
self.gpu_count = torch.cuda.device_count()
self.gpu_type = device_property.name
......
......@@ -5,6 +5,7 @@ import datetime
import enum
import gc
import os
import random
import socket
import subprocess
import sys
......@@ -32,6 +33,7 @@ from typing_extensions import ParamSpec, TypeIs, assert_never
import vllm.envs as envs
from vllm.logger import enable_trace_function_call, init_logger
from vllm.platforms import current_platform
logger = init_logger(__name__)
......@@ -373,6 +375,22 @@ def get_cpu_memory() -> int:
return psutil.virtual_memory().total
def seed_everything(seed: int) -> None:
"""
Set the seed of each random module.
Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
"""
random.seed(seed)
np.random.seed(seed)
if current_platform.is_cuda_alike():
torch.cuda.manual_seed_all(seed)
if is_xpu():
torch.xpu.manual_seed_all(seed)
def random_uuid() -> str:
return str(uuid.uuid4().hex)
......@@ -634,9 +652,7 @@ def create_kv_caches_with_random_flash(
seed: int = 0,
device: Optional[str] = "cuda",
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
......@@ -678,9 +694,7 @@ def create_kv_caches_with_random(
f"Does not support key cache of type fp8 with head_size {head_size}"
)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
......@@ -750,7 +764,7 @@ class CudaMemoryProfiler:
def current_memory_usage(self) -> float:
# Return the memory usage in bytes.
if torch.cuda.is_available():
if current_platform.is_cuda_alike():
torch.cuda.reset_peak_memory_stats(self.device)
mem = torch.cuda.max_memory_allocated(self.device)
elif is_xpu():
......
......@@ -454,14 +454,20 @@ def init_worker_distributed_environment(
def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
# Check if the GPU supports the dtype.
if torch_dtype == torch.bfloat16:
compute_capability = current_platform.get_device_capability()
if compute_capability[0] < 8:
if torch_dtype == torch.bfloat16: # noqa: SIM102
if not current_platform.has_device_capability(80):
capability = current_platform.get_device_capability()
gpu_name = current_platform.get_device_name()
if capability is None:
compute_str = "does not have a compute capability"
else:
version_str = capability.as_version_str()
compute_str = f"has compute capability {version_str}"
raise ValueError(
"Bfloat16 is only supported on GPUs with compute capability "
f"of at least 8.0. Your {gpu_name} GPU has compute capability "
f"{compute_capability[0]}.{compute_capability[1]}. "
f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
"You can use float16 instead by explicitly setting the"
"`dtype` flag in CLI, for example: --dtype=half.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment