"vscode:/vscode.git/clone" did not exist on "68c09efc37e87032640cf8db571eaf486bd744ac"
Unverified Commit f5792c7c authored by Conroy Cheers's avatar Conroy Cheers Committed by GitHub
Browse files

[Hardware][NVIDIA] Add non-NVML CUDA mode for Jetson (#9735)


Signed-off-by: default avatarConroy Cheers <conroy@corncheese.org>
parent db66e018
...@@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) ...@@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures. # Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
# Supported AMD GPU architectures. # Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101") set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
...@@ -249,7 +249,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -249,7 +249,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs. # Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that # Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet. # are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS}) cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
if (MARLIN_ARCHS) if (MARLIN_ARCHS)
set(MARLIN_SRCS set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu" "csrc/quantization/fp8/fp8_marlin.cu"
...@@ -301,7 +301,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -301,7 +301,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x. # kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x # subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS) if (SCALED_MM_2X_ARCHS)
...@@ -427,7 +427,7 @@ set_gencode_flags_for_srcs( ...@@ -427,7 +427,7 @@ set_gencode_flags_for_srcs(
CUDA_ARCHS "${CUDA_ARCHS}") CUDA_ARCHS "${CUDA_ARCHS}")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS) if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h" "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
......
...@@ -28,7 +28,15 @@ try: ...@@ -28,7 +28,15 @@ try:
finally: finally:
pynvml.nvmlShutdown() pynvml.nvmlShutdown()
except Exception: except Exception:
pass # CUDA is supported on Jetson, but NVML may not be.
import os
def cuda_is_jetson() -> bool:
return os.path.isfile("/etc/nv_tegra_release") \
or os.path.exists("/sys/class/tegra-firmware")
if cuda_is_jetson():
is_cuda = True
is_rocm = False is_rocm = False
......
...@@ -4,7 +4,7 @@ pynvml. However, it should not initialize cuda context. ...@@ -4,7 +4,7 @@ pynvml. However, it should not initialize cuda context.
import os import os
from functools import lru_cache, wraps from functools import lru_cache, wraps
from typing import TYPE_CHECKING, Callable, List, Tuple, TypeVar from typing import TYPE_CHECKING, Callable, List, TypeVar
import pynvml import pynvml
import torch import torch
...@@ -38,10 +38,23 @@ if pynvml.__file__.endswith("__init__.py"): ...@@ -38,10 +38,23 @@ if pynvml.__file__.endswith("__init__.py"):
# see https://github.com/huggingface/diffusers/issues/9704 for details # see https://github.com/huggingface/diffusers/issues/9704 for details
torch.backends.cuda.enable_cudnn_sdp(False) torch.backends.cuda.enable_cudnn_sdp(False)
# NVML utils
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, def device_id_to_physical_device_id(device_id: int) -> int:
# all the related functions work on real physical device ids. if "CUDA_VISIBLE_DEVICES" in os.environ:
# the major benefit of using NVML is that it will not initialize CUDA device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
if device_ids == [""]:
msg = (
"CUDA_VISIBLE_DEVICES is set to empty string, which means"
" GPU support is disabled. If you are using ray, please unset"
" the environment variable `CUDA_VISIBLE_DEVICES` inside the"
" worker/actor. "
"Check https://github.com/vllm-project/vllm/issues/8402 for"
" more information.")
raise RuntimeError(msg)
physical_device_id = device_ids[device_id]
return int(physical_device_id)
else:
return device_id
def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]: def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
...@@ -57,87 +70,75 @@ def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]: ...@@ -57,87 +70,75 @@ def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
return wrapper return wrapper
@lru_cache(maxsize=8) class CudaPlatformBase(Platform):
@with_nvml_context _enum = PlatformEnum.CUDA
def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]: device_type: str = "cuda"
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) dispatch_key: str = "CUDA"
return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
@lru_cache(maxsize=8)
@with_nvml_context
def get_physical_device_name(device_id: int = 0) -> str:
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
return pynvml.nvmlDeviceGetName(handle)
@lru_cache(maxsize=8)
@with_nvml_context
def get_physical_device_total_memory(device_id: int = 0) -> int:
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
@with_nvml_context @classmethod
def warn_if_different_devices(): def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
device_ids: int = pynvml.nvmlDeviceGetCount() raise NotImplementedError
if device_ids > 1:
device_names = [get_physical_device_name(i) for i in range(device_ids)]
if len(set(device_names)) > 1 and os.environ.get(
"CUDA_DEVICE_ORDER") != "PCI_BUS_ID":
logger.warning(
"Detected different devices in the system: \n%s\nPlease"
" make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
"avoid unexpected behavior.", "\n".join(device_names))
@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
raise NotImplementedError
try: @classmethod
from sphinx.ext.autodoc.mock import _MockModule def get_device_total_memory(cls, device_id: int = 0) -> int:
raise NotImplementedError
if not isinstance(pynvml, _MockModule): @classmethod
warn_if_different_devices() def is_full_nvlink(cls, device_ids: List[int]) -> bool:
except ModuleNotFoundError: raise NotImplementedError
warn_if_different_devices()
@classmethod
def log_warnings(cls):
pass
def device_id_to_physical_device_id(device_id: int) -> int: @classmethod
if "CUDA_VISIBLE_DEVICES" in os.environ: def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",") parallel_config = vllm_config.parallel_config
if device_ids == [""]: scheduler_config = vllm_config.scheduler_config
msg = ( if parallel_config.worker_cls == "auto":
"CUDA_VISIBLE_DEVICES is set to empty string, which means" if scheduler_config.is_multi_step:
" GPU support is disabled. If you are using ray, please unset" parallel_config.worker_cls = \
" the environment variable `CUDA_VISIBLE_DEVICES` inside the" "vllm.worker.multi_step_worker.MultiStepWorker"
" worker/actor. " elif vllm_config.speculative_config:
"Check https://github.com/vllm-project/vllm/issues/8402 for" parallel_config.worker_cls = \
" more information.") "vllm.spec_decode.spec_decode_worker.create_spec_worker"
raise RuntimeError(msg)
physical_device_id = device_ids[device_id]
return int(physical_device_id)
else: else:
return device_id parallel_config.worker_cls = "vllm.worker.worker.Worker"
class CudaPlatform(Platform): # NVML utils
_enum = PlatformEnum.CUDA # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
device_type: str = "cuda" # all the related functions work on real physical device ids.
dispatch_key: str = "CUDA" # the major benefit of using NVML is that it will not initialize CUDA
class NvmlCudaPlatform(CudaPlatformBase):
@classmethod @classmethod
@lru_cache(maxsize=8)
@with_nvml_context
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
physical_device_id = device_id_to_physical_device_id(device_id) physical_device_id = device_id_to_physical_device_id(device_id)
major, minor = get_physical_device_capability(physical_device_id) handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
return DeviceCapability(major=major, minor=minor) return DeviceCapability(major=major, minor=minor)
@classmethod @classmethod
@lru_cache(maxsize=8)
@with_nvml_context
def get_device_name(cls, device_id: int = 0) -> str: def get_device_name(cls, device_id: int = 0) -> str:
physical_device_id = device_id_to_physical_device_id(device_id) physical_device_id = device_id_to_physical_device_id(device_id)
return get_physical_device_name(physical_device_id) return cls._get_physical_device_name(physical_device_id)
@classmethod @classmethod
@lru_cache(maxsize=8)
@with_nvml_context
def get_device_total_memory(cls, device_id: int = 0) -> int: def get_device_total_memory(cls, device_id: int = 0) -> int:
physical_device_id = device_id_to_physical_device_id(device_id) physical_device_id = device_id_to_physical_device_id(device_id)
return get_physical_device_total_memory(physical_device_id) handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
@classmethod @classmethod
@with_nvml_context @with_nvml_context
...@@ -153,27 +154,86 @@ class CudaPlatform(Platform): ...@@ -153,27 +154,86 @@ class CudaPlatform(Platform):
if i < j: if i < j:
try: try:
p2p_status = pynvml.nvmlDeviceGetP2PStatus( p2p_status = pynvml.nvmlDeviceGetP2PStatus(
handle, peer_handle, handle,
pynvml.NVML_P2P_CAPS_INDEX_NVLINK) peer_handle,
pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
)
if p2p_status != pynvml.NVML_P2P_STATUS_OK: if p2p_status != pynvml.NVML_P2P_STATUS_OK:
return False return False
except pynvml.NVMLError: except pynvml.NVMLError:
logger.exception( logger.exception(
"NVLink detection failed. This is normal if your" "NVLink detection failed. This is normal if"
" machine has no NVLink equipped.") " your machine has no NVLink equipped.")
return False return False
return True return True
@classmethod @classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None: def _get_physical_device_name(cls, device_id: int = 0) -> str:
parallel_config = vllm_config.parallel_config handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
scheduler_config = vllm_config.scheduler_config return pynvml.nvmlDeviceGetName(handle)
if parallel_config.worker_cls == "auto":
if scheduler_config.is_multi_step: @classmethod
parallel_config.worker_cls = \ @with_nvml_context
"vllm.worker.multi_step_worker.MultiStepWorker" def log_warnings(cls):
elif vllm_config.speculative_config: device_ids: int = pynvml.nvmlDeviceGetCount()
parallel_config.worker_cls = \ if device_ids > 1:
"vllm.spec_decode.spec_decode_worker.create_spec_worker" device_names = [
else: cls._get_physical_device_name(i) for i in range(device_ids)
parallel_config.worker_cls = "vllm.worker.worker.Worker" ]
if (len(set(device_names)) > 1
and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
logger.warning(
"Detected different devices in the system: \n%s\nPlease"
" make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
"avoid unexpected behavior.",
"\n".join(device_names),
)
class NonNvmlCudaPlatform(CudaPlatformBase):
@classmethod
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
major, minor = torch.cuda.get_device_capability(device_id)
return DeviceCapability(major=major, minor=minor)
@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
return torch.cuda.get_device_name(device_id)
@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int:
device_props = torch.cuda.get_device_properties(device_id)
return device_props.total_memory
@classmethod
def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
logger.exception(
"NVLink detection not possible, as context support was"
" not found. Assuming no NVLink available.")
return False
# Autodetect either NVML-enabled or non-NVML platform
# based on whether NVML is available.
nvml_available = False
try:
try:
pynvml.nvmlInit()
nvml_available = True
except Exception:
# On Jetson, NVML is not supported.
nvml_available = False
finally:
if nvml_available:
pynvml.nvmlShutdown()
CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
try:
from sphinx.ext.autodoc.mock import _MockModule
if not isinstance(pynvml, _MockModule):
CudaPlatform.log_warnings()
except ModuleNotFoundError:
CudaPlatform.log_warnings()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment