Unverified Commit 2f6d17cb authored by kourosh hakhamaneshi's avatar kourosh hakhamaneshi Committed by GitHub
Browse files

[rocm][ray] Fix: Unify Ray device visibility handling across CUDA and ROCm (#33308)


Signed-off-by: default avatarKourosh Hakhamaneshi <kourosh@anyscale.com>
parent 192ad464
...@@ -15,8 +15,6 @@ FROM ${BASE_IMAGE} AS base ...@@ -15,8 +15,6 @@ FROM ${BASE_IMAGE} AS base
ARG ARG_PYTORCH_ROCM_ARCH ARG ARG_PYTORCH_ROCM_ARCH
ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
# Install some basic utilities # Install some basic utilities
RUN apt-get update -q -y && apt-get install -q -y \ RUN apt-get update -q -y && apt-get install -q -y \
......
...@@ -60,6 +60,11 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch): ...@@ -60,6 +60,11 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch):
runtime_env = { runtime_env = {
"env_vars": { "env_vars": {
"TEST_ENV_VAR": "test_value", "TEST_ENV_VAR": "test_value",
# In future ray versions, this will be default, so when setting a
# task or actor with num_gpus=None/0, the visible devices env var
# won't be overridden resulting in no GPUs being visible on a gpu
# machine.
"RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO": "0",
}, },
} }
......
...@@ -102,6 +102,9 @@ class CudaPlatformBase(Platform): ...@@ -102,6 +102,9 @@ class CudaPlatformBase(Platform):
ray_device_key: str = "GPU" ray_device_key: str = "GPU"
dist_backend: str = "nccl" dist_backend: str = "nccl"
device_control_env_var: str = "CUDA_VISIBLE_DEVICES" device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
ray_noset_device_env_vars: list[str] = [
"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES",
]
@property @property
def supported_dtypes(self) -> list[torch.dtype]: def supported_dtypes(self) -> list[torch.dtype]:
......
...@@ -116,6 +116,11 @@ class Platform: ...@@ -116,6 +116,11 @@ class Platform:
# https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa # https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa
device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER" device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER"
# environment variables that need to be set to 1 to prevent ray from
# setting the visible devices e.g.
# RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES
ray_noset_device_env_vars: list[str] = []
# The torch.compile backend for compiling simple and # The torch.compile backend for compiling simple and
# standalone functions. The default value is "inductor" to keep # standalone functions. The default value is "inductor" to keep
# the same behavior as PyTorch. # the same behavior as PyTorch.
......
...@@ -194,6 +194,11 @@ class RocmPlatform(Platform): ...@@ -194,6 +194,11 @@ class RocmPlatform(Platform):
dist_backend: str = "nccl" dist_backend: str = "nccl"
# rocm shares the same device control env var as CUDA # rocm shares the same device control env var as CUDA
device_control_env_var: str = "CUDA_VISIBLE_DEVICES" device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
ray_noset_device_env_vars: list[str] = [
"RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES",
"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES",
"RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES",
]
supported_quantization: list[str] = [ supported_quantization: list[str] = [
"awq", "awq",
......
...@@ -69,6 +69,8 @@ class RayDistributedExecutor(Executor): ...@@ -69,6 +69,8 @@ class RayDistributedExecutor(Executor):
"VLLM_HOST_PORT", "VLLM_HOST_PORT",
"LOCAL_RANK", "LOCAL_RANK",
"CUDA_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES",
"HIP_VISIBLE_DEVICES",
"ROCR_VISIBLE_DEVICES",
} }
# These non-vLLM env vars are copied from the driver to workers # These non-vLLM env vars are copied from the driver to workers
...@@ -146,6 +148,14 @@ class RayDistributedExecutor(Executor): ...@@ -146,6 +148,14 @@ class RayDistributedExecutor(Executor):
return ray_remote_kwargs return ray_remote_kwargs
def _update_noset_device_env_vars(self, ray_remote_kwargs):
runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
env_vars = runtime_env.setdefault("env_vars", {})
env_vars.update(
{env_var: "1" for env_var in current_platform.ray_noset_device_env_vars}
)
return ray_remote_kwargs
# child class could overwrite this to return actual env vars. # child class could overwrite this to return actual env vars.
def _get_env_vars_to_be_updated(self): def _get_env_vars_to_be_updated(self):
return self._env_vars_for_all_workers return self._env_vars_for_all_workers
...@@ -169,6 +179,11 @@ class RayDistributedExecutor(Executor): ...@@ -169,6 +179,11 @@ class RayDistributedExecutor(Executor):
ray_remote_kwargs ray_remote_kwargs
) )
# The way ray actors are setup in vllm is that the visible devices are
# not set by actors, they are left unset by ray. Internally we index
# the right gpu with local_rank. This is similar to how mp mode works.
self._update_noset_device_env_vars(ray_remote_kwargs)
# Create the workers. # Create the workers.
bundle_indices: list[int] bundle_indices: list[int]
if envs.VLLM_RAY_BUNDLE_INDICES: if envs.VLLM_RAY_BUNDLE_INDICES:
...@@ -303,6 +318,15 @@ class RayDistributedExecutor(Executor): ...@@ -303,6 +318,15 @@ class RayDistributedExecutor(Executor):
) )
# Set environment variables for the driver and workers. # Set environment variables for the driver and workers.
# We set CUDA_VISIBLE_DEVICES to ALL GPUs on the node for each worker.
# This is needed because:
# 1. Ray's compiled DAG needs to find the allocated GPU in
# CUDA_VISIBLE_DEVICES.
# 2. vLLM's communication layer (NCCL, CustomAllreduce) needs to see
# all GPUs for P2P checks and communication setup. Though if it was
# just this reason, we could have also just kept the visible devices
# unset.
# Each worker will use local_rank to index into the visible devices.
all_args_to_update_environment_variables = [ all_args_to_update_environment_variables = [
{ {
current_platform.device_control_env_var: ",".join( current_platform.device_control_env_var: ",".join(
......
...@@ -209,6 +209,7 @@ class Worker(WorkerBase): ...@@ -209,6 +209,7 @@ class Worker(WorkerBase):
f"be less than or equal to the number of visible devices " f"be less than or equal to the number of visible devices "
f"({visible_device_count})." f"({visible_device_count})."
) )
self.device = torch.device(f"cuda:{self.local_rank}") self.device = torch.device(f"cuda:{self.local_rank}")
current_platform.set_device(self.device) current_platform.set_device(self.device)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from collections.abc import Callable from collections.abc import Callable
from typing import TYPE_CHECKING, Any, TypeVar from typing import TYPE_CHECKING, Any, TypeVar
...@@ -221,11 +220,6 @@ class WorkerWrapperBase: ...@@ -221,11 +220,6 @@ class WorkerWrapperBase:
envs_list: list[dict[str, str]], envs_list: list[dict[str, str]],
) -> None: ) -> None:
envs = envs_list[self.rpc_rank] envs = envs_list[self.rpc_rank]
key = "CUDA_VISIBLE_DEVICES"
if key in envs and key in os.environ:
# overwriting CUDA_VISIBLE_DEVICES is desired behavior
# suppress the warning in `update_environment_variables`
del os.environ[key]
update_environment_variables(envs) update_environment_variables(envs)
def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None: def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment