"vscode:/vscode.git/clone" did not exist on "fd46df331de3dd1832e601cf589204f424312241"
Unverified Commit 9273a427 authored by Nick Hill's avatar Nick Hill Committed by GitHub
Browse files

[Misc] Allow enabling NCCL for DP sync when async scheduling (#32197)


Signed-off-by: default avatarNick Hill <nickhill123@gmail.com>
parent 78d13ea9
......@@ -2,10 +2,11 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from collections.abc import Callable
from typing import TYPE_CHECKING, Any, Literal
import torch
from pydantic import Field, model_validator
from pydantic import Field, field_validator, model_validator
from pydantic.dataclasses import dataclass
from torch.distributed import ProcessGroup, ReduceOp
from typing_extensions import Self
......@@ -182,9 +183,12 @@ class ParallelConfig:
threshold, microbatching will be used. Otherwise, the request will be
processed in a single batch."""
disable_nccl_for_dp_synchronization: bool = False
disable_nccl_for_dp_synchronization: bool = Field(default=None)
"""Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py
to use Gloo instead of NCCL for its all reduce"""
to use Gloo instead of NCCL for its all reduce.
Defaults to True when async scheduling is enabled, False otherwise.
"""
ray_workers_use_nsight: bool = False
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
......@@ -292,6 +296,12 @@ class ParallelConfig:
should only be set by API server scale-out.
"""
@field_validator("disable_nccl_for_dp_synchronization", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
"""Skip validation if the value is `None` when initialisation is delayed."""
return None if value is None else handler(value)
@model_validator(mode="after")
def _validate_parallel_config(self) -> Self:
if self._api_process_rank >= self._api_process_count:
......
......@@ -209,9 +209,7 @@ class SchedulerConfig:
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
"""Skip validation if the value is `None` when initialisation is delayed."""
if value is None:
return value
return handler(value)
return None if value is None else handler(value)
def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
if is_encoder_decoder:
......
......@@ -629,20 +629,22 @@ class VllmConfig:
else:
self.scheduler_config.async_scheduling = True
if (
self.scheduler_config.async_scheduling
and not self.parallel_config.disable_nccl_for_dp_synchronization
):
logger.info_once(
"Disabling NCCL for DP synchronization when using async scheduling."
)
self.parallel_config.disable_nccl_for_dp_synchronization = True
logger.info_once(
"Asynchronous scheduling is %s.",
"enabled" if self.scheduler_config.async_scheduling else "disabled",
)
if self.parallel_config.disable_nccl_for_dp_synchronization is None:
if self.scheduler_config.async_scheduling:
logger.info_once(
"Disabling NCCL for DP synchronization "
"when using async scheduling.",
scope="local",
)
self.parallel_config.disable_nccl_for_dp_synchronization = True
else:
self.parallel_config.disable_nccl_for_dp_synchronization = False
from vllm.platforms import current_platform
if (
......
......@@ -413,7 +413,7 @@ class EngineArgs:
ubatch_size: int = ParallelConfig.ubatch_size
dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold
disable_nccl_for_dp_synchronization: bool = (
disable_nccl_for_dp_synchronization: bool | None = (
ParallelConfig.disable_nccl_for_dp_synchronization
)
eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment