Unverified Commit 49bfc538 authored by Sage Moore's avatar Sage Moore Committed by GitHub
Browse files

Update num_tokens_across_dp to use nccl instead of gloo (#24105)


Signed-off-by: default avatarSage Moore <sage@neuralmagic.com>
parent a0b26701
...@@ -92,6 +92,7 @@ if TYPE_CHECKING: ...@@ -92,6 +92,7 @@ if TYPE_CHECKING:
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
VLLM_SKIP_P2P_CHECK: bool = False VLLM_SKIP_P2P_CHECK: bool = False
VLLM_DISABLED_KERNELS: list[str] = [] VLLM_DISABLED_KERNELS: list[str] = []
VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION: bool = False
VLLM_USE_V1: bool = True VLLM_USE_V1: bool = True
VLLM_ROCM_USE_AITER: bool = False VLLM_ROCM_USE_AITER: bool = False
VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
...@@ -745,6 +746,13 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -745,6 +746,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[ lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[
"VLLM_DISABLED_KERNELS"].split(","), "VLLM_DISABLED_KERNELS"].split(","),
# Swaps the all reduce backend that we use to coordinate the DP padding
# information from NCCL to gloo.
"VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION":
lambda:
(os.getenv("VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION", "False").lower() in
("true", "1")),
# If set, use the V1 code path. # If set, use the V1 code path.
"VLLM_USE_V1": "VLLM_USE_V1":
lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))), lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
......
...@@ -13,6 +13,7 @@ import torch.distributed as dist ...@@ -13,6 +13,7 @@ import torch.distributed as dist
import vllm.envs as envs import vllm.envs as envs
from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.backends.abstract import AttentionMetadata
...@@ -75,14 +76,26 @@ class DPMetadata: ...@@ -75,14 +76,26 @@ class DPMetadata:
Gather the num_tokens across all DP ranks and return results in a Gather the num_tokens across all DP ranks and return results in a
CPU tensor of size dp_size. CPU tensor of size dp_size.
""" """
from vllm.distributed.parallel_state import get_dp_group
device = current_platform.device_type
group = get_dp_group().device_group
# Transfering this tensor from GPU to CPU will introduce a GPU sync
# point that could adversely affect performance of vllm with asynch
# scheduling. This environment variable exists to quickly disable
# this optimization if we run into this case.
if envs.VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION:
logger.info_once(
"Using CPU all reduce to syncronize DP padding between ranks.")
device = "cpu"
group = get_dp_group().cpu_group
num_tokens_across_dp = [0] * dp_size num_tokens_across_dp = [0] * dp_size
num_tokens_across_dp[dp_rank] = num_tokens num_tokens_across_dp[dp_rank] = num_tokens
num_tokens_tensor = torch.tensor(num_tokens_across_dp, num_tokens_tensor = torch.tensor(num_tokens_across_dp,
device="cpu", device=device,
dtype=torch.int32) dtype=torch.int32)
from vllm.distributed.parallel_state import get_dp_group dist.all_reduce(num_tokens_tensor, group=group)
dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group) return num_tokens_tensor.cpu()
return num_tokens_tensor
@staticmethod @staticmethod
def make( def make(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment