Unverified Commit 0cd259b2 authored by Nick Hill's avatar Nick Hill Committed by GitHub
Browse files

[BugFix] Fix P/D with non-MoE DP (#33037)


Signed-off-by: default avatarNick Hill <nickhill123@gmail.com>
parent 83fb2d09
......@@ -911,6 +911,17 @@ class EngineCoreProc(EngineCore):
set_process_title("EngineCore")
decorate_logs()
if data_parallel and vllm_config.kv_transfer_config is not None:
# modify the engine_id and append the local_dp_rank to it to ensure
# that the kv_transfer_config is unique for each DP rank.
vllm_config.kv_transfer_config.engine_id = (
f"{vllm_config.kv_transfer_config.engine_id}_dp{local_dp_rank}"
)
logger.debug(
"Setting kv_transfer_config.engine_id to %s",
vllm_config.kv_transfer_config.engine_id,
)
parallel_config.data_parallel_index = dp_rank
if data_parallel and vllm_config.model_config.is_moe:
# Set data parallel rank for this engine process.
......@@ -1285,17 +1296,6 @@ class DPEngineCoreProc(EngineCoreProc):
assert local_dp_rank is not None
assert 0 <= local_dp_rank <= dp_rank < dp_size
if vllm_config.kv_transfer_config is not None:
# modify the engine_id and append the local_dp_rank to it to ensure
# that the kv_transfer_config is unique for each DP rank.
vllm_config.kv_transfer_config.engine_id = (
f"{vllm_config.kv_transfer_config.engine_id}_dp{local_dp_rank}"
)
logger.debug(
"Setting kv_transfer_config.engine_id to %s",
vllm_config.kv_transfer_config.engine_id,
)
self.dp_rank = dp_rank
self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
......
......@@ -313,6 +313,13 @@ class CoreEngineActorManager:
dp_vllm_config.parallel_config.placement_group = pg
local_client = index < local_engine_count
if dp_size > 1 and dp_vllm_config.kv_transfer_config is not None:
# modify the engine_id and append the local_dp_rank to it to ensure
# that the kv_transfer_config is unique for each DP rank.
dp_vllm_config.kv_transfer_config.engine_id = (
f"{dp_vllm_config.kv_transfer_config.engine_id}_dp{local_index}"
)
# Ray XPU known issue: dpctl initializes the GPU runtime early, so
# setting device env vars in Ray actor's initialization method
# will not affect device selection. See:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment