"docs/backends/vscode:/vscode.git/clone" did not exist on "9fa8125cc8a926845e4c3e45341a486a7404bb98"
Unverified Commit 2da403e3 authored by Alec's avatar Alec Committed by GitHub
Browse files

refactor: add explicit non-leader node handling in vLLM (#5597)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
parent 50f1e0e1
......@@ -148,7 +148,7 @@ jobs:
- { major_minor: '12.9', major: '12' }
name: vllm-build-test (cuda${{ matrix.cuda_version.major_minor}}, ${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
timeout-minutes: 90
timeout-minutes: 240
env:
FRAMEWORK: vllm
steps: &runtime-container-build-push-test
......
......@@ -48,6 +48,19 @@ configure_dynamo_logging()
logger = logging.getLogger(__name__)
async def _handle_non_leader_node(dp_rank: int) -> None:
"""
Handle non-leader node (data_parallel_rank >= 1) in multi-node deployments.
Non-leader nodes run vLLM workers but don't serve Dynamo endpoints.
"""
logger.info(
f"Non-leader node detected (data_parallel_rank={dp_rank}). "
"Skipping endpoint serving."
)
# Wait indefinitely - process terminated via signal handlers
await asyncio.Event().wait()
async def graceful_shutdown(runtime):
"""
Shutdown dynamo distributed runtime.
......@@ -452,11 +465,13 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
runtime.register_engine_route("wake", handler.wake)
logger.info("Registered engine routes: /engine/sleep, /engine/wake")
# Handle non-leader nodes - don't serve endpoints
if config.engine_args.data_parallel_rank:
await _handle_non_leader_node(config.engine_args.data_parallel_rank)
return
# Register prefill model with ModelType.Prefill
if not config.engine_args.data_parallel_rank: # if rank is 0 or None then register
model_input = (
ModelInput.Text if config.use_vllm_tokenizer else ModelInput.Tokens
)
model_input = ModelInput.Text if config.use_vllm_tokenizer else ModelInput.Tokens
await register_vllm_model(
model_input,
ModelType.Prefill,
......@@ -575,16 +590,16 @@ async def init(runtime: DistributedRuntime, config: Config):
runtime.register_engine_route("wake", handler.wake)
logger.info("Registered engine routes: /engine/sleep, /engine/wake")
if not config.engine_args.data_parallel_rank: # if rank is 0 or None then register
# Handle non-leader nodes - don't serve endpoints
if config.engine_args.data_parallel_rank:
await _handle_non_leader_node(config.engine_args.data_parallel_rank)
return
# Parse endpoint types from --dyn-endpoint-types flag
model_type = parse_endpoint_types(config.dyn_endpoint_types)
logger.info(
f"Registering model with endpoint types: {config.dyn_endpoint_types}"
)
logger.info(f"Registering model with endpoint types: {config.dyn_endpoint_types}")
model_input = (
ModelInput.Text if config.use_vllm_tokenizer else ModelInput.Tokens
)
model_input = ModelInput.Text if config.use_vllm_tokenizer else ModelInput.Tokens
# Warn if custom template provided but chat endpoint not enabled
if config.custom_jinja_template and "chat" not in config.dyn_endpoint_types:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment