Unverified Commit 714b4234 authored by Roger Wang's avatar Roger Wang Committed by GitHub
Browse files

fix: propagate vLLM --stream-interval to Dynamo frontend (#8101)


Signed-off-by: default avatarRoger Wang <hey@rogerw.io>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent d94b350d
......@@ -466,12 +466,30 @@ class EngineFactory:
input_processor = InputProcessor(vllm_config)
tokenizer = input_processor.get_tokenizer()
# Resolve stream_interval: env var override > backend config > default (20)
stream_interval = self.stream_interval
if not os.getenv("DYN_VLLM_STREAM_INTERVAL"):
backend_interval = (
mdc.runtime_config().get("runtime_data", {}).get("stream_interval")
)
if backend_interval is not None:
try:
stream_interval = max(1, int(backend_interval))
except (TypeError, ValueError):
logger.warning(
"Invalid stream_interval=%r from backend runtime_config, "
"using default=%d",
backend_interval,
stream_interval,
)
output_processor = OutputProcessor(
tokenizer,
log_stats=False,
stream_interval=self.stream_interval,
stream_interval=stream_interval,
)
logger.info("vLLM OutputProcessor stream_interval=%d", self.stream_interval)
logger.info("vLLM OutputProcessor stream_interval=%d", stream_interval)
tool_parser_name = self.flags.tool_call_parser or mdc.runtime_config().get(
"tool_call_parser"
......
......@@ -126,10 +126,10 @@ def cross_validate_config(
"""Validate dynamo and engine config together. This should not modify the configs."""
if hasattr(engine_config, "stream_interval") and engine_config.stream_interval != 1:
logger.warning(
"--stream-interval is currently not respected in Dynamo. "
"Dynamo uses its own post-processing implementation on the frontend, "
"bypassing vLLM's OutputProcessor buffering."
logger.info(
"--stream-interval=%d will be propagated to the Dynamo frontend. "
"Set DYN_VLLM_STREAM_INTERVAL env var to override.",
engine_config.stream_interval,
)
# Validate --gms-shadow-mode requires --load-format gms
......
......@@ -656,6 +656,13 @@ async def register_vllm_model(
config.exclude_tools_when_tool_choice_none
)
# Propagate stream_interval so the frontend can respect --stream-interval.
# set_engine_specific requires a JSON-encoded string (the Rust binding
# parses it with serde_json::from_str); str(int) happens to be valid JSON.
stream_interval = getattr(config.engine_args, "stream_interval", None)
if stream_interval is not None:
runtime_config.set_engine_specific("stream_interval", str(stream_interval))
# Get data_parallel_size from vllm_config (defaults to 1)
dp_range = get_dp_range_for_worker(vllm_config)
runtime_config.data_parallel_start_rank = dp_range[0]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment