"examples/vllm_v0/components/worker.py" did not exist on "08fd28978c1480e5ec07f4dc82d9befa24908230"
Unverified Commit f91b42b9 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat(frontend): Reduce Python-side overhead in the vLLM chat path (#6437)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent fd839b8d
......@@ -82,7 +82,8 @@ class FrontendConfig(ConfigBase):
event_plane: str
chat_processor: str
enable_anthropic_api: bool
exp_python_factory: bool
debug_perf: bool
preprocess_workers: int
def validate(self) -> None:
if bool(self.tls_cert_path) ^ bool(self.tls_key_path): # ^ is XOR
......@@ -515,9 +516,10 @@ class FrontendArgGroup(ArgGroup):
)
add_argument(
g,
flag_name="--chat-processor",
flag_name="--dyn-chat-processor",
env_var="DYN_CHAT_PROCESSOR",
default="dynamo",
dest="chat_processor",
help=(
"[EXPERIMENTAL] When set to 'vllm', use local vllm for the pre and post "
"processor."
......@@ -527,11 +529,28 @@ class FrontendArgGroup(ArgGroup):
add_negatable_bool_argument(
g,
flag_name="--exp-python-factory",
env_var="DYN_EXP_PYTHON_FACTORY",
flag_name="--dyn-debug-perf",
env_var="DYN_DEBUG_PERF",
default=False,
dest="debug_perf",
help=(
"[EXPERIMENTAL] Enable Python-based engine factory. When set, engines will be "
"created via a Python callback instead of the default Rust pipeline."
"[EXPERIMENTAL] Enable performance instrumentation for diagnosing preprocessing bottlenecks. "
"Logs per-function timing, request concurrency, and hot-path section durations. "
"'--dyn-chat-processor vllm' only."
),
)
add_argument(
g,
flag_name="--dyn-preprocess-workers",
env_var="DYN_PREPROCESS_WORKERS",
default=0,
dest="preprocess_workers",
help=(
"[EXPERIMENTAL] Number of worker processes for preprocessing and output processing. "
"When > 0, offloads CPU-bound work (tokenization, template rendering, "
"detokenization) to a ProcessPoolExecutor with N workers, each with its "
"own GIL. 0 (default) keeps all processing on the main event loop. '--dyn-chat-processor vllm' only."
),
arg_type=int,
)
......@@ -57,7 +57,7 @@ def setup_engine_factory(
"""
from .vllm_processor import EngineFactory
return EngineFactory(runtime, router_config, config, vllm_flags)
return EngineFactory(runtime, router_config, config, vllm_flags, config.debug_perf)
def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]:
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Performance instrumentation for diagnosing frontend preprocessing bottlenecks.
Activated by passing --dyn-debug-perf to dynamo.frontend.
"""
from __future__ import annotations
import logging
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Concurrency gauge
# ---------------------------------------------------------------------------
_active_requests = 0
_peak_requests = 0
def enter_generator() -> int:
"""Increment active request count. Returns current count.
Safe without a lock: only called while the GIL is held (all callers are
in Python code), so the read-modify-write on the global int is atomic
with respect to other Python threads.
"""
global _active_requests, _peak_requests
_active_requests += 1
count = _active_requests
if count > _peak_requests:
_peak_requests = count
return count
def exit_generator() -> int:
"""Decrement active request count. Returns current count."""
global _active_requests
_active_requests -= 1
return _active_requests
def get_active_requests() -> int:
return _active_requests
def get_peak_requests() -> int:
return _peak_requests
......@@ -3,6 +3,7 @@
from __future__ import annotations
import os
from collections.abc import Sequence
from dataclasses import dataclass
from typing import Any
......@@ -13,6 +14,7 @@ from vllm.reasoning import ReasoningParser
from vllm.sampling_params import SamplingParams
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser
from vllm.utils.async_utils import AsyncMicrobatchTokenizer
@dataclass
......@@ -24,6 +26,19 @@ class PreprocessResult:
prompt_token_ids: list[int]
_ASYNC_TOKENIZER_POOL: dict[int, AsyncMicrobatchTokenizer] = {}
SKIP_REQUEST_VALIDATION = os.getenv("DYN_VLLM_SKIP_REQUEST_VALIDATION", "1") == "1"
def _get_async_tokenizer(tokenizer: TokenizerLike) -> AsyncMicrobatchTokenizer:
key = id(tokenizer)
async_tokenizer = _ASYNC_TOKENIZER_POOL.get(key)
if async_tokenizer is None:
async_tokenizer = AsyncMicrobatchTokenizer(tokenizer)
_ASYNC_TOKENIZER_POOL[key] = async_tokenizer
return async_tokenizer
def _materialize_assistant_tool_calls(
messages: Sequence[Any],
) -> list[dict[str, Any] | Any]:
......@@ -53,13 +68,33 @@ def _materialize_assistant_tool_calls(
return normalized
async def preprocess_chat_request(
request: dict[str, Any],
def _prepare_request(
request: dict[str, Any] | ChatCompletionRequest,
*,
tokenizer: TokenizerLike,
renderer,
tool_parser_class: type[ToolParser] | None,
) -> PreprocessResult:
) -> tuple[
ChatCompletionRequest, ToolParser | None, dict[str, Any], Any, dict[str, Any]
]:
"""Validate request and build arguments for template rendering.
Returns:
request_for_sampling: Validated ChatCompletionRequest.
tool_parser: Instantiated tool parser, or None.
chat_template_kwargs: Template kwargs (for PreprocessResult).
messages_for_render: Messages to pass as first arg to render_messages.
render_kwargs: Keyword arguments for render_messages / render_messages_async.
"""
if isinstance(request, ChatCompletionRequest):
request_for_sampling = request
elif SKIP_REQUEST_VALIDATION:
# Trusted fast path; caller must provide OpenAI-compatible payload.
request_for_sampling = ChatCompletionRequest.model_construct(**request)
if request_for_sampling.tools and any(
not hasattr(tool, "model_dump") for tool in request_for_sampling.tools
):
request_for_sampling = ChatCompletionRequest.model_validate(request)
else:
request_for_sampling = ChatCompletionRequest.model_validate(request)
tool_parser: ToolParser | None = None
......@@ -88,8 +123,7 @@ async def preprocess_chat_request(
else request_for_sampling.messages
)
_, engine_prompt = await renderer.render_messages_async(
messages_for_render,
render_kwargs = dict(
chat_template=request_for_sampling.chat_template,
chat_template_content_format="auto",
add_generation_prompt=request_for_sampling.add_generation_prompt,
......@@ -100,6 +134,73 @@ async def preprocess_chat_request(
**chat_template_kwargs,
)
return (
request_for_sampling,
tool_parser,
chat_template_kwargs,
messages_for_render,
render_kwargs,
)
async def preprocess_chat_request(
request: dict[str, Any] | ChatCompletionRequest,
*,
tokenizer: TokenizerLike,
renderer,
tool_parser_class: type[ToolParser] | None,
) -> PreprocessResult:
(
request_for_sampling,
tool_parser,
chat_template_kwargs,
messages,
render_kwargs,
) = _prepare_request(
request, tokenizer=tokenizer, tool_parser_class=tool_parser_class
)
_, engine_prompt = await renderer.render_messages_async(messages, **render_kwargs)
if "prompt_token_ids" in engine_prompt:
tokens = list(engine_prompt["prompt_token_ids"])
else:
async_tokenizer = _get_async_tokenizer(tokenizer)
encoded = await async_tokenizer(
engine_prompt["prompt"],
add_special_tokens=request_for_sampling.add_special_tokens,
)
tokens = list(encoded.input_ids)
return PreprocessResult(
request_for_sampling=request_for_sampling,
tool_parser=tool_parser,
chat_template_kwargs=chat_template_kwargs,
engine_prompt=engine_prompt,
prompt_token_ids=tokens,
)
def preprocess_chat_request_sync(
request: dict[str, Any] | ChatCompletionRequest,
*,
tokenizer: TokenizerLike,
renderer,
tool_parser_class: type[ToolParser] | None,
) -> PreprocessResult:
"""Sync version of preprocess_chat_request for worker processes."""
(
request_for_sampling,
tool_parser,
chat_template_kwargs,
messages,
render_kwargs,
) = _prepare_request(
request, tokenizer=tokenizer, tool_parser_class=tool_parser_class
)
_, engine_prompt = renderer.render_messages(messages, **render_kwargs)
if "prompt_token_ids" in engine_prompt:
tokens = list(engine_prompt["prompt_token_ids"])
else:
......@@ -141,6 +242,9 @@ class StreamingPostProcessor:
if reasoning_parser_class
else None
)
self._fast_plain_text = (
self.tool_parser is None and self.reasoning_parser is None
)
self._control_markers = tuple(
t for t in getattr(tokenizer, "all_special_tokens", ()) if t
......@@ -191,6 +295,23 @@ class StreamingPostProcessor:
# to text. Re-detokenizing from token_ids can reintroduce stop markers.
delta_text = output.text or ""
if self._fast_plain_text:
if delta_text:
delta: dict[str, Any] = {
"role": "assistant",
"content": delta_text,
}
elif output.finish_reason:
delta = {}
else:
return None
return {
"index": output.index,
"delta": delta,
"finish_reason": output.finish_reason,
"logprobs": output.logprobs,
}
current_text = self.previous_text + delta_text
current_token_ids = self.previous_token_ids + delta_token_ids
......
......@@ -159,7 +159,11 @@ async def launch_workers(args, extra_engine_args_path):
logger.info(f"Creating mocker worker {worker_id + 1}/{args.num_workers}")
# Create a separate DistributedRuntime for this worker (on same event loop)
runtime = DistributedRuntime(loop, args.discovery_backend, args.request_plane)
runtime = DistributedRuntime(
loop,
args.discovery_backend,
args.request_plane,
)
runtimes.append(runtime)
# Determine which engine args file to use
......
......@@ -559,3 +559,75 @@ class TestVllmRendererApi:
"ReasoningParser.is_reasoning_end_streaming signature changed; "
f"expected ['self', 'input_ids', 'delta_ids'], got {end_params}"
)
def test_preprocess_worker_result_picklability(self):
"""Verify PreprocessWorkerResult survives pickle round-trip.
_preprocess_worker returns this dataclass via a ProcessPoolExecutor
Future. If any field becomes unpicklable, the pool path breaks.
"""
import pickle
from dynamo.frontend.vllm_processor import PreprocessWorkerResult
result = PreprocessWorkerResult(
dynamo_preproc={
"model": "test-model",
"token_ids": [1, 2, 3],
"stop_conditions": {
"max_tokens": 100,
"stop": [],
"stop_token_ids": [2],
"min_tokens": 0,
"ignore_eos": False,
},
"sampling_options": {
"n": 1,
"presence_penalty": 0.0,
"frequency_penalty": 0.0,
"repetition_penalty": 1.0,
"temperature": 1.0,
"top_p": 1.0,
"top_k": 0,
"min_p": 0.0,
"seed": None,
},
"output_options": {
"logprobs": None,
"prompt_logprobs": None,
"skip_special_tokens": True,
},
"eos_token_ids": [2],
"annotations": [],
},
tokens=[1, 2, 3],
vllm_preproc=EngineCoreRequest(
request_id="test-123",
prompt_token_ids=[1, 2, 3],
mm_features=None,
sampling_params=SamplingParams(),
pooling_params=None,
eos_token_id=2,
arrival_time=0.0,
lora_request=None,
cache_salt=None,
data_parallel_rank=None,
prompt_embeds=None,
client_index=0,
current_wave=0,
priority=0,
trace_headers=None,
),
sampling_params=SamplingParams(),
request_for_sampling={"model": "test-model", "tools": None},
chat_template_kwargs={"reasoning_effort": None},
)
data = pickle.dumps(result)
restored = pickle.loads(data)
assert restored.dynamo_preproc == result.dynamo_preproc
assert restored.tokens == result.tokens
assert restored.vllm_preproc.request_id == "test-123"
assert restored.request_for_sampling == result.request_for_sampling
assert restored.chat_template_kwargs == result.chat_template_kwargs
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment