feat(frontend): Reduce Python-side overhead in the vLLM chat path (#6437)

Signed-off-by: Graham King <grahamk@nvidia.com>

feat(frontend): Reduce Python-side overhead in the vLLM chat path (#6437)
Signed-off-by: Graham King <grahamk@nvidia.com>
f91b42b9 · Graham King · GitHub · fd839b8d · f91b42b9 · f91b42b9
Unverified Commit f91b42b9 authored Feb 20, 2026 by Graham King Committed by GitHub Feb 20, 2026
7 changed files
--- a/components/src/dynamo/frontend/frontend_args.py
+++ b/components/src/dynamo/frontend/frontend_args.py
@@ -82,7 +82,8 @@ class FrontendConfig(ConfigBase):
    event_plane: str
    chat_processor: str
    enable_anthropic_api: bool
-    exp_python_factory: bool
+    debug_perf: bool
+    preprocess_workers: int

    def validate(self) -> None:
        if bool(self.tls_cert_path) ^ bool(self.tls_key_path):  # ^ is XOR
@@ -515,9 +516,10 @@ class FrontendArgGroup(ArgGroup):
        )
        add_argument(
            g,
-            flag_name="--chat-processor",
+            flag_name="--dyn-chat-processor",
            env_var="DYN_CHAT_PROCESSOR",
            default="dynamo",
+            dest="chat_processor",
            help=(
                "[EXPERIMENTAL] When set to 'vllm', use local vllm for the pre and post "
                "processor."
@@ -527,11 +529,28 @@ class FrontendArgGroup(ArgGroup):

        add_negatable_bool_argument(
            g,
-            flag_name="--exp-python-factory",
-            env_var="DYN_EXP_PYTHON_FACTORY",
+            flag_name="--dyn-debug-perf",
+            env_var="DYN_DEBUG_PERF",
            default=False,
+            dest="debug_perf",
            help=(
-                "[EXPERIMENTAL] Enable Python-based engine factory. When set, engines will be "
-                "created via a Python callback instead of the default Rust pipeline."
+                "[EXPERIMENTAL] Enable performance instrumentation for diagnosing preprocessing bottlenecks. "
+                "Logs per-function timing, request concurrency, and hot-path section durations. "
+                "'--dyn-chat-processor vllm' only."
            ),
        )
+
+        add_argument(
+            g,
+            flag_name="--dyn-preprocess-workers",
+            env_var="DYN_PREPROCESS_WORKERS",
+            default=0,
+            dest="preprocess_workers",
+            help=(
+                "[EXPERIMENTAL] Number of worker processes for preprocessing and output processing. "
+                "When > 0, offloads CPU-bound work (tokenization, template rendering, "
+                "detokenization) to a ProcessPoolExecutor with N workers, each with its "
+                "own GIL. 0 (default) keeps all processing on the main event loop. '--dyn-chat-processor vllm' only."
+            ),
+            arg_type=int,
+        )
--- a/components/src/dynamo/frontend/main.py
+++ b/components/src/dynamo/frontend/main.py
@@ -57,7 +57,7 @@ def setup_engine_factory(
    """
    from .vllm_processor import EngineFactory

-    return EngineFactory(runtime, router_config, config, vllm_flags)
+    return EngineFactory(runtime, router_config, config, vllm_flags, config.debug_perf)


 def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]:

--- a/components/src/dynamo/frontend/perf_instrumentation.py
+++ b/components/src/dynamo/frontend/perf_instrumentation.py
+#  SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+
+"""
+Performance instrumentation for diagnosing frontend preprocessing bottlenecks.
+
+Activated by passing --dyn-debug-perf to dynamo.frontend.
+"""
+
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Concurrency gauge
+# ---------------------------------------------------------------------------
+
+_active_requests = 0
+_peak_requests = 0
+
+
+def enter_generator() -> int:
+    """Increment active request count. Returns current count.
+
+    Safe without a lock: only called while the GIL is held (all callers are
+    in Python code), so the read-modify-write on the global int is atomic
+    with respect to other Python threads.
+    """
+    global _active_requests, _peak_requests
+    _active_requests += 1
+    count = _active_requests
+    if count > _peak_requests:
+        _peak_requests = count
+    return count
+
+
+def exit_generator() -> int:
+    """Decrement active request count. Returns current count."""
+    global _active_requests
+    _active_requests -= 1
+    return _active_requests
+
+
+def get_active_requests() -> int:
+    return _active_requests
+
+
+def get_peak_requests() -> int:
+    return _peak_requests
--- a/components/src/dynamo/frontend/prepost.py
+++ b/components/src/dynamo/frontend/prepost.py
@@ -3,6 +3,7 @@

 from __future__ import annotations

+import os
 from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import Any
@@ -13,6 +14,7 @@ from vllm.reasoning import ReasoningParser
 from vllm.sampling_params import SamplingParams
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers import ToolParser
+from vllm.utils.async_utils import AsyncMicrobatchTokenizer


 @dataclass
@@ -24,6 +26,19 @@ class PreprocessResult:
    prompt_token_ids: list[int]


+_ASYNC_TOKENIZER_POOL: dict[int, AsyncMicrobatchTokenizer] = {}
+SKIP_REQUEST_VALIDATION = os.getenv("DYN_VLLM_SKIP_REQUEST_VALIDATION", "1") == "1"
+
+
+def _get_async_tokenizer(tokenizer: TokenizerLike) -> AsyncMicrobatchTokenizer:
+    key = id(tokenizer)
+    async_tokenizer = _ASYNC_TOKENIZER_POOL.get(key)
+    if async_tokenizer is None:
+        async_tokenizer = AsyncMicrobatchTokenizer(tokenizer)
+        _ASYNC_TOKENIZER_POOL[key] = async_tokenizer
+    return async_tokenizer
+
+
 def _materialize_assistant_tool_calls(
    messages: Sequence[Any],
 ) -> list[dict[str, Any] | Any]:
@@ -53,13 +68,33 @@ def _materialize_assistant_tool_calls(
    return normalized


-async def preprocess_chat_request(
-    request: dict[str, Any],
+def _prepare_request(
+    request: dict[str, Any] | ChatCompletionRequest,
    *,
    tokenizer: TokenizerLike,
-    renderer,
    tool_parser_class: type[ToolParser] | None,
-) -> PreprocessResult:
+) -> tuple[
+    ChatCompletionRequest, ToolParser | None, dict[str, Any], Any, dict[str, Any]
+]:
+    """Validate request and build arguments for template rendering.
+
+    Returns:
+        request_for_sampling: Validated ChatCompletionRequest.
+        tool_parser: Instantiated tool parser, or None.
+        chat_template_kwargs: Template kwargs (for PreprocessResult).
+        messages_for_render: Messages to pass as first arg to render_messages.
+        render_kwargs: Keyword arguments for render_messages / render_messages_async.
+    """
+    if isinstance(request, ChatCompletionRequest):
+        request_for_sampling = request
+    elif SKIP_REQUEST_VALIDATION:
+        # Trusted fast path; caller must provide OpenAI-compatible payload.
+        request_for_sampling = ChatCompletionRequest.model_construct(**request)
+        if request_for_sampling.tools and any(
+            not hasattr(tool, "model_dump") for tool in request_for_sampling.tools
+        ):
+            request_for_sampling = ChatCompletionRequest.model_validate(request)
+    else:
        request_for_sampling = ChatCompletionRequest.model_validate(request)

    tool_parser: ToolParser | None = None
@@ -88,8 +123,7 @@ async def preprocess_chat_request(
        else request_for_sampling.messages
    )

-    _, engine_prompt = await renderer.render_messages_async(
-        messages_for_render,
+    render_kwargs = dict(
        chat_template=request_for_sampling.chat_template,
        chat_template_content_format="auto",
        add_generation_prompt=request_for_sampling.add_generation_prompt,
@@ -100,6 +134,73 @@ async def preprocess_chat_request(
        **chat_template_kwargs,
    )

+    return (
+        request_for_sampling,
+        tool_parser,
+        chat_template_kwargs,
+        messages_for_render,
+        render_kwargs,
+    )
+
+
+async def preprocess_chat_request(
+    request: dict[str, Any] | ChatCompletionRequest,
+    *,
+    tokenizer: TokenizerLike,
+    renderer,
+    tool_parser_class: type[ToolParser] | None,
+) -> PreprocessResult:
+    (
+        request_for_sampling,
+        tool_parser,
+        chat_template_kwargs,
+        messages,
+        render_kwargs,
+    ) = _prepare_request(
+        request, tokenizer=tokenizer, tool_parser_class=tool_parser_class
+    )
+
+    _, engine_prompt = await renderer.render_messages_async(messages, **render_kwargs)
+
+    if "prompt_token_ids" in engine_prompt:
+        tokens = list(engine_prompt["prompt_token_ids"])
+    else:
+        async_tokenizer = _get_async_tokenizer(tokenizer)
+        encoded = await async_tokenizer(
+            engine_prompt["prompt"],
+            add_special_tokens=request_for_sampling.add_special_tokens,
+        )
+        tokens = list(encoded.input_ids)
+
+    return PreprocessResult(
+        request_for_sampling=request_for_sampling,
+        tool_parser=tool_parser,
+        chat_template_kwargs=chat_template_kwargs,
+        engine_prompt=engine_prompt,
+        prompt_token_ids=tokens,
+    )
+
+
+def preprocess_chat_request_sync(
+    request: dict[str, Any] | ChatCompletionRequest,
+    *,
+    tokenizer: TokenizerLike,
+    renderer,
+    tool_parser_class: type[ToolParser] | None,
+) -> PreprocessResult:
+    """Sync version of preprocess_chat_request for worker processes."""
+    (
+        request_for_sampling,
+        tool_parser,
+        chat_template_kwargs,
+        messages,
+        render_kwargs,
+    ) = _prepare_request(
+        request, tokenizer=tokenizer, tool_parser_class=tool_parser_class
+    )
+
+    _, engine_prompt = renderer.render_messages(messages, **render_kwargs)
+
    if "prompt_token_ids" in engine_prompt:
        tokens = list(engine_prompt["prompt_token_ids"])
    else:
@@ -141,6 +242,9 @@ class StreamingPostProcessor:
            if reasoning_parser_class
            else None
        )
+        self._fast_plain_text = (
+            self.tool_parser is None and self.reasoning_parser is None
+        )

        self._control_markers = tuple(
            t for t in getattr(tokenizer, "all_special_tokens", ()) if t
@@ -191,6 +295,23 @@ class StreamingPostProcessor:
        # to text. Re-detokenizing from token_ids can reintroduce stop markers.
        delta_text = output.text or ""

+        if self._fast_plain_text:
+            if delta_text:
+                delta: dict[str, Any] = {
+                    "role": "assistant",
+                    "content": delta_text,
+                }
+            elif output.finish_reason:
+                delta = {}
+            else:
+                return None
+            return {
+                "index": output.index,
+                "delta": delta,
+                "finish_reason": output.finish_reason,
+                "logprobs": output.logprobs,
+            }
+
        current_text = self.previous_text + delta_text
        current_token_ids = self.previous_token_ids + delta_token_ids


--- a/components/src/dynamo/frontend/vllm_processor.py
+++ b/components/src/dynamo/frontend/vllm_processor.py
--- a/components/src/dynamo/mocker/main.py
+++ b/components/src/dynamo/mocker/main.py
@@ -159,7 +159,11 @@ async def launch_workers(args, extra_engine_args_path):
        logger.info(f"Creating mocker worker {worker_id + 1}/{args.num_workers}")

        # Create a separate DistributedRuntime for this worker (on same event loop)
-        runtime = DistributedRuntime(loop, args.discovery_backend, args.request_plane)
+        runtime = DistributedRuntime(
+            loop,
+            args.discovery_backend,
+            args.request_plane,
+        )
        runtimes.append(runtime)

        # Determine which engine args file to use

--- a/components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
@@ -559,3 +559,75 @@ class TestVllmRendererApi:
            "ReasoningParser.is_reasoning_end_streaming signature changed; "
            f"expected ['self', 'input_ids', 'delta_ids'], got {end_params}"
        )
+
+    def test_preprocess_worker_result_picklability(self):
+        """Verify PreprocessWorkerResult survives pickle round-trip.
+
+        _preprocess_worker returns this dataclass via a ProcessPoolExecutor
+        Future. If any field becomes unpicklable, the pool path breaks.
+        """
+        import pickle
+
+        from dynamo.frontend.vllm_processor import PreprocessWorkerResult
+
+        result = PreprocessWorkerResult(
+            dynamo_preproc={
+                "model": "test-model",
+                "token_ids": [1, 2, 3],
+                "stop_conditions": {
+                    "max_tokens": 100,
+                    "stop": [],
+                    "stop_token_ids": [2],
+                    "min_tokens": 0,
+                    "ignore_eos": False,
+                },
+                "sampling_options": {
+                    "n": 1,
+                    "presence_penalty": 0.0,
+                    "frequency_penalty": 0.0,
+                    "repetition_penalty": 1.0,
+                    "temperature": 1.0,
+                    "top_p": 1.0,
+                    "top_k": 0,
+                    "min_p": 0.0,
+                    "seed": None,
+                },
+                "output_options": {
+                    "logprobs": None,
+                    "prompt_logprobs": None,
+                    "skip_special_tokens": True,
+                },
+                "eos_token_ids": [2],
+                "annotations": [],
+            },
+            tokens=[1, 2, 3],
+            vllm_preproc=EngineCoreRequest(
+                request_id="test-123",
+                prompt_token_ids=[1, 2, 3],
+                mm_features=None,
+                sampling_params=SamplingParams(),
+                pooling_params=None,
+                eos_token_id=2,
+                arrival_time=0.0,
+                lora_request=None,
+                cache_salt=None,
+                data_parallel_rank=None,
+                prompt_embeds=None,
+                client_index=0,
+                current_wave=0,
+                priority=0,
+                trace_headers=None,
+            ),
+            sampling_params=SamplingParams(),
+            request_for_sampling={"model": "test-model", "tools": None},
+            chat_template_kwargs={"reasoning_effort": None},
+        )
+
+        data = pickle.dumps(result)
+        restored = pickle.loads(data)
+
+        assert restored.dynamo_preproc == result.dynamo_preproc
+        assert restored.tokens == result.tokens
+        assert restored.vllm_preproc.request_id == "test-123"
+        assert restored.request_for_sampling == result.request_for_sampling
+        assert restored.chat_template_kwargs == result.chat_template_kwargs