chore(frontend): Remove the multi-processing vllm processor path (#7005)

Signed-off-by: Graham King <grahamk@nvidia.com>

chore(frontend): Remove the multi-processing vllm processor path (#7005)
Signed-off-by: Graham King <grahamk@nvidia.com>
35b0ce62 · Graham King · GitHub · 8381e28a · 35b0ce62 · 35b0ce62
Unverified Commit 35b0ce62 authored Mar 06, 2026 by Graham King Committed by GitHub Mar 06, 2026
4 changed files
--- a/components/src/dynamo/frontend/main.py
+++ b/components/src/dynamo/frontend/main.py
@@ -228,7 +228,7 @@ async def async_main():
    if config.chat_processor == "vllm":
        assert (
            vllm_flags is not None
-        ), "vllm_flags is required when chat_processor is vllm"
+        ), "vllm_flags is required when chat processor is vllm"
        chat_engine_factory = setup_engine_factory(
            runtime, router_config, config, vllm_flags
        ).chat_engine_factory

--- a/components/src/dynamo/frontend/prepost.py
+++ b/components/src/dynamo/frontend/prepost.py
@@ -187,43 +187,6 @@ async def preprocess_chat_request(
    )
-def preprocess_chat_request_sync(
-    request: dict[str, Any] | ChatCompletionRequest,
-    *,
-    tokenizer: TokenizerLike,
-    renderer,
-    tool_parser_class: type[ToolParser] | None,
-) -> PreprocessResult:
-    """Sync version of preprocess_chat_request for worker processes."""
-    (
-        request_for_sampling,
-        tool_parser,
-        chat_template_kwargs,
-        messages,
-        chat_params,
-    ) = _prepare_request(
-        request, tokenizer=tokenizer, tool_parser_class=tool_parser_class
-    )
-    _, engine_prompt = renderer.render_messages(messages, chat_params)
-    if "prompt_token_ids" in engine_prompt:
-        tokens = list(engine_prompt["prompt_token_ids"])
-    else:
-        tokens = tokenizer.encode(
-            engine_prompt["prompt"],
-            add_special_tokens=request_for_sampling.add_special_tokens,
-        )
-    return PreprocessResult(
-        request_for_sampling=request_for_sampling,
-        tool_parser=tool_parser,
-        chat_template_kwargs=chat_template_kwargs,
-        engine_prompt=engine_prompt,
-        prompt_token_ids=tokens,
-    )
 class StreamingPostProcessor:
    def __init__(
        self,

--- a/components/src/dynamo/frontend/vllm_processor.py
+++ b/components/src/dynamo/frontend/vllm_processor.py
@@ -12,9 +12,6 @@ import time
 import uuid
 from argparse import Namespace
 from collections.abc import AsyncGenerator
-from concurrent.futures import ProcessPoolExecutor
-from concurrent.futures import wait as _futures_wait
-from dataclasses import dataclass
 from typing import Any
 from vllm.config import CacheConfig, LoadConfig, ModelConfig, VllmConfig
@@ -39,11 +36,7 @@ from dynamo.llm import (
 )
 from dynamo.runtime import Client, DistributedRuntime
-from .prepost import (
+from .prepost import StreamingPostProcessor, preprocess_chat_request
-    StreamingPostProcessor,
-    preprocess_chat_request,
-    preprocess_chat_request_sync,
-)
 logger = logging.getLogger(__name__)
@@ -79,194 +72,6 @@ def map_finish_reason(raw_reason: str | None) -> FinishReason | None:
    return mapped
-# --- Worker process globals (initialized once per process by _init_worker) ---
-_w_input_processor: InputProcessor | None = None
-_w_tokenizer: Any = None
-_w_tool_parser_class: type[ToolParser] | None = None
-class _PreprocessError(Exception):
-    """Raised by _preprocess_worker for user-facing errors (e.g., n!=1)."""
-    def __init__(self, error_dict: dict[str, Any]):
-        self.error_dict = error_dict
-        super().__init__(str(error_dict))
-@dataclass
-class PreprocessWorkerResult:
-    """Picklable return value from the preprocess worker."""
-    dynamo_preproc: dict[str, Any]
-    tokens: list[int]
-    vllm_preproc: EngineCoreRequest
-    sampling_params: SamplingParams
-    request_for_sampling: Any  # ChatCompletionRequest (Pydantic model, picklable)
-    chat_template_kwargs: dict[str, Any]
-def _init_worker(
-    model_path: str,
-    tokenizer_mode: str,
-    config_format: str,
-    load_format: str,
-    tool_parser_name: str | None,
-) -> None:
-    """Initialize a worker process with its own VllmConfig and InputProcessor."""
-    global _w_input_processor, _w_tokenizer, _w_tool_parser_class
-    global _w_reasoning_parser_class
-    model_config = ModelConfig(
-        model=model_path,
-        tokenizer_mode=tokenizer_mode,
-        config_format=config_format,
-    )
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        load_config=LoadConfig(load_format=load_format),
-        cache_config=CacheConfig(),
-    )
-    _w_input_processor = InputProcessor(vllm_config)
-    _w_tokenizer = _w_input_processor.get_tokenizer()
-    if tool_parser_name:
-        _w_tool_parser_class = ToolParserManager.get_tool_parser(tool_parser_name)
-    else:
-        _w_tool_parser_class = None
-def _worker_warmup() -> bool:
-    """Dummy task to ensure worker process is fully initialized."""
-    return True
-def _preprocess_worker(
-    request: dict[str, Any],
-    request_id: str,
-    model_name: str,
-) -> PreprocessWorkerResult:
-    """Preprocess a request in a worker process and return a picklable result."""
-    assert _w_input_processor is not None
-    pre = preprocess_chat_request_sync(
-        request,
-        tokenizer=_w_tokenizer,
-        renderer=_w_input_processor.renderer,
-        tool_parser_class=_w_tool_parser_class,
-    )
-    request_for_sampling = pre.request_for_sampling
-    engine_prompt = pre.engine_prompt
-    tokens = pre.prompt_token_ids
-    if request_for_sampling.max_completion_tokens is not None:
-        max_tokens = request_for_sampling.max_completion_tokens
-    elif request_for_sampling.max_tokens is not None:
-        max_tokens = request_for_sampling.max_tokens
-    else:
-        max_tokens = None
-    sampling_params = SamplingParams(
-        output_kind=RequestOutputKind.DELTA,
-        max_tokens=max_tokens,
-    )
-    for k, v in _w_input_processor.generation_config_fields.items():
-        if hasattr(sampling_params, k):
-            setattr(sampling_params, k, v)
-    sampling_fields = (
-        set(getattr(SamplingParams, "__annotations__", ()))
-        & set(type(request_for_sampling).model_fields)
-    ) - {"max_tokens", "logprobs", "output_kind"}
-    for k in sorted(sampling_fields):
-        v = getattr(request_for_sampling, k, None)
-        if v is not None:
-            setattr(sampling_params, k, v)
-    logprobs = request_for_sampling.logprobs
-    top_logprobs = request_for_sampling.top_logprobs
-    if logprobs is True:
-        sampling_params.logprobs = top_logprobs or 1
-    elif isinstance(logprobs, int) and not isinstance(logprobs, bool):
-        sampling_params.logprobs = logprobs
-    elif top_logprobs not in (None, 0):
-        sampling_params.logprobs = top_logprobs
-    prompt_inputs = TokensPrompt(prompt_token_ids=tokens)
-    if "multi_modal_data" in engine_prompt:
-        prompt_inputs["multi_modal_data"] = engine_prompt["multi_modal_data"]
-    if "multi_modal_uuids" in engine_prompt:
-        prompt_inputs["multi_modal_uuids"] = engine_prompt["multi_modal_uuids"]
-    if request_for_sampling.cache_salt is not None:
-        prompt_inputs["cache_salt"] = request_for_sampling.cache_salt
-    if request_for_sampling.mm_processor_kwargs is not None:
-        prompt_inputs["mm_processor_kwargs"] = request_for_sampling.mm_processor_kwargs
-    vllm_preproc: EngineCoreRequest = _w_input_processor.process_inputs(
-        request_id,
-        prompt_inputs,
-        sampling_params,
-    )
-    InputProcessor.assign_request_id(vllm_preproc)
-    sp = vllm_preproc.sampling_params
-    if sp.n != 1:
-        raise _PreprocessError(
-            {
-                "error": {
-                    "message": (
-                        f"Unsupported value: 'n={sp.n}'. "
-                        "This endpoint currently supports only n=1."
-                    ),
-                    "type": "invalid_request_error",
-                    "param": "n",
-                    "code": "unsupported_value",
-                }
-            }
-        )
-    dynamo_preproc = {
-        "model": model_name,
-        "token_ids": tokens,
-        "stop_conditions": {
-            "max_tokens": sp.max_tokens,
-            "stop": sp.stop,
-            "stop_token_ids": sp.stop_token_ids,
-            "min_tokens": sp.min_tokens,
-            "ignore_eos": sp.ignore_eos,
-        },
-        "sampling_options": {
-            "n": sp.n,
-            "presence_penalty": sp.presence_penalty,
-            "frequency_penalty": sp.frequency_penalty,
-            "repetition_penalty": sp.repetition_penalty,
-            "temperature": sp.temperature,
-            "top_p": sp.top_p,
-            "top_k": sp.top_k,
-            "min_p": sp.min_p,
-            "seed": sp.seed,
-        },
-        "output_options": {
-            "logprobs": sp.logprobs,
-            "prompt_logprobs": sp.prompt_logprobs,
-            "skip_special_tokens": sp.skip_special_tokens,
-        },
-        "eos_token_ids": [vllm_preproc.eos_token_id]
-        if vllm_preproc.eos_token_id is not None
-        else [],
-        "annotations": [],
-    }
-    return PreprocessWorkerResult(
-        dynamo_preproc=dynamo_preproc,
-        tokens=tokens,
-        vllm_preproc=vllm_preproc,
-        sampling_params=sampling_params,
-        request_for_sampling=request_for_sampling,
-        chat_template_kwargs=pre.chat_template_kwargs,
-    )
 class VllmProcessor:
    def __init__(
        self,
@@ -277,8 +82,6 @@ class VllmProcessor:
        tool_parser_class: type[ToolParser] | None,
        reasoning_parser_class: type[ReasoningParser] | None,
        debug_perf: bool = False,
-        preprocess_pool: ProcessPoolExecutor | None = None,
-        preprocess_workers: int = 0,
    ):
        self.tokenizer = tokenizer
        self.input_processor = input_processor
@@ -288,15 +91,6 @@ class VllmProcessor:
        self.tool_parser_class = tool_parser_class
        self.reasoning_parser_class = reasoning_parser_class
        self.debug_perf = debug_perf
-        self.preprocess_pool = preprocess_pool
-        if preprocess_pool is not None:
-            # Allow a small buffer beyond the worker count so the pool's
-            # internal queue always has work ready when a worker finishes.
-            self._worker_semaphore: asyncio.Semaphore | None = asyncio.Semaphore(
-                preprocess_workers + 2
-            )
-        else:
-            self._worker_semaphore = None
    # Ideally we would map NVCreateChatCompletionRequest into Python so it can be type checked, but
    # it has a lot of fields.
@@ -306,7 +100,7 @@ class VllmProcessor:
    ) -> AsyncGenerator[dict[str, Any], None]:
        """
        Run a single request through the engine. Does pre and post processing on this machine, delegates
-        model inference to a worker using the router.
+        model inference to a backend using the router.
        """
        # ** VllmProcessor.generator called: {'messages': [{'role': 'user', 'content': 'What is the capital of Tuvalu?'}], 'model': '/home/grahamk/llms/Qwen3-0.6B', 'max_completion_tokens': 1000, 'stream': False}
@@ -319,14 +113,8 @@ class VllmProcessor:
            logger.info("[perf] generator enter: active_requests=%d", active)
        try:
-            if self.preprocess_pool is None:
+            async for item in self._generator_inner(request):
-                # Single process
+                yield item
-                async for item in self._generator_inner(request):
-                    yield item
-            else:
-                # Multi process
-                async for item in self._generator_inner_pool(request):
-                    yield item
        finally:
            if self.debug_perf:
                active = exit_generator()
@@ -525,7 +313,6 @@ class VllmProcessor:
        vllm_preproc: EngineCoreRequest,
        post: StreamingPostProcessor,
    ) -> AsyncGenerator[dict[str, Any], None]:
-        """Shared streaming logic for both single-process and pool paths."""
        self.output_processor.add_request(vllm_preproc, None)
        token_count = 0
@@ -632,77 +419,6 @@ class VllmProcessor:
                    post_proc_total_ms / token_count,
                )
-    async def _generator_inner_pool(
-        self, request: dict[str, Any]
-    ) -> AsyncGenerator[dict[str, Any], None]:
-        """Process a request using the worker pool.
-        Phase 1: Preprocess in a worker process (semaphore held).
-        Phase 2: Remote inference via router (no worker held).
-        Phase 3: Post-process tokens in the main process.
-        """
-        request_id = random_uuid()
-        # --- Phase 1: Preprocess (semaphore held) ---
-        try:
-            assert self._worker_semaphore is not None
-            async with self._worker_semaphore:
-                assert self.preprocess_pool is not None
-                future = self.preprocess_pool.submit(
-                    _preprocess_worker, request, request_id, request["model"]
-                )
-                preproc_result: PreprocessWorkerResult = await asyncio.wrap_future(
-                    future
-                )
-            # Semaphore + worker released here
-        except _PreprocessError as exc:
-            yield exc.error_dict
-            return
-        except Exception as exc:
-            logger.exception("Worker preprocessing failed for request %s", request_id)
-            yield {
-                "error": {
-                    "message": f"Worker error: {exc}",
-                    "type": "internal_error",
-                }
-            }
-            return
-        # --- Between phases: reconstruct main-process objects ---
-        dynamo_preproc = preproc_result.dynamo_preproc
-        tokens = preproc_result.tokens
-        vllm_preproc = preproc_result.vllm_preproc
-        sampling_params = preproc_result.sampling_params
-        request_for_sampling = preproc_result.request_for_sampling
-        tool_parser = None
-        if (
-            self.tool_parser_class
-            and request_for_sampling.tools
-            and request_for_sampling.tool_choice != "none"
-        ):
-            tool_parser = self.tool_parser_class(self.tokenizer)
-        post = StreamingPostProcessor(
-            tokenizer=self.tokenizer,
-            request_for_sampling=request_for_sampling,
-            sampling_params=sampling_params,
-            prompt_token_ids=tokens,
-            tool_parser=tool_parser,
-            reasoning_parser_class=self.reasoning_parser_class,
-            chat_template_kwargs=preproc_result.chat_template_kwargs,
-        )
-        async for item in self._generate_and_stream(
-            request_id,
-            request,
-            dynamo_preproc,
-            tokens,
-            vllm_preproc,
-            post,
-        ):
-            yield item
 class EngineFactory:
    def __init__(
@@ -713,6 +429,11 @@ class EngineFactory:
        flags: Namespace,
        debug_perf: bool = False,
    ):
+        if config.preprocess_workers != 0:
+            raise RuntimeError(
+                "preprocess_workers > 0 is not supported by vllm preprocessor"
+            )
        self.runtime = runtime
        self.router_config = router_config
        self.config = config
@@ -808,46 +529,6 @@ class EngineFactory:
                router_mode=self.router_config.router_mode
            )
-        preprocess_pool = None
-        preprocess_workers = self.config.preprocess_workers
-        if preprocess_workers > 0:
-            logger.info(
-                "Creating preprocess worker pool with %d workers for model %s",
-                preprocess_workers,
-                source_path,
-            )
-            preprocess_pool = ProcessPoolExecutor(
-                max_workers=preprocess_workers,
-                initializer=_init_worker,
-                initargs=(
-                    source_path,
-                    tokenizer_mode,
-                    config_format,
-                    load_format,
-                    tool_parser_name,
-                ),
-            )
-            # Warm up all workers to ensure initialization completes
-            futures = [
-                preprocess_pool.submit(_worker_warmup)
-                for _ in range(preprocess_workers)
-            ]
-            done, not_done = _futures_wait(futures, timeout=120)
-            if not_done:
-                for f in not_done:
-                    f.cancel()
-                preprocess_pool.shutdown(wait=False, cancel_futures=True)
-                raise RuntimeError(
-                    "Timed out waiting for preprocess worker pool warmup"
-                )
-            try:
-                for f in done:
-                    f.result()  # Raises if initializer failed
-            except Exception:
-                preprocess_pool.shutdown(wait=False, cancel_futures=True)
-                raise
-            logger.info("Preprocess worker pool ready (%d workers)", preprocess_workers)
        gen = VllmProcessor(
            tokenizer,
            input_processor,
@@ -856,8 +537,6 @@ class EngineFactory:
            tool_parser_class,
            reasoning_parser_class,
            debug_perf=self.debug_perf,
-            preprocess_pool=preprocess_pool,
-            preprocess_workers=preprocess_workers,
        )
        return PythonAsyncEngine(gen.generator, loop)
--- a/components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
@@ -566,75 +566,3 @@ class TestVllmRendererApi:
            "ReasoningParser.is_reasoning_end_streaming signature changed; "
            f"expected ['self', 'input_ids', 'delta_ids'], got {end_params}"
        )
-    def test_preprocess_worker_result_picklability(self):
-        """Verify PreprocessWorkerResult survives pickle round-trip.
-        _preprocess_worker returns this dataclass via a ProcessPoolExecutor
-        Future. If any field becomes unpicklable, the pool path breaks.
-        """
-        import pickle
-        from dynamo.frontend.vllm_processor import PreprocessWorkerResult
-        result = PreprocessWorkerResult(
-            dynamo_preproc={
-                "model": "test-model",
-                "token_ids": [1, 2, 3],
-                "stop_conditions": {
-                    "max_tokens": 100,
-                    "stop": [],
-                    "stop_token_ids": [2],
-                    "min_tokens": 0,
-                    "ignore_eos": False,
-                },
-                "sampling_options": {
-                    "n": 1,
-                    "presence_penalty": 0.0,
-                    "frequency_penalty": 0.0,
-                    "repetition_penalty": 1.0,
-                    "temperature": 1.0,
-                    "top_p": 1.0,
-                    "top_k": 0,
-                    "min_p": 0.0,
-                    "seed": None,
-                },
-                "output_options": {
-                    "logprobs": None,
-                    "prompt_logprobs": None,
-                    "skip_special_tokens": True,
-                },
-                "eos_token_ids": [2],
-                "annotations": [],
-            },
-            tokens=[1, 2, 3],
-            vllm_preproc=EngineCoreRequest(
-                request_id="test-123",
-                prompt_token_ids=[1, 2, 3],
-                mm_features=None,
-                sampling_params=SamplingParams(),
-                pooling_params=None,
-                eos_token_id=2,
-                arrival_time=0.0,
-                lora_request=None,
-                cache_salt=None,
-                data_parallel_rank=None,
-                prompt_embeds=None,
-                client_index=0,
-                current_wave=0,
-                priority=0,
-                trace_headers=None,
-            ),
-            sampling_params=SamplingParams(),
-            request_for_sampling={"model": "test-model", "tools": None},
-            chat_template_kwargs={"reasoning_effort": None},
-        )
-        data = pickle.dumps(result)
-        restored = pickle.loads(data)
-        assert restored.dynamo_preproc == result.dynamo_preproc
-        assert restored.tokens == result.tokens
-        assert restored.vllm_preproc.request_id == "test-123"
-        assert restored.request_for_sampling == result.request_for_sampling
-        assert restored.chat_template_kwargs == result.chat_template_kwargs