[Renderer] Deprecate code paths for old input processing (#34775)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Renderer] Deprecate code paths for old input processing (#34775)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
a766b303 · Cyrus Leung · GitHub · 1faa8cb7 · a766b303 · a766b303
Unverified Commit a766b303 authored Feb 18, 2026 by Cyrus Leung Committed by GitHub Feb 18, 2026
6 changed files
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -155,3 +155,4 @@ The interface for the model/module may change during vLLM's development. If you
    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
    - `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead.
+    - `prompt` in `Platform.validate_request` is deprecated and will be removed in v0.18.0.
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -519,7 +519,6 @@ class LLM:
            ),
            params=seq_params,
            lora_requests=seq_lora_requests,
-            tokenization_kwargs=tokenization_kwargs,
            priorities=seq_priority,
        )

@@ -1813,7 +1812,6 @@ class LLM:
            params=seq_params,
            use_tqdm=use_tqdm,
            lora_requests=seq_lora_requests,
-            tokenization_kwargs=tokenization_kwargs,
            priorities=seq_priority,
        )

@@ -1872,7 +1870,6 @@ class LLM:
            params=seq_params,
            lora_requests=seq_lora_requests,
            use_tqdm=use_tqdm,
-            tokenization_kwargs=tokenization_kwargs,
        )

    def _render_and_run_requests(
@@ -1881,7 +1878,6 @@ class LLM:
        params: Sequence[SamplingParams | PoolingParams],
        *,
        lora_requests: Sequence[LoRARequest | None] | None = None,
-        tokenization_kwargs: dict[str, Any] | None = None,
        priorities: Sequence[int] | None = None,
        use_tqdm: bool | Callable[..., tqdm] = True,
    ):
@@ -1899,7 +1895,6 @@ class LLM:
            prompts=prompts,
            params=params,
            lora_requests=lora_requests,
-            tokenization_kwargs=tokenization_kwargs,
            priorities=priorities,
        )

@@ -1911,7 +1906,6 @@ class LLM:
        params: Sequence[SamplingParams | PoolingParams],
        *,
        lora_requests: Sequence[LoRARequest | None] | None = None,
-        tokenization_kwargs: dict[str, Any] | None = None,
        priorities: Sequence[int] | None = None,
    ) -> list[str]:
        added_request_ids: list[str] = []
@@ -1922,7 +1916,6 @@ class LLM:
                    prompt,
                    params[i],
                    lora_request=None if lora_requests is None else lora_requests[i],
-                    tokenization_kwargs=tokenization_kwargs,
                    priority=0 if priorities is None else priorities[i],
                )
                added_request_ids.append(request_id)
@@ -1938,7 +1931,6 @@ class LLM:
        prompt: ProcessorInputs,
        params: SamplingParams | PoolingParams,
        lora_request: LoRARequest | None = None,
-        tokenization_kwargs: dict[str, Any] | None = None,
        priority: int = 0,
    ) -> str:
        if isinstance(params, SamplingParams):
@@ -1947,27 +1939,11 @@ class LLM:

        request_id = str(next(self.request_counter))

-        if params.truncate_prompt_tokens is not None:
-            params_type = type(params).__name__
-            warnings.warn(
-                f"The `truncate_prompt_tokens` parameter in `{params_type}` "
-                "is deprecated and will be removed in v0.16. "
-                "Please pass it via `tokenization_kwargs` instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-
-            tokenization_kwargs = merge_kwargs(
-                tokenization_kwargs,
-                dict(truncate_prompt_tokens=params.truncate_prompt_tokens),
-            )
-
        return self.llm_engine.add_request(
            request_id,
            prompt,
            params,
            lora_request=lora_request,
-            tokenization_kwargs=tokenization_kwargs,
            priority=priority,
        )


--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
    from torch.distributed import PrefixStore, ProcessGroup

    from vllm.config import VllmConfig
-    from vllm.inputs import ProcessorInputs, PromptType
+    from vllm.inputs import ProcessorInputs
    from vllm.pooling_params import PoolingParams
    from vllm.sampling_params import SamplingParams
    from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -568,9 +568,8 @@ class Platform:
    @classmethod
    def validate_request(
        cls,
-        prompt: "PromptType | ProcessorInputs",
-        params: "SamplingParams | PoolingParams",
        processed_inputs: "ProcessorInputs",
+        params: "SamplingParams | PoolingParams",
    ) -> None:
        """Raises if this request is unsupported on this platform"""


--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.outputs import STREAM_FINISHED, PoolingRequestOutput, RequestOutput
 from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
-from vllm.renderers import merge_kwargs, renderer_from_config
+from vllm.renderers import renderer_from_config
 from vllm.renderers.inputs.preprocess import extract_prompt_components
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.tasks import SupportedTask
@@ -319,21 +319,6 @@ class AsyncLLM(EngineClient):
                "prompt logprobs"
            )

-        if params.truncate_prompt_tokens is not None:
-            params_type = type(params).__name__
-            warnings.warn(
-                f"The `truncate_prompt_tokens` parameter in `{params_type}` "
-                "is deprecated and will be removed in v0.16. "
-                "Please pass it via `tokenization_kwargs` instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-
-            tokenization_kwargs = merge_kwargs(
-                tokenization_kwargs,
-                dict(truncate_prompt_tokens=params.truncate_prompt_tokens),
-            )
-
        if isinstance(prompt, AsyncGenerator):
            if reasoning_ended is not None:
                raise NotImplementedError
@@ -353,6 +338,12 @@ class AsyncLLM(EngineClient):

        # Convert Input --> Request.
        if isinstance(prompt, EngineCoreRequest):
+            logger.warning_once(
+                "Passing EngineCoreRequest to AsyncLLM.generate() and .add_requests() "
+                "is deprecated and will be removed in v0.18. You should instead pass "
+                "the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
+            )
+
            request = prompt
            if request_id != request.request_id:
                logger.warning_once(

--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import time
+import warnings
 from collections.abc import Mapping
 from typing import Any, Literal

@@ -28,6 +29,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.tokenizers import TokenizerLike
 from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
+from vllm.utils.func_utils import supports_kw
 from vllm.utils.jsontree import json_iter_leaves
 from vllm.v1.engine import EngineCoreRequest

@@ -72,6 +74,33 @@ class InputProcessor:
            mm_registry=mm_registry,
        )

+        from vllm.platforms import current_platform
+
+        platform_validate_request = current_platform.validate_request
+        if supports_kw(platform_validate_request, "prompt"):
+            logger.warning_once(
+                "The signature of Platform.validate_request has changed from "
+                "`(cls, prompt, params, processed_inputs) -> None` to "
+                "`(cls, processed_inputs, params) -> None`. The old signature "
+                "will no longer be supported starting from v0.18."
+            )
+
+            orig_validate_request = platform_validate_request
+
+            def compat_validate_request(
+                processed_inputs: ProcessorInputs,
+                params: SamplingParams | PoolingParams,
+            ):
+                return orig_validate_request(
+                    processed_inputs,
+                    params,
+                    processed_inputs,  # type: ignore
+                )  # type: ignore
+
+            platform_validate_request = compat_validate_request
+
+        self._platform_validate_request = platform_validate_request
+
    @property
    def tokenizer(self) -> TokenizerLike | None:
        return self.renderer.tokenizer
@@ -87,6 +116,16 @@ class InputProcessor:
        supported_tasks: tuple[SupportedTask, ...] | None,
    ):
        """Raise `ValueError` if SamplingParams or PoolingParams is not valid."""
+        if params.truncate_prompt_tokens is not None:
+            params_type = type(params).__name__
+            warnings.warn(
+                f"The `truncate_prompt_tokens` parameter in `{params_type}` "
+                "is deprecated and will be removed in v0.17. "
+                "Please pass it via `tokenization_kwargs` instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
        if isinstance(params, SamplingParams):
            params.verify(
                self.model_config,
@@ -211,11 +250,24 @@ class InputProcessor:
            )

        if isinstance(prompt, dict) and "type" in prompt:
+            if tokenization_kwargs:
+                logger.warning_once(
+                    "Passing tokenization_kwargs to InputProcessor is deprecated "
+                    "and will be removed in v0.18. You should instead pass "
+                    "them to Renderer.render_cmpl() or Renderer.render_chat()."
+                )
+
            if arrival_time is None:
                arrival_time = prompt.get("arrival_time", time.time())  # type: ignore[assignment]

            processed_inputs: ProcessorInputs = prompt  # type: ignore[assignment]
        else:
+            logger.warning_once(
+                "Passing raw prompts to InputProcessor is deprecated "
+                "and will be removed in v0.18. You should instead pass "
+                "the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
+            )
+
            if arrival_time is None:
                arrival_time = time.time()

@@ -224,13 +276,7 @@ class InputProcessor:
                tokenization_kwargs=tokenization_kwargs,
            )

-        from vllm.platforms import current_platform
-
-        current_platform.validate_request(
-            prompt=prompt,
-            params=params,
-            processed_inputs=processed_inputs,
-        )
+        self._platform_validate_request(processed_inputs, params)

        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
        self._validate_model_inputs(encoder_inputs, decoder_inputs)

--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -234,10 +234,16 @@ class LLMEngine:

        # Process raw inputs into the request.
        if isinstance(prompt, EngineCoreRequest):
+            logger.warning_once(
+                "Passing EngineCoreRequest to LLMEngine.generate() and .add_requests() "
+                "is deprecated and will be removed in v0.18. You should instead pass "
+                "the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
+            )
+
            request = prompt
            if request_id != request.request_id:
                logger.warning_once(
-                    "AsyncLLM.add_request() was passed a request_id parameter that "
+                    "LLMEngine.add_request() was passed a request_id parameter that "
                    "does not match the EngineCoreRequest.request_id attribute. The "
                    "latter will be used, and the former will be ignored."
                )