[Deprecation] Deprecate code in 0.17 as scheduled (#35441)

Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>

[Deprecation] Deprecate code in 0.17 as scheduled (#35441)
Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
e113a301 · Wentao Ye · GitHub · 1dafb29f · e113a301 · e113a301
Unverified Commit e113a301 authored Feb 28, 2026 by Wentao Ye Committed by GitHub Feb 28, 2026
20 changed files
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -683,13 +683,13 @@ async def test_params_not_supported(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_normalize(server: RemoteOpenAIServer, model_name: str):
+async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
-    async def get_outputs(normalize):
+    async def get_outputs(use_activation):
        request_args = {
            "model": MODEL_NAME,
            "input": input_text,
            "encoding_format": "float",
-            "normalize": normalize,
+            "use_activation": use_activation,
        }
        response = requests.post(server.url_for("v1/embeddings"), json=request_args)
@@ -697,9 +697,9 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str):
        return torch.tensor([x["embedding"] for x in outputs["data"]])
-    default = await get_outputs(normalize=None)
+    default = await get_outputs(use_activation=None)
-    w_normal = await get_outputs(normalize=True)
+    w_normal = await get_outputs(use_activation=True)
-    wo_normal = await get_outputs(normalize=False)
+    wo_normal = await get_outputs(use_activation=False)
    assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
    assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (

--- a/vllm/entrypoints/grpc_server.py
+++ b/vllm/entrypoints/grpc_server.py
@@ -101,11 +101,15 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer):
            sampling_params = self._sampling_params_from_proto(
                request.sampling_params, stream=request.stream
            )
+            tokenization_kwargs = self._tokenization_kwargs_from_proto(
+                request.sampling_params
+            )
            async for output in self.async_llm.generate(
                prompt=prompt,
                sampling_params=sampling_params,
                request_id=request_id,
+                tokenization_kwargs=tokenization_kwargs,
            ):
                # Convert vLLM output to protobuf
                # For streaming, always send chunks
@@ -308,9 +312,6 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer):
            seed=params.seed if params.HasField("seed") else None,
            include_stop_str_in_output=params.include_stop_str_in_output,
            logit_bias=dict(params.logit_bias) if params.logit_bias else None,
-            truncate_prompt_tokens=params.truncate_prompt_tokens
-            if params.HasField("truncate_prompt_tokens")
-            else None,
            structured_outputs=structured_outputs,
            # detokenize must be True if stop strings are used
            detokenize=bool(stop),
@@ -319,6 +320,14 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer):
            else RequestOutputKind.FINAL_ONLY,
        )
+    @staticmethod
+    def _tokenization_kwargs_from_proto(
+        params: vllm_engine_pb2.SamplingParams,
+    ) -> dict[str, int] | None:
+        if params.HasField("truncate_prompt_tokens"):
+            return {"truncate_prompt_tokens": params.truncate_prompt_tokens}
+        return None
    @staticmethod
    def _chunk_response(output: RequestOutput) -> vllm_engine_pb2.GenerateResponse:
        """

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-import warnings
 from collections.abc import Callable, Iterable, Sequence
 from typing import TYPE_CHECKING, Any
@@ -1030,7 +1029,6 @@ class LLM:
        prompts: PromptType | Sequence[PromptType] | DataPrompt,
        pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
        *,
-        truncate_prompt_tokens: int | None = None,
        use_tqdm: bool | Callable[..., tqdm] = True,
        lora_request: list[LoRARequest] | LoRARequest | None = None,
        pooling_task: PoolingTask | None = None,
@@ -1088,20 +1086,6 @@ class LLM:
                "pooling model."
            )
-        if truncate_prompt_tokens is not None:
-            warnings.warn(
-                "The `truncate_prompt_tokens` parameter in `LLM.encode()` "
-                "is deprecated and will be removed in v0.16. "
-                "Please pass it via `tokenization_kwargs` instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            tokenization_kwargs = merge_kwargs(
-                tokenization_kwargs,
-                dict(truncate_prompt_tokens=truncate_prompt_tokens),
-            )
        if use_io_processor := (isinstance(prompts, dict) and "data" in prompts):
            if self.io_processor is None:
                raise ValueError(
@@ -1185,7 +1169,6 @@ class LLM:
        self,
        prompts: PromptType | Sequence[PromptType],
        *,
-        truncate_prompt_tokens: int | None = None,
        use_tqdm: bool | Callable[..., tqdm] = True,
        pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
        lora_request: list[LoRARequest] | LoRARequest | None = None,
@@ -1221,12 +1204,6 @@ class LLM:
                "Try converting the model using `--convert embed`."
            )
-        if truncate_prompt_tokens is not None:
-            tokenization_kwargs = merge_kwargs(
-                tokenization_kwargs,
-                dict(truncate_prompt_tokens=truncate_prompt_tokens),
-            )
        items = self.encode(
            prompts,
            use_tqdm=use_tqdm,
@@ -1294,7 +1271,6 @@ class LLM:
        /,
        *,
        pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
-        truncate_prompt_tokens: int | None = None,
        use_tqdm: bool | Callable[..., tqdm] = True,
        lora_request: list[LoRARequest] | LoRARequest | None = None,
        tokenization_kwargs: dict[str, Any] | None = None,
@@ -1319,13 +1295,11 @@ class LLM:
            A list of `PoolingRequestOutput` objects containing the
            pooled hidden states in the same order as the input prompts.
        """
        return self.encode(
            prompts,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
            pooling_params=pooling_params,
-            truncate_prompt_tokens=truncate_prompt_tokens,
            pooling_task="token_classify",
            tokenization_kwargs=tokenization_kwargs,
        )
@@ -1771,23 +1745,15 @@ class LLM:
        seq_prompts = prompt_to_seq(prompts)
        seq_params = self._params_to_seq(params, len(seq_prompts))
        seq_lora_requests = self._lora_request_to_seq(lora_request, len(seq_prompts))
-        seq_tok_kwargs = [
-            merge_kwargs(
-                tokenization_kwargs,
-                dict(truncate_prompt_tokens=param.truncate_prompt_tokens),
-            )
-            for param in seq_params
-        ]
        seq_priority = self._priority_to_seq(priority, len(prompts))
        return self._render_and_add_requests(
            prompts=(
-                self._preprocess_cmpl_one(prompt, tok_kwargs)
+                self._preprocess_cmpl_one(prompt, tokenization_kwargs)
-                for prompt, tok_kwargs in zip(
+                for prompt in maybe_tqdm(
-                    maybe_tqdm(
+                    seq_prompts,
-                        seq_prompts, use_tqdm=use_tqdm, desc="Rendering prompts"
+                    use_tqdm=use_tqdm,
-                    ),
+                    desc="Rendering prompts",
-                    seq_tok_kwargs,
                )
            ),
            params=seq_params,
@@ -1841,13 +1807,6 @@ class LLM:
        seq_convs = conversation_to_seq(messages)
        seq_params = self._params_to_seq(params, len(seq_convs))
        seq_lora_requests = self._lora_request_to_seq(lora_request, len(seq_convs))
-        seq_tok_kwargs = [
-            merge_kwargs(
-                tokenization_kwargs,
-                dict(truncate_prompt_tokens=param.truncate_prompt_tokens),
-            )
-            for param in seq_params
-        ]
        return self._render_and_run_requests(
            prompts=(
@@ -1859,16 +1818,13 @@ class LLM:
                    add_generation_prompt=add_generation_prompt,
                    continue_final_message=continue_final_message,
                    tools=tools,
-                    tokenization_kwargs=tok_kwargs,
+                    tokenization_kwargs=tokenization_kwargs,
                    mm_processor_kwargs=mm_processor_kwargs,
                )
-                for conversation, tok_kwargs in zip(
+                for conversation in maybe_tqdm(
-                    maybe_tqdm(
+                    seq_convs,
-                        seq_convs,
+                    use_tqdm=use_tqdm,
-                        use_tqdm=use_tqdm,
+                    desc="Rendering conversations",
-                        desc="Rendering conversations",
-                    ),
-                    seq_tok_kwargs,
                )
            ),
            params=seq_params,

--- a/vllm/entrypoints/openai/chat_completion/protocol.py
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
@@ -490,7 +490,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
            skip_special_tokens=self.skip_special_tokens,
            spaces_between_special_tokens=self.spaces_between_special_tokens,
            include_stop_str_in_output=self.include_stop_str_in_output,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
            output_kind=RequestOutputKind.DELTA
            if self.stream
            else RequestOutputKind.FINAL_ONLY,

--- a/vllm/entrypoints/openai/completion/protocol.py
+++ b/vllm/entrypoints/openai/completion/protocol.py
@@ -302,7 +302,6 @@ class CompletionRequest(OpenAIBaseModel):
            skip_special_tokens=self.skip_special_tokens,
            spaces_between_special_tokens=self.spaces_between_special_tokens,
            include_stop_str_in_output=self.include_stop_str_in_output,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
            output_kind=RequestOutputKind.DELTA
            if self.stream
            else RequestOutputKind.FINAL_ONLY,

--- a/vllm/entrypoints/openai/translations/__init__.py
+++ b/vllm/entrypoints/openai/translations/__init__.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import warnings
-warnings.warn(
-    "The 'vllm.entrypoints.openai.translations' module has been renamed to "
-    "'vllm.entrypoints.openai.speech_to_text'. Please update your imports. "
-    "This backward-compatible alias will be removed in version 0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
--- a/vllm/entrypoints/openai/translations/api_router.py
+++ b/vllm/entrypoints/openai/translations/api_router.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import warnings
-warnings.warn(
-    "'vllm.entrypoints.openai.translations.api_router' has been moved to "
-    "'vllm.entrypoints.openai.speech_to_text.api_router'. Please update your "
-    "imports. This backward-compatible alias will be removed in version 0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-from vllm.entrypoints.openai.speech_to_text.api_router import *  # noqa: F401,F403,E402
--- a/vllm/entrypoints/openai/translations/protocol.py
+++ b/vllm/entrypoints/openai/translations/protocol.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import warnings
-warnings.warn(
-    "'vllm.entrypoints.openai.translations.protocol' has been moved to "
-    "'vllm.entrypoints.openai.speech_to_text.protocol'. Please update your "
-    "imports. This backward-compatible alias will be removed in version 0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-from vllm.entrypoints.openai.speech_to_text.protocol import *  # noqa: F401,F403,E402
--- a/vllm/entrypoints/openai/translations/serving.py
+++ b/vllm/entrypoints/openai/translations/serving.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import warnings
-warnings.warn(
-    "'vllm.entrypoints.openai.translations.serving' has been moved to "
-    "'vllm.entrypoints.openai.speech_to_text.serving'. Please update your "
-    "imports. This backward-compatible alias will be removed in version 0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-from vllm.entrypoints.openai.speech_to_text.serving import *  # noqa: F401,F403,E402
--- a/vllm/entrypoints/openai/translations/speech_to_text.py
+++ b/vllm/entrypoints/openai/translations/speech_to_text.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import warnings
-warnings.warn(
-    "'vllm.entrypoints.openai.translations.speech_to_text' has been moved to "
-    "'vllm.entrypoints.openai.speech_to_text.speech_to_text'. Please update "
-    "your imports. This backward-compatible alias will be removed in version "
-    "0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-from vllm.entrypoints.openai.speech_to_text.speech_to_text import *  # noqa: F401,F403,E402
--- a/vllm/entrypoints/pooling/base/protocol.py
+++ b/vllm/entrypoints/pooling/base/protocol.py
@@ -190,10 +190,6 @@ class EmbedRequestMixin(EncodingRequestMixin):
        description="Whether to use activation for the pooler outputs. "
        "`None` uses the pooler's default, which is `True` in most cases.",
    )
-    normalize: bool | None = Field(
-        default=None,
-        description="Deprecated; please pass `use_activation` instead",
-    )
    # --8<-- [end:embed-extra-params]

--- a/vllm/entrypoints/pooling/classify/protocol.py
+++ b/vllm/entrypoints/pooling/classify/protocol.py
@@ -40,7 +40,6 @@ class ClassificationCompletionRequest(
    def to_pooling_params(self):
        return PoolingParams(
            task="classify",
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
            use_activation=self.use_activation,
        )
@@ -63,7 +62,6 @@ class ClassificationChatRequest(
    def to_pooling_params(self):
        return PoolingParams(
            task="classify",
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
            use_activation=self.use_activation,
        )

--- a/vllm/entrypoints/pooling/embed/protocol.py
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -14,12 +14,9 @@ from vllm.entrypoints.pooling.base.protocol import (
    EmbedRequestMixin,
    PoolingBasicRequestMixin,
 )
-from vllm.logger import init_logger
 from vllm.renderers import TokenizeParams
 from vllm.utils import random_uuid
-logger = init_logger(__name__)
 def _get_max_total_output_tokens(
    model_config: ModelConfig,
@@ -60,18 +57,10 @@ class EmbeddingCompletionRequest(
        )
    def to_pooling_params(self):
-        if self.normalize is not None:
-            logger.warning_once(
-                "`normalize` is deprecated and will be removed in v0.17. "
-                "Please pass `use_activation` instead."
-            )
-            self.use_activation = self.normalize
        return PoolingParams(
            task="embed",
            dimensions=self.dimensions,
            use_activation=self.use_activation,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
        )
@@ -97,18 +86,10 @@ class EmbeddingChatRequest(
        )
    def to_pooling_params(self):
-        if self.normalize is not None:
-            logger.warning_once(
-                "`normalize` is deprecated and will be removed in v0.17. "
-                "Please pass `use_activation` instead."
-            )
-            self.use_activation = self.normalize
        return PoolingParams(
            task="embed",
            dimensions=self.dimensions,
            use_activation=self.use_activation,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
        )

--- a/vllm/entrypoints/pooling/pooling/protocol.py
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -16,13 +16,10 @@ from vllm.entrypoints.pooling.base.protocol import (
    EncodingRequestMixin,
    PoolingBasicRequestMixin,
 )
-from vllm.logger import init_logger
 from vllm.renderers import TokenizeParams
 from vllm.tasks import PoolingTask
 from vllm.utils import random_uuid
-logger = init_logger(__name__)
 class PoolingCompletionRequest(
    PoolingBasicRequestMixin,
@@ -45,16 +42,8 @@ class PoolingCompletionRequest(
        )
    def to_pooling_params(self):
-        if self.normalize is not None:
-            logger.warning_once(
-                "`normalize` is deprecated and will be removed in v0.17. "
-                "Please pass `use_activation` instead."
-            )
-            self.use_activation = self.normalize
        return PoolingParams(
            task=self.task,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
            use_activation=self.use_activation,
            dimensions=self.dimensions,
        )
@@ -78,16 +67,8 @@ class PoolingChatRequest(
        )
    def to_pooling_params(self):
-        if self.normalize is not None:
-            logger.warning_once(
-                "`normalize` is deprecated and will be removed in v0.17. "
-                "Please pass `use_activation` instead."
-            )
-            self.use_activation = self.normalize
        return PoolingParams(
            task=self.task,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
            use_activation=self.use_activation,
            dimensions=self.dimensions,
        )

--- a/vllm/entrypoints/pooling/score/protocol.py
+++ b/vllm/entrypoints/pooling/score/protocol.py
@@ -37,7 +37,6 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin):
    def to_pooling_params(self, task: PoolingTask = "score"):
        return PoolingParams(
            task=task,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
            use_activation=self.use_activation,
        )
@@ -113,7 +112,6 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
    def to_pooling_params(self, task: PoolingTask = "score"):
        return PoolingParams(
            task=task,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
            use_activation=self.use_activation,
        )

--- a/vllm/model_executor/layers/mamba/mamba_utils.py
+++ b/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -289,9 +289,6 @@ def get_temporal_copy_spec(
    )
-get_full_copy_spec = get_temporal_copy_spec
 class MambaStateCopyFuncCalculator:
    @classmethod
    def linear_attention_state_copy_func(cls):

--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -43,12 +43,9 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 IMAGE_TOKEN = "<image>"
-IMAGE_PLACEHOLDER_ID = 151669
 VIDEO_TOKEN = "<video>"
-VIDEO_PLACEHOLDER_ID = 151670
 INDICATOR_IDS = [151672, 151673, 151674, 151675]
 IMAGE_PAD_TOKEN_ID = 151655
-THINK_END_TOKEN_ID = 151668
 class Ovis2_5ImagePatchInputs(TensorSchema):

--- a/vllm/multimodal/processing/processor.py
+++ b/vllm/multimodal/processing/processor.py
@@ -17,7 +17,7 @@ from typing import (
 import regex as re
 import torch
-from typing_extensions import TypeVar, assert_never, deprecated
+from typing_extensions import TypeVar, assert_never
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
@@ -996,16 +996,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        self.data_parser = self.info.get_data_parser()
-    @property
-    @deprecated("Will be removed in v0.17. Use `info.supported_mm_limits` instead.")
-    def supported_mm_limits(self):
-        return self.info.supported_mm_limits
-    @property
-    @deprecated("Will be removed in v0.17. Use `info.allowed_mm_limits` instead.")
-    def allowed_mm_limits(self):
-        return self.info.allowed_mm_limits
    def __call__(
        self,
        prompt: str,

--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import mimetypes
-import warnings
 from collections import defaultdict
 from collections.abc import Generator, Sequence
 from itertools import groupby
@@ -30,23 +29,6 @@ else:
    torch = LazyLoader("torch", globals(), "torch")
-def __getattr__(name: str):
-    if name == "MEDIA_CONNECTOR_REGISTRY":
-        from .media import MEDIA_CONNECTOR_REGISTRY
-        warnings.warn(
-            "`vllm.multimodal.utils.MEDIA_CONNECTOR_REGISTRY` "
-            "has been moved to `vllm.multimodal.media.MEDIA_CONNECTOR_REGISTRY`. "
-            "The old name will be removed in v0.17.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        return MEDIA_CONNECTOR_REGISTRY
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 def encode_audio_base64(
    audio: np.ndarray,
    sampling_rate: int,

--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from copy import deepcopy
-from typing import Annotated, Any
+from typing import Any
 import msgspec
@@ -19,10 +19,6 @@ class PoolingParams(
    """API parameters for pooling models.
    Attributes:
-        truncate_prompt_tokens: Controls prompt truncation.
-            Set to -1 to use the model's default truncation size.
-            Set to k to keep only the last k tokens (left truncation).
-            Set to None to disable truncation.
        use_activation: Whether to apply activation function to the pooler outputs.
            `None` uses the pooler's default, which is `True` in most cases.
        dimensions: Reduce the dimensions of embeddings
@@ -30,7 +26,6 @@ class PoolingParams(
    """
    # --8<-- [start:common-pooling-params]
-    truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None
    use_activation: bool | None = None
    # --8<-- [end:common-pooling-params]
@@ -198,7 +193,6 @@ class PoolingParams(
            f"returned_token_ids={self.returned_token_ids}, "
            f"requires_token_ids={self.requires_token_ids}, "
            f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, "
-            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
            f"extra_kwargs={self.extra_kwargs})"
        )