[Bugfix] Read truncate_prompt_tokens from pooling_params in AsyncLLM.encode() (#31013)

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>

[Bugfix] Read truncate_prompt_tokens from pooling_params in AsyncLLM.encode() (#31013)
Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
1501a407 · Jeffrey Wang · GitHub · ff2168bc · 1501a407 · 1501a407
Unverified Commit 1501a407 authored Dec 20, 2025 by Jeffrey Wang Committed by GitHub Dec 20, 2025
Show whitespace changes
Inline Side-by-side

Showing with 20 additions and 2 deletions

vllm/engine/protocol.py vllm/engine/protocol.py +5 -1

vllm/v1/engine/async_llm.py vllm/v1/engine/async_llm.py +15 -1

No files found.
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -71,7 +71,11 @@ class EngineClient(ABC):
        truncate_prompt_tokens: int | None = None,
        tokenization_kwargs: dict[str, Any] | None = None,
    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model."""
+        """Generate outputs for a request from a pooling model.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove this argument in v0.15.
+        """
        ...

    @abstractmethod

--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -4,6 +4,7 @@ import asyncio
 import os
 import socket
 import time
+import warnings
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
 from typing import Any, cast
@@ -627,6 +628,9 @@ class AsyncLLM(EngineClient):

        The caller of generate() iterates the returned AsyncGenerator,
        returning the RequestOutput back to the caller.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove truncate_prompt_tokens in v0.15.
        """

        try:
@@ -641,9 +645,19 @@ class AsyncLLM(EngineClient):

            if tokenization_kwargs is None:
                tokenization_kwargs = {}
+
+            if truncate_prompt_tokens is not None:
+                warnings.warn(
+                    "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` "
+                    "is deprecated and will be removed in v0.15. "
+                    "Please use `pooling_params.truncate_prompt_tokens` instead.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+
            _validate_truncation_size(
                self.model_config.max_model_len,
-                truncate_prompt_tokens,
+                pooling_params.truncate_prompt_tokens,
                tokenization_kwargs,
            )