"examples/vscode:/vscode.git/clone" did not exist on "6e7b1c4b591a7d735fc93792e53cb5592cfab4f2"
Unverified Commit a766b303 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Renderer] Deprecate code paths for old input processing (#34775)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 1faa8cb7
......@@ -155,3 +155,4 @@ The interface for the model/module may change during vLLM's development. If you
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
- `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
- `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead.
- `prompt` in `Platform.validate_request` is deprecated and will be removed in v0.18.0.
......@@ -519,7 +519,6 @@ class LLM:
),
params=seq_params,
lora_requests=seq_lora_requests,
tokenization_kwargs=tokenization_kwargs,
priorities=seq_priority,
)
......@@ -1813,7 +1812,6 @@ class LLM:
params=seq_params,
use_tqdm=use_tqdm,
lora_requests=seq_lora_requests,
tokenization_kwargs=tokenization_kwargs,
priorities=seq_priority,
)
......@@ -1872,7 +1870,6 @@ class LLM:
params=seq_params,
lora_requests=seq_lora_requests,
use_tqdm=use_tqdm,
tokenization_kwargs=tokenization_kwargs,
)
def _render_and_run_requests(
......@@ -1881,7 +1878,6 @@ class LLM:
params: Sequence[SamplingParams | PoolingParams],
*,
lora_requests: Sequence[LoRARequest | None] | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
priorities: Sequence[int] | None = None,
use_tqdm: bool | Callable[..., tqdm] = True,
):
......@@ -1899,7 +1895,6 @@ class LLM:
prompts=prompts,
params=params,
lora_requests=lora_requests,
tokenization_kwargs=tokenization_kwargs,
priorities=priorities,
)
......@@ -1911,7 +1906,6 @@ class LLM:
params: Sequence[SamplingParams | PoolingParams],
*,
lora_requests: Sequence[LoRARequest | None] | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
priorities: Sequence[int] | None = None,
) -> list[str]:
added_request_ids: list[str] = []
......@@ -1922,7 +1916,6 @@ class LLM:
prompt,
params[i],
lora_request=None if lora_requests is None else lora_requests[i],
tokenization_kwargs=tokenization_kwargs,
priority=0 if priorities is None else priorities[i],
)
added_request_ids.append(request_id)
......@@ -1938,7 +1931,6 @@ class LLM:
prompt: ProcessorInputs,
params: SamplingParams | PoolingParams,
lora_request: LoRARequest | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
priority: int = 0,
) -> str:
if isinstance(params, SamplingParams):
......@@ -1947,27 +1939,11 @@ class LLM:
request_id = str(next(self.request_counter))
if params.truncate_prompt_tokens is not None:
params_type = type(params).__name__
warnings.warn(
f"The `truncate_prompt_tokens` parameter in `{params_type}` "
"is deprecated and will be removed in v0.16. "
"Please pass it via `tokenization_kwargs` instead.",
DeprecationWarning,
stacklevel=2,
)
tokenization_kwargs = merge_kwargs(
tokenization_kwargs,
dict(truncate_prompt_tokens=params.truncate_prompt_tokens),
)
return self.llm_engine.add_request(
request_id,
prompt,
params,
lora_request=lora_request,
tokenization_kwargs=tokenization_kwargs,
priority=priority,
)
......
......@@ -17,7 +17,7 @@ if TYPE_CHECKING:
from torch.distributed import PrefixStore, ProcessGroup
from vllm.config import VllmConfig
from vllm.inputs import ProcessorInputs, PromptType
from vllm.inputs import ProcessorInputs
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.utils.argparse_utils import FlexibleArgumentParser
......@@ -568,9 +568,8 @@ class Platform:
@classmethod
def validate_request(
cls,
prompt: "PromptType | ProcessorInputs",
params: "SamplingParams | PoolingParams",
processed_inputs: "ProcessorInputs",
params: "SamplingParams | PoolingParams",
) -> None:
"""Raises if this request is unsupported on this platform"""
......
......@@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.outputs import STREAM_FINISHED, PoolingRequestOutput, RequestOutput
from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.renderers import merge_kwargs, renderer_from_config
from vllm.renderers import renderer_from_config
from vllm.renderers.inputs.preprocess import extract_prompt_components
from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.tasks import SupportedTask
......@@ -319,21 +319,6 @@ class AsyncLLM(EngineClient):
"prompt logprobs"
)
if params.truncate_prompt_tokens is not None:
params_type = type(params).__name__
warnings.warn(
f"The `truncate_prompt_tokens` parameter in `{params_type}` "
"is deprecated and will be removed in v0.16. "
"Please pass it via `tokenization_kwargs` instead.",
DeprecationWarning,
stacklevel=2,
)
tokenization_kwargs = merge_kwargs(
tokenization_kwargs,
dict(truncate_prompt_tokens=params.truncate_prompt_tokens),
)
if isinstance(prompt, AsyncGenerator):
if reasoning_ended is not None:
raise NotImplementedError
......@@ -353,6 +338,12 @@ class AsyncLLM(EngineClient):
# Convert Input --> Request.
if isinstance(prompt, EngineCoreRequest):
logger.warning_once(
"Passing EngineCoreRequest to AsyncLLM.generate() and .add_requests() "
"is deprecated and will be removed in v0.18. You should instead pass "
"the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
)
request = prompt
if request_id != request.request_id:
logger.warning_once(
......
......@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
import warnings
from collections.abc import Mapping
from typing import Any, Literal
......@@ -28,6 +29,7 @@ from vllm.sampling_params import SamplingParams
from vllm.tasks import POOLING_TASKS, SupportedTask
from vllm.tokenizers import TokenizerLike
from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
from vllm.utils.func_utils import supports_kw
from vllm.utils.jsontree import json_iter_leaves
from vllm.v1.engine import EngineCoreRequest
......@@ -72,6 +74,33 @@ class InputProcessor:
mm_registry=mm_registry,
)
from vllm.platforms import current_platform
platform_validate_request = current_platform.validate_request
if supports_kw(platform_validate_request, "prompt"):
logger.warning_once(
"The signature of Platform.validate_request has changed from "
"`(cls, prompt, params, processed_inputs) -> None` to "
"`(cls, processed_inputs, params) -> None`. The old signature "
"will no longer be supported starting from v0.18."
)
orig_validate_request = platform_validate_request
def compat_validate_request(
processed_inputs: ProcessorInputs,
params: SamplingParams | PoolingParams,
):
return orig_validate_request(
processed_inputs,
params,
processed_inputs, # type: ignore
) # type: ignore
platform_validate_request = compat_validate_request
self._platform_validate_request = platform_validate_request
@property
def tokenizer(self) -> TokenizerLike | None:
return self.renderer.tokenizer
......@@ -87,6 +116,16 @@ class InputProcessor:
supported_tasks: tuple[SupportedTask, ...] | None,
):
"""Raise `ValueError` if SamplingParams or PoolingParams is not valid."""
if params.truncate_prompt_tokens is not None:
params_type = type(params).__name__
warnings.warn(
f"The `truncate_prompt_tokens` parameter in `{params_type}` "
"is deprecated and will be removed in v0.17. "
"Please pass it via `tokenization_kwargs` instead.",
DeprecationWarning,
stacklevel=2,
)
if isinstance(params, SamplingParams):
params.verify(
self.model_config,
......@@ -211,11 +250,24 @@ class InputProcessor:
)
if isinstance(prompt, dict) and "type" in prompt:
if tokenization_kwargs:
logger.warning_once(
"Passing tokenization_kwargs to InputProcessor is deprecated "
"and will be removed in v0.18. You should instead pass "
"them to Renderer.render_cmpl() or Renderer.render_chat()."
)
if arrival_time is None:
arrival_time = prompt.get("arrival_time", time.time()) # type: ignore[assignment]
processed_inputs: ProcessorInputs = prompt # type: ignore[assignment]
else:
logger.warning_once(
"Passing raw prompts to InputProcessor is deprecated "
"and will be removed in v0.18. You should instead pass "
"the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
)
if arrival_time is None:
arrival_time = time.time()
......@@ -224,13 +276,7 @@ class InputProcessor:
tokenization_kwargs=tokenization_kwargs,
)
from vllm.platforms import current_platform
current_platform.validate_request(
prompt=prompt,
params=params,
processed_inputs=processed_inputs,
)
self._platform_validate_request(processed_inputs, params)
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
self._validate_model_inputs(encoder_inputs, decoder_inputs)
......
......@@ -234,10 +234,16 @@ class LLMEngine:
# Process raw inputs into the request.
if isinstance(prompt, EngineCoreRequest):
logger.warning_once(
"Passing EngineCoreRequest to LLMEngine.generate() and .add_requests() "
"is deprecated and will be removed in v0.18. You should instead pass "
"the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
)
request = prompt
if request_id != request.request_id:
logger.warning_once(
"AsyncLLM.add_request() was passed a request_id parameter that "
"LLMEngine.add_request() was passed a request_id parameter that "
"does not match the EngineCoreRequest.request_id attribute. The "
"latter will be used, and the former will be ignored."
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment