"tests/vscode:/vscode.git/clone" did not exist on "e1a3f5e831a467b2867a66e0e56ac0f70ed44394"
Unverified Commit a766b303 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Renderer] Deprecate code paths for old input processing (#34775)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 1faa8cb7
...@@ -155,3 +155,4 @@ The interface for the model/module may change during vLLM's development. If you ...@@ -155,3 +155,4 @@ The interface for the model/module may change during vLLM's development. If you
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0. - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
- `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
- `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead. - `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead.
- `prompt` in `Platform.validate_request` is deprecated and will be removed in v0.18.0.
...@@ -519,7 +519,6 @@ class LLM: ...@@ -519,7 +519,6 @@ class LLM:
), ),
params=seq_params, params=seq_params,
lora_requests=seq_lora_requests, lora_requests=seq_lora_requests,
tokenization_kwargs=tokenization_kwargs,
priorities=seq_priority, priorities=seq_priority,
) )
...@@ -1813,7 +1812,6 @@ class LLM: ...@@ -1813,7 +1812,6 @@ class LLM:
params=seq_params, params=seq_params,
use_tqdm=use_tqdm, use_tqdm=use_tqdm,
lora_requests=seq_lora_requests, lora_requests=seq_lora_requests,
tokenization_kwargs=tokenization_kwargs,
priorities=seq_priority, priorities=seq_priority,
) )
...@@ -1872,7 +1870,6 @@ class LLM: ...@@ -1872,7 +1870,6 @@ class LLM:
params=seq_params, params=seq_params,
lora_requests=seq_lora_requests, lora_requests=seq_lora_requests,
use_tqdm=use_tqdm, use_tqdm=use_tqdm,
tokenization_kwargs=tokenization_kwargs,
) )
def _render_and_run_requests( def _render_and_run_requests(
...@@ -1881,7 +1878,6 @@ class LLM: ...@@ -1881,7 +1878,6 @@ class LLM:
params: Sequence[SamplingParams | PoolingParams], params: Sequence[SamplingParams | PoolingParams],
*, *,
lora_requests: Sequence[LoRARequest | None] | None = None, lora_requests: Sequence[LoRARequest | None] | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
priorities: Sequence[int] | None = None, priorities: Sequence[int] | None = None,
use_tqdm: bool | Callable[..., tqdm] = True, use_tqdm: bool | Callable[..., tqdm] = True,
): ):
...@@ -1899,7 +1895,6 @@ class LLM: ...@@ -1899,7 +1895,6 @@ class LLM:
prompts=prompts, prompts=prompts,
params=params, params=params,
lora_requests=lora_requests, lora_requests=lora_requests,
tokenization_kwargs=tokenization_kwargs,
priorities=priorities, priorities=priorities,
) )
...@@ -1911,7 +1906,6 @@ class LLM: ...@@ -1911,7 +1906,6 @@ class LLM:
params: Sequence[SamplingParams | PoolingParams], params: Sequence[SamplingParams | PoolingParams],
*, *,
lora_requests: Sequence[LoRARequest | None] | None = None, lora_requests: Sequence[LoRARequest | None] | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
priorities: Sequence[int] | None = None, priorities: Sequence[int] | None = None,
) -> list[str]: ) -> list[str]:
added_request_ids: list[str] = [] added_request_ids: list[str] = []
...@@ -1922,7 +1916,6 @@ class LLM: ...@@ -1922,7 +1916,6 @@ class LLM:
prompt, prompt,
params[i], params[i],
lora_request=None if lora_requests is None else lora_requests[i], lora_request=None if lora_requests is None else lora_requests[i],
tokenization_kwargs=tokenization_kwargs,
priority=0 if priorities is None else priorities[i], priority=0 if priorities is None else priorities[i],
) )
added_request_ids.append(request_id) added_request_ids.append(request_id)
...@@ -1938,7 +1931,6 @@ class LLM: ...@@ -1938,7 +1931,6 @@ class LLM:
prompt: ProcessorInputs, prompt: ProcessorInputs,
params: SamplingParams | PoolingParams, params: SamplingParams | PoolingParams,
lora_request: LoRARequest | None = None, lora_request: LoRARequest | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
priority: int = 0, priority: int = 0,
) -> str: ) -> str:
if isinstance(params, SamplingParams): if isinstance(params, SamplingParams):
...@@ -1947,27 +1939,11 @@ class LLM: ...@@ -1947,27 +1939,11 @@ class LLM:
request_id = str(next(self.request_counter)) request_id = str(next(self.request_counter))
if params.truncate_prompt_tokens is not None:
params_type = type(params).__name__
warnings.warn(
f"The `truncate_prompt_tokens` parameter in `{params_type}` "
"is deprecated and will be removed in v0.16. "
"Please pass it via `tokenization_kwargs` instead.",
DeprecationWarning,
stacklevel=2,
)
tokenization_kwargs = merge_kwargs(
tokenization_kwargs,
dict(truncate_prompt_tokens=params.truncate_prompt_tokens),
)
return self.llm_engine.add_request( return self.llm_engine.add_request(
request_id, request_id,
prompt, prompt,
params, params,
lora_request=lora_request, lora_request=lora_request,
tokenization_kwargs=tokenization_kwargs,
priority=priority, priority=priority,
) )
......
...@@ -17,7 +17,7 @@ if TYPE_CHECKING: ...@@ -17,7 +17,7 @@ if TYPE_CHECKING:
from torch.distributed import PrefixStore, ProcessGroup from torch.distributed import PrefixStore, ProcessGroup
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.inputs import ProcessorInputs, PromptType from vllm.inputs import ProcessorInputs
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
...@@ -568,9 +568,8 @@ class Platform: ...@@ -568,9 +568,8 @@ class Platform:
@classmethod @classmethod
def validate_request( def validate_request(
cls, cls,
prompt: "PromptType | ProcessorInputs",
params: "SamplingParams | PoolingParams",
processed_inputs: "ProcessorInputs", processed_inputs: "ProcessorInputs",
params: "SamplingParams | PoolingParams",
) -> None: ) -> None:
"""Raises if this request is unsupported on this platform""" """Raises if this request is unsupported on this platform"""
......
...@@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry ...@@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.outputs import STREAM_FINISHED, PoolingRequestOutput, RequestOutput from vllm.outputs import STREAM_FINISHED, PoolingRequestOutput, RequestOutput
from vllm.plugins.io_processors import get_io_processor from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.renderers import merge_kwargs, renderer_from_config from vllm.renderers import renderer_from_config
from vllm.renderers.inputs.preprocess import extract_prompt_components from vllm.renderers.inputs.preprocess import extract_prompt_components
from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.tasks import SupportedTask from vllm.tasks import SupportedTask
...@@ -319,21 +319,6 @@ class AsyncLLM(EngineClient): ...@@ -319,21 +319,6 @@ class AsyncLLM(EngineClient):
"prompt logprobs" "prompt logprobs"
) )
if params.truncate_prompt_tokens is not None:
params_type = type(params).__name__
warnings.warn(
f"The `truncate_prompt_tokens` parameter in `{params_type}` "
"is deprecated and will be removed in v0.16. "
"Please pass it via `tokenization_kwargs` instead.",
DeprecationWarning,
stacklevel=2,
)
tokenization_kwargs = merge_kwargs(
tokenization_kwargs,
dict(truncate_prompt_tokens=params.truncate_prompt_tokens),
)
if isinstance(prompt, AsyncGenerator): if isinstance(prompt, AsyncGenerator):
if reasoning_ended is not None: if reasoning_ended is not None:
raise NotImplementedError raise NotImplementedError
...@@ -353,6 +338,12 @@ class AsyncLLM(EngineClient): ...@@ -353,6 +338,12 @@ class AsyncLLM(EngineClient):
# Convert Input --> Request. # Convert Input --> Request.
if isinstance(prompt, EngineCoreRequest): if isinstance(prompt, EngineCoreRequest):
logger.warning_once(
"Passing EngineCoreRequest to AsyncLLM.generate() and .add_requests() "
"is deprecated and will be removed in v0.18. You should instead pass "
"the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
)
request = prompt request = prompt
if request_id != request.request_id: if request_id != request.request_id:
logger.warning_once( logger.warning_once(
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time import time
import warnings
from collections.abc import Mapping from collections.abc import Mapping
from typing import Any, Literal from typing import Any, Literal
...@@ -28,6 +29,7 @@ from vllm.sampling_params import SamplingParams ...@@ -28,6 +29,7 @@ from vllm.sampling_params import SamplingParams
from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.tasks import POOLING_TASKS, SupportedTask
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
from vllm.utils.func_utils import supports_kw
from vllm.utils.jsontree import json_iter_leaves from vllm.utils.jsontree import json_iter_leaves
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest
...@@ -72,6 +74,33 @@ class InputProcessor: ...@@ -72,6 +74,33 @@ class InputProcessor:
mm_registry=mm_registry, mm_registry=mm_registry,
) )
from vllm.platforms import current_platform
platform_validate_request = current_platform.validate_request
if supports_kw(platform_validate_request, "prompt"):
logger.warning_once(
"The signature of Platform.validate_request has changed from "
"`(cls, prompt, params, processed_inputs) -> None` to "
"`(cls, processed_inputs, params) -> None`. The old signature "
"will no longer be supported starting from v0.18."
)
orig_validate_request = platform_validate_request
def compat_validate_request(
processed_inputs: ProcessorInputs,
params: SamplingParams | PoolingParams,
):
return orig_validate_request(
processed_inputs,
params,
processed_inputs, # type: ignore
) # type: ignore
platform_validate_request = compat_validate_request
self._platform_validate_request = platform_validate_request
@property @property
def tokenizer(self) -> TokenizerLike | None: def tokenizer(self) -> TokenizerLike | None:
return self.renderer.tokenizer return self.renderer.tokenizer
...@@ -87,6 +116,16 @@ class InputProcessor: ...@@ -87,6 +116,16 @@ class InputProcessor:
supported_tasks: tuple[SupportedTask, ...] | None, supported_tasks: tuple[SupportedTask, ...] | None,
): ):
"""Raise `ValueError` if SamplingParams or PoolingParams is not valid.""" """Raise `ValueError` if SamplingParams or PoolingParams is not valid."""
if params.truncate_prompt_tokens is not None:
params_type = type(params).__name__
warnings.warn(
f"The `truncate_prompt_tokens` parameter in `{params_type}` "
"is deprecated and will be removed in v0.17. "
"Please pass it via `tokenization_kwargs` instead.",
DeprecationWarning,
stacklevel=2,
)
if isinstance(params, SamplingParams): if isinstance(params, SamplingParams):
params.verify( params.verify(
self.model_config, self.model_config,
...@@ -211,11 +250,24 @@ class InputProcessor: ...@@ -211,11 +250,24 @@ class InputProcessor:
) )
if isinstance(prompt, dict) and "type" in prompt: if isinstance(prompt, dict) and "type" in prompt:
if tokenization_kwargs:
logger.warning_once(
"Passing tokenization_kwargs to InputProcessor is deprecated "
"and will be removed in v0.18. You should instead pass "
"them to Renderer.render_cmpl() or Renderer.render_chat()."
)
if arrival_time is None: if arrival_time is None:
arrival_time = prompt.get("arrival_time", time.time()) # type: ignore[assignment] arrival_time = prompt.get("arrival_time", time.time()) # type: ignore[assignment]
processed_inputs: ProcessorInputs = prompt # type: ignore[assignment] processed_inputs: ProcessorInputs = prompt # type: ignore[assignment]
else: else:
logger.warning_once(
"Passing raw prompts to InputProcessor is deprecated "
"and will be removed in v0.18. You should instead pass "
"the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
)
if arrival_time is None: if arrival_time is None:
arrival_time = time.time() arrival_time = time.time()
...@@ -224,13 +276,7 @@ class InputProcessor: ...@@ -224,13 +276,7 @@ class InputProcessor:
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
) )
from vllm.platforms import current_platform self._platform_validate_request(processed_inputs, params)
current_platform.validate_request(
prompt=prompt,
params=params,
processed_inputs=processed_inputs,
)
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
self._validate_model_inputs(encoder_inputs, decoder_inputs) self._validate_model_inputs(encoder_inputs, decoder_inputs)
......
...@@ -234,10 +234,16 @@ class LLMEngine: ...@@ -234,10 +234,16 @@ class LLMEngine:
# Process raw inputs into the request. # Process raw inputs into the request.
if isinstance(prompt, EngineCoreRequest): if isinstance(prompt, EngineCoreRequest):
logger.warning_once(
"Passing EngineCoreRequest to LLMEngine.generate() and .add_requests() "
"is deprecated and will be removed in v0.18. You should instead pass "
"the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
)
request = prompt request = prompt
if request_id != request.request_id: if request_id != request.request_id:
logger.warning_once( logger.warning_once(
"AsyncLLM.add_request() was passed a request_id parameter that " "LLMEngine.add_request() was passed a request_id parameter that "
"does not match the EngineCoreRequest.request_id attribute. The " "does not match the EngineCoreRequest.request_id attribute. The "
"latter will be used, and the former will be ignored." "latter will be used, and the former will be ignored."
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment