Unverified Commit 56212b49 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

fix(frontend): Update vllm processor for vllm 0.16 (#6799)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent 9fe03dd8
...@@ -11,6 +11,7 @@ from typing import Any ...@@ -11,6 +11,7 @@ from typing import Any
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.engine.protocol import DeltaMessage, DeltaToolCall from vllm.entrypoints.openai.engine.protocol import DeltaMessage, DeltaToolCall
from vllm.reasoning import ReasoningParser from vllm.reasoning import ReasoningParser
from vllm.renderers import ChatParams
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser from vllm.tool_parsers import ToolParser
...@@ -73,9 +74,7 @@ def _prepare_request( ...@@ -73,9 +74,7 @@ def _prepare_request(
*, *,
tokenizer: TokenizerLike, tokenizer: TokenizerLike,
tool_parser_class: type[ToolParser] | None, tool_parser_class: type[ToolParser] | None,
) -> tuple[ ) -> tuple[ChatCompletionRequest, ToolParser | None, dict[str, Any], Any, ChatParams]:
ChatCompletionRequest, ToolParser | None, dict[str, Any], Any, dict[str, Any]
]:
"""Validate request and build arguments for template rendering. """Validate request and build arguments for template rendering.
Returns: Returns:
...@@ -83,7 +82,7 @@ def _prepare_request( ...@@ -83,7 +82,7 @@ def _prepare_request(
tool_parser: Instantiated tool parser, or None. tool_parser: Instantiated tool parser, or None.
chat_template_kwargs: Template kwargs (for PreprocessResult). chat_template_kwargs: Template kwargs (for PreprocessResult).
messages_for_render: Messages to pass as first arg to render_messages. messages_for_render: Messages to pass as first arg to render_messages.
render_kwargs: Keyword arguments for render_messages / render_messages_async. chat_params: ChatParams for render_messages / render_messages_async.
""" """
if isinstance(request, ChatCompletionRequest): if isinstance(request, ChatCompletionRequest):
request_for_sampling = request request_for_sampling = request
...@@ -123,15 +122,17 @@ def _prepare_request( ...@@ -123,15 +122,17 @@ def _prepare_request(
else request_for_sampling.messages else request_for_sampling.messages
) )
render_kwargs = dict( chat_params = ChatParams(
chat_template=request_for_sampling.chat_template, chat_template=request_for_sampling.chat_template,
chat_template_content_format="auto", chat_template_content_format="auto",
add_generation_prompt=request_for_sampling.add_generation_prompt, chat_template_kwargs=dict(
continue_final_message=request_for_sampling.continue_final_message, add_generation_prompt=request_for_sampling.add_generation_prompt,
tools=tool_dicts, continue_final_message=request_for_sampling.continue_final_message,
documents=request_for_sampling.documents, tools=tool_dicts,
tokenize=tokenize_in_template, documents=request_for_sampling.documents,
**chat_template_kwargs, tokenize=tokenize_in_template,
**chat_template_kwargs,
),
) )
return ( return (
...@@ -139,7 +140,7 @@ def _prepare_request( ...@@ -139,7 +140,7 @@ def _prepare_request(
tool_parser, tool_parser,
chat_template_kwargs, chat_template_kwargs,
messages_for_render, messages_for_render,
render_kwargs, chat_params,
) )
...@@ -155,12 +156,12 @@ async def preprocess_chat_request( ...@@ -155,12 +156,12 @@ async def preprocess_chat_request(
tool_parser, tool_parser,
chat_template_kwargs, chat_template_kwargs,
messages, messages,
render_kwargs, chat_params,
) = _prepare_request( ) = _prepare_request(
request, tokenizer=tokenizer, tool_parser_class=tool_parser_class request, tokenizer=tokenizer, tool_parser_class=tool_parser_class
) )
_, engine_prompt = await renderer.render_messages_async(messages, **render_kwargs) _, engine_prompt = await renderer.render_messages_async(messages, chat_params)
if "prompt_token_ids" in engine_prompt: if "prompt_token_ids" in engine_prompt:
tokens = list(engine_prompt["prompt_token_ids"]) tokens = list(engine_prompt["prompt_token_ids"])
...@@ -194,12 +195,12 @@ def preprocess_chat_request_sync( ...@@ -194,12 +195,12 @@ def preprocess_chat_request_sync(
tool_parser, tool_parser,
chat_template_kwargs, chat_template_kwargs,
messages, messages,
render_kwargs, chat_params,
) = _prepare_request( ) = _prepare_request(
request, tokenizer=tokenizer, tool_parser_class=tool_parser_class request, tokenizer=tokenizer, tool_parser_class=tool_parser_class
) )
_, engine_prompt = renderer.render_messages(messages, **render_kwargs) _, engine_prompt = renderer.render_messages(messages, chat_params)
if "prompt_token_ids" in engine_prompt: if "prompt_token_ids" in engine_prompt:
tokens = list(engine_prompt["prompt_token_ids"]) tokens = list(engine_prompt["prompt_token_ids"])
......
...@@ -83,8 +83,6 @@ def map_finish_reason(raw_reason: str | None) -> FinishReason | None: ...@@ -83,8 +83,6 @@ def map_finish_reason(raw_reason: str | None) -> FinishReason | None:
_w_input_processor: InputProcessor | None = None _w_input_processor: InputProcessor | None = None
_w_tokenizer: Any = None _w_tokenizer: Any = None
_w_tool_parser_class: type[ToolParser] | None = None _w_tool_parser_class: type[ToolParser] | None = None
_w_reasoning_parser_class: type[ReasoningParser] | None = None
_w_stream_interval: int = 20
class _PreprocessError(Exception): class _PreprocessError(Exception):
...@@ -113,12 +111,9 @@ def _init_worker( ...@@ -113,12 +111,9 @@ def _init_worker(
config_format: str, config_format: str,
load_format: str, load_format: str,
tool_parser_name: str | None, tool_parser_name: str | None,
reasoning_parser_name: str | None,
stream_interval: int,
) -> None: ) -> None:
"""Initialize a worker process with its own VllmConfig and InputProcessor.""" """Initialize a worker process with its own VllmConfig and InputProcessor."""
global _w_input_processor, _w_tokenizer, _w_tool_parser_class global _w_input_processor, _w_tokenizer, _w_tool_parser_class
global _w_reasoning_parser_class, _w_stream_interval
model_config = ModelConfig( model_config = ModelConfig(
model=model_path, model=model_path,
...@@ -139,14 +134,6 @@ def _init_worker( ...@@ -139,14 +134,6 @@ def _init_worker(
else: else:
_w_tool_parser_class = None _w_tool_parser_class = None
if reasoning_parser_name:
_w_reasoning_parser_class = ReasoningParserManager.get_reasoning_parser(
reasoning_parser_name
)
else:
_w_reasoning_parser_class = None
_w_stream_interval = max(1, stream_interval)
def _worker_warmup() -> bool: def _worker_warmup() -> bool:
"""Dummy task to ensure worker process is fully initialized.""" """Dummy task to ensure worker process is fully initialized."""
...@@ -158,11 +145,7 @@ def _preprocess_worker( ...@@ -158,11 +145,7 @@ def _preprocess_worker(
request_id: str, request_id: str,
model_name: str, model_name: str,
) -> PreprocessWorkerResult: ) -> PreprocessWorkerResult:
"""Preprocess a request in a worker process and return a picklable result. """Preprocess a request in a worker process and return a picklable result."""
This replaces _request_handler's Phase A. No queues — errors propagate
naturally via the Future.
"""
pre = preprocess_chat_request_sync( pre = preprocess_chat_request_sync(
request, request,
tokenizer=_w_tokenizer, tokenizer=_w_tokenizer,
...@@ -838,8 +821,6 @@ class EngineFactory: ...@@ -838,8 +821,6 @@ class EngineFactory:
config_format, config_format,
load_format, load_format,
tool_parser_name, tool_parser_name,
reasoning_parser_name,
self.stream_interval,
), ),
) )
# Warm up all workers to ensure initialization completes # Warm up all workers to ensure initialization completes
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment