Unverified Commit 56212b49 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

fix(frontend): Update vllm processor for vllm 0.16 (#6799)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent 9fe03dd8
......@@ -11,6 +11,7 @@ from typing import Any
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.engine.protocol import DeltaMessage, DeltaToolCall
from vllm.reasoning import ReasoningParser
from vllm.renderers import ChatParams
from vllm.sampling_params import SamplingParams
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser
......@@ -73,9 +74,7 @@ def _prepare_request(
*,
tokenizer: TokenizerLike,
tool_parser_class: type[ToolParser] | None,
) -> tuple[
ChatCompletionRequest, ToolParser | None, dict[str, Any], Any, dict[str, Any]
]:
) -> tuple[ChatCompletionRequest, ToolParser | None, dict[str, Any], Any, ChatParams]:
"""Validate request and build arguments for template rendering.
Returns:
......@@ -83,7 +82,7 @@ def _prepare_request(
tool_parser: Instantiated tool parser, or None.
chat_template_kwargs: Template kwargs (for PreprocessResult).
messages_for_render: Messages to pass as first arg to render_messages.
render_kwargs: Keyword arguments for render_messages / render_messages_async.
chat_params: ChatParams for render_messages / render_messages_async.
"""
if isinstance(request, ChatCompletionRequest):
request_for_sampling = request
......@@ -123,15 +122,17 @@ def _prepare_request(
else request_for_sampling.messages
)
render_kwargs = dict(
chat_params = ChatParams(
chat_template=request_for_sampling.chat_template,
chat_template_content_format="auto",
add_generation_prompt=request_for_sampling.add_generation_prompt,
continue_final_message=request_for_sampling.continue_final_message,
tools=tool_dicts,
documents=request_for_sampling.documents,
tokenize=tokenize_in_template,
**chat_template_kwargs,
chat_template_kwargs=dict(
add_generation_prompt=request_for_sampling.add_generation_prompt,
continue_final_message=request_for_sampling.continue_final_message,
tools=tool_dicts,
documents=request_for_sampling.documents,
tokenize=tokenize_in_template,
**chat_template_kwargs,
),
)
return (
......@@ -139,7 +140,7 @@ def _prepare_request(
tool_parser,
chat_template_kwargs,
messages_for_render,
render_kwargs,
chat_params,
)
......@@ -155,12 +156,12 @@ async def preprocess_chat_request(
tool_parser,
chat_template_kwargs,
messages,
render_kwargs,
chat_params,
) = _prepare_request(
request, tokenizer=tokenizer, tool_parser_class=tool_parser_class
)
_, engine_prompt = await renderer.render_messages_async(messages, **render_kwargs)
_, engine_prompt = await renderer.render_messages_async(messages, chat_params)
if "prompt_token_ids" in engine_prompt:
tokens = list(engine_prompt["prompt_token_ids"])
......@@ -194,12 +195,12 @@ def preprocess_chat_request_sync(
tool_parser,
chat_template_kwargs,
messages,
render_kwargs,
chat_params,
) = _prepare_request(
request, tokenizer=tokenizer, tool_parser_class=tool_parser_class
)
_, engine_prompt = renderer.render_messages(messages, **render_kwargs)
_, engine_prompt = renderer.render_messages(messages, chat_params)
if "prompt_token_ids" in engine_prompt:
tokens = list(engine_prompt["prompt_token_ids"])
......
......@@ -83,8 +83,6 @@ def map_finish_reason(raw_reason: str | None) -> FinishReason | None:
_w_input_processor: InputProcessor | None = None
_w_tokenizer: Any = None
_w_tool_parser_class: type[ToolParser] | None = None
_w_reasoning_parser_class: type[ReasoningParser] | None = None
_w_stream_interval: int = 20
class _PreprocessError(Exception):
......@@ -113,12 +111,9 @@ def _init_worker(
config_format: str,
load_format: str,
tool_parser_name: str | None,
reasoning_parser_name: str | None,
stream_interval: int,
) -> None:
"""Initialize a worker process with its own VllmConfig and InputProcessor."""
global _w_input_processor, _w_tokenizer, _w_tool_parser_class
global _w_reasoning_parser_class, _w_stream_interval
model_config = ModelConfig(
model=model_path,
......@@ -139,14 +134,6 @@ def _init_worker(
else:
_w_tool_parser_class = None
if reasoning_parser_name:
_w_reasoning_parser_class = ReasoningParserManager.get_reasoning_parser(
reasoning_parser_name
)
else:
_w_reasoning_parser_class = None
_w_stream_interval = max(1, stream_interval)
def _worker_warmup() -> bool:
"""Dummy task to ensure worker process is fully initialized."""
......@@ -158,11 +145,7 @@ def _preprocess_worker(
request_id: str,
model_name: str,
) -> PreprocessWorkerResult:
"""Preprocess a request in a worker process and return a picklable result.
This replaces _request_handler's Phase A. No queues — errors propagate
naturally via the Future.
"""
"""Preprocess a request in a worker process and return a picklable result."""
pre = preprocess_chat_request_sync(
request,
tokenizer=_w_tokenizer,
......@@ -838,8 +821,6 @@ class EngineFactory:
config_format,
load_format,
tool_parser_name,
reasoning_parser_name,
self.stream_interval,
),
)
# Warm up all workers to ensure initialization completes
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment