Unverified Commit d43be1f3 authored by Kris Hung's avatar Kris Hung Committed by GitHub
Browse files

fix: Fix chat processor for vllm video/audio examples (#6689)

parent 55c0a769
...@@ -15,7 +15,8 @@ from typing import AsyncIterator, Tuple, Union ...@@ -15,7 +15,8 @@ from typing import AsyncIterator, Tuple, Union
import uvloop import uvloop
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.tokenizers import TokenizerLike as AnyTokenizer from vllm.tokenizers import TokenizerLike as AnyTokenizer
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
......
...@@ -20,15 +20,15 @@ from typing import AsyncIterator, List, Optional, Protocol, Union, runtime_check ...@@ -20,15 +20,15 @@ from typing import AsyncIterator, List, Optional, Protocol, Union, runtime_check
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.chat_utils import ConversationMessage from vllm.entrypoints.chat_utils import ConversationMessage
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
ChatCompletionRequest, from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
CompletionRequest, from vllm.entrypoints.openai.completion.protocol import CompletionRequest
RequestResponseMetadata, from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
) from vllm.entrypoints.openai.engine.protocol import RequestResponseMetadata
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.inputs.data import TokensPrompt from vllm.inputs.data import TokensPrompt
from vllm.renderers.registry import renderer_from_config
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.tokenizers import TokenizerLike as AnyTokenizer from vllm.tokenizers import TokenizerLike as AnyTokenizer
...@@ -41,6 +41,7 @@ class StubEngineClient: ...@@ -41,6 +41,7 @@ class StubEngineClient:
def __init__(self, model_config: ModelConfig): def __init__(self, model_config: ModelConfig):
self.model_config = model_config self.model_config = model_config
self.renderer = renderer_from_config(model_config)
self.input_processor = None self.input_processor = None
self.io_processor = None self.io_processor = None
...@@ -154,9 +155,6 @@ class ChatProcessor: ...@@ -154,9 +155,6 @@ class ChatProcessor:
async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResult: async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResult:
request = self.parse_raw_request(raw_request) request = self.parse_raw_request(raw_request)
# TODO: Revisit this later when adding multi-modal support for the frontend.
# If no chat template is provided and tokenizer doesn't have one,
# use a simple format that just concatenates messages
if not request.chat_template and not self.tokenizer.chat_template: if not request.chat_template and not self.tokenizer.chat_template:
chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}User: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}\n{% endif %}{% endfor %}Assistant:" chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}User: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}\n{% endif %}{% endfor %}Assistant:"
else: else:
...@@ -167,20 +165,14 @@ class ChatProcessor: ...@@ -167,20 +165,14 @@ class ChatProcessor:
engine_prompts, engine_prompts,
) = await self.openai_serving._preprocess_chat( ) = await self.openai_serving._preprocess_chat(
request, request,
self.tokenizer,
request.messages, request.messages,
chat_template=chat_template, default_template=chat_template,
chat_template_content_format=self.openai_serving.chat_template_content_format, default_template_content_format=self.openai_serving.chat_template_content_format,
add_generation_prompt=request.add_generation_prompt, default_template_kwargs=None,
continue_final_message=request.continue_final_message,
tool_dicts=None, tool_dicts=None,
documents=request.documents, tool_parser=None,
chat_template_kwargs=request.chat_template_kwargs,
tool_parser=self.openai_serving.tool_parser,
add_special_tokens=request.add_special_tokens,
) )
# In newer vLLM, _preprocess_chat returns (conversation, engine_prompts) - 2 values
if not conversation or not engine_prompts: if not conversation or not engine_prompts:
raise ValueError( raise ValueError(
"Preprocessing returned empty conversation or engine_prompts" "Preprocessing returned empty conversation or engine_prompts"
...@@ -305,19 +297,14 @@ class CompletionsProcessor: ...@@ -305,19 +297,14 @@ class CompletionsProcessor:
async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult: async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult:
request = self.parse_raw_request(raw_request) request = self.parse_raw_request(raw_request)
# In newer vLLM, _preprocess_completion was removed engine_prompts = await self.openai_serving._preprocess_completion(
# Use the renderer approach instead request,
renderer = self.openai_serving._get_renderer(self.tokenizer) prompt_input=request.prompt,
config = self.openai_serving._build_render_config(request)
engine_prompts = await renderer.render_prompt_and_embeds(
prompt_or_prompts=request.prompt,
prompt_embeds=getattr(request, "prompt_embeds", None), prompt_embeds=getattr(request, "prompt_embeds", None),
config=config,
) )
# engine_prompts is now a list of TokensPrompt
if not engine_prompts: if not engine_prompts:
raise ValueError("Renderer returned empty engine_prompts") raise ValueError("Preprocessing returned empty engine_prompts")
return PreprocessResult(None, engine_prompts[0]) return PreprocessResult(None, engine_prompts[0])
async def stream_response( async def stream_response(
...@@ -332,6 +319,7 @@ class CompletionsProcessor: ...@@ -332,6 +319,7 @@ class CompletionsProcessor:
raise ValueError("Only streaming responses are supported") raise ValueError("Only streaming responses are supported")
async for raw_response in self.openai_serving.completion_stream_generator( async for raw_response in self.openai_serving.completion_stream_generator(
request, request,
[], # engine_prompts (not needed for streaming output)
result_generator, result_generator,
request_id, request_id,
int(time.time()), # created_time int(time.time()), # created_time
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment