# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json import time from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence from typing import Any, Final import partial_json_parser import regex as re from fastapi import Request from openai_harmony import Message as OpenAIMessage from partial_json_parser.core.options import Allow from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ( ChatTemplateContentFormatOption, ConversationMessage, get_history_tool_calls_cnt, make_tool_call_id, ) from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionLogProb, ChatCompletionLogProbs, ChatCompletionLogProbsContent, ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, ) from vllm.entrypoints.openai.chat_completion.stream_harmony import ( TokenState, extract_harmony_streaming_delta, ) from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo, RequestResponseMetadata, ToolCall, UsageInfo, ) from vllm.entrypoints.openai.engine.serving import ( GenerationError, OpenAIServing, clamp_prompt_logprobs, ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.parser.harmony_utils import ( get_developer_message, get_stop_tokens_for_assistant_actions, get_streamable_parser_for_assistant, get_system_message, parse_chat_inputs_to_harmony_messages, parse_chat_output, render_for_completion, ) from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls from vllm.entrypoints.utils import get_max_tokens, should_include_usage from vllm.inputs.data import ProcessorInputs, TokensPrompt from vllm.logger import init_logger from vllm.logprobs import Logprob from vllm.outputs import CompletionOutput, RequestOutput from vllm.parser import ParserManager from vllm.reasoning import ReasoningParser from vllm.renderers import ChatParams from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.tokenizers import TokenizerLike from vllm.tool_parsers import ToolParser from vllm.tool_parsers.mistral_tool_parser import MistralToolCall from vllm.tool_parsers.utils import partial_json_loads from vllm.utils.collection_utils import as_list from vllm.utils.mistral import is_mistral_tokenizer from vllm.utils.mistral import mt as _mt logger = init_logger(__name__) class OpenAIServingChat(OpenAIServing): def __init__( self, engine_client: EngineClient, models: OpenAIServingModels, response_role: str, *, request_logger: RequestLogger | None, chat_template: str | None, chat_template_content_format: ChatTemplateContentFormatOption, trust_request_chat_template: bool = False, return_tokens_as_token_ids: bool = False, reasoning_parser: str = "", enable_auto_tools: bool = False, exclude_tools_when_tool_choice_none: bool = False, tool_parser: str | None = None, enable_prompt_tokens_details: bool = False, enable_force_include_usage: bool = False, enable_log_outputs: bool = False, enable_log_deltas: bool = True, default_chat_template_kwargs: dict[str, Any] | None = None, ) -> None: super().__init__( engine_client=engine_client, models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids, ) self.response_role = response_role self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format self.trust_request_chat_template = trust_request_chat_template self.default_chat_template_kwargs = default_chat_template_kwargs or {} self.enable_log_outputs = enable_log_outputs self.enable_log_deltas = enable_log_deltas # set up reasoning parser self.reasoning_parser_cls = ParserManager.get_reasoning_parser( reasoning_parser_name=reasoning_parser ) # set up tool use self.enable_auto_tools: bool = enable_auto_tools self.tool_parser = ParserManager.get_tool_parser( tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools, model_name=self.model_config.model, ) self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none self.enable_prompt_tokens_details = enable_prompt_tokens_details self.enable_force_include_usage = enable_force_include_usage self.default_sampling_params = self.model_config.get_diff_sampling_param() mc = self.model_config self.override_max_tokens = ( self.default_sampling_params.get("max_tokens") if mc.generation_config not in ("auto", "vllm") else getattr(mc, "override_generation_config", {}).get("max_new_tokens") ) self.use_harmony = self.model_config.hf_config.model_type == "gpt_oss" if self.use_harmony: if "stop_token_ids" not in self.default_sampling_params: self.default_sampling_params["stop_token_ids"] = [] self.default_sampling_params["stop_token_ids"].extend( get_stop_tokens_for_assistant_actions() ) # Handle tool call ID type for Kimi K2 (supporting test mocking via overrides) hf_overrides = getattr(self.model_config, "hf_overrides", None) if self.model_config.hf_text_config.model_type == "kimi_k2" or ( isinstance(hf_overrides, dict) and hf_overrides.get("model_type") == "kimi_k2" ): self.tool_call_id_type = "kimi_k2" else: self.tool_call_id_type = "random" # NOTE(woosuk): While OpenAI's chat completion API supports browsing # for some models, currently vLLM doesn't support it. Please use the # Responses API instead. self.supports_browsing = False self.browser_tool = None # NOTE(woosuk): Chat completion API does not support code interpreter. # Please use the Responses API instead. self.supports_code_interpreter = False self.python_tool = None def warmup(self) -> None: self.renderer.warmup( ChatParams( chat_template=self.chat_template, chat_template_content_format=self.chat_template_content_format, chat_template_kwargs=self.default_chat_template_kwargs, ) ) async def render_chat_request( self, request: ChatCompletionRequest, ) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse: """ render chat request by validating and preprocessing inputs. Returns: A tuple of (conversation, engine_prompts) on success, or an ErrorResponse on failure. """ error_check_ret = await self._check_model(request) if error_check_ret is not None: logger.error("Error with model %s", error_check_ret) return error_check_ret # If the engine is dead, raise the engine's DEAD_ERROR. # This is required for the streaming case, where we return a # success status before we actually start generating text :). if self.engine_client.errored: raise self.engine_client.dead_error tokenizer = self.renderer.tokenizer tool_parser = self.tool_parser if is_mistral_tokenizer(tokenizer): # because of issues with pydantic we need to potentially # re-serialize the tool_calls field of the request # for more info: see comment in `maybe_serialize_tool_calls` _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] _mt.truncate_tool_call_ids(request) # type: ignore[arg-type] _mt.validate_request_params(request) # Check if tool parsing is unavailable (common condition) tool_parsing_unavailable = ( tool_parser is None and not is_mistral_tokenizer(tokenizer) and not self.use_harmony ) # Validate tool_choice when tool parsing is required but unavailable if tool_parsing_unavailable and request.tool_choice not in ( None, "none", ): if request.tool_choice == "auto" and not self.enable_auto_tools: # for hf tokenizers, "auto" tools requires # --enable-auto-tool-choice and --tool-call-parser return self.create_error_response( '"auto" tool choice requires ' "--enable-auto-tool-choice and --tool-call-parser to be set" ) elif request.tool_choice != "auto": # "required" or named tool requires tool parser return self.create_error_response( f'tool_choice="{request.tool_choice}" requires ' "--tool-call-parser to be set" ) if request.tools is None or ( request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none ): tool_dicts = None else: tool_dicts = [tool.model_dump() for tool in request.tools] if not self.use_harmony: # Common case. error_check_ret = self._validate_chat_template( request_chat_template=request.chat_template, chat_template_kwargs=request.chat_template_kwargs, trust_request_chat_template=self.trust_request_chat_template, ) if error_check_ret is not None: return error_check_ret conversation, engine_prompts = await self._preprocess_chat( request, request.messages, default_template=self.chat_template, default_template_content_format=self.chat_template_content_format, default_template_kwargs=self.default_chat_template_kwargs, tool_dicts=tool_dicts, tool_parser=tool_parser, ) else: # For GPT-OSS. should_include_tools = tool_dicts is not None conversation, engine_prompts = self._make_request_with_harmony( request, should_include_tools ) return conversation, engine_prompts async def create_chat_completion( self, request: ChatCompletionRequest, raw_request: Request | None = None, ) -> AsyncGenerator[str, None] | ChatCompletionResponse | ErrorResponse: """ Chat Completion API similar to OpenAI's API. See https://platform.openai.com/docs/api-reference/chat/create for the API specification. This API mimics the OpenAI Chat Completion API. """ # Streaming response tokenizer = self.renderer.tokenizer assert tokenizer is not None reasoning_parser: ReasoningParser | None = None if self.reasoning_parser_cls: # Pass the same chat template kwargs as used in tokenization chat_template_kwargs = self._prepare_extra_chat_template_kwargs( request.chat_template_kwargs, self.default_chat_template_kwargs, ) reasoning_parser = self.reasoning_parser_cls( tokenizer, chat_template_kwargs=chat_template_kwargs, # type: ignore[call-arg] ) result = await self.render_chat_request(request) if isinstance(result, ErrorResponse): return result conversation, engine_prompts = result request_id = ( f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}" ) request_metadata = RequestResponseMetadata(request_id=request_id) if raw_request: raw_request.state.request_metadata = request_metadata lora_request = self._maybe_get_adapters(request, supports_default_mm_loras=True) model_name = self.models.model_name(lora_request) # Extract data_parallel_rank from header (router can inject it) data_parallel_rank = self._get_data_parallel_rank(raw_request) # Schedule the request and get the result generator. max_model_len = self.model_config.max_model_len generators: list[AsyncGenerator[RequestOutput, None]] = [] for i, engine_prompt in enumerate(engine_prompts): prompt_token_ids = self._extract_prompt_components(engine_prompt).token_ids # If we are creating sub requests for multiple prompts, ensure that they # have unique request ids. sub_request_id = ( request_id if len(engine_prompts) == 1 else f"{request_id}_{i}" ) max_tokens = get_max_tokens( max_model_len, request.max_completion_tokens if request.max_completion_tokens is not None else request.max_tokens, self._extract_prompt_len(engine_prompt), self.default_sampling_params, self.override_max_tokens, ) sampling_params: SamplingParams | BeamSearchParams if request.use_beam_search: sampling_params = request.to_beam_search_params( max_tokens, self.default_sampling_params ) else: sampling_params = request.to_sampling_params( max_tokens, self.default_sampling_params, ) self._log_inputs( sub_request_id, engine_prompt, params=sampling_params, lora_request=lora_request, ) trace_headers = ( None if raw_request is None else await self._get_trace_headers(raw_request.headers) ) if isinstance(sampling_params, BeamSearchParams): generator = self.beam_search( prompt=engine_prompt, request_id=sub_request_id, params=sampling_params, lora_request=lora_request, trace_headers=trace_headers, ) else: reasoning_ended = ( reasoning_parser.is_reasoning_end(prompt_token_ids or []) if reasoning_parser else None ) generator = self.engine_client.generate( engine_prompt, sampling_params, sub_request_id, lora_request=lora_request, trace_headers=trace_headers, priority=request.priority, data_parallel_rank=data_parallel_rank, reasoning_ended=reasoning_ended, ) generators.append(generator) assert len(generators) == 1 (result_generator,) = generators if request.stream: return self.chat_completion_stream_generator( request, result_generator, request_id, model_name, conversation, tokenizer, request_metadata, reasoning_parser, ) return await self.chat_completion_full_generator( request, result_generator, request_id, model_name, conversation, tokenizer, request_metadata, reasoning_parser, ) def get_chat_request_role(self, request: ChatCompletionRequest) -> str: if request.add_generation_prompt: return self.response_role return request.messages[-1]["role"] @staticmethod def _bracket_level(s: str, opening="{", closing="}") -> int: """ Calculate the current level of nested brackets in a given string. """ level = 0 for char in s: if char == opening: level += 1 elif char == closing: level -= 1 return level @staticmethod def _filter_delta_text(delta_text: str, previous_text: str) -> tuple[str, bool]: # remove last '},' of the tool definition stemming from the # "name"/"parameters" outer object or closing ']' of the tool list # count occurrences of opening and closing curly braces and # once level 0 is reached stop outputting text # if 0 is reached while parsing the delta_text we know the current # tool will finish in this current iteration bracket_level = OpenAIServingChat._bracket_level(previous_text) updated_delta, passed_zero = "", False for c in delta_text: if c == "{": bracket_level += 1 passed_zero = bracket_level == 0 elif c == "}": bracket_level -= 1 passed_zero = bracket_level == 0 if bracket_level != 0: updated_delta += c else: # if a comma is reached at level 0 we can stop if c == ",": break return updated_delta, passed_zero def extract_tool_call_required_streaming( self, previous_text: str, current_text: str | None, delta_text: str, function_name_returned: bool, tool_call_idx: int | None = None, ) -> tuple[DeltaMessage | None, bool]: if current_text is None or current_text == "": # if the current text is empty, we cannot parse it return None, function_name_returned try: flags = Allow.ALL obj, _ = partial_json_loads(current_text, flags) except ( partial_json_parser.core.exceptions.MalformedJSON, json.JSONDecodeError, ): logger.debug("not enough tokens to parse into JSON yet") obj = None # check if the current text is a valid array # containing a partial tool calling object # if not repeat if obj is None or not isinstance(obj, list) or not len(obj) > 0: function_name_returned = False delta_message = None else: _, finishes_previous_tool = OpenAIServingChat._filter_delta_text( delta_text, previous_text ) # take the last tool call from the generated list current_tool_call = obj[-1] # once parameters have been generated the name is complete as well if not finishes_previous_tool and ( "name" not in current_tool_call or "parameters" not in current_tool_call ): function_name_returned = False delta_message = None else: if not function_name_returned: # get partly generated arguments from the latest tool call param_match = re.search( r'.*"parameters":\s*(.*)', current_text, re.DOTALL ) arguments = param_match.group(1) if param_match else "" arguments, _ = OpenAIServingChat._filter_delta_text( arguments, previous_text ) # if this iteration finishes a previous tool call but a # new incomplete tool is already generated, take the # previous from the list if finishes_previous_tool and "parameters" not in current_tool_call: current_tool_call = obj[-2] function_name_returned = True tool_call_id = make_tool_call_id( id_type=self.tool_call_id_type, func_name=current_tool_call["name"], idx=tool_call_idx, ) delta_message = DeltaMessage( tool_calls=[ DeltaToolCall( id=tool_call_id, function=DeltaFunctionCall( name=current_tool_call["name"], arguments=arguments ), index=len(obj) - 1, type="function", ) ] ) else: delta_text, _ = OpenAIServingChat._filter_delta_text( delta_text, previous_text ) if delta_text != "": delta_message = DeltaMessage( tool_calls=[ DeltaToolCall( function=DeltaFunctionCall( # OpenAI API returns None # instead of name every time name=None, arguments=delta_text, ), index=len(obj) - 1, ) ] ) else: delta_message = None return delta_message, function_name_returned async def chat_completion_stream_generator( self, request: ChatCompletionRequest, result_generator: AsyncIterator[RequestOutput], request_id: str, model_name: str, conversation: list[ConversationMessage], tokenizer: TokenizerLike, request_metadata: RequestResponseMetadata, reasoning_parser: ReasoningParser | None = None, ) -> AsyncGenerator[str, None]: created_time = int(time.time()) chunk_object_type: Final = "chat.completion.chunk" first_iteration = True # Send response for each token for each request.n (index) num_choices = 1 if request.n is None else request.n previous_num_tokens = [0] * num_choices finish_reason_sent = [False] * num_choices num_prompt_tokens = 0 num_cached_tokens = None if self.use_harmony: harmony_parsers = [ get_streamable_parser_for_assistant() for _ in range(num_choices) ] harmony_tools_streamed = [False] * num_choices tools_streamed = [False] * num_choices if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam): tool_choice_function_name = request.tool_choice.function.name else: tool_choice_function_name = None # Determine whether tools are in use with "auto" tool choice tool_choice_auto = ( not tool_choice_function_name and self._should_stream_with_auto_tool_parsing(request) ) all_previous_token_ids: list[list[int]] | None function_name_returned = [False] * num_choices if self.tool_call_id_type == "kimi_k2": history_tool_call_cnt = get_history_tool_calls_cnt(conversation) else: history_tool_call_cnt = 0 # Always track previous_texts for comprehensive output logging previous_texts = [""] * num_choices # Only one of these will be used, thus previous_texts and # all_previous_token_ids will not be used twice in the same iteration. if tool_choice_auto or reasoning_parser: # These are only required in "auto" tool choice case all_previous_token_ids = [[]] * num_choices # For reasoning parser and tool call all enabled added_content_delta_arr = [False] * num_choices reasoning_end_arr = [False] * num_choices prompt_is_reasoning_end_arr: list[bool | None] = [None] * num_choices else: all_previous_token_ids = None # Prepare the tool parser if it's needed try: if tool_choice_auto and self.tool_parser: if tokenizer is None: raise ValueError( "Tokenizer not available when `skip_tokenizer_init=True`" ) tool_parsers: list[ToolParser | None] = [ self.tool_parser(tokenizer) ] * num_choices else: tool_parsers = [None] * num_choices except Exception as e: logger.exception("Error in tool parser creation.") data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" yield "data: [DONE]\n\n" return stream_options = request.stream_options include_usage, include_continuous_usage = should_include_usage( stream_options, self.enable_force_include_usage ) try: async for res in result_generator: if res.prompt_token_ids is not None: num_prompt_tokens = len(res.prompt_token_ids) if res.encoder_prompt_token_ids is not None: num_prompt_tokens += len(res.encoder_prompt_token_ids) # We need to do it here, because if there are exceptions in # the result_generator, it needs to be sent as the FIRST # response (by the try...catch). if first_iteration: num_cached_tokens = res.num_cached_tokens # Send first response for each request.n (index) with # the role role = self.get_chat_request_role(request) # NOTE num_choices defaults to 1 so this usually executes # once per request for i in range(num_choices): choice_data = ChatCompletionResponseStreamChoice( index=i, delta=DeltaMessage( role=role, content="", ), logprobs=None, finish_reason=None, ) # return prompt_token_ids at the first chunk ever chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, created=created_time, choices=[choice_data], model=model_name, prompt_token_ids=( res.prompt_token_ids if request.return_token_ids else None ), ) # if continuous usage stats are requested, add it if include_continuous_usage: chunk.usage = UsageInfo( prompt_tokens=num_prompt_tokens, completion_tokens=0, total_tokens=num_prompt_tokens, ) data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" # Send response to echo the input portion of the # last message if request.echo: last_msg_content: str | list[dict[str, str]] = "" if ( conversation and "content" in conversation[-1] and conversation[-1].get("role") == role ): last_msg_content = conversation[-1]["content"] or "" if last_msg_content: for i in range(num_choices): choice_data = ChatCompletionResponseStreamChoice( index=i, delta=DeltaMessage(content=last_msg_content), logprobs=None, finish_reason=None, ) chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, created=created_time, choices=[choice_data], model=model_name, ) if include_continuous_usage: chunk.usage = UsageInfo( prompt_tokens=num_prompt_tokens, completion_tokens=0, total_tokens=num_prompt_tokens, ) data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" first_iteration = False for output in res.outputs: i = output.index tool_parser = tool_parsers[i] if ( reasoning_parser and res.prompt_token_ids and prompt_is_reasoning_end_arr[i] is None ): # only check once per choice, because prompt_token_ids # are the same for all deltas in that choice prompt_is_reasoning_end_arr[i] = ( reasoning_parser.is_reasoning_end(res.prompt_token_ids) ) if finish_reason_sent[i]: continue if request.logprobs and request.top_logprobs is not None: assert output.logprobs is not None, "Did not output logprobs" logprobs = self._create_chat_logprobs( token_ids=output.token_ids, top_logprobs=output.logprobs, tokenizer=tokenizer, num_output_top_logprobs=request.top_logprobs, return_as_token_id=request.return_tokens_as_token_ids, ) else: logprobs = None if self.use_harmony: harmony_parser = harmony_parsers[i] prev_recipient = harmony_parser.current_recipient # Track accumulated content per token with their state token_states: list[TokenState] = [] for token_id in output.token_ids: harmony_parser.process(token_id) token_delta = harmony_parser.last_content_delta or "" token_states.append( TokenState( harmony_parser.current_channel, harmony_parser.current_recipient, token_delta, ) ) delta_text = "".join(delta for _, _, delta in token_states) cur_channel = harmony_parser.current_channel # handle the case where several tokens where generated at once # including the final token, leading to a delta in the text # but the current channel to be empty (start state) if not cur_channel and delta_text: cur_channel = "final" else: delta_text = output.text if ( not delta_text and not output.token_ids and not previous_num_tokens[i] ): # Chunked prefill case, don't return empty chunks continue delta_message: DeltaMessage | None # just update previous_texts and previous_token_ids if tool_choice_auto or reasoning_parser: assert previous_texts is not None assert all_previous_token_ids is not None previous_text = previous_texts[i] previous_token_ids = all_previous_token_ids[i] current_text = previous_text + delta_text # avoid the None + list error. if previous_token_ids: current_token_ids = previous_token_ids + as_list( output.token_ids ) else: current_token_ids = as_list(output.token_ids) if self.use_harmony: delta_message, tools_streamed_flag = ( extract_harmony_streaming_delta( harmony_parser=harmony_parser, token_states=token_states, prev_recipient=prev_recipient, include_reasoning=request.include_reasoning, ) ) harmony_tools_streamed[i] |= tools_streamed_flag # handle streaming deltas for tools with named tool_choice elif tool_choice_function_name: # When encountering think end id in prompt_token_ids # i.e {"enable_thinking": False}, # check BEFORE calling the parser to avoid a spurious # reasoning delta on the first chunk. if ( reasoning_parser and not reasoning_end_arr[i] and prompt_is_reasoning_end_arr[i] ): reasoning_end_arr[i] = True if ( reasoning_parser and not reasoning_end_arr[i] and not reasoning_parser.is_reasoning_end( previous_token_ids ) ): assert reasoning_parser is not None delta_message = ( reasoning_parser.extract_reasoning_streaming( previous_text, current_text, delta_text, previous_token_ids, current_token_ids, output.token_ids, ) ) # When encountering think end id in delta_token_ids, # set reasoning status to end. # Only keep 'content', remove 'reasoning'. if reasoning_parser.is_reasoning_end( as_list(output.token_ids) ): reasoning_end_arr[i] = True if delta_message and delta_message.content: # This need to be added to next `delta_text` current_text = delta_message.content delta_message.content = None else: current_text = "" else: # Just to add remaining `content` if reasoning_parser: delta_text = previous_text + delta_text current_text = "" if function_name_returned[i]: delta_tool_call = DeltaToolCall( function=DeltaFunctionCall(arguments=delta_text), index=i, ) else: # Generate ID based on tokenizer type if is_mistral_tokenizer(tokenizer): tool_call_id = MistralToolCall.generate_random_id() else: tool_call_id = make_tool_call_id( id_type=self.tool_call_id_type, func_name=tool_choice_function_name, idx=history_tool_call_cnt, ) delta_tool_call = DeltaToolCall( id=tool_call_id, type="function", function=DeltaFunctionCall( name=tool_choice_function_name, arguments=delta_text, ), index=i, ) function_name_returned[i] = True history_tool_call_cnt += 1 delta_message = DeltaMessage( tool_calls=[ delta_tool_call, ] ) tools_streamed[i] = True elif request.tool_choice == "required": assert previous_texts is not None previous_text = previous_texts[i] current_text = previous_text + delta_text fn_name_returned = function_name_returned[i] output_token_ids = as_list(output.token_ids) if ( reasoning_parser is not None and not reasoning_end_arr[i] and prompt_is_reasoning_end_arr[i] ): reasoning_end_arr[i] = True if reasoning_parser and not reasoning_end_arr[i]: delta_message = ( reasoning_parser.extract_reasoning_streaming( previous_text, current_text, delta_text, previous_token_ids, current_token_ids, output_token_ids, ) ) if reasoning_parser.is_reasoning_end(output_token_ids): reasoning_end_arr[i] = True if delta_message and delta_message.content: current_text = delta_message.content delta_message.content = None else: # reasoning ended current_text = "" else: # either finished reasoning or no reasoning at all content = current_text delta_message, function_name_returned[i] = ( self.extract_tool_call_required_streaming( previous_text=previous_text, current_text=content, delta_text=delta_text, function_name_returned=fn_name_returned, tool_call_idx=history_tool_call_cnt, ) ) if ( delta_message and delta_message.tool_calls and delta_message.tool_calls[0].id is not None ): history_tool_call_cnt += 1 tools_streamed[i] = True # handle streaming deltas for tools with "auto" tool choice # and reasoning parser elif tool_choice_auto and reasoning_parser: assert tool_parser is not None assert added_content_delta_arr is not None assert reasoning_end_arr is not None output_token_ids = as_list(output.token_ids) if not reasoning_end_arr[i]: # When encountering think end id in prompt_token_ids # i.e {"enable_thinking": False}, # set reasoning status to end. if prompt_is_reasoning_end_arr[i]: reasoning_end_arr[i] = True current_token_ids = output_token_ids # Don't update current_text, keep it as is from delta else: delta_message = ( reasoning_parser.extract_reasoning_streaming( previous_text, current_text, delta_text, previous_token_ids, current_token_ids, output_token_ids, ) ) # When encountering think end id in delta_token_ids, # set reasoning status to end. # Remove the text and token ids related # to 'reasoning'. if reasoning_parser.is_reasoning_end(output_token_ids): reasoning_end_arr[i] = True current_token_ids = ( reasoning_parser.extract_content_ids( output_token_ids ) ) if delta_message and delta_message.content: current_text = delta_message.content delta_message.content = None else: current_text = "" # handle tool calls only after reasoning is done, if reasoning_end_arr[i]: delta_token_ids = output_token_ids # First time to tool call, # add the remaining text and token ids # to delta from previous if not added_content_delta_arr[i]: added_content_delta_arr[i] = True previous_text = "" previous_token_ids = [] delta_text = current_text delta_token_ids = current_token_ids delta_message = tool_parser.extract_tool_calls_streaming( previous_text=previous_text, current_text=current_text, delta_text=delta_text, previous_token_ids=previous_token_ids, current_token_ids=current_token_ids, delta_token_ids=delta_token_ids, request=request, ) if delta_message and delta_message.tool_calls: tools_streamed[i] = True # when only tool calls elif tool_choice_auto: assert tool_parser is not None delta_message = tool_parser.extract_tool_calls_streaming( previous_text=previous_text, current_text=current_text, delta_text=delta_text, previous_token_ids=previous_token_ids, current_token_ids=current_token_ids, delta_token_ids=output.token_ids, request=request, ) if delta_message and delta_message.tool_calls: tools_streamed[i] = True # when only reasoning elif reasoning_parser: # When encountering think end id in prompt_token_ids # i.e {"enable_thinking": False}, # set reasoning status to end. # Route all generated tokens as content directly. if prompt_is_reasoning_end_arr[i]: delta_message = DeltaMessage(content=delta_text) else: delta_message = ( reasoning_parser.extract_reasoning_streaming( previous_text, current_text, delta_text, previous_token_ids, current_token_ids, output.token_ids, ) ) # handle streaming just a content delta else: delta_message = DeltaMessage(content=delta_text) # update the previous values for the next iteration if (tool_choice_auto or reasoning_parser) and not self.use_harmony: assert previous_texts is not None assert all_previous_token_ids is not None previous_texts[i] = current_text all_previous_token_ids[i] = current_token_ids else: # Update for comprehensive logging even in simple case assert previous_texts is not None previous_texts[i] += delta_text # set the previous values for the next iteration previous_num_tokens[i] += len(output.token_ids) # if the message delta is None (e.g. because it was a # "control token" for tool calls or the parser otherwise # wasn't ready to send a token, then # get the next token without streaming a chunk if delta_message is None: # NOTE: If return_token_ids is enabled, we still need to # send a chunk with token_ids even if delta_message is None # to ensure all tokens are included in the response if ( output.finish_reason is None and not request.return_token_ids ): continue delta_message = DeltaMessage() # Log streaming delta if output logging is enabled if self.enable_log_outputs and self.request_logger: delta_content_parts = [] if delta_message.content: delta_content_parts.append(delta_message.content) if delta_message.reasoning: reasoning = delta_message.reasoning delta_content_parts.append(f"[reasoning: {reasoning}]") if delta_message.tool_calls: tool_args = "".join( tc.function.arguments for tc in delta_message.tool_calls if tc.function and tc.function.arguments ) if tool_args: delta_content_parts.append(f"[tool_calls: {tool_args}]") if delta_content_parts and self.enable_log_deltas: delta_content = " ".join(delta_content_parts) self.request_logger.log_outputs( request_id=request_id, outputs=delta_content, output_token_ids=as_list(output.token_ids), finish_reason=output.finish_reason, is_streaming=True, delta=True, ) if output.finish_reason is None: # Send token-by-token response for each request.n choice_data = ChatCompletionResponseStreamChoice( index=i, delta=delta_message, logprobs=logprobs, finish_reason=None, token_ids=( as_list(output.token_ids) if request.return_token_ids else None ), ) # if the model is finished generating else: # check for error finish reason and abort streaming # finish_reason='error' indicates a retryable error self._raise_if_error(output.finish_reason, request_id) # check to make sure we haven't "forgotten" to stream # any tokens that were generated but previously # matched by partial json parsing # only happens if we are NOT using structured outputs auto_tools_called = False if tool_parser: auto_tools_called = len(tool_parser.prev_tool_call_arr) > 0 index = ( len(tool_parser.prev_tool_call_arr) - 1 if auto_tools_called else 0 ) else: index = 0 if ( self._should_check_for_unstreamed_tool_arg_tokens( delta_message, output ) and tool_parser ): latest_delta_len = 0 if ( isinstance( delta_message.tool_calls[0].function, DeltaFunctionCall, ) ) and isinstance( delta_message.tool_calls[0].function.arguments, str ): latest_delta_len = len( delta_message.tool_calls[0].function.arguments ) # get the expected call based on partial JSON # parsing which "autocompletes" the JSON. # Tool parsers (e.g. Qwen3Coder) store # arguments as a JSON string in # prev_tool_call_arr. Calling json.dumps() # on an already-serialized string would # double-serialize it (e.g. '{"k":1}' becomes # '"{\\"k\\":1}"'), which then causes the # replace() below to fail and append the # entire double-serialized string as a # spurious final delta. args = tool_parser.prev_tool_call_arr[index].get( "arguments", {} ) if isinstance(args, str): expected_call = args else: expected_call = json.dumps(args, ensure_ascii=False) # get what we've streamed so far for arguments # for the current tool actual_call = tool_parser.streamed_args_for_tool[index] if latest_delta_len > 0: actual_call = actual_call[:-latest_delta_len] # check to see if there's anything left to stream remaining_call = expected_call.replace(actual_call, "", 1) # set that as a delta message delta_message = self._create_remaining_args_delta( delta_message, remaining_call, index ) # Send the finish response for each request.n only once # In OpenAI's API, when a tool is called, the # finish_reason is: # "tool_calls" for "auto" or "required" tool calls, # and "stop" for named tool calls. if ( auto_tools_called or (tools_streamed[i] and not tool_choice_function_name) or (self.use_harmony and harmony_tools_streamed[i]) ): finish_reason_ = "tool_calls" else: finish_reason_ = ( output.finish_reason if output.finish_reason else "stop" ) choice_data = ChatCompletionResponseStreamChoice( index=i, delta=delta_message, logprobs=logprobs, finish_reason=finish_reason_, stop_reason=output.stop_reason, token_ids=( as_list(output.token_ids) if request.return_token_ids else None ), ) finish_reason_sent[i] = True choice_data = maybe_filter_parallel_tool_calls(choice_data, request) chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, created=created_time, choices=[choice_data], model=model_name, ) # handle usage stats if requested & if continuous if include_continuous_usage: completion_tokens = previous_num_tokens[i] chunk.usage = UsageInfo( prompt_tokens=num_prompt_tokens, completion_tokens=completion_tokens, total_tokens=num_prompt_tokens + completion_tokens, ) data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" # once the final token is handled, if stream_options.include_usage # is sent, send the usage if include_usage: completion_tokens = sum(previous_num_tokens) final_usage = UsageInfo( prompt_tokens=num_prompt_tokens, completion_tokens=completion_tokens, total_tokens=num_prompt_tokens + completion_tokens, ) if self.enable_prompt_tokens_details and num_cached_tokens: final_usage.prompt_tokens_details = PromptTokenUsageInfo( cached_tokens=num_cached_tokens ) final_usage_chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, created=created_time, choices=[], model=model_name, usage=final_usage, ) final_usage_data = final_usage_chunk.model_dump_json( exclude_unset=True, exclude_none=True ) yield f"data: {final_usage_data}\n\n" # report to FastAPI middleware aggregate usage across all choices num_completion_tokens = sum(previous_num_tokens) request_metadata.final_usage_info = UsageInfo( prompt_tokens=num_prompt_tokens, completion_tokens=num_completion_tokens, total_tokens=num_prompt_tokens + num_completion_tokens, ) # Log complete streaming response if output logging is enabled if self.enable_log_outputs and self.request_logger: # Log the complete response for each choice for i in range(num_choices): full_text = ( previous_texts[i] if previous_texts and i < len(previous_texts) else f"" ) self.request_logger.log_outputs( request_id=request_id, outputs=full_text, output_token_ids=None, # Consider also logging all token IDs finish_reason="streaming_complete", is_streaming=True, delta=False, ) except GenerationError as e: yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: logger.exception("Error in chat completion stream generator.") data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" # Send the final done message after all response.n are finished yield "data: [DONE]\n\n" async def chat_completion_full_generator( self, request: ChatCompletionRequest, result_generator: AsyncIterator[RequestOutput], request_id: str, model_name: str, conversation: list[ConversationMessage], tokenizer: TokenizerLike, request_metadata: RequestResponseMetadata, reasoning_parser: ReasoningParser | None = None, ) -> ErrorResponse | ChatCompletionResponse: from vllm.tokenizers.mistral import MistralTokenizer created_time = int(time.time()) final_res: RequestOutput | None = None try: async for res in result_generator: final_res = res except asyncio.CancelledError: return self.create_error_response("Client disconnected") assert final_res is not None choices: list[ChatCompletionResponseChoice] = [] if self.tool_call_id_type == "kimi_k2": history_tool_call_cnt = get_history_tool_calls_cnt(conversation) else: history_tool_call_cnt = 0 role = self.get_chat_request_role(request) for output in final_res.outputs: # check for error finish reason and raise GenerationError # finish_reason='error' indicates a retryable request-level internal error self._raise_if_error(output.finish_reason, request_id) token_ids = output.token_ids out_logprobs = output.logprobs tool_call_info = None if request.logprobs and request.top_logprobs is not None: assert out_logprobs is not None, "Did not output logprobs" logprobs = self._create_chat_logprobs( token_ids=token_ids, top_logprobs=out_logprobs, num_output_top_logprobs=request.top_logprobs, tokenizer=tokenizer, return_as_token_id=request.return_tokens_as_token_ids, ) else: logprobs = None if self.use_harmony: reasoning, content, _ = parse_chat_output(token_ids) if not request.include_reasoning: reasoning = None if self.tool_parser is not None: if tokenizer is None: raise ValueError( "Tokenizer not available when `skip_tokenizer_init=True`" ) tool_parser = self.tool_parser(tokenizer) # NOTE: We use token_ids for openai tool parser tool_call_info = tool_parser.extract_tool_calls( "", request=request, token_ids=token_ids, # type: ignore ) content = tool_call_info.content message = ChatMessage( role=role, reasoning=reasoning, content=content, tool_calls=tool_call_info.tool_calls, ) else: message = ChatMessage( role=role, reasoning=reasoning, content=content, ) choice_data = ChatCompletionResponseChoice( index=output.index, message=message, logprobs=logprobs, finish_reason=( "tool_calls" if (tool_call_info is not None and tool_call_info.tools_called) else output.finish_reason if output.finish_reason else "stop" ), stop_reason=output.stop_reason, token_ids=( as_list(output.token_ids) if request.return_token_ids else None ), ) choices.append(choice_data) continue if reasoning_parser: # If the reasoning parser is enabled, # tool calls are extracted exclusively from the content. reasoning, content = reasoning_parser.extract_reasoning( output.text, request=request ) if not request.include_reasoning: reasoning = None else: reasoning = None content = output.text auto_tools_called = False # if auto tools are not enabled, and a named tool choice using # outlines is not being used tool_calls, content = self._parse_tool_calls_from_content( request=request, tokenizer=tokenizer, content=content, enable_auto_tools=self.enable_auto_tools, tool_parser_cls=self.tool_parser, ) tool_call_class = ( MistralToolCall if is_mistral_tokenizer(tokenizer) else ToolCall ) if (not self.enable_auto_tools or not self.tool_parser) and ( not isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam) and request.tool_choice != "required" ): message = ChatMessage(role=role, reasoning=reasoning, content=content) elif ( request.tool_choice and type(request.tool_choice) is ChatCompletionNamedToolChoiceParam ): assert tool_calls is not None and len(tool_calls) > 0 tool_call_class_items = [] for idx, tc in enumerate(tool_calls): # Use native ID if available (e.g., Kimi K2), # otherwise generate ID with correct id_type if tc.id: tool_call_class_items.append( tool_call_class(id=tc.id, function=tc) ) else: # Generate ID using the correct format (kimi_k2 or random), # but leave it to the class if it's Mistral to preserve # 9-char IDs if isinstance(tokenizer, MistralTokenizer): tool_call_class_items.append(tool_call_class(function=tc)) else: generated_id = make_tool_call_id( id_type=self.tool_call_id_type, func_name=tc.name, idx=history_tool_call_cnt, ) tool_call_class_items.append( tool_call_class(id=generated_id, function=tc) ) history_tool_call_cnt += 1 message = ChatMessage( role=role, reasoning=reasoning, content="", tool_calls=tool_call_class_items, ) elif request.tool_choice and request.tool_choice == "required": tool_call_class_items = [] tool_calls = tool_calls or [] for idx, tool_call in enumerate(tool_calls): # Use native ID if available, # otherwise generate ID with correct id_type if tool_call.id: tool_call_class_items.append( tool_call_class(id=tool_call.id, function=tool_call) ) else: # Generate ID using the correct format (kimi_k2 or random), # but leave it to the class if it's Mistral to preserve # 9-char IDs if isinstance(tokenizer, MistralTokenizer): tool_call_class_items.append( tool_call_class(function=tool_call) ) else: generated_id = make_tool_call_id( id_type=self.tool_call_id_type, func_name=tool_call.name, idx=history_tool_call_cnt, ) tool_call_class_items.append( tool_call_class(id=generated_id, function=tool_call) ) history_tool_call_cnt += 1 message = ChatMessage( role=role, content="", tool_calls=tool_call_class_items, reasoning=reasoning, ) # if the request doesn't use tool choice # OR specifies to not use a tool elif not request.tool_choice or request.tool_choice == "none": message = ChatMessage(role=role, reasoning=reasoning, content=content) # handle when there are tools and tool choice is auto elif ( request.tools and (request.tool_choice == "auto" or request.tool_choice is None) and self.enable_auto_tools and self.tool_parser ): # In the OpenAI API the finish_reason is "tools_called" # if the tool choice is auto and the model produced a tool # call. The same is not true for named function calls auto_tools_called = tool_calls is not None and len(tool_calls) > 0 if tool_calls: tool_call_items = [] for idx, tc in enumerate(tool_calls): # Use native ID if available (e.g., Kimi K2), # otherwise generate ID with correct id_type if tc.id: tool_call_items.append( tool_call_class(id=tc.id, function=tc) ) else: # Generate ID using the correct format (kimi_k2 or random), # but leave it to the class if it's Mistral to preserve # 9-char IDs if isinstance(tokenizer, MistralTokenizer): tool_call_items.append(tool_call_class(function=tc)) else: generated_id = make_tool_call_id( id_type=self.tool_call_id_type, func_name=tc.name, idx=history_tool_call_cnt, ) tool_call_items.append( tool_call_class(id=generated_id, function=tc) ) history_tool_call_cnt += 1 message = ChatMessage( role=role, reasoning=reasoning, content=content, tool_calls=tool_call_items, ) else: # FOR NOW make it a chat message; we will have to detect # the type to make it later. ret_content = content # try to use content return from tool parser first, # tool parser may do some modify for the content. if content and len(content) > 0: ret_content = content message = ChatMessage( role=role, reasoning=reasoning, content=ret_content, ) # undetermined case that is still important to handle else: logger.error( "Error in chat_completion_full_generator - cannot determine" " if tools should be extracted. Returning a standard chat " "completion." ) message = ChatMessage(role=role, reasoning=reasoning, content=content) # In OpenAI's API, when a tool is called, the finish_reason is: # "tool_calls" for "auto" or "required" tool calls, # and "stop" for named tool calls. is_finish_reason_tool_calls = auto_tools_called or ( request.tool_choice and request.tool_choice == "required" and output.finish_reason == "stop" ) choice_data = ChatCompletionResponseChoice( index=output.index, message=message, logprobs=logprobs, finish_reason="tool_calls" if is_finish_reason_tool_calls else output.finish_reason if output.finish_reason else "stop", stop_reason=output.stop_reason, token_ids=( as_list(output.token_ids) if request.return_token_ids else None ), ) choice_data = maybe_filter_parallel_tool_calls(choice_data, request) choices.append(choice_data) if request.echo: last_msg_content: str | list[dict[str, str]] = "" if ( conversation and "content" in conversation[-1] and conversation[-1].get("role") == role ): last_msg_content = conversation[-1]["content"] or "" if isinstance(last_msg_content, list): last_msg_content = "\n".join(msg["text"] for msg in last_msg_content) for choice in choices: full_message = last_msg_content + (choice.message.content or "") choice.message.content = full_message assert final_res.prompt_token_ids is not None num_prompt_tokens = len(final_res.prompt_token_ids) if final_res.encoder_prompt_token_ids is not None: num_prompt_tokens += len(final_res.encoder_prompt_token_ids) num_generated_tokens = sum( len(output.token_ids) for output in final_res.outputs ) usage = UsageInfo( prompt_tokens=num_prompt_tokens, completion_tokens=num_generated_tokens, total_tokens=num_prompt_tokens + num_generated_tokens, ) if self.enable_prompt_tokens_details and final_res.num_cached_tokens: usage.prompt_tokens_details = PromptTokenUsageInfo( cached_tokens=final_res.num_cached_tokens ) request_metadata.final_usage_info = usage response = ChatCompletionResponse( id=request_id, created=created_time, model=model_name, choices=choices, usage=usage, prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs), prompt_token_ids=( final_res.prompt_token_ids if request.return_token_ids else None ), kv_transfer_params=final_res.kv_transfer_params, ) # Log complete response if output logging is enabled if self.enable_log_outputs and self.request_logger: for choice in choices: output_text = "" if choice.message.content: output_text = choice.message.content elif choice.message.tool_calls: # For tool calls, log the function name and arguments tool_call_descriptions = [] for tc in choice.message.tool_calls: # type: ignore function_call: FunctionCall = tc.function # type: ignore tool_call_descriptions.append( f"{function_call.name}({function_call.arguments})" ) tool_calls_str = ", ".join(tool_call_descriptions) output_text = f"[tool_calls: {tool_calls_str}]" if output_text: # Get the corresponding output token IDs output_token_ids = None if choice.index < len(final_res.outputs): output_token_ids = final_res.outputs[choice.index].token_ids self.request_logger.log_outputs( request_id=request_id, outputs=output_text, output_token_ids=output_token_ids, finish_reason=choice.finish_reason, is_streaming=False, delta=False, ) return response def _get_top_logprobs( self, logprobs: dict[int, Logprob], top_logprobs: int | None, tokenizer: TokenizerLike | None, should_return_as_token_id: bool, ) -> list[ChatCompletionLogProb]: return [ ChatCompletionLogProb( token=( token := self._get_decoded_token( p[1], p[0], tokenizer, return_as_token_id=should_return_as_token_id, ) ), logprob=max(p[1].logprob, -9999.0), bytes=list(token.encode("utf-8", errors="replace")), ) for i, p in enumerate(logprobs.items()) if (top_logprobs and i < top_logprobs or top_logprobs == -1) ] def _create_chat_logprobs( self, token_ids: GenericSequence[int], top_logprobs: GenericSequence[dict[int, Logprob] | None], tokenizer: TokenizerLike | None, num_output_top_logprobs: int | None = None, return_as_token_id: bool | None = None, ) -> ChatCompletionLogProbs: """Create OpenAI-style logprobs.""" logprobs_content: list[ChatCompletionLogProbsContent] = [] should_return_as_token_id = ( return_as_token_id if return_as_token_id is not None else self.return_tokens_as_token_ids ) for i, token_id in enumerate(token_ids): step_top_logprobs = top_logprobs[i] if step_top_logprobs is None or step_top_logprobs.get(token_id) is None: if should_return_as_token_id: token = f"token_id:{token_id}" else: if tokenizer is None: raise ValueError( "Unable to get tokenizer because `skip_tokenizer_init=True`" ) token = tokenizer.decode(token_id) logprobs_content.append( ChatCompletionLogProbsContent( token=token, bytes=list(token.encode("utf-8", errors="replace")), ) ) else: step_token = step_top_logprobs[token_id] step_decoded = step_token.decoded_token logprobs_content.append( ChatCompletionLogProbsContent( token=self._get_decoded_token( step_token, token_id, tokenizer, should_return_as_token_id, ), logprob=max(step_token.logprob, -9999.0), bytes=( None if step_decoded is None else list(step_decoded.encode("utf-8", errors="replace")) ), top_logprobs=self._get_top_logprobs( step_top_logprobs, num_output_top_logprobs, tokenizer, should_return_as_token_id, ), ) ) return ChatCompletionLogProbs(content=logprobs_content) def _should_stream_with_auto_tool_parsing(self, request: ChatCompletionRequest): """ Utility function to check if streamed tokens should go through the tool call parser that was configured. We only want to do this IF user-provided tools are set, a tool parser is configured, "auto" tool choice is enabled, and the request's tool choice field indicates that "auto" tool choice should be used. """ return ( request.tools and self.tool_parser and self.enable_auto_tools and request.tool_choice in ["auto", None] ) def _should_check_for_unstreamed_tool_arg_tokens( self, delta_message: DeltaMessage | None, output: CompletionOutput, ) -> bool: """ Check to see if we should check for unstreamed tool arguments tokens. This is only applicable when auto tool parsing is enabled, the delta is a tool call with arguments. """ return bool( # if there is a delta message that includes tool calls which # include a function that has arguments output.finish_reason is not None and self.enable_auto_tools and self.tool_parser and delta_message and delta_message.tool_calls and delta_message.tool_calls[0] and delta_message.tool_calls[0].function and delta_message.tool_calls[0].function.arguments is not None ) @staticmethod def _create_remaining_args_delta( delta_message: DeltaMessage, remaining_call: str, index: int, ) -> DeltaMessage: """ Create a delta message for remaining tool arguments, preserving id/type/name from the original delta. """ original_tc = next( (tc for tc in delta_message.tool_calls if tc.index == index), None, ) original_fn = original_tc.function if original_tc else None return DeltaMessage( tool_calls=[ DeltaToolCall( index=index, id=original_tc.id if original_tc else None, type=original_tc.type if original_tc else None, function=DeltaFunctionCall( name=original_fn.name if original_fn else None, arguments=remaining_call, ), ) ] ) def _make_request_with_harmony( self, request: ChatCompletionRequest, should_include_tools: bool = True, ): messages: list[OpenAIMessage] = [] # because of issues with pydantic we need to potentially # re-serialize the tool_calls field of the request # for more info: see comment in `maybe_serialize_tool_calls` _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] # Add system message. # NOTE: In Chat Completion API, browsing is enabled by default # if the model supports it. TODO: Support browsing. assert not self.supports_browsing assert not self.supports_code_interpreter if (reasoning_effort := request.reasoning_effort) == "none": raise ValueError(f"Harmony does not support {reasoning_effort=}") sys_msg = get_system_message( reasoning_effort=reasoning_effort, browser_description=None, python_description=None, with_custom_tools=should_include_tools, ) messages.append(sys_msg) # Add developer message. if request.tools: dev_msg = get_developer_message( tools=request.tools if should_include_tools else None # type: ignore[arg-type] ) messages.append(dev_msg) # Add user message. messages.extend(parse_chat_inputs_to_harmony_messages(request.messages)) # Render prompt token ids. prompt_token_ids = render_for_completion(messages) engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids) # Add cache_salt if provided in the request if request.cache_salt is not None: engine_prompt["cache_salt"] = request.cache_salt return messages, [engine_prompt]