# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import asyncio from contextlib import suppress from dataclasses import dataclass, field from typing import Any from unittest.mock import AsyncMock, MagicMock import pytest import pytest_asyncio from openai import OpenAI from vllm.config.multimodal import MultiModalConfig from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.tokenizers import get_tokenizer from vllm.v1.engine.async_llm import AsyncLLM from ...utils import RemoteOpenAIServer, models_path_prefix GPT_OSS_MODEL_NAME = os.path.join(models_path_prefix, "openai/gpt-oss-20b") @pytest.fixture(scope="module") def monkeypatch_module(): from _pytest.monkeypatch import MonkeyPatch mpatch = MonkeyPatch() yield mpatch mpatch.undo() @pytest.fixture( scope="module", params=[True, False], ids=["with_tool_parser", "without_tool_parser"], ) def with_tool_parser(request) -> bool: return request.param @pytest.fixture(scope="module") def default_server_args(with_tool_parser: bool): args = [ # use half precision for speed and memory savings in CI environment "--enforce-eager", "--max-model-len", "4096", "--reasoning-parser", "openai_gptoss", "--gpu-memory-utilization", "0.8", ] if with_tool_parser: args.extend( [ "--tool-call-parser", "openai", "--enable-auto-tool-choice", ] ) return args @pytest.fixture(scope="module") def gptoss_server( monkeypatch_module: pytest.MonkeyPatch, default_server_args: list[str] ): with monkeypatch_module.context() as m: m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN") with RemoteOpenAIServer( GPT_OSS_MODEL_NAME, default_server_args ) as remote_server: yield remote_server @pytest_asyncio.fixture async def gptoss_client(gptoss_server): async with gptoss_server.get_async_client() as async_client: yield async_client @pytest.mark.asyncio async def test_gpt_oss_chat_tool_call_streaming( gptoss_client: OpenAI, with_tool_parser: bool ): tools = [ { "type": "function", "function": { "name": "get_current_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "city": {"type": "string"}, "state": {"type": "string"}, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"], }, }, "required": ["city", "state", "unit"], }, }, } ] messages = [ {"role": "user", "content": "What is the weather in Dallas, TX?"}, ] stream = await gptoss_client.chat.completions.create( model=GPT_OSS_MODEL_NAME, messages=messages, tools=tools if with_tool_parser else None, stream=True, ) name = None args_buf = "" content_buf = "" async for chunk in stream: delta = chunk.choices[0].delta if delta.tool_calls: tc = delta.tool_calls[0] if tc.function and tc.function.name: name = tc.function.name if tc.function and tc.function.arguments: args_buf += tc.function.arguments if getattr(delta, "content", None): content_buf += delta.content if with_tool_parser: assert name is not None assert len(args_buf) > 0 else: assert name is None assert len(args_buf) == 0 assert len(content_buf) > 0 @pytest.mark.asyncio async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI, with_tool_parser: bool): if not with_tool_parser: pytest.skip("skip non-tool for multi-turn tests") tools = [ { "type": "function", "function": { "name": "get_current_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "city": {"type": "string"}, "state": {"type": "string"}, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"], }, }, "required": ["city", "state", "unit"], }, }, } ] messages = [ {"role": "system", "content": "you are a helpful assistant"}, {"role": "user", "content": "What is the weather in Dallas, TX with celsius?"}, ] first = await gptoss_client.chat.completions.create( model=GPT_OSS_MODEL_NAME, messages=messages, tools=tools, temperature=0.0, ) first_msg = first.choices[0].message assert first_msg.tool_calls is not None and len(first_msg.tool_calls) > 0 tc = first_msg.tool_calls[0] assert tc.function is not None and tc.function.name == "get_current_weather" args1 = tc.function.arguments assert args1 is not None and len(args1) > 0 assert not first_msg.content messages.append({"role": "assistant", "content": args1}) messages.append( {"role": "user", "content": "Now convert to celsius and return JSON only"} ) second = await gptoss_client.chat.completions.create( model=GPT_OSS_MODEL_NAME, messages=messages, tools=tools, temperature=0.0, ) second_msg = second.choices[0].message assert (second_msg.content is not None and len(second_msg.content) > 0) or ( second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0 ) @pytest.mark.asyncio async def test_gpt_oss_tool_message_array_content( gptoss_client: OpenAI, with_tool_parser: bool ): """Test that tool messages support both string and array content formats.""" if not with_tool_parser: pytest.skip("skip non-tool for array content tests") tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "city": {"type": "string"}, "state": {"type": "string"}, }, "required": ["city", "state"], }, }, } ] # Test 1: Tool message with string content messages_string = [ {"role": "user", "content": "What's the weather in Paris?"}, { "role": "assistant", "tool_calls": [ { "id": "call_123", "type": "function", "function": { "name": "get_weather", "arguments": '{"city": "Paris", "state": "TX"}', }, } ], }, {"role": "tool", "content": "The weather in Paris, TX is sunny, 22°C"}, ] response_string = await gptoss_client.chat.completions.create( model=GPT_OSS_MODEL_NAME, messages=messages_string, tools=tools, temperature=0.0, ) assert response_string is not None assert response_string.choices[0].message is not None # Test 2: Tool message with array content messages_array = [ {"role": "user", "content": "What's the weather in Dallas?"}, { "role": "assistant", "tool_calls": [ { "id": "call_456", "type": "function", "function": { "name": "get_weather", "arguments": '{"city": "Dallas", "state": "TX"}', }, } ], }, { "role": "tool", "content": [ {"type": "text", "text": "f2e897a7-2705-4337-8193-2a8f57b81618"} ], }, ] response_array = await gptoss_client.chat.completions.create( model=GPT_OSS_MODEL_NAME, messages=messages_array, tools=tools, temperature=0.0, ) assert response_array is not None assert response_array.choices[0].message is not None # Test 3: Tool message with multiple array content items messages_multi_array = [ {"role": "user", "content": "Search for information"}, { "role": "assistant", "tool_calls": [ { "id": "call_789", "type": "function", "function": { "name": "get_weather", "arguments": '{"city": "Austin", "state": "TX"}', }, } ], }, { "role": "tool", "content": [ {"type": "text", "text": "Weather data: "}, {"type": "text", "text": "Austin, TX - Partly cloudy, 25°C"}, {"type": "text", "text": " with 60% humidity"}, ], }, ] response_multi_array = await gptoss_client.chat.completions.create( model=GPT_OSS_MODEL_NAME, messages=messages_multi_array, tools=tools, temperature=0.0, ) assert response_multi_array is not None assert response_multi_array.choices[0].message is not None MODEL_NAME = os.path.join(models_path_prefix, "openai-community/gpt2") MODEL_NAME_SHORT = os.path.join(models_path_prefix, "gpt2") CHAT_TEMPLATE = "Dummy chat template for testing {}" BASE_MODEL_PATHS = [ BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME), BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT), ] @dataclass class MockHFConfig: model_type: str = "any" @dataclass class MockModelConfig: task = "generate" runner_type = "generate" tokenizer = MODEL_NAME trust_remote_code = False tokenizer_mode = "auto" max_model_len = 100 tokenizer_revision = None multimodal_config = MultiModalConfig() hf_config = MockHFConfig() logits_processors: list[str] | None = None logits_processor_pattern = None diff_sampling_param: dict | None = None allowed_local_media_path: str = "" allowed_media_domains: list[str] | None = None encoder_config = None generation_config: str = "auto" media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) skip_tokenizer_init = False def get_diff_sampling_param(self): return self.diff_sampling_param or {} def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: models = OpenAIServingModels( engine_client=engine, base_model_paths=BASE_MODEL_PATHS, ) serving_chat = OpenAIServingChat( engine, models, response_role="assistant", chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", request_logger=None, ) async def _fake_process_inputs( request_id, engine_prompt, sampling_params, *, lora_request, trace_headers, priority, ): return dict(engine_prompt), {} serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs) return serving_chat @dataclass class MockEngine: model_config: MockModelConfig = field(default_factory=MockModelConfig) input_processor: MagicMock = field(default_factory=MagicMock) io_processor: MagicMock = field(default_factory=MagicMock) async def _async_serving_chat_init(): engine = MockEngine() models = OpenAIServingModels(engine, BASE_MODEL_PATHS) serving_completion = OpenAIServingChat( engine, models, response_role="assistant", chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", request_logger=None, ) return serving_completion def test_async_serving_chat_init(): serving_completion = asyncio.run(_async_serving_chat_init()) assert serving_completion.chat_template == CHAT_TEMPLATE @pytest.mark.asyncio async def test_serving_chat_returns_correct_model_name(): mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() serving_chat = _build_serving_chat(mock_engine) messages = [{"role": "user", "content": "what is 1+1?"}] async def return_model_name(*args): return args[3] serving_chat.chat_completion_full_generator = return_model_name # Test that full name is returned when short name is requested req = ChatCompletionRequest(model=MODEL_NAME_SHORT, messages=messages) assert await serving_chat.create_chat_completion(req) == MODEL_NAME # Test that full name is returned when empty string is specified req = ChatCompletionRequest(model="", messages=messages) assert await serving_chat.create_chat_completion(req) == MODEL_NAME # Test that full name is returned when no model is specified req = ChatCompletionRequest(messages=messages) assert await serving_chat.create_chat_completion(req) == MODEL_NAME @pytest.mark.asyncio async def test_serving_chat_should_set_correct_max_tokens(): mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() serving_chat = _build_serving_chat(mock_engine) req = ChatCompletionRequest( model=MODEL_NAME, messages=[{"role": "user", "content": "what is 1+1?"}], ) with suppress(Exception): await serving_chat.create_chat_completion(req) assert mock_engine.generate.call_args.args[1].max_tokens == 93 req.max_tokens = 10 with suppress(Exception): await serving_chat.create_chat_completion(req) assert mock_engine.generate.call_args.args[1].max_tokens == 10 # Setting server's max_tokens in the generation_config.json # lower than context_window - prompt_tokens mock_model_config = MockModelConfig() mock_model_config.diff_sampling_param = { "max_tokens": 10 # Setting server-side max_tokens limit } # Reinitialize the engine with new settings mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() # Initialize the serving chat serving_chat = _build_serving_chat(mock_engine) # Test Case 1: No max_tokens specified in request req = ChatCompletionRequest( model=MODEL_NAME, messages=[{"role": "user", "content": "what is 1+1?"}], ) with suppress(Exception): await serving_chat.create_chat_completion(req) assert mock_engine.generate.call_args.args[1].max_tokens == 10 # Test Case 2: Request's max_tokens set higher than server accepts req.max_tokens = 15 with suppress(Exception): await serving_chat.create_chat_completion(req) assert mock_engine.generate.call_args.args[1].max_tokens == 10 # Test Case 3: Request's max_tokens set lower than server accepts req.max_tokens = 5 with suppress(Exception): await serving_chat.create_chat_completion(req) assert mock_engine.generate.call_args.args[1].max_tokens == 5 # Setting server's max_tokens in the generation_config.json # higher than context_window - prompt_tokens mock_model_config = MockModelConfig() mock_model_config.diff_sampling_param = { "max_tokens": 200 # Setting server-side max_tokens limit } # Reinitialize the engine with new settings mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() # Initialize the serving chat serving_chat = _build_serving_chat(mock_engine) # Test case 1: No max_tokens specified, defaults to context_window req = ChatCompletionRequest( model=MODEL_NAME, messages=[{"role": "user", "content": "what is 1+1?"}], ) with suppress(Exception): await serving_chat.create_chat_completion(req) assert mock_engine.generate.call_args.args[1].max_tokens == 93 # Test Case 2: Request's max_tokens set higher than server accepts req.max_tokens = 100 with suppress(Exception): await serving_chat.create_chat_completion(req) assert mock_engine.generate.call_args.args[1].max_tokens == 93 # Test Case 3: Request's max_tokens set lower than server accepts req.max_tokens = 5 with suppress(Exception): await serving_chat.create_chat_completion(req) assert mock_engine.generate.call_args.args[1].max_tokens == 5 @pytest.mark.asyncio async def test_serving_chat_could_load_correct_generation_config(): mock_model_config = MockModelConfig() mock_model_config.diff_sampling_param = { "temperature": 0.5, "repetition_penalty": 1.05, } mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() # Initialize the serving chat serving_chat = _build_serving_chat(mock_engine) req = ChatCompletionRequest( model=MODEL_NAME, messages=[{"role": "user", "content": "what is 1+1?"}], ) with suppress(Exception): await serving_chat.create_chat_completion(req) assert mock_engine.generate.call_args.args[1].temperature == 0.5 assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05 # Test the param when user set it req.temperature = 0.1 with suppress(Exception): await serving_chat.create_chat_completion(req) assert mock_engine.generate.call_args.args[1].temperature == 0.1 assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05 # Test When temperature==0.0 req.temperature = 0.0 with suppress(Exception): await serving_chat.create_chat_completion(req) assert mock_engine.generate.call_args.args[1].temperature == 0.0 assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05 @pytest.mark.parametrize("model_type", ["gpt_oss", "any"]) @pytest.mark.asyncio async def test_serving_chat_did_set_correct_cache_salt(model_type): mock_model_config = MockModelConfig() mock_model_config.hf_config.model_type = model_type mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() serving_chat = _build_serving_chat(mock_engine) # Test cache_salt req = ChatCompletionRequest( model=MODEL_NAME, messages=[{"role": "user", "content": "what is 1+1?"}], ) # By default, cache_salt in the engine prompt is not set with suppress(Exception): await serving_chat.create_chat_completion(req) engine_prompt = serving_chat._process_inputs.await_args_list[0].args[1] assert "cache_salt" not in engine_prompt # Test with certain cache_salt req.cache_salt = "test_salt" with suppress(Exception): await serving_chat.create_chat_completion(req) engine_prompt = serving_chat._process_inputs.await_args_list[1].args[1] assert engine_prompt.get("cache_salt") == "test_salt" @pytest.mark.asyncio async def test_serving_chat_data_parallel_rank_extraction(): """Test that data_parallel_rank is properly extracted from header and passed to engine.""" mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() # Mock the generate method to return an async generator async def mock_generate(*args, **kwargs): # Yield a fake RequestOutput from vllm.outputs import CompletionOutput, RequestOutput yield RequestOutput( request_id="test-request", prompt="test prompt", prompt_token_ids=[1, 2, 3], prompt_logprobs=None, outputs=[ CompletionOutput( index=0, text="test response", token_ids=[4, 5, 6], cumulative_logprob=0.0, logprobs=None, finish_reason="stop", stop_reason=None, ) ], finished=True, ) mock_engine.generate = AsyncMock(side_effect=mock_generate) serving_chat = _build_serving_chat(mock_engine) # Test when data_parallel_rank is present in header req = ChatCompletionRequest( model=MODEL_NAME, messages=[{"role": "user", "content": "what is 1+1?"}], ) # Mock request with X-data-parallel-rank header mock_raw_request = MagicMock() mock_raw_request.headers = {"X-data-parallel-rank": "2"} mock_raw_request.state = MagicMock() with suppress(Exception): await serving_chat.create_chat_completion(req, mock_raw_request) # Verify that data_parallel_rank was passed to engine.generate assert "data_parallel_rank" in mock_engine.generate.call_args.kwargs assert mock_engine.generate.call_args.kwargs["data_parallel_rank"] == 2 # Test when data_parallel_rank is not present (defaults to None) req_no_dp = ChatCompletionRequest( model=MODEL_NAME, messages=[{"role": "user", "content": "what is 2+2?"}], ) # Mock request with no header mock_raw_request_no_dp = MagicMock() mock_raw_request_no_dp.headers = {} mock_raw_request_no_dp.state = MagicMock() with suppress(Exception): await serving_chat.create_chat_completion(req_no_dp, mock_raw_request_no_dp) # Verify that data_parallel_rank defaults to None assert "data_parallel_rank" in mock_engine.generate.call_args.kwargs assert mock_engine.generate.call_args.kwargs["data_parallel_rank"] is None