Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

38d80967 · zhuwenwen · 33650733 · 880c741b · 38d80967 · 38d80967
Commit 38d80967 authored Sep 12, 2025 by zhuwenwen
20 changed files
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -201,3 +201,32 @@ table: "table_1" | "table_2"
 condition: column "=" number
 number: "1" | "2"
 """)
+@pytest.fixture(scope="session")
+def zephyr_lora_files():
+    """Download zephyr LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+@pytest.fixture(scope="session")
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
+    """Create zephyr LoRA files with added tokens once per test session."""
+    import shutil
+    from tempfile import TemporaryDirectory
+    from transformers import AutoTokenizer
+    tmp_dir = TemporaryDirectory()
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
+    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+    # Copy tokenizer to adapter and add some unique tokens
+    # 32000, 32001, 32002
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
+    tokenizer.save_pretrained(tmp_model_dir)
+    yield tmp_model_dir
+    tmp_dir.cleanup()
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -7,7 +7,7 @@ import pytest
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
-from ..openai.test_vision import TEST_IMAGE_URLS
+from ..openai.test_vision import TEST_IMAGE_ASSETS
 @pytest.fixture(scope="function")
@@ -95,7 +95,8 @@ def vision_llm():
 @pytest.mark.parametrize("image_urls",
-                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+                         [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]],
+                         indirect=True)
 def test_chat_multi_image(vision_llm, image_urls: list[str]):
    messages = [{
        "role":

--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -79,7 +79,7 @@ def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
            )
            # Need to re-import huggingface_hub
-            # and friends to setup offline mode
+            # and friends to set up offline mode
            _re_import_modules()
            # Cached model files should be used in offline mode
            for model_config in MODEL_CONFIGS:
@@ -136,7 +136,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
                disable_connect,
            )
            # Need to re-import huggingface_hub
-            # and friends to setup offline mode
+            # and friends to set up offline mode
            _re_import_modules()
            engine_args = EngineArgs(model="facebook/opt-125m")
            LLM(**dataclasses.asdict(engine_args))

--- a/tests/entrypoints/openai/conftest.py
+++ b/tests/entrypoints/openai/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+from vllm.assets.audio import AudioAsset
+@pytest.fixture
+def mary_had_lamb():
+    path = AudioAsset('mary_had_lamb').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+@pytest.fixture
+def winning_call():
+    path = AudioAsset('winning_call').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+@pytest.fixture
+def foscolo():
+    # Test translation it->en
+    path = AudioAsset('azacinto_foscolo').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -32,7 +32,7 @@ def to_bytes(y, sr):
 async def transcribe_audio(client, tokenizer, y, sr):
    # Send loaded audio directly instead of loading from disk,
-    # dont account for that time though
+    # don't account for that time though
    with to_bytes(y, sr) as f:
        start_time = time.perf_counter()
        transcription = await client.audio.transcriptions.create(

--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -12,11 +12,9 @@ import pytest_asyncio
 import regex as re
 import requests
 import torch
-from openai import BadRequestError, OpenAI
+from openai import BadRequestError
 from ...utils import RemoteOpenAIServer
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -970,59 +968,6 @@ async def test_long_seed(client: openai.AsyncOpenAI):
                or "less_than_equal" in exc_info.value.message)
-@pytest.mark.asyncio
-async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
-    url = f"http://localhost:{server.port}/v1/chat/completions"
-    headers = {
-        "Content-Type": "application/json",
-    }
-    data = {
-        # model_name is avoided here.
-        "messages": [{
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }, {
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
-        "max_tokens":
-        5
-    }
-    response = requests.post(url, headers=headers, json=data)
-    response_data = response.json()
-    print(response_data)
-    assert response_data.get("model") == MODEL_NAME
-    choice = response_data.get("choices")[0]
-    message = choice.get("message")
-    assert message is not None
-    content = message.get("content")
-    assert content is not None
-    assert len(content) > 0
-@pytest.mark.asyncio
-async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer):
-    openai_api_key = "EMPTY"
-    openai_api_base = f"http://localhost:{server.port}/v1"
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-    messages = [
-        {
-            "role": "user",
-            "content": "Hello, vLLM!"
-        },
-    ]
-    response = client.chat.completions.create(
-        model="",  # empty string
-        messages=messages,
-    )
-    assert response.model == MODEL_NAME
 @pytest.mark.asyncio
 async def test_invocations(server: RemoteOpenAIServer,
                           client: openai.AsyncOpenAI):

--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -104,7 +104,9 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
        trust_remote_code=model_info.trust_remote_code,
        revision=model_info.revision,
        hf_overrides=model_info.hf_overrides,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
    # Initialize the tokenizer
    tokenizer = get_tokenizer(

--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -3,8 +3,6 @@
 # imports for guided decoding tests
 import json
 import os
-import shutil
-from tempfile import TemporaryDirectory
 from typing import Optional
 import jsonschema
@@ -14,9 +12,7 @@ import pytest_asyncio
 import regex as re
 import requests
 # downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from openai import BadRequestError
-from transformers import AutoTokenizer
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -26,32 +22,10 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically these adapters use a different base model,
 # but we're not testing generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
 GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-@pytest.fixture(scope="module")
-def zephyr_lora_added_tokens_files(zephyr_lora_files):
-    tmp_dir = TemporaryDirectory()
-    tmp_model_dir = f"{tmp_dir.name}/zephyr"
-    shutil.copytree(zephyr_lora_files, tmp_model_dir)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    # Copy tokenizer to adapter and add some unique tokens
-    # 32000, 32001, 32002
-    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-                                 special_tokens=True)
-    assert added == 3
-    tokenizer.save_pretrained(tmp_model_dir)
-    yield tmp_model_dir
-    tmp_dir.cleanup()
 @pytest.fixture(scope="module")
 def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
    return [

--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -3,48 +3,23 @@
 import base64
 import io
-import shutil
-from tempfile import TemporaryDirectory
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
 import torch
 # downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from openai import BadRequestError
-from transformers import AutoConfig, AutoTokenizer
+from transformers import AutoConfig
 from ...utils import RemoteOpenAIServer
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
 CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-@pytest.fixture(scope="module")
-def zephyr_lora_added_tokens_files(zephyr_lora_files):
-    tmp_dir = TemporaryDirectory()
-    tmp_model_dir = f"{tmp_dir.name}/zephyr"
-    shutil.copytree(zephyr_lora_files, tmp_model_dir)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    # Copy tokenizer to adapter and add some unique tokens
-    # 32000, 32001, 32002
-    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-                                 special_tokens=True)
-    assert added == 3
-    tokenizer.save_pretrained(tmp_model_dir)
-    yield tmp_model_dir
-    tmp_dir.cleanup()
 @pytest.fixture(scope="module")
 def default_server_args(
    zephyr_lora_files,

--- a/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/tests/entrypoints/openai/test_encoder_decoder.py
@@ -30,6 +30,7 @@ async def client(server):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="bart is not yet supported in V1")
 async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
    completion = await client.completions.create(model=model_name,
                                                 prompt="Hello, my name is",

--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -9,8 +9,6 @@ from contextlib import suppress
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from ...utils import RemoteOpenAIServer
@@ -18,7 +16,6 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
 BADREQUEST_CASES = [
    (
@@ -48,11 +45,6 @@ BADREQUEST_CASES = [
 ]
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
 @pytest.fixture(scope="module")
 def monkeypatch_module():
    from _pytest.monkeypatch import MonkeyPatch

--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
    "vllm:request_params_max_tokens_sum",
    "vllm:request_params_max_tokens_bucket",
    "vllm:request_params_max_tokens_count",
-    "vllm:time_to_first_token_seconds_sum",
-    "vllm:time_to_first_token_seconds_bucket",
-    "vllm:time_to_first_token_seconds_count",
    "vllm:time_per_output_token_seconds_sum",
    "vllm:time_per_output_token_seconds_bucket",
    "vllm:time_per_output_token_seconds_count",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:inter_token_latency_seconds_sum",
+    "vllm:inter_token_latency_seconds_bucket",
+    "vllm:inter_token_latency_seconds_count",
    "vllm:e2e_request_latency_seconds_sum",
    "vllm:e2e_request_latency_seconds_bucket",
    "vllm:e2e_request_latency_seconds_count",
@@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
    "vllm:request_decode_time_seconds_count",
 ]
-HIDDEN_DEPRECATED_METRICS: list[str] = []
+HIDDEN_DEPRECATED_METRICS: list[str] = [
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
+]
 @pytest.mark.asyncio
@@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
    assert response.status_code == HTTPStatus.OK
    for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
-        if (not server.show_hidden_metrics
+        if (metric in HIDDEN_DEPRECATED_METRICS
-                and metric not in HIDDEN_DEPRECATED_METRICS):
+                and not server.show_hidden_metrics):
-            assert metric in response.text
+            continue
+        assert metric in response.text
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -4,8 +4,6 @@
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from ...utils import RemoteOpenAIServer
@@ -13,12 +11,6 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
 @pytest.fixture(scope="module")

--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -10,7 +10,7 @@ import pytest
 import regex as re
 import torch
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.renderer import BaseRenderer
 from ...utils import RemoteOpenAIServer
@@ -27,12 +27,16 @@ async def test_empty_prompt():
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
-        with pytest.raises(openai.BadRequestError,
+        with pytest.raises(
-                           match="decoder prompt cannot be empty"):
+                openai.BadRequestError,
+                match=
+                "Either prompt or prompt_embeds must be provided and non-empty."
+        ):
            await client.completions.create(model=model_name,
                                            prompt="",
                                            max_tokens=5,
-                                            temperature=0.0)
+                                            temperature=0.0,
+                                            extra_body={"prompt_embeds": []})
 @pytest.mark.asyncio
@@ -83,7 +87,7 @@ def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
    buffer.seek(0)
    encoded_tensor = pybase64.b64encode(buffer.getvalue())
-    loaded_prompt_embeds = OpenAIServing._load_prompt_embeds(encoded_tensor)
+    loaded_prompt_embeds = BaseRenderer.load_prompt_embeds(encoded_tensor)
    assert len(loaded_prompt_embeds) == 1
    loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
    assert loaded_tensor.device.type == "cpu"

--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -275,7 +275,8 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_streaming(client: OpenAI, model_name: str):
+@pytest.mark.parametrize("background", [True, False])
+async def test_streaming(client: OpenAI, model_name: str, background: bool):
    # TODO: Add back when web search and code interpreter are available in CI
    prompts = [
        "tell me a story about a cat in 20 words",
@@ -300,11 +301,16 @@ async def test_streaming(client: OpenAI, model_name: str):
                # },
            ],
            stream=True,
+            background=background,
        )
        events = []
        current_event_mode = None
+        resp_id = None
        async for event in response:
+            if event.type == "response.created":
+                resp_id = event.response.id
            if current_event_mode != event.type:
                current_event_mode = event.type
                print(f"\n[{event.type}] ", end="", flush=True)
@@ -322,6 +328,17 @@ async def test_streaming(client: OpenAI, model_name: str):
        assert len(events) > 0
+        if background:
+            starting_after = 5
+            async with await client.responses.retrieve(
+                    response_id=resp_id,
+                    stream=True,
+                    starting_after=starting_after) as stream:
+                counter = starting_after
+                async for event in stream:
+                    counter += 1
+                    assert event == events[counter]
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])

--- a/tests/entrypoints/openai/test_return_token_ids.py
+++ b/tests/entrypoints/openai/test_return_token_ids.py
@@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
                logprobs_token_ids.append(token_id)
        # When echo=True, the logprobs include both prompt and response tokens
-        # The token_ids field should match the the suffix of response portion
+        # The token_ids field should match the suffix of response portion
        # The prompt_token_ids should match the prompt portion
        assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
        response_token_ids_length = len(completion.choices[0].token_ids)

--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -11,8 +11,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 from ...utils import RemoteOpenAIServer
 from .test_completion import default_server_args  # noqa: F401
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
 from .test_completion import MODEL_NAME

--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
 import asyncio
 from contextlib import suppress
 from dataclasses import dataclass, field
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 from unittest.mock import MagicMock
 import pytest
+import pytest_asyncio
 from vllm.config import MultiModalConfig
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
@@ -17,9 +20,205 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                    OpenAIServingModels)
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from ...utils import RemoteOpenAIServer
+if TYPE_CHECKING:
+    from openai import OpenAI
+GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+@pytest.fixture(scope="module",
+                params=[True, False],
+                ids=["with_tool_parser", "without_tool_parser"])
+def with_tool_parser(request) -> bool:
+    return request.param
+@pytest.fixture(scope="module")
+def default_server_args(with_tool_parser: bool):
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--enforce-eager",
+        "--max-model-len",
+        "4096",
+        "--reasoning-parser",
+        "openai_gptoss",
+        "--gpu-memory-utilization",
+        "0.8",
+    ]
+    if with_tool_parser:
+        args.extend([
+            "--tool-call-parser",
+            "openai",
+            "--enable-auto-tool-choice",
+        ])
+    return args
+@pytest.fixture(scope="module")
+def gptoss_server(monkeypatch_module: pytest.MonkeyPatch,
+                  default_server_args: list[str]):
+    with monkeypatch_module.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
+        with RemoteOpenAIServer(GPT_OSS_MODEL_NAME,
+                                default_server_args) as remote_server:
+            yield remote_server
+@pytest_asyncio.fixture
+async def gptoss_client(gptoss_server):
+    async with gptoss_server.get_async_client() as async_client:
+        yield async_client
+@pytest.mark.asyncio
+async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI,
+                                                with_tool_parser: bool):
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string"
+                    },
+                    "state": {
+                        "type": "string"
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }]
+    messages = [
+        {
+            "role": "user",
+            "content": "What is the weather in Dallas, TX?"
+        },
+    ]
+    stream = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools if with_tool_parser else None,
+        stream=True)
+    name = None
+    args_buf = ""
+    content_buf = ""
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.tool_calls:
+            tc = delta.tool_calls[0]
+            if tc.function and tc.function.name:
+                name = tc.function.name
+            if tc.function and tc.function.arguments:
+                args_buf += tc.function.arguments
+        if getattr(delta, "content", None):
+            content_buf += delta.content
+    if with_tool_parser:
+        assert name is not None
+        assert len(args_buf) > 0
+    else:
+        assert name is None
+        assert len(args_buf) == 0
+        assert len(content_buf) > 0
+@pytest.mark.asyncio
+async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI,
+                                       with_tool_parser: bool):
+    if not with_tool_parser:
+        pytest.skip("skip non-tool for multi-turn tests")
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string"
+                    },
+                    "state": {
+                        "type": "string"
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }]
+    messages = [
+        {
+            "role": "system",
+            "content": "you are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "What is the weather in Dallas, TX with celsius?"
+        },
+    ]
+    first = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools,
+        temperature=0.0,
+    )
+    first_msg = first.choices[0].message
+    assert first_msg.tool_calls is not None and len(first_msg.tool_calls) > 0
+    tc = first_msg.tool_calls[0]
+    assert tc.function is not None and tc.function.name == "get_current_weather"
+    args1 = tc.function.arguments
+    assert args1 is not None and len(args1) > 0
+    messages.append({"role": "assistant", "content": args1})
+    messages.append({
+        "role": "user",
+        "content": "Now convert to celsius and return JSON only"
+    })
+    second = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools,
+        temperature=0.0,
+    )
+    second_msg = second.choices[0].message
+    assert (second_msg.content is not None and len(second_msg.content) > 0) or \
+        (second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0)
 MODEL_NAME = "openai-community/gpt2"
+MODEL_NAME_SHORT = "gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
-BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+BASE_MODEL_PATHS = [
+    BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
+    BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT)
+]
 @dataclass
@@ -75,6 +274,42 @@ def test_async_serving_chat_init():
    assert serving_completion.chat_template == CHAT_TEMPLATE
+@pytest.mark.asyncio
+async def test_serving_chat_returns_correct_model_name():
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=MockModelConfig())
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     MockModelConfig(),
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+    messages = [{"role": "user", "content": "what is 1+1?"}]
+    async def return_model_name(*args):
+        return args[3]
+    serving_chat.chat_completion_full_generator = return_model_name
+    # Test that full name is returned when short name is requested
+    req = ChatCompletionRequest(model=MODEL_NAME_SHORT, messages=messages)
+    assert await serving_chat.create_chat_completion(req) == MODEL_NAME
+    # Test that full name is returned when empty string is specified
+    req = ChatCompletionRequest(model="", messages=messages)
+    assert await serving_chat.create_chat_completion(req) == MODEL_NAME
+    # Test that full name is returned when no model is specified
+    req = ChatCompletionRequest(messages=messages)
+    assert await serving_chat.create_chat_completion(req) == MODEL_NAME
 @pytest.mark.asyncio
 async def test_serving_chat_should_set_correct_max_tokens():
    mock_engine = MagicMock(spec=MQLLMEngineClient)
@@ -313,7 +548,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
        }],
    )
-    # By default cache_salt in the engine prompt is not set
+    # By default, cache_salt in the engine prompt is not set
    with suppress(Exception):
        await serving_chat.create_chat_completion(req)
    assert "cache_salt" not in mock_engine.generate.call_args.args[0]

--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -11,7 +11,7 @@ import torch
 from ...utils import RemoteOpenAIServer
-MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
+MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 DTYPE = "float16"
@@ -35,7 +35,9 @@ def server():
        "--trust-remote-code",
        "--skip-tokenizer-init",
        "--max-num-seqs",
-        "32"
+        "32",
+        "--model-impl",
+        "terratorch"
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:

--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -8,8 +8,6 @@ import requests
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from ...utils import RemoteOpenAIServer
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"