Merge tag 'v0.8.2' into v0.8.2-dev

469e903b · zhuwenwen · 389ebcf7 · 25f560a6 · 469e903b · 469e903b
Commit 469e903b authored Mar 28, 2025 by zhuwenwen
20 changed files
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -2,7 +2,7 @@
 import contextlib
 import os
-from typing import Any, List, NamedTuple
+from typing import Any, NamedTuple
 import openai  # use the official client for correctness check
 import pytest
@@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
 # # any model with a chat template should work here
 MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
-DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 API_KEY = "abc-123"
 ERROR_API_KEY = "abc"
 ROOT_PATH = "llm"
@@ -28,8 +27,6 @@ def server():
        "4080",
        "--root-path",  # use --root-path=/llm for testing
        "/" + ROOT_PATH,
-        "--chat-template",
-        DUMMY_CHAT_TEMPLATE,
    ]
    envs = os.environ.copy()
@@ -40,7 +37,7 @@ def server():
 class TestCase(NamedTuple):
    model_name: str
-    base_url: List[str]
+    base_url: list[str]
    api_key: str
    expected_error: Any

--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -36,10 +36,10 @@ INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/c
 INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
-INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
+INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}
-{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
+{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}
 {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
 INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
@@ -55,7 +55,7 @@ def test_empty_file():
        proc = subprocess.Popen([
            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
            input_file.name, "-o", output_file.name, "--model",
-            os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
+            os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
        ], )
        proc.communicate()
        proc.wait()
@@ -115,7 +115,7 @@ def test_embeddings():
        proc = subprocess.Popen([
            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
            input_file.name, "-o", output_file.name, "--model",
-            os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
+            os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
        ], )
        proc.communicate()
        proc.wait()

--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
 # SPDX-License-Identifier: Apache-2.0
 import os
+import math
+from typing import Any
 import pytest
 import requests
+import torch.nn.functional as F
+from torch import tensor
 from vllm.entrypoints.openai.protocol import ScoreResponse
 from ...utils import RemoteOpenAIServer, models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3")
+MODELS = [
+    {
+        "name": os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"),
-@pytest.fixture(scope="module")
+        "is_cross_encoder": True
-def server():
+    },
-    args = ["--enforce-eager", "--max-model-len", "100"]
+    {
+        "name": "BAAI/bge-base-en-v1.5",
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        "is_cross_encoder": False
+    },
+]
+DTYPE = "half"
+def run_transformers(hf_model, model, text_pairs):
+    if model["is_cross_encoder"]:
+        return hf_model.predict(text_pairs).tolist()
+    else:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        return [
+            F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
+            for pair in hf_embeddings
+        ]
+@pytest.fixture(scope="class", params=MODELS)
+def model(request):
+    yield request.param
+@pytest.fixture(scope="class")
+def server(model: dict[str, Any]):
+    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
+    with RemoteOpenAIServer(model["name"], args) as remote_server:
        yield remote_server
-@pytest.mark.asyncio
+@pytest.fixture(scope="class")
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def runner(model: dict[str, Any], hf_runner):
-def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str):
+    kwargs = {
-    text_1 = "What is the capital of France?"
+        "dtype": DTYPE,
-    text_2 = [
+        "is_cross_encoder" if model["is_cross_encoder"]\
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+              else "is_sentence_transformer": True
-    ]
+    }
-    score_response = requests.post(server.url_for("score"),
+    with hf_runner(model["name"], **kwargs) as hf_model:
-                                   json={
+        yield hf_model
-                                       "model": model_name,
-                                       "text_1": text_1,
-                                       "text_2": text_2,
+class TestModel:
-                                   })
-    score_response.raise_for_status()
+    def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
-    score = ScoreResponse.model_validate(score_response.json())
+                                    model: dict[str, Any], runner):
+        text_1 = "What is the capital of France?"
-    assert score.id is not None
+        text_2 = [
-    assert score.data is not None
+            "The capital of Brazil is Brasilia.",
-    assert len(score.data) == 2
+            "The capital of France is Paris."
-    assert score.data[0].score <= 0.01
+        ]
-    assert score.data[1].score >= 0.9
+        score_response = requests.post(server.url_for("score"),
+                                       json={
-@pytest.mark.asyncio
+                                           "model": model["name"],
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
+                                           "text_1": text_1,
-def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str):
+                                           "text_2": text_2,
-    text_1 = [
+                                       })
-        "What is the capital of the United States?",
+        score_response.raise_for_status()
-        "What is the capital of France?"
+        score = ScoreResponse.model_validate(score_response.json())
-    ]
-    text_2 = [
+        assert score.id is not None
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+        assert score.data is not None
-    ]
+        assert len(score.data) == 2
-    score_response = requests.post(server.url_for("score"),
+        vllm_outputs = [d.score for d in score.data]
-                                   json={
-                                       "model": model_name,
+        text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]]
-                                       "text_1": text_1,
+        hf_outputs = run_transformers(runner, model, text_pairs)
-                                       "text_2": text_2,
-                                   })
+        for i in range(len(vllm_outputs)):
-    score_response.raise_for_status()
+            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
-    score = ScoreResponse.model_validate(score_response.json())
+    def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
-    assert score.id is not None
+                                     model: dict[str, Any], runner):
-    assert score.data is not None
+        text_1 = [
-    assert len(score.data) == 2
+            "What is the capital of the United States?",
-    assert score.data[0].score <= 0.01
+            "What is the capital of France?"
-    assert score.data[1].score >= 0.9
+        ]
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
-@pytest.mark.asyncio
+            "The capital of France is Paris."
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
+        ]
-def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str):
-    text_1 = "What is the capital of France?"
+        score_response = requests.post(server.url_for("score"),
-    text_2 = "The capital of France is Paris."
+                                       json={
+                                           "model": model["name"],
-    score_response = requests.post(server.url_for("score"),
+                                           "text_1": text_1,
-                                   json={
+                                           "text_2": text_2,
-                                       "model": model_name,
+                                       })
-                                       "text_1": text_1,
+        score_response.raise_for_status()
-                                       "text_2": text_2,
+        score = ScoreResponse.model_validate(score_response.json())
-                                   })
-    score_response.raise_for_status()
+        assert score.id is not None
-    score = ScoreResponse.model_validate(score_response.json())
+        assert score.data is not None
+        assert len(score.data) == 2
-    assert score.id is not None
-    assert score.data is not None
+        vllm_outputs = [d.score for d in score.data]
-    assert len(score.data) == 1
-    assert score.data[0].score >= 0.9
+        text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
-@pytest.mark.asyncio
+        for i in range(len(vllm_outputs)):
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
+            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
-def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str):
+    def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
-    text_1 = "What is the capital of France?" * 20
+                                   model: dict[str, Any], runner):
-    text_2 = [
+        text_1 = "What is the capital of France?"
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+        text_2 = "The capital of France is Paris."
-    ]
+        score_response = requests.post(server.url_for("score"),
-    score_response = requests.post(server.url_for("score"),
+                                       json={
-                                   json={
+                                           "model": model["name"],
-                                       "model": model_name,
+                                           "text_1": text_1,
-                                       "text_1": text_1,
+                                           "text_2": text_2,
-                                       "text_2": text_2,
+                                       })
-                                   })
+        score_response.raise_for_status()
-    assert score_response.status_code == 400
+        score = ScoreResponse.model_validate(score_response.json())
-    # Assert just a small fragments of the response
-    assert "Please reduce the length of the input." in \
+        assert score.id is not None
-        score_response.text
+        assert score.data is not None
+        assert len(score.data) == 1
-    # Test truncation
-    score_response = requests.post(server.url_for("score"),
+        vllm_outputs = [d.score for d in score.data]
-                                   json={
-                                       "model": model_name,
+        text_pairs = [[text_1, text_2]]
-                                       "text_1": text_1,
+        hf_outputs = run_transformers(runner, model, text_pairs)
-                                       "text_2": text_2,
-                                       "truncate_prompt_tokens": 101
+        for i in range(len(vllm_outputs)):
-                                   })
+            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
-    assert score_response.status_code == 400
-    assert "Please, select a smaller truncation size." in \
+    def test_score_max_model_len(self, server: RemoteOpenAIServer,
-        score_response.text
+                                 model: dict[str, Any]):
+        text_1 = "What is the capital of France?" * 20
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris."
+        ]
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        assert score_response.status_code == 400
+        # Assert just a small fragments of the response
+        assert "Please reduce the length of the input." in \
+            score_response.text
+        # Test truncation
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                           "truncate_prompt_tokens": 101
+                                       })
+        assert score_response.status_code == 400
+        assert "Please, select a smaller truncation size." in \
+            score_response.text
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -39,6 +39,7 @@ class MockModelConfig:
    diff_sampling_param: Optional[dict] = None
    allowed_local_media_path: str = ""
    encoder_config = None
+    generation_config: str = "auto"
    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}

--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -9,8 +9,8 @@ import os
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
-                                              LoadLoraAdapterRequest,
+                                              LoadLoRAAdapterRequest,
-                                              UnloadLoraAdapterRequest)
+                                              UnloadLoRAAdapterRequest)
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                    OpenAIServingModels)
 from vllm.lora.request import LoRARequest
@@ -53,7 +53,7 @@ async def test_serving_model_name():
 @pytest.mark.asyncio
 async def test_load_lora_adapter_success():
    serving_models = await _async_serving_models_init()
-    request = LoadLoraAdapterRequest(lora_name="adapter",
+    request = LoadLoRAAdapterRequest(lora_name="adapter",
                                     lora_path="/path/to/adapter2")
    response = await serving_models.load_lora_adapter(request)
    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
@@ -64,7 +64,7 @@ async def test_load_lora_adapter_success():
 @pytest.mark.asyncio
 async def test_load_lora_adapter_missing_fields():
    serving_models = await _async_serving_models_init()
-    request = LoadLoraAdapterRequest(lora_name="", lora_path="")
+    request = LoadLoRAAdapterRequest(lora_name="", lora_path="")
    response = await serving_models.load_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
@@ -74,14 +74,14 @@ async def test_load_lora_adapter_missing_fields():
 @pytest.mark.asyncio
 async def test_load_lora_adapter_duplicate():
    serving_models = await _async_serving_models_init()
-    request = LoadLoraAdapterRequest(lora_name="adapter1",
+    request = LoadLoRAAdapterRequest(lora_name="adapter1",
                                     lora_path="/path/to/adapter1")
    response = await serving_models.load_lora_adapter(request)
    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
        lora_name='adapter1')
    assert len(serving_models.lora_requests) == 1
-    request = LoadLoraAdapterRequest(lora_name="adapter1",
+    request = LoadLoRAAdapterRequest(lora_name="adapter1",
                                     lora_path="/path/to/adapter1")
    response = await serving_models.load_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
@@ -93,12 +93,12 @@ async def test_load_lora_adapter_duplicate():
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_success():
    serving_models = await _async_serving_models_init()
-    request = LoadLoraAdapterRequest(lora_name="adapter1",
+    request = LoadLoRAAdapterRequest(lora_name="adapter1",
                                     lora_path="/path/to/adapter1")
    response = await serving_models.load_lora_adapter(request)
    assert len(serving_models.lora_requests) == 1
-    request = UnloadLoraAdapterRequest(lora_name="adapter1")
+    request = UnloadLoRAAdapterRequest(lora_name="adapter1")
    response = await serving_models.unload_lora_adapter(request)
    assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
        lora_name='adapter1')
@@ -108,7 +108,7 @@ async def test_unload_lora_adapter_success():
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_missing_fields():
    serving_models = await _async_serving_models_init()
-    request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
+    request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None)
    response = await serving_models.unload_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
@@ -118,7 +118,7 @@ async def test_unload_lora_adapter_missing_fields():
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_not_found():
    serving_models = await _async_serving_models_init()
-    request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
+    request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter")
    response = await serving_models.unload_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "NotFoundError"

--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -28,5 +28,12 @@ def test_sleep_mode():
        response = requests.post(remote_server.url_for("/sleep"),
                                 data={"level": "1"})
        assert response.status_code == 200
+        response = requests.get(remote_server.url_for("/is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is True
        response = requests.post(remote_server.url_for("/wake_up"))
        assert response.status_code == 200
+        response = requests.get(remote_server.url_for("/is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is False
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -3,12 +3,14 @@
 # imports for guided decoding tests
 import io
 import json
+from unittest.mock import patch
 import librosa
 import numpy as np
 import openai
 import pytest
 import soundfile as sf
+from openai._base_client import AsyncAPIClient
 from vllm.assets.audio import AudioAsset
@@ -120,3 +122,73 @@ async def test_completion_endpoints():
        res = await client.completions.create(model=model_name, prompt="Hello")
        assert res.code == 400
        assert res.message == "The model does not support Completions API"
+@pytest.mark.asyncio
+async def test_streaming_response(winning_call):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    transcription = ""
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res_no_stream = await client.audio.transcriptions.create(
+            model=model_name,
+            file=winning_call,
+            response_format="json",
+            language="en",
+            temperature=0.0)
+        # Unfortunately this only works when the openai client is patched
+        # to use streaming mode, not exposed in the transcription api.
+        original_post = AsyncAPIClient.post
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.transcriptions.create(
+                model=model_name,
+                file=winning_call,
+                language="en",
+                temperature=0.0,
+                extra_body=dict(stream=True))
+            # Reconstruct from chunks and validate
+            async for chunk in res:
+                # just a chunk
+                text = chunk.choices[0]['delta']['content']
+                transcription += text
+        assert transcription == res_no_stream.text
+@pytest.mark.asyncio
+async def test_stream_options(winning_call):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        original_post = AsyncAPIClient.post
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.transcriptions.create(
+                model=model_name,
+                file=winning_call,
+                language="en",
+                temperature=0.0,
+                extra_body=dict(stream=True,
+                                stream_include_usage=True,
+                                stream_continuous_usage_stats=True))
+            final = False
+            continuous = True
+            async for chunk in res:
+                if not len(chunk.choices):
+                    # final usage sent
+                    final = True
+                else:
+                    continuous = continuous and hasattr(chunk, 'usage')
+            assert final and continuous
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import Dict, List
 import os
 import openai
 import pytest
@@ -34,8 +32,6 @@ def server():
    args = [
        "--task",
        "generate",
-        "--dtype",
-        "bfloat16",
        "--max-model-len",
        "32768",
        "--max-num-seqs",
@@ -57,7 +53,7 @@ async def client(server):
 @pytest.fixture(scope="session")
-def base64_encoded_video() -> Dict[str, str]:
+def base64_encoded_video() -> dict[str, str]:
    return {
        video_url: encode_video_base64(fetch_video(video_url))
        for video_url in TEST_VIDEO_URLS
@@ -99,7 +95,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
    message = choice.message
    message = chat_completion.choices[0].message
@@ -159,7 +155,7 @@ async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 async def test_single_chat_session_video_base64encoded(
        client: openai.AsyncOpenAI, model_name: str, video_url: str,
-        base64_encoded_video: Dict[str, str]):
+        base64_encoded_video: dict[str, str]):
    messages = [{
        "role":
@@ -192,7 +188,7 @@ async def test_single_chat_session_video_base64encoded(
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
    message = choice.message
    message = chat_completion.choices[0].message
@@ -217,7 +213,7 @@ async def test_single_chat_session_video_base64encoded(
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 async def test_single_chat_session_video_base64encoded_beamsearch(
        client: openai.AsyncOpenAI, model_name: str, video_url: str,
-        base64_encoded_video: Dict[str, str]):
+        base64_encoded_video: dict[str, str]):
    messages = [{
        "role":
@@ -287,7 +283,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI,
        temperature=0.0,
        stream=True,
    )
-    chunks: List[str] = []
+    chunks: list[str] = []
    finish_reason_count = 0
    async for chunk in stream:
        delta = chunk.choices[0].delta
@@ -310,7 +306,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI,
    "video_urls",
    [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))])
 async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str,
-                                 video_urls: List[str]):
+                                 video_urls: list[str]):
    messages = [{
        "role":

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import Dict, List
 import openai
 import pytest
 import os
@@ -36,8 +34,6 @@ def server():
    args = [
        "--task",
        "generate",
-        "--dtype",
-        "bfloat16",
        "--max-model-len",
        "2048",
        "--max-num-seqs",
@@ -59,7 +55,7 @@ async def client(server):
 @pytest.fixture(scope="session")
-def base64_encoded_image() -> Dict[str, str]:
+def base64_encoded_image() -> dict[str, str]:
    return {
        image_url: encode_image_base64(fetch_image(image_url))
        for image_url in TEST_IMAGE_URLS
@@ -161,7 +157,7 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image_base64encoded(
        client: openai.AsyncOpenAI, model_name: str, image_url: str,
-        base64_encoded_image: Dict[str, str]):
+        base64_encoded_image: dict[str, str]):
    messages = [{
        "role":
@@ -219,7 +215,7 @@ async def test_single_chat_session_image_base64encoded(
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image_base64encoded_beamsearch(
        client: openai.AsyncOpenAI, model_name: str, image_url: str,
-        base64_encoded_image: Dict[str, str]):
+        base64_encoded_image: dict[str, str]):
    messages = [{
        "role":
@@ -289,7 +285,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
        temperature=0.0,
        stream=True,
    )
-    chunks: List[str] = []
+    chunks: list[str] = []
    finish_reason_count = 0
    async for chunk in stream:
        delta = chunk.choices[0].delta
@@ -312,7 +308,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
    "image_urls",
    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
 async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
-                                 image_urls: List[str]):
+                                 image_urls: list[str]):
    messages = [{
        "role":

--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import Dict
 import os
 import pytest
 import requests
@@ -38,8 +36,6 @@ def server():
    args = [
        "--task",
        "embed",
-        "--dtype",
-        "bfloat16",
        "--max-model-len",
        "2048",
        "--max-num-seqs",
@@ -57,7 +53,7 @@ def server():
 @pytest.fixture(scope="session")
-def base64_encoded_image() -> Dict[str, str]:
+def base64_encoded_image() -> dict[str, str]:
    return {
        image_url: encode_image_base64(fetch_image(image_url))
        for image_url in TEST_IMAGE_URLS

--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import List
 from unittest.mock import MagicMock
 import pytest
@@ -125,7 +124,7 @@ TEST_CASES = [
 @pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
                         TEST_CASES)
 def test_tool_call(streaming: bool, model_output: str,
-                   expected_tool_calls: List[FunctionCall]):
+                   expected_tool_calls: list[FunctionCall]):
    mock_tokenizer = MagicMock()
    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
        mock_tokenizer)

--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import Iterable, List, Tuple, Union
+from collections.abc import Iterable
+from typing import Union
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaMessage,
@@ -12,7 +13,7 @@ from vllm.entrypoints.openai.tool_parsers import ToolParser
 class StreamingToolReconstructor:
    def __init__(self, assert_one_tool_per_delta: bool = True):
-        self.tool_calls: List[ToolCall] = []
+        self.tool_calls: list[ToolCall] = []
        self.other_content: str = ""
        self._assert_one_tool_per_delta = assert_one_tool_per_delta
@@ -72,7 +73,7 @@ def run_tool_extraction(
    request: Union[ChatCompletionRequest, None] = None,
    streaming: bool = False,
    assert_one_tool_per_delta: bool = True,
-) -> Tuple[Union[str, None], List[ToolCall]]:
+) -> tuple[Union[str, None], list[ToolCall]]:
    if streaming:
        reconstructor = run_tool_extraction_streaming(
            tool_parser,
@@ -106,7 +107,7 @@ def run_tool_extraction_streaming(
    reconstructor = StreamingToolReconstructor(
        assert_one_tool_per_delta=assert_one_tool_per_delta)
    previous_text = ""
-    previous_tokens: List[int] = []
+    previous_tokens: list[int] = []
    for delta in model_deltas:
        token_delta = [
            tool_parser.vocab.get(token)

--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -5,10 +5,13 @@ from typing import Optional
 import pytest
 import os
+from packaging.version import Version
+from transformers import __version__ as TRANSFORMERS_VERSION
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
+from vllm.entrypoints.chat_utils import (_resolve_hf_chat_template,
+                                         _try_extract_ast, load_chat_template,
                                         parse_chat_messages,
                                         parse_chat_messages_futures,
                                         resolve_chat_template_content_format)
@@ -22,11 +25,14 @@ from ..utils import VLLM_PATH
 EXAMPLES_DIR = VLLM_PATH / "examples"
 PHI3V_MODEL_ID = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")
 ULTRAVOX_MODEL_ID = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")
 QWEN2VL_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")
+QWEN25VL_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")
 MLLAMA_MODEL_ID = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct")
 LLAMA_GUARD_MODEL_ID = os.path.join(models_path_prefix, "meta-llama/Llama-Guard-3-1B")
+HERMES_MODEL_ID = os.path.join(models_path_prefix, "NousResearch/Hermes-3-Llama-3.1-8B")
 @pytest.fixture(scope="function")
@@ -36,7 +42,7 @@ def phi3v_model_config():
                       tokenizer=PHI3V_MODEL_ID,
                       tokenizer_mode="auto",
                       trust_remote_code=True,
-                       dtype="bfloat16",
+                       dtype="auto",
                       seed=0,
                       limit_mm_per_prompt={
                           "image": 2,
@@ -60,7 +66,7 @@ def mllama_model_config():
                       tokenizer=MLLAMA_MODEL_ID,
                       tokenizer_mode="auto",
                       trust_remote_code=True,
-                       dtype="bfloat16",
+                       dtype="auto",
                       seed=0,
                       limit_mm_per_prompt={
                           "image": 2,
@@ -671,7 +677,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
                               tokenizer=MLLAMA_MODEL_ID,
                               tokenizer_mode="auto",
                               trust_remote_code=True,
-                               dtype="bfloat16",
+                               dtype="auto",
                               seed=0,
                               limit_mm_per_prompt={
                                   "image": 2,
@@ -705,25 +711,70 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
    vllm_result = apply_hf_chat_template(
        tokenizer,
+        trust_remote_code=model_config.trust_remote_code,
        conversation=conversation,
        chat_template=None,
+        tools=None,
        add_generation_prompt=True,
    )
    assert hf_result == vllm_result
+@pytest.mark.parametrize(
+    "model",
+    [
+        QWEN2VL_MODEL_ID,  # tokenizer.chat_template is of type str
+        HERMES_MODEL_ID,  # tokenizer.chat_template is of type dict
+    ])
+@pytest.mark.parametrize("use_tools", [True, False])
+def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
+    """checks that chat_template is a dict type for HF models."""
+    # Build the tokenizer group and grab the underlying tokenizer
+    tokenizer_group = TokenizerGroup(
+        model,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+    tokenizer = tokenizer_group.tokenizer
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "dummy_function_name",
+            "description": "This is a dummy function",
+            "parameters": sample_json_schema
+        }
+    }] if use_tools else None
+    # Test detecting the tokenizer's chat_template
+    chat_template = _resolve_hf_chat_template(
+        tokenizer,
+        chat_template=None,
+        tools=tools,
+        trust_remote_code=True,
+    )
+    assert isinstance(chat_template, str)
 # yapf: disable
 @pytest.mark.parametrize(
    ("model", "expected_format"),
    [(PHI3V_MODEL_ID, "string"),
     (QWEN2VL_MODEL_ID, "openai"),
+     (QWEN25VL_MODEL_ID, "openai"),
     (ULTRAVOX_MODEL_ID, "string"),
     (MLLAMA_MODEL_ID, "openai"),
     (LLAMA_GUARD_MODEL_ID, "openai")],
 )
 # yapf: enable
 def test_resolve_content_format_hf_defined(model, expected_format):
+    if model == QWEN25VL_MODEL_ID and Version(TRANSFORMERS_VERSION) < Version(
+            "4.49.0"):
+        pytest.skip("Qwen2.5-VL requires transformers>=4.49.0")
    tokenizer_group = TokenizerGroup(
        model,
        enable_lora=False,
@@ -732,7 +783,13 @@ def test_resolve_content_format_hf_defined(model, expected_format):
    )
    tokenizer = tokenizer_group.tokenizer
-    chat_template = tokenizer.chat_template
+    # Test detecting the tokenizer's chat_template
+    chat_template = _resolve_hf_chat_template(
+        tokenizer,
+        chat_template=None,
+        tools=None,
+        trust_remote_code=True,
+    )
    assert isinstance(chat_template, str)
    print("[TEXT]")
@@ -742,8 +799,10 @@ def test_resolve_content_format_hf_defined(model, expected_format):
    resolved_format = resolve_chat_template_content_format(
        None,  # Test detecting the tokenizer's chat_template
+        None,
        "auto",
        tokenizer,
+        trust_remote_code=True,
    )
    assert resolved_format == expected_format
@@ -793,8 +852,10 @@ def test_resolve_content_format_examples(template_path, expected_format):
    resolved_format = resolve_chat_template_content_format(
        chat_template,
+        None,
        "auto",
        dummy_tokenizer,
+        trust_remote_code=True,
    )
    assert resolved_format == expected_format
--- a/tests/entrypoints/test_ssl_cert_refresher.py
+++ b/tests/entrypoints/test_ssl_cert_refresher.py
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+import tempfile
+from pathlib import Path
+from ssl import SSLContext
+import pytest
+from vllm.entrypoints.ssl import SSLCertRefresher
+class MockSSLContext(SSLContext):
+    def __init__(self):
+        self.load_cert_chain_count = 0
+        self.load_ca_count = 0
+    def load_cert_chain(
+        self,
+        certfile,
+        keyfile=None,
+        password=None,
+    ):
+        self.load_cert_chain_count += 1
+    def load_verify_locations(
+        self,
+        cafile=None,
+        capath=None,
+        cadata=None,
+    ):
+        self.load_ca_count += 1
+def create_file() -> str:
+    with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as f:
+        return f.name
+def touch_file(path: str) -> None:
+    Path(path).touch()
+@pytest.mark.asyncio
+async def test_ssl_refresher():
+    ssl_context = MockSSLContext()
+    key_path = create_file()
+    cert_path = create_file()
+    ca_path = create_file()
+    ssl_refresher = SSLCertRefresher(ssl_context, key_path, cert_path, ca_path)
+    await asyncio.sleep(1)
+    assert ssl_context.load_cert_chain_count == 0
+    assert ssl_context.load_ca_count == 0
+    touch_file(key_path)
+    await asyncio.sleep(1)
+    assert ssl_context.load_cert_chain_count == 1
+    assert ssl_context.load_ca_count == 0
+    touch_file(cert_path)
+    touch_file(ca_path)
+    await asyncio.sleep(1)
+    assert ssl_context.load_cert_chain_count == 2
+    assert ssl_context.load_ca_count == 1
+    ssl_refresher.stop()
+    touch_file(cert_path)
+    touch_file(ca_path)
+    await asyncio.sleep(1)
+    assert ssl_context.load_cert_chain_count == 2
+    assert ssl_context.load_ca_count == 1
--- a/tests/fastsafetensors_loader/__init__.py
+++ b/tests/fastsafetensors_loader/__init__.py
--- a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
+++ b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
+# SPDX-License-Identifier: Apache-2.0
+from vllm import SamplingParams
+from vllm.config import LoadFormat
+test_model = "openai-community/gpt2"
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
+def test_model_loader_download_files(vllm_runner):
+    with vllm_runner(test_model,
+                     load_format=LoadFormat.FASTSAFETENSORS) as llm:
+        deserialized_outputs = llm.generate(prompts, sampling_params)
+        assert deserialized_outputs
--- a/tests/fastsafetensors_loader/test_weight_utils.py
+++ b/tests/fastsafetensors_loader/test_weight_utils.py
+# SPDX-License-Identifier: Apache-2.0
+import glob
+import tempfile
+import huggingface_hub.constants
+import torch
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, fastsafetensors_weights_iterator,
+    safetensors_weights_iterator)
+def test_fastsafetensors_model_loader():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("openai-community/gpt2",
+                                 allow_patterns=["*.safetensors"],
+                                 cache_dir=tmpdir)
+        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
+        assert len(safetensors) > 0
+        fastsafetensors_tensors = {}
+        hf_safetensors_tensors = {}
+        for name, tensor in fastsafetensors_weights_iterator(
+                safetensors, True):
+            fastsafetensors_tensors[name] = tensor
+        for name, tensor in safetensors_weights_iterator(safetensors, True):
+            hf_safetensors_tensors[name] = tensor
+        assert len(fastsafetensors_tensors) == len(hf_safetensors_tensors)
+        for name, fastsafetensors_tensor in fastsafetensors_tensors.items():
+            fastsafetensors_tensor = fastsafetensors_tensor.to('cpu')
+            assert fastsafetensors_tensor.dtype == hf_safetensors_tensors[
+                name].dtype
+            assert fastsafetensors_tensor.shape == hf_safetensors_tensors[
+                name].shape
+            assert torch.all(
+                fastsafetensors_tensor.eq(hf_safetensors_tensors[name]))
+if __name__ == "__main__":
+    test_fastsafetensors_model_loader()
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 import torch
@@ -9,8 +9,7 @@ from vllm.platforms import current_platform
 # Using the default value (240.0) from pytorch will cause accuracy
 # issue on dynamic quantization models. Here use 224.0 for rocm.
 ROCM_FP8_MAX = 224.0
-FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm() \
+FP8_DTYPE = current_platform.fp8_dtype()
-                else torch.float8_e4m3fn
 def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
@@ -19,7 +18,7 @@ def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
 def ref_dynamic_per_token_quant(x: torch.tensor,
                                quant_dtype: torch.dtype,
                                scale_ub: Optional[torch.tensor] = None) \
-        -> Tuple[torch.tensor, torch.tensor]:
+        -> tuple[torch.tensor, torch.tensor]:
    assert quant_dtype in [torch.int8, FP8_DTYPE]
    if scale_ub is not None:
@@ -68,7 +67,7 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
 # ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
 # kernel
 def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
-                    -> Tuple[torch.tensor, torch.tensor]:
+                    -> tuple[torch.tensor, torch.tensor]:
    fp8_traits = torch.finfo(FP8_DTYPE)
    fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \

--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
 # SPDX-License-Identifier: Apache-2.0
 import random
-from typing import Type
 import pytest
 import torch
@@ -86,7 +85,7 @@ def test_act_and_mul(
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_activation(
-    activation: Type[torch.nn.Module],
+    activation: type[torch.nn.Module],
    num_tokens: int,
    d: int,
    dtype: torch.dtype,

--- a/tests/kernels/test_allspark_gemm.py
+++ b/tests/kernels/test_allspark_gemm.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.allspark_utils import (
+    ALLSPARK_AMPERE_K_ALIGN, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+    ALLSPARK_AMPERE_N_ALIGN)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    quantize_weights)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+def is_gptq_allspark_supported(min_capability: int,
+                               max_capability: int) -> bool:
+    if not current_platform.is_cuda():
+        return False
+    capability = current_platform.get_device_capability()
+    assert capability is not None
+    return capability.to_int() >= min_capability \
+        and capability.to_int() <= max_capability
+MNK_FACTORS = [
+    (1, 4, 8),
+    (13, 17, 67),
+    (26, 37, 13),
+    (48, 16, 24),
+    (67, 13, 88),
+    (257, 13, 11),
+    (658, 13, 11),
+    (1033, 9, 17),
+]
+DTYPES = [torch.float16, torch.bfloat16]
+HAS_ZP_OPTS = [False, True]
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+def rand_data(shape, dtype=torch.float16):
+    return torch.randn(shape, dtype=dtype, device="cuda")
+@pytest.mark.skipif(
+    not is_gptq_allspark_supported(80, 89),
+    reason="AllSpark Ampere kernel is not supported on this GPU type.")
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("group_size", [-1])
+@pytest.mark.parametrize("has_zp", HAS_ZP_OPTS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_gptq_allspark_gemm_ampere(mnk_factors, group_size, has_zp, dtype):
+    m_factor, n_factor, k_factor = mnk_factors
+    m = m_factor
+    n = n_factor * ALLSPARK_AMPERE_N_ALIGN
+    k = k_factor * ALLSPARK_AMPERE_K_ALIGN
+    input = rand_data((m, k), dtype=dtype)
+    weight = rand_data((k, n), dtype=dtype)
+    # Quantize (and apply act_order if provided)
+    w_ref, qw, s, zp = quantize_weights(weight, scalar_types.uint8b128,
+                                        group_size, has_zp)
+    qw = qw.to(torch.uint8)
+    if has_zp:
+        zp = zp.to(dtype)
+    properties = torch.cuda.get_device_properties(qw.device.index)
+    sm_count = properties.multi_processor_count
+    sm_version = properties.major * 10 + properties.minor
+    n_32align = (n + 32 - 1) // 32 * 32
+    qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
+        qw, s, zp, has_zp)
+    opcheck(torch.ops._C.rearrange_kn_weight_as_n32k16_order,
+            (qw, s, zp, has_zp, qw_reorder, s_reorder, zp_reorder, k, n,
+             n_32align))
+    opcheck(torch.ops._C.allspark_w8a16_gemm,
+            (input, qw_reorder, s_reorder, zp_reorder, n, group_size, sm_count,
+             sm_version, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, has_zp, True),
+            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+    output = ops.allspark_w8a16_gemm(input, qw_reorder, s_reorder, zp_reorder,
+                                     n, group_size, sm_count, sm_version,
+                                     ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+                                     has_zp, True)
+    output_ref = torch.matmul(input, w_ref)
+    torch.cuda.synchronize()
+    max_diff = compute_max_diff(output, output_ref)
+    assert max_diff < 0.04