Merge tag 'v0.7.1' into v0.7.1-dev

afd0da21 · zhuwenwen · 1a11f127 · 4f4d427a · afd0da21 · afd0da21
Commit afd0da21 authored Feb 03, 2025 by zhuwenwen
20 changed files
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
+# unit test for `examples/offline_inference/torchrun_example.py`
+import random
+import torch.distributed as dist
+from vllm import LLM, SamplingParams
+from vllm.distributed.parallel_state import get_world_group
+# Create prompts
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# to test if all ranks agree on the same kv cache configuration.
+llm = LLM(model="facebook/opt-125m",
+          tensor_parallel_size=2,
+          distributed_executor_backend="external_launcher",
+          gpu_memory_utilization=random.uniform(0.7, 0.9),
+          swap_space=random.randint(1, 4))
+outputs = llm.generate(prompts, sampling_params)
+cpu_group = get_world_group().cpu_group
+torch_rank = dist.get_rank(group=cpu_group)
+def test_consistent_across_ranks(obj):
+    if torch_rank == 0:
+        dist.broadcast_object_list([obj], src=0, group=cpu_group)
+    else:
+        container = [None]
+        dist.broadcast_object_list(container, src=0, group=cpu_group)
+        assert container[0] == obj
+test_consistent_across_ranks(
+    llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
+test_consistent_across_ranks(
+    llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
+# all ranks should have the same outputs
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    test_consistent_across_ranks(prompt)
+    test_consistent_across_ranks(generated_text)
+    print(f"Rank {torch_rank}, Prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
 import asyncio
 import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import pytest
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
-from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync
+from vllm.executor.uniproc_executor import UniProcExecutor
 from vllm.sampling_params import SamplingParams
 import os
 from ..utils import models_path_prefix
@@ -16,21 +17,20 @@ class Mock:
    ...
-class CustomGPUExecutor(GPUExecutor):
+class CustomUniExecutor(UniProcExecutor):
-    def execute_model(self, *args, **kwargs):
+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
        # Drop marker to show that this was ran
        with open(".marker", "w"):
            ...
-        return super().execute_model(*args, **kwargs)
+        return super().collective_rpc(method, timeout, args, kwargs)
-class CustomGPUExecutorAsync(GPUExecutorAsync):
+CustomUniExecutorAsync = CustomUniExecutor
-    async def execute_model_async(self, *args, **kwargs):
-        with open(".marker", "w"):
-            ...
-        return await super().execute_model_async(*args, **kwargs)
 @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@@ -43,10 +43,6 @@ def test_custom_executor_type_checking(model):
        engine_args = AsyncEngineArgs(model=model,
                                      distributed_executor_backend=Mock)
        AsyncLLMEngine.from_engine_args(engine_args)
-    with pytest.raises(TypeError):
-        engine_args = AsyncEngineArgs(
-            model=model, distributed_executor_backend=CustomGPUExecutor)
-        AsyncLLMEngine.from_engine_args(engine_args)
 @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@@ -57,7 +53,9 @@ def test_custom_executor(model, tmp_path):
        assert not os.path.exists(".marker")
        engine_args = EngineArgs(
-            model=model, distributed_executor_backend=CustomGPUExecutor)
+            model=model,
+            distributed_executor_backend=CustomUniExecutor,
+        )
        engine = LLMEngine.from_engine_args(engine_args)
        sampling_params = SamplingParams(max_tokens=1)
@@ -77,7 +75,7 @@ def test_custom_executor_async(model, tmp_path):
        assert not os.path.exists(".marker")
        engine_args = AsyncEngineArgs(
-            model=model, distributed_executor_backend=CustomGPUExecutorAsync)
+            model=model, distributed_executor_backend=CustomUniExecutorAsync)
        engine = AsyncLLMEngine.from_engine_args(engine_args)
        sampling_params = SamplingParams(max_tokens=1)

--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -6,16 +6,15 @@ from typing import Any, List, Tuple
 import pytest
+from vllm.config import VllmConfig
 from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                  ResultHandler, WorkerMonitor)
+from vllm.worker.worker_base import WorkerWrapperBase
-class DummyWorker:
+class DummyWorkerWrapper(WorkerWrapperBase):
    """Dummy version of vllm.worker.worker.Worker"""
-    def __init__(self, rank: int):
-        self.rank = rank
    def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
        sleep(0.05)
@@ -23,14 +22,15 @@ class DummyWorker:
            # simulate error case
            raise worker_input
-        return self.rank, input
+        return self.rpc_rank, input
 def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
    result_handler = ResultHandler()
+    vllm_config = VllmConfig()
    workers = [
-        ProcessWorkerWrapper(result_handler, partial(DummyWorker, rank=rank))
+        ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config,
-        for rank in range(8)
+                             rank) for rank in range(8)
    ]
    worker_monitor = WorkerMonitor(workers, result_handler)

--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
+import pytest
+from vllm import LLM
+from ...utils import fork_new_process_for_each_test
+@pytest.mark.parametrize("tp_size", [1, 2])
+@pytest.mark.parametrize("backend", ["mp", "ray"])
+@fork_new_process_for_each_test
+def test_collective_rpc(tp_size, backend):
+    if tp_size == 1 and backend == "ray":
+        pytest.skip("Skip duplicate test case")
+    if tp_size == 1:
+        backend = None
+    # intentionally define the method and class in the test function,
+    # to test if they can be serialized and sent to the workers
+    def echo_rank(self):
+        return self.rank
+    from vllm.worker.worker import Worker
+    class MyWorker(Worker):
+        def echo_rank(self):
+            return self.rank
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+              enforce_eager=True,
+              load_format="dummy",
+              tensor_parallel_size=tp_size,
+              distributed_executor_backend=backend,
+              worker_cls=MyWorker)
+    for method in ["echo_rank", echo_rank]:
+        assert llm.collective_rpc(method) == list(range(tp_size))
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -107,3 +107,10 @@ def test_multiple_pooling_params(llm: LLM):
    # pooling_params is None, default params should be applied
    outputs = llm.encode(PROMPTS, pooling_params=None)
    assert len(PROMPTS) == len(outputs)
+@pytest.mark.skip_global_cleanup
+def test_right_side_truncation(llm: LLM):
+    # Embeddings models should truncate the end of the prompt
+    tokenizer = llm.get_tokenizer()
+    assert tokenizer.truncation_side == "right"
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py
--- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+from typing import List
+import pytest
+from transformers import AutoTokenizer
+from tests.entrypoints.openai.reasoning_parsers.utils import (
+    run_reasoning_extraction)
+from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
+                                                       ReasoningParserManager)
+parser_name = "deepseek_r1"
+start_token = "<think>"
+end_token = "</think>"
+SIMPLE_REASONING = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+NO_REASONING = {
+    "output": "This is a reasoning section",
+    "reasoning_content": None,
+    "content": "This is a reasoning section",
+}
+MULTIPLE_LINES = {
+    "output": "<think>This\nThat</think>This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+SHORTEST_REASONING_NO_STREAMING = {
+    "output": "<think></think>This is the rest",
+    "reasoning_content": "",
+    "content": "This is the rest",
+}
+SHORTEST_REASONING = {
+    "output": "<think></think>This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_streaming",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_streaming",
+    ),
+    pytest.param(
+        False,
+        NO_REASONING,
+        id="no_streaming",
+    ),
+    pytest.param(
+        True,
+        NO_REASONING,
+        id="no_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING,
+        id="shortest_streaming",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING,
+        id="shortest_streaming",
+    ),
+]
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+):
+    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+    tokenizer.add_tokens([start_token, end_token])
+    output = tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: List[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
--- a/tests/entrypoints/openai/reasoning_parsers/utils.py
+++ b/tests/entrypoints/openai/reasoning_parsers/utils.py
+from typing import List, Optional, Tuple, Union
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser
+class StreamingReasoningReconstructor:
+    def __init__(self):
+        self.reasoning_content = None
+        self.other_content = None
+    def append_delta(self, delta: DeltaMessage):
+        # content and the reasoning content should not be present
+        # at the same time
+        assert delta.content is None or delta.reasoning_content is None, (
+            "Both content and reasoning content are present in the "
+            "delta message")
+        if delta.content is not None:
+            if self.other_content is None:
+                self.other_content = delta.content
+            else:
+                self.other_content += delta.content
+        else:
+            if self.reasoning_content is None:
+                self.reasoning_content = delta.reasoning_content
+            else:
+                self.reasoning_content += delta.reasoning_content
+def run_reasoning_extraction(
+    reasoning_parser: ReasoningParser,
+    model_output: List[str],
+    request: Union[ChatCompletionRequest, None] = None,
+    streaming: bool = False,
+) -> Tuple[Optional[str], Optional[str]]:
+    if streaming:
+        reconstructor = run_reasoning_extraction_streaming(
+            reasoning_parser,
+            model_output,
+            request,
+        )
+        return (
+            reconstructor.reasoning_content,
+            reconstructor.other_content or None,
+        )
+    else:
+        reasoning, content = run_reasoning_extraction_nonstreaming(
+            reasoning_parser, model_output, request)
+        return reasoning, content
+def run_reasoning_extraction_nonstreaming(
+    reasoning_parser: ReasoningParser,
+    model_output: List[str],
+    request: Union[ChatCompletionRequest, None] = None,
+) -> Tuple[Optional[str], Optional[str]]:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    return reasoning_parser.extract_reasoning_content(
+        model_output=''.join(model_output), request=request)
+def run_reasoning_extraction_streaming(
+    reasoning_parser: ReasoningParser,
+    model_deltas: List[str],
+    request: Union[ChatCompletionRequest, None] = None,
+) -> StreamingReasoningReconstructor:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    reconstructor = StreamingReasoningReconstructor()
+    previous_text = ""
+    previous_tokens: List[int] = []
+    for delta in model_deltas:
+        token_delta = [
+            reasoning_parser.vocab.get(token)
+            for token in reasoning_parser.model_tokenizer.tokenize(delta)
+            if token in reasoning_parser.vocab
+        ]
+        current_text = previous_text + delta
+        current_tokens = previous_tokens + token_delta
+        delta_message = reasoning_parser.extract_reasoning_content_streaming(
+            previous_text,
+            current_text,
+            delta,
+            previous_tokens,
+            current_tokens,
+            token_delta,
+        )
+        if delta_message is not None:
+            reconstructor.append_delta(delta_message)
+        previous_text = current_text
+        previous_tokens = current_tokens
+    return reconstructor
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -4,7 +4,7 @@ import pytest
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                              validate_parsed_serve_args)
-from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.utils import FlexibleArgumentParser
 from ...utils import VLLM_PATH
@@ -116,6 +116,35 @@ def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
    validate_parsed_serve_args(args)
+def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
+    """Ensure validation fails if reasoning is enabled with auto tool choice"""
+    args = serve_parser.parse_args(args=[
+        "--enable-auto-tool-choice",
+        "--enable-reasoning",
+    ])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+def test_enable_reasoning_passes_with_reasoning_parser(serve_parser):
+    """Ensure validation passes if reasoning is enabled 
+    with a reasoning parser"""
+    args = serve_parser.parse_args(args=[
+        "--enable-reasoning",
+        "--reasoning-parser",
+        "deepseek_r1",
+    ])
+    validate_parsed_serve_args(args)
+def test_enable_reasoning_fails_without_reasoning_parser(serve_parser):
+    """Ensure validation fails if reasoning is enabled 
+    without a reasoning parser"""
+    args = serve_parser.parse_args(args=["--enable-reasoning"])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
 def test_chat_template_validation_for_happy_paths(serve_parser):
    """Ensure validation passes if the chat template exists"""
    args = serve_parser.parse_args(

--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -29,6 +29,8 @@ PA_NAME = os.path.join(models_path_prefix, "swapnilbp/llama_tweet_ptune")
 # need to change to match the prompt adapter
 PA_NUM_VIRTUAL_TOKENS = 8
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
@@ -638,8 +640,7 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-                         ["outlines", "lm-format-enforcer"])
 async def test_guided_json_completion(client: openai.AsyncOpenAI,
                                      guided_decoding_backend: str,
                                      sample_json_schema):
@@ -661,8 +662,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-                         ["outlines", "lm-format-enforcer"])
 async def test_guided_regex_completion(client: openai.AsyncOpenAI,
                                       guided_decoding_backend: str,
                                       sample_regex):
@@ -683,8 +683,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-                         ["outlines", "lm-format-enforcer"])
 async def test_guided_choice_completion(client: openai.AsyncOpenAI,
                                        guided_decoding_backend: str,
                                        sample_guided_choice):
@@ -764,8 +763,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-                         ["outlines", "lm-format-enforcer"])
 async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
                                          guided_decoding_backend: str,
                                          sample_json_schema, sample_regex):

--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
+import asyncio
+import json
+import shutil
+from contextlib import suppress
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from ...utils import RemoteOpenAIServer
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+BADREQUEST_CASES = [
+    (
+        "test_rank",
+        {
+            "r": 1024
+        },
+        "is greater than max_lora_rank",
+    ),
+    (
+        "test_bias",
+        {
+            "bias": "all"
+        },
+        "Adapter bias cannot be used without bias_enabled",
+    ),
+    ("test_dora", {
+        "use_dora": True
+    }, "does not yet support DoRA"),
+    (
+        "test_modules_to_save",
+        {
+            "modules_to_save": ["lm_head"]
+        },
+        "only supports modules_to_save being None",
+    ),
+]
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+@pytest.fixture(scope="module")
+def server_with_lora_modules_json(zephyr_lora_files):
+    # Define the json format LoRA module configurations
+    lora_module_1 = {
+        "name": "zephyr-lora",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+    lora_module_2 = {
+        "name": "zephyr-lora2",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        json.dumps(lora_module_1),
+        json.dumps(lora_module_2),
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+    # Enable the /v1/load_lora_adapter endpoint
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(server_with_lora_modules_json):
+    async with server_with_lora_modules_json.get_async_client(
+    ) as async_client:
+        yield async_client
+@pytest.mark.asyncio
+async def test_static_lora_lineage(client: openai.AsyncOpenAI,
+                                   zephyr_lora_files):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert served_model.root == MODEL_NAME
+    assert served_model.parent is None
+    assert all(lora_model.root == zephyr_lora_files
+               for lora_model in lora_models)
+    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
+@pytest.mark.asyncio
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
+                                    zephyr_lora_files):
+    response = await client.post("load_lora_adapter",
+                                 cast_to=str,
+                                 body={
+                                     "lora_name": "zephyr-lora-3",
+                                     "lora_path": zephyr_lora_files
+                                 })
+    # Ensure adapter loads before querying /models
+    assert "success" in response
+    models = await client.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME
+    assert dynamic_lora_model.id == "zephyr-lora-3"
+@pytest.mark.asyncio
+async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
+    with pytest.raises(openai.NotFoundError):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "notfound",
+                              "lora_path": "/not/an/adapter"
+                          })
+@pytest.mark.asyncio
+async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
+                                          tmp_path):
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+    with pytest.raises(openai.BadRequestError):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "invalid-json",
+                              "lora_path": str(invalid_files)
+                          })
+@pytest.mark.asyncio
+@pytest.mark.parametrize("test_name,config_change,expected_error",
+                         BADREQUEST_CASES)
+async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path,
+                                        zephyr_lora_files, test_name: str,
+                                        config_change: dict,
+                                        expected_error: str):
+    # Create test directory
+    test_dir = tmp_path / test_name
+    # Copy adapter files
+    shutil.copytree(zephyr_lora_files, test_dir)
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
+        adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(config_change)
+    # Save modified configuration
+    with open(config_path, "w") as f:
+        json.dump(adapter_config, f)
+    # Test loading the adapter
+    with pytest.raises(openai.BadRequestError, match=expected_error):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": test_name,
+                              "lora_path": str(test_dir)
+                          })
+@pytest.mark.asyncio
+async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
+                                      zephyr_lora_files):
+    """Validate that many loras can be dynamically registered and inferenced 
+    with concurrently"""
+    # This test file configures the server with --max-cpu-loras=2 and this test
+    # will concurrently load 10 adapters, so it should flex the LRU cache
+    async def load_and_run_adapter(adapter_name: str):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": adapter_name,
+                              "lora_path": str(zephyr_lora_files)
+                          })
+        for _ in range(3):
+            await client.completions.create(
+                model=adapter_name,
+                prompt=["Hello there", "Foo bar bazz buzz"],
+                max_tokens=5,
+            )
+    lora_tasks = []
+    for i in range(10):
+        lora_tasks.append(
+            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+    results, _ = await asyncio.wait(lora_tasks)
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+@pytest.mark.asyncio
+async def test_loading_invalid_adapters_does_not_break_others(
+        client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files):
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+    stop_good_requests_event = asyncio.Event()
+    async def run_good_requests(client):
+        # Run chat completions requests until event set
+        results = []
+        while not stop_good_requests_event.is_set():
+            try:
+                batch = await client.completions.create(
+                    model="zephyr-lora",
+                    prompt=["Hello there", "Foo bar bazz buzz"],
+                    max_tokens=5,
+                )
+                results.append(batch)
+            except Exception as e:
+                results.append(e)
+        return results
+    # Create task to run good requests
+    good_task = asyncio.create_task(run_good_requests(client))
+    # Run a bunch of bad adapter loads
+    for _ in range(25):
+        with suppress(openai.NotFoundError):
+            await client.post("load_lora_adapter",
+                              cast_to=str,
+                              body={
+                                  "lora_name": "notfound",
+                                  "lora_path": "/not/an/adapter"
+                              })
+    for _ in range(25):
+        with suppress(openai.BadRequestError):
+            await client.post("load_lora_adapter",
+                              cast_to=str,
+                              body={
+                                  "lora_name": "invalid",
+                                  "lora_path": str(invalid_files)
+                              })
+    # Ensure all the running requests with lora adapters succeeded
+    stop_good_requests_event.set()
+    results = await good_task
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+    # Ensure we can load another adapter and run it
+    await client.post("load_lora_adapter",
+                      cast_to=str,
+                      body={
+                          "lora_name": "valid",
+                          "lora_path": zephyr_lora_files
+                      })
+    await client.completions.create(
+        model="valid",
+        prompt=["Hello there", "Foo bar bazz buzz"],
+        max_tokens=5,
+    )
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -17,6 +17,24 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
 MODEL_NAME = os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+@pytest.fixture(scope="module", params=[True, False])
+def use_v1(request):
+    # Module-scoped variant of run_with_both_engines
+    #
+    # Use this fixture to run a test with both v0 and v1, and
+    # also to conditionalize the test logic e.g.
+    #
+    # def test_metrics_exist(use_v1, server, client):
+    #     ...
+    #     expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
+    #     for metric in expected:
+    #         assert metric in response.text
+    #
+    # @skip_v1 wouldn't work here because this is a module-level
+    # fixture - per-function decorators would have no effect
+    yield request.param
 @pytest.fixture(scope="module")
 def default_server_args():
    return [
@@ -37,10 +55,12 @@ def default_server_args():
                    "--enable-chunked-prefill",
                    "--disable-frontend-multiprocessing",
                ])
-def server(default_server_args, request):
+def server(use_v1, default_server_args, request):
    if request.param:
        default_server_args.append(request.param)
-    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+    env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0')
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args,
+                            env_dict=env_dict) as remote_server:
        yield remote_server
@@ -85,7 +105,7 @@ EXPECTED_VALUES = {
 @pytest.mark.asyncio
 async def test_metrics_counts(server: RemoteOpenAIServer,
-                              client: openai.AsyncClient):
+                              client: openai.AsyncClient, use_v1: bool):
    for _ in range(_NUM_REQUESTS):
        # sending a request triggers the metrics to be logged.
        await client.completions.create(
@@ -99,6 +119,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
    # Loop over all expected metric_families
    for metric_family, suffix_values_list in EXPECTED_VALUES.items():
+        if use_v1 and metric_family not in EXPECTED_METRICS_V1:
+            continue
        found_metric = False
        # Check to see if the metric_family is found in the prom endpoint.
@@ -175,10 +198,30 @@ EXPECTED_METRICS = [
    "swap_space_bytes",
 ]
+EXPECTED_METRICS_V1 = [
+    "vllm:num_requests_running",
+    "vllm:num_requests_waiting",
+    "vllm:gpu_cache_usage_perc",
+    "vllm:prompt_tokens_total",
+    "vllm:generation_tokens_total",
+    "vllm:request_prompt_tokens_sum",
+    "vllm:request_prompt_tokens_bucket",
+    "vllm:request_prompt_tokens_count",
+    "vllm:request_generation_tokens_sum",
+    "vllm:request_generation_tokens_bucket",
+    "vllm:request_generation_tokens_count",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
+]
 @pytest.mark.asyncio
 async def test_metrics_exist(server: RemoteOpenAIServer,
-                             client: openai.AsyncClient):
+                             client: openai.AsyncClient, use_v1: bool):
    # sending a request triggers the metrics to be logged.
    await client.completions.create(model=MODEL_NAME,
                                    prompt="Hello, my name is",
@@ -188,11 +231,13 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
    response = requests.get(server.url_for("metrics"))
    assert response.status_code == HTTPStatus.OK
-    for metric in EXPECTED_METRICS:
+    for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
        assert metric in response.text
-def test_metrics_exist_run_batch():
+def test_metrics_exist_run_batch(use_v1: bool):
+    if use_v1:
+        pytest.skip("Skipping test on vllm V1")
    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""  # noqa: E501
    #base_url = "0.0.0.0"

--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
+import pytest
+import requests
+from vllm.entrypoints.openai.protocol import RerankResponse
+from ...utils import RemoteOpenAIServer
+MODEL_NAME = "BAAI/bge-reranker-base"
+@pytest.fixture(scope="module")
+def server():
+    args = ["--enforce-eager", "--max-model-len", "100"]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents,
+                                    })
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_top_n(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.", "Cross-encoder models are neat"
+    ]
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents,
+                                        "top_n": 2
+                                    })
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?" * 100
+    documents = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents
+                                    })
+    assert rerank_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input." in \
+        rerank_response.text
\ No newline at end of file
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
+import json
 import subprocess
 import sys
 import os
@@ -39,6 +40,9 @@ INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "
 {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
 {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
+INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
 def test_empty_file():
    with tempfile.NamedTemporaryFile(
@@ -120,3 +124,36 @@ def test_embeddings():
            # Ensure that the output format conforms to the openai api.
            # Validation should throw if the schema is wrong.
            BatchRequestOutput.model_validate_json(line)
+def test_score():
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(INPUT_SCORE_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen([
+            sys.executable,
+            "-m",
+            "vllm.entrypoints.openai.run_batch",
+            "-i",
+            input_file.name,
+            "-o",
+            output_file.name,
+            "--model",
+            "BAAI/bge-reranker-v2-m3",
+        ], )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+            # Ensure that there is no error in the response.
+            line_dict = json.loads(line)
+            assert isinstance(line_dict, dict)
+            assert line_dict["error"] is None
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -11,9 +11,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3")
 @pytest.fixture(scope="module")
 def server():
-    args = [
+    args = ["--enforce-eager", "--max-model-len", "100"]
-        "--enforce-eager",
-    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
@@ -21,8 +19,7 @@ def server():
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
+def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str):
-                                      model_name: str):
    text_1 = "What is the capital of France?"
    text_2 = [
        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
@@ -46,8 +43,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
+def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str):
-                                       model_name: str):
    text_1 = [
        "What is the capital of the United States?",
        "What is the capital of France?"
@@ -74,8 +70,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
+def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str):
-                                     model_name: str):
    text_1 = "What is the capital of France?"
    text_2 = "The capital of France is Paris."
@@ -92,3 +87,36 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
    assert score.data is not None
    assert len(score.data) == 1
    assert score.data[0].score >= 0.9
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str):
+    text_1 = "What is the capital of France?" * 20
+    text_2 = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+    score_response = requests.post(server.url_for("score"),
+                                   json={
+                                       "model": model_name,
+                                       "text_1": text_1,
+                                       "text_2": text_2,
+                                   })
+    assert score_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input." in \
+        score_response.text
+    # Test truncation
+    score_response = requests.post(server.url_for("score"),
+                                   json={
+                                       "model": model_name,
+                                       "text_1": text_1,
+                                       "text_2": text_2,
+                                       "truncate_prompt_tokens": 101
+                                   })
+    assert score_response.status_code == 400
+    assert "Please, select a smaller truncation size." in \
+        score_response.text
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -8,7 +8,8 @@ from vllm.config import MultiModalConfig
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_engine import BaseModelPath
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from ...utils import models_path_prefix
@@ -34,6 +35,8 @@ class MockModelConfig:
    hf_config = MockHFConfig()
    logits_processor_pattern = None
    diff_sampling_param: Optional[dict] = None
+    allowed_local_media_path: str = ""
+    encoder_config = None
    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}
@@ -50,14 +53,13 @@ async def _async_serving_chat_init():
    engine = MockEngine()
    model_config = await engine.get_model_config()
+    models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
    serving_completion = OpenAIServingChat(engine,
                                           model_config,
-                                           BASE_MODEL_PATHS,
+                                           models,
                                           response_role="assistant",
                                           chat_template=CHAT_TEMPLATE,
                                           chat_template_content_format="auto",
-                                           lora_modules=None,
-                                           prompt_adapters=None,
                                           request_logger=None)
    return serving_completion
@@ -72,14 +74,15 @@ def test_serving_chat_should_set_correct_max_tokens():
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=MockModelConfig())
    serving_chat = OpenAIServingChat(mock_engine,
                                     MockModelConfig(),
-                                     BASE_MODEL_PATHS,
+                                     models,
                                     response_role="assistant",
                                     chat_template=CHAT_TEMPLATE,
                                     chat_template_content_format="auto",
-                                     lora_modules=None,
-                                     prompt_adapters=None,
                                     request_logger=None)
    req = ChatCompletionRequest(
        model=MODEL_NAME,
@@ -101,6 +104,116 @@ def test_serving_chat_should_set_correct_max_tokens():
    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+    # Setting server's max_tokens in the generation_config.json
+    # lower than context_window - prompt_tokens
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "max_tokens": 10  # Setting server-side max_tokens limit
+    }
+    # Reinitialize the engine with new settings
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+    # Test Case 1: No max_tokens specified in request
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+    # Test Case 2: Request's max_tokens set higher than server accepts
+    req.max_tokens = 15
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+    # Test Case 3: Request's max_tokens set lower than server accepts
+    req.max_tokens = 5
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
+    # Setting server's max_tokens in the generation_config.json
+    # higher than context_window - prompt_tokens
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "max_tokens": 200  # Setting server-side max_tokens limit
+    }
+    # Reinitialize the engine with new settings
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+    # Test case 1: No max_tokens specified, defaults to context_window
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+    # Test Case 2: Request's max_tokens set higher than server accepts
+    req.max_tokens = 100
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+    # Test Case 3: Request's max_tokens set lower than server accepts
+    req.max_tokens = 5
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
 def test_serving_chat_could_load_correct_generation_config():
@@ -115,14 +228,15 @@ def test_serving_chat_could_load_correct_generation_config():
    mock_engine.errored = False
    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
    serving_chat = OpenAIServingChat(mock_engine,
                                     mock_model_config,
-                                     BASE_MODEL_PATHS,
+                                     models,
                                     response_role="assistant",
                                     chat_template=CHAT_TEMPLATE,
                                     chat_template_content_format="auto",
-                                     lora_modules=None,
-                                     prompt_adapters=None,
                                     request_logger=None)
    req = ChatCompletionRequest(
        model=MODEL_NAME,

--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -9,8 +9,8 @@ from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                              LoadLoraAdapterRequest,
                                              UnloadLoraAdapterRequest)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.lora.request import LoRARequest
 from ...utils import models_path_prefix
@@ -22,47 +22,48 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
    "Success: LoRA adapter '{lora_name}' removed successfully.")
-async def _async_serving_engine_init():
+async def _async_serving_models_init() -> OpenAIServingModels:
-    mock_engine_client = MagicMock(spec=EngineClient)
    mock_model_config = MagicMock(spec=ModelConfig)
+    mock_engine_client = MagicMock(spec=EngineClient)
    # Set the max_model_len attribute to avoid missing attribute
    mock_model_config.max_model_len = 2048
-    serving_engine = OpenAIServing(mock_engine_client,
+    serving_models = OpenAIServingModels(engine_client=mock_engine_client,
-                                   mock_model_config,
+                                         base_model_paths=BASE_MODEL_PATHS,
-                                   BASE_MODEL_PATHS,
+                                         model_config=mock_model_config,
-                                   lora_modules=None,
+                                         lora_modules=None,
-                                   prompt_adapters=None,
+                                         prompt_adapters=None)
-                                   request_logger=None)
+    await serving_models.init_static_loras()
-    return serving_engine
+    return serving_models
 @pytest.mark.asyncio
 async def test_serving_model_name():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
-    assert serving_engine._get_model_name(None) == MODEL_NAME
+    assert serving_models.model_name(None) == MODEL_NAME
    request = LoRARequest(lora_name="adapter",
                          lora_path="/path/to/adapter2",
                          lora_int_id=1)
-    assert serving_engine._get_model_name(request) == request.lora_name
+    assert serving_models.model_name(request) == request.lora_name
 @pytest.mark.asyncio
 async def test_load_lora_adapter_success():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
    request = LoadLoraAdapterRequest(lora_name="adapter",
                                     lora_path="/path/to/adapter2")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
-    assert len(serving_engine.lora_requests) == 1
+    assert len(serving_models.lora_requests) == 1
-    assert serving_engine.lora_requests[0].lora_name == "adapter"
+    assert serving_models.lora_requests[0].lora_name == "adapter"
 @pytest.mark.asyncio
 async def test_load_lora_adapter_missing_fields():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
    request = LoadLoraAdapterRequest(lora_name="", lora_path="")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
    assert response.code == HTTPStatus.BAD_REQUEST
@@ -70,43 +71,43 @@ async def test_load_lora_adapter_missing_fields():
 @pytest.mark.asyncio
 async def test_load_lora_adapter_duplicate():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
    request = LoadLoraAdapterRequest(lora_name="adapter1",
                                     lora_path="/path/to/adapter1")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
        lora_name='adapter1')
-    assert len(serving_engine.lora_requests) == 1
+    assert len(serving_models.lora_requests) == 1
    request = LoadLoraAdapterRequest(lora_name="adapter1",
                                     lora_path="/path/to/adapter1")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
    assert response.code == HTTPStatus.BAD_REQUEST
-    assert len(serving_engine.lora_requests) == 1
+    assert len(serving_models.lora_requests) == 1
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_success():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
    request = LoadLoraAdapterRequest(lora_name="adapter1",
                                     lora_path="/path/to/adapter1")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
-    assert len(serving_engine.lora_requests) == 1
+    assert len(serving_models.lora_requests) == 1
    request = UnloadLoraAdapterRequest(lora_name="adapter1")
-    response = await serving_engine.unload_lora_adapter(request)
+    response = await serving_models.unload_lora_adapter(request)
    assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
        lora_name='adapter1')
-    assert len(serving_engine.lora_requests) == 0
+    assert len(serving_models.lora_requests) == 0
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_missing_fields():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
    request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
-    response = await serving_engine.unload_lora_adapter(request)
+    response = await serving_models.unload_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
    assert response.code == HTTPStatus.BAD_REQUEST
@@ -114,9 +115,9 @@ async def test_unload_lora_adapter_missing_fields():
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_not_found():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
    request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
-    response = await serving_engine.unload_lora_adapter(request)
+    response = await serving_models.unload_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
-    assert response.type == "InvalidUserInput"
+    assert response.type == "NotFoundError"
-    assert response.code == HTTPStatus.BAD_REQUEST
+    assert response.code == HTTPStatus.NOT_FOUND
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
-import json
-import os
 import openai
 import pytest
@@ -10,16 +7,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
 @pytest.mark.asyncio
-async def test_shutdown_on_engine_failure(tmp_path):
+async def test_shutdown_on_engine_failure():
-    # Use a bad adapter to crash the engine
-    # (This test will fail when that bug is fixed)
-    adapter_path = tmp_path / "bad_adapter"
-    os.mkdir(adapter_path)
-    with open(adapter_path / "adapter_model_config.json", "w") as f:
-        json.dump({"not": "real"}, f)
-    with open(adapter_path / "adapter_model.safetensors", "wb") as f:
-        f.write(b"this is fake")
    # dtype, max-len etc set so that this can run in CI
    args = [
        "--dtype",
@@ -29,9 +17,6 @@ async def test_shutdown_on_engine_failure(tmp_path):
        "--enforce-eager",
        "--max-num-seqs",
        "128",
-        "--enable-lora",
-        "--lora-modules",
-        f"bad-adapter={tmp_path / 'bad_adapter'}",
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -39,9 +24,13 @@ async def test_shutdown_on_engine_failure(tmp_path):
            with pytest.raises(
                (openai.APIConnectionError, openai.InternalServerError)):
-                # This crashes the engine
+                # Asking for lots of prompt logprobs will currently crash the
-                await client.completions.create(model="bad-adapter",
+                # engine. This may change in the future when that bug is fixed
-                                                prompt="Hello, my name is")
+                prompt = "Hello " * 4000
+                await client.completions.create(
+                    model=MODEL_NAME,
+                    prompt=prompt,
+                    extra_body={"prompt_logprobs": 10})
            # Now the server should shut down
            return_code = remote_server.proc.wait(timeout=8)

--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -99,5 +99,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 3072
    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 765
+    assert embeddings.usage.prompt_tokens == 764
-    assert embeddings.usage.total_tokens == 765
+    assert embeddings.usage.total_tokens == 764
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -3,7 +3,6 @@ from typing import Optional
 import pytest
 import os
-from PIL import Image
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
@@ -93,10 +92,7 @@ def _assert_mm_data_is_image_input(
    image_data = mm_data.get("image")
    assert image_data is not None
-    if image_count == 1:
+    assert isinstance(image_data, list) and len(image_data) == image_count
-        assert isinstance(image_data, Image.Image)
-    else:
-        assert isinstance(image_data, list) and len(image_data) == image_count
 def test_parse_chat_messages_single_image(
@@ -760,10 +756,12 @@ def test_resolve_content_format_hf_defined(model, expected_format):
     ("template_chatglm.jinja", "string"),
     ("template_chatglm2.jinja", "string"),
     ("template_chatml.jinja", "string"),
+     ("template_deepseek_vl2.jinja", "string"),
     ("template_falcon_180b.jinja", "string"),
     ("template_falcon.jinja", "string"),
     ("template_inkbot.jinja", "string"),
     ("template_llava.jinja", "string"),
+     ("template_pixtral_hf.jinja", "openai"),
     ("template_vlm2vec.jinja", "openai"),
     ("tool_chat_template_granite_20b_fc.jinja", "string"),
     ("tool_chat_template_hermes.jinja", "string"),