Merge tag 'v0.6.5' into v0.6.5-dev

4d3a2c28 · zhuwenwen · 92ec5d8e · 2d1b9baa · 4d3a2c28 · 4d3a2c28
Commit 4d3a2c28 authored Dec 30, 2024 by zhuwenwen
20 changed files
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
+from typing import List
+import os
+import pytest
+from vllm import LLM
+from ..openai.test_vision import TEST_IMAGE_URLS
+from ...utils import models_path_prefix
+def test_chat():
+    llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
+    prompt1 = "Explain the concept of entropy."
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt1
+        },
+    ]
+    outputs = llm.chat(messages)
+    assert len(outputs) == 1
+def test_multi_chat():
+    llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
+    prompt1 = "Explain the concept of entropy."
+    prompt2 = "Explain what among us is."
+    conversation1 = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt1
+        },
+    ]
+    conversation2 = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt2
+        },
+    ]
+    messages = [conversation1, conversation2]
+    outputs = llm.chat(messages)
+    assert len(outputs) == 2
+@pytest.mark.parametrize("image_urls",
+                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+def test_chat_multi_image(image_urls: List[str]):
+    llm = LLM(
+        model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
+        dtype="bfloat16",
+        max_model_len=4096,
+        max_num_seqs=5,
+        enforce_eager=True,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 2},
+    )
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            } for image_url in image_urls),
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    outputs = llm.chat(messages)
+    assert len(outputs) >= 0
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -4,9 +4,8 @@ from typing import List
 import pytest
 import os
-from vllm import LLM, EmbeddingRequestOutput, PoolingParams
+from vllm import LLM, PoolingParams, PoolingRequestOutput
+from vllm.distributed import cleanup_dist_env_and_memory
-from ...conftest import cleanup
 from ...utils import models_path_prefix
 MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
@@ -43,29 +42,14 @@ def llm():
        del llm
-    cleanup()
+    cleanup_dist_env_and_memory()
-def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
+def assert_outputs_equal(o1: List[PoolingRequestOutput],
-                         o2: List[EmbeddingRequestOutput]):
+                         o2: List[PoolingRequestOutput]):
    assert [o.outputs for o in o1] == [o.outputs for o in o2]
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize('prompt', PROMPTS)
-def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
-    pooling_params = PoolingParams()
-    with pytest.warns(DeprecationWarning, match="'prompts'"):
-        v1_output = llm.encode(prompts=prompt, pooling_params=pooling_params)
-    v2_output = llm.encode(prompt, pooling_params=pooling_params)
-    assert_outputs_equal(v1_output, v2_output)
-    v2_output = llm.encode({"prompt": prompt}, pooling_params=pooling_params)
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
 def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
@@ -81,25 +65,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
    assert_outputs_equal(v1_output, v2_output)
-@pytest.mark.skip_global_cleanup
-def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
-    pooling_params = PoolingParams()
-    with pytest.warns(DeprecationWarning, match="'prompts'"):
-        v1_output = llm.encode(prompts=PROMPTS, pooling_params=pooling_params)
-    v2_output = llm.encode(PROMPTS, pooling_params=pooling_params)
-    assert_outputs_equal(v1_output, v2_output)
-    v2_output = llm.encode(
-        [{
-            "prompt": p
-        } for p in PROMPTS],
-        pooling_params=pooling_params,
-    )
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
    pooling_params = PoolingParams()

--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -5,9 +5,7 @@ import os
 import pytest
 from vllm import LLM, RequestOutput, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
-from ...conftest import cleanup
-from ..openai.test_vision import TEST_IMAGE_URLS
 from ...utils import models_path_prefix
 MODEL_NAME = os.path.join(models_path_prefix, "facebook/opt-125m")
@@ -42,30 +40,13 @@ def llm():
        del llm
-    cleanup()
+    cleanup_dist_env_and_memory()
 def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
    assert [o.outputs for o in o1] == [o.outputs for o in o2]
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize('prompt', PROMPTS)
-def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
-    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
-    with pytest.warns(DeprecationWarning, match="'prompts'"):
-        v1_output = llm.generate(prompts=prompt,
-                                 sampling_params=sampling_params)
-    v2_output = llm.generate(prompt, sampling_params=sampling_params)
-    assert_outputs_equal(v1_output, v2_output)
-    v2_output = llm.generate({"prompt": prompt},
-                             sampling_params=sampling_params)
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
 def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
@@ -81,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
    assert_outputs_equal(v1_output, v2_output)
-@pytest.mark.skip_global_cleanup
-def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
-    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
-    with pytest.warns(DeprecationWarning, match="'prompts'"):
-        v1_output = llm.generate(prompts=PROMPTS,
-                                 sampling_params=sampling_params)
-    v2_output = llm.generate(PROMPTS, sampling_params=sampling_params)
-    assert_outputs_equal(v1_output, v2_output)
-    v2_output = llm.generate(
-        [{
-            "prompt": p
-        } for p in PROMPTS],
-        sampling_params=sampling_params,
-    )
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
@@ -143,90 +104,3 @@ def test_multiple_sampling_params(llm: LLM):
    # sampling_params is None, default params should be applied
    outputs = llm.generate(PROMPTS, sampling_params=None)
    assert len(PROMPTS) == len(outputs)
-def test_chat():
-    llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"))
-    prompt1 = "Explain the concept of entropy."
-    messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt1
-        },
-    ]
-    outputs = llm.chat(messages)
-    assert len(outputs) == 1
-def test_multi_chat():
-    llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"))
-    prompt1 = "Explain the concept of entropy."
-    prompt2 = "Explain what among us is."
-    conversation1 = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt1
-        },
-    ]
-    conversation2 = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt2
-        },
-    ]
-    messages = [conversation1, conversation2]
-    outputs = llm.chat(messages)
-    assert len(outputs) == 2
-@pytest.mark.parametrize("image_urls",
-                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
-def test_chat_multi_image(image_urls: List[str]):
-    llm = LLM(
-        model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
-        dtype="bfloat16",
-        max_model_len=4096,
-        max_num_seqs=5,
-        enforce_eager=True,
-        trust_remote_code=True,
-        limit_mm_per_prompt={"image": 2},
-    )
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            } for image_url in image_urls),
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
-    outputs = llm.chat(messages)
-    assert len(outputs) >= 0
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -6,9 +6,8 @@ import os
 from huggingface_hub import snapshot_download
 from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest
-from ...conftest import cleanup
 from ...utils import models_path_prefix
 MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
@@ -41,7 +40,7 @@ def llm():
        del llm
-    cleanup()
+    cleanup_dist_env_and_memory()
 @pytest.fixture(scope="module")

--- a/tests/entrypoints/llm/test_gpu_utilization.py
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
+import os
+from vllm import LLM, SamplingParams
+from ...utils import models_path_prefix
+def test_gpu_memory_utilization():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    # makes sure gpu_memory_utilization is per-instance limit,
+    # not a global limit
+    llms = [
+        LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
+            gpu_memory_utilization=0.3,
+            enforce_eager=True) for i in range(3)
+    ]
+    for llm in llms:
+        outputs = llm.generate(prompts, sampling_params)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -6,11 +6,11 @@ import jsonschema
 import pytest
 import os
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
-from vllm.sampling_params import SamplingParams
-from ...conftest import cleanup
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from ...utils import models_path_prefix
 MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
@@ -25,7 +25,7 @@ def llm():
    with llm.deprecate_legacy_api():
        yield weakref.proxy(llm)
        del llm
-    cleanup()
+    cleanup_dist_env_and_memory()
 @pytest.mark.skip_global_cleanup
@@ -33,14 +33,12 @@ def test_guided_regex(sample_regex, llm):
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
-    )
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
-    outputs = llm.generate(
+    outputs = llm.generate(prompts=[
-        prompts=[
+        f"Give an example IPv4 address with this regex: {sample_regex}"
-            f"Give an example IPv4 address with this regex: {sample_regex}"
+    ] * 2,
-        ] * 2,
+                           sampling_params=sampling_params,
-        sampling_params=sampling_params,
+                           use_tqdm=True)
-        use_tqdm=True,
-        guided_options_request=dict(guided_regex=sample_regex))
    assert outputs is not None
    for output in outputs:
@@ -59,15 +57,13 @@ def test_guided_json_completion(sample_json_schema, llm):
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=1000,
-    )
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
-    outputs = llm.generate(
+    outputs = llm.generate(prompts=[
-        prompts=[
+        f"Give an example JSON for an employee profile "
-            f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}"
-            f"that fits this schema: {sample_json_schema}"
+    ] * 2,
-        ] * 2,
+                           sampling_params=sampling_params,
-        sampling_params=sampling_params,
+                           use_tqdm=True)
-        use_tqdm=True,
-        guided_options_request=dict(guided_json=sample_json_schema))
    assert outputs is not None
@@ -83,17 +79,72 @@ def test_guided_json_completion(sample_json_schema, llm):
        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+@pytest.mark.skip_global_cleanup
+def test_guided_complex_json_completion(sample_complex_json_schema, llm):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=sample_complex_json_schema))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for an assignment grade "
+        f"that fits this schema: {sample_complex_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_complex_json_schema)
+@pytest.mark.skip_global_cleanup
+def test_guided_definition_json_completion(sample_definition_json_schema, llm):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_definition_json_schema))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for solving 8x + 7 = -23 "
+        f"that fits this schema: {sample_definition_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_definition_json_schema)
 @pytest.mark.skip_global_cleanup
 def test_guided_choice_completion(sample_guided_choice, llm):
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
-    )
+        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
    outputs = llm.generate(
        prompts="The best language for type-safe systems programming is ",
        sampling_params=sampling_params,
-        use_tqdm=True,
+        use_tqdm=True)
-        guided_options_request=dict(guided_choice=sample_guided_choice))
    assert outputs is not None
    for output in outputs:
@@ -114,13 +165,13 @@ def test_guided_grammar(sample_sql_statements, llm):
        temperature=0.8,
        top_p=0.95,
        max_tokens=1000,
-    )
+        guided_decoding=GuidedDecodingParams(grammar=sample_sql_statements))
    outputs = llm.generate(
        prompts=("Generate a sql state that select col_1 from "
                 "table_1 where it is equals to 1"),
        sampling_params=sampling_params,
        use_tqdm=True,
-        guided_options_request=dict(guided_grammar=sample_sql_statements))
+    )
    assert outputs is not None
    for output in outputs:
@@ -142,3 +193,55 @@ def test_guided_grammar(sample_sql_statements, llm):
        assert generated_text.strip() == ground_truth
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+@pytest.mark.skip_global_cleanup
+def test_guided_options_request_deprecation_warning(sample_regex, llm):
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    with pytest.warns(DeprecationWarning, match="guided_options_request"):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True,
+                     guided_options_request=dict(guided_regex=sample_regex))
+@pytest.mark.skip_global_cleanup
+def test_validation_against_both_guided_decoding_options(sample_regex, llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+    with pytest.raises(ValueError, match="Cannot set both"):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True,
+                     guided_options_request=dict(guided_regex=sample_regex))
+@pytest.mark.skip_global_cleanup
+def test_guided_json_object(llm):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=100,
+        guided_decoding=GuidedDecodingParams(json_object=True))
+    outputs = llm.generate(
+        prompts=("Generate a JSON object describing a person with name "
+                 "and age for John Smith who is 31 years old."),
+        sampling_params=sampling_params,
+        use_tqdm=True)
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        # Parse to verify it is valid JSON
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
--- a/tests/entrypoints/llm/test_init.py
+++ b/tests/entrypoints/llm/test_init.py
+import os
+import pytest
+from vllm import LLM
+from ...utils import error_on_warning, models_path_prefix
+MODEL_NAME = os.path.join(models_path_prefix, "facebook/opt-125m")
+def test_pos_args_deprecated():
+    with error_on_warning(DeprecationWarning):
+        LLM(model=MODEL_NAME, tokenizer=MODEL_NAME)
+    with error_on_warning(DeprecationWarning):
+        LLM(MODEL_NAME, tokenizer=MODEL_NAME)
+    with pytest.warns(DeprecationWarning, match="'tokenizer'"):
+        LLM(MODEL_NAME, MODEL_NAME)
+    with pytest.warns(DeprecationWarning,
+                      match="'tokenizer', 'tokenizer_mode'"):
+        LLM(MODEL_NAME, MODEL_NAME, "auto")
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
 import sys
 import os
+from contextlib import nullcontext
+from vllm_test_utils import BlameResult, blame
 from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
 from ...utils import models_path_prefix
-def test_lazy_outlines(sample_regex):
+def run_normal():
-    """If users don't use guided decoding, outlines should not be imported.
-    """
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
@@ -16,6 +18,7 @@ def test_lazy_outlines(sample_regex):
    ]
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    # Create an LLM without guided decoding as a baseline.
    llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
              enforce_eager=True,
              gpu_memory_utilization=0.3)
@@ -25,9 +28,13 @@ def test_lazy_outlines(sample_regex):
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    # make sure outlines is not imported
+    # Destroy the LLM object and free up the GPU memory.
-    assert 'outlines' not in sys.modules
+    del llm
+    cleanup_dist_env_and_memory()
+def run_lmfe(sample_regex):
+    # Create an LLM with guided decoding enabled.
    llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
              enforce_eager=True,
              guided_decoding_backend="lm-format-enforcer",
@@ -46,5 +53,26 @@ def test_lazy_outlines(sample_regex):
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+def test_lazy_outlines(sample_regex):
+    """If users don't use guided decoding, outlines should not be imported.
+    """
    # make sure outlines is not imported
-    assert 'outlines' not in sys.modules
+    module_name = "outlines"
+    # In CI, we only check finally if the module is imported.
+    # If it is indeed imported, we can rerun the test with `use_blame=True`,
+    # which will trace every function call to find the first import location,
+    # and help find the root cause.
+    # We don't run it in CI by default because it is slow.
+    use_blame = False
+    context = blame(
+        lambda: module_name in sys.modules) if use_blame else nullcontext()
+    with context as result:
+        run_normal()
+        run_lmfe(sample_regex)
+    if use_blame:
+        assert isinstance(result, BlameResult)
+        print(f"the first import location is:\n{result.trace_stack}")
+    assert module_name not in sys.modules, (
+        f"Module {module_name} is imported. To see the first"
+        f" import location, run the test with `use_blame=True`.")
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -5,7 +5,22 @@ from vllm import LLM
 from ...utils import models_path_prefix
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
 def test_empty_prompt():
-    llm = LLM(model=os.path.join(models_path_prefix, "gpt2"))
+    llm = LLM(model=os.path.join(models_path_prefix, "gpt2"), enforce_eager=True)
    with pytest.raises(ValueError, match='Prompt cannot be empty'):
        llm.generate([""])
+@pytest.mark.skip_v1
+def test_out_of_vocab_token():
+    llm = LLM(model=os.path.join(models_path_prefix, "gpt2"), enforce_eager=True)
+    with pytest.raises(ValueError, match='out of vocabulary'):
+        llm.generate({"prompt_token_ids": [999999]})
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
 """Tests for HF_HUB_OFFLINE mode"""
 import importlib
 import sys
-import weakref
 import os
 import pytest
 from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
-from ...conftest import cleanup
 from ...utils import models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "facebook/opt-125m")
-@pytest.fixture(scope="module")
+MODEL_CONFIGS = [
-def llm():
+    {
-    # pytest caches the fixture so we use weakref.proxy to
+        "model": os.path.join(models_path_prefix, "facebook/opt-125m"),
-    # enable garbage collection
+        "enforce_eager": True,
-    llm = LLM(model=MODEL_NAME,
+        "gpu_memory_utilization": 0.20,
-              max_num_batched_tokens=4096,
+        "max_model_len": 64,
-              tensor_parallel_size=1,
+        "max_num_batched_tokens": 64,
-              gpu_memory_utilization=0.10,
+        "max_num_seqs": 64,
-              enforce_eager=True)
+        "tensor_parallel_size": 1,
+    },
+    {
+        "model":  os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.1"),
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.95,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+        "tokenizer_mode": "mistral",
+    },
+]
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
-        del llm
+@pytest.fixture(scope="module")
+def cache_models():
+    # Cache model files first
+    for model_config in MODEL_CONFIGS:
+        LLM(**model_config)
+        cleanup_dist_env_and_memory()
-    cleanup()
+    yield
 @pytest.mark.skip_global_cleanup
-def test_offline_mode(llm: LLM, monkeypatch):
+@pytest.mark.usefixtures("cache_models")
-    # we use the llm fixture to ensure the model files are in-cache
+def test_offline_mode(monkeypatch):
-    del llm
    # Set HF to offline mode and ensure we can still construct an LLM
    try:
        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
        # Need to re-import huggingface_hub and friends to setup offline mode
        _re_import_modules()
        # Cached model files should be used in offline mode
-        LLM(model=MODEL_NAME,
+        for model_config in MODEL_CONFIGS:
-            max_num_batched_tokens=4096,
+            LLM(**model_config)
-            tensor_parallel_size=1,
-            gpu_memory_utilization=0.10,
-            enforce_eager=True)
    finally:
        # Reset the environment after the test
        # NB: Assuming tests are run in online mode

--- a/tests/entrypoints/openai/test_accuracy.py
+++ b/tests/entrypoints/openai/test_accuracy.py
@@ -11,6 +11,8 @@ import lm_eval
 import pytest
 import os
+from vllm.platforms import current_platform
 from ...utils import RemoteOpenAIServer, models_path_prefix
 MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
@@ -19,22 +21,33 @@ TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
 EXPECTED_VALUE = 0.58
-DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
+DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"]
 MORE_ARGS_LIST = [
+    [],  # Default
    ["--enable-chunked-prefill"],  # Chunked
    ["--num-scheduler-steps", "8"],  # MS
    ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"]  # MS+Stream
 ]
+MAX_WAIT_SECONDS = None
+if current_platform.is_tpu():
+    MORE_ARGS_LIST = [
+        [],  # Default
+        # ["--num-scheduler-steps", "8"], # Multi-step << currently fails
+    ]
+    MAX_WAIT_SECONDS = 600
+def run_test(more_args):
+    """Run the end to end accuracy test."""
-@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy(more_args):
    args = list(DEFAULT_ARGS)
    args.extend(more_args)
    print(f"Running with: {args}")
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(
+            MODEL_NAME, args,
+            max_wait_seconds=MAX_WAIT_SECONDS) as remote_server:
        url = f"{remote_server.url_for('v1')}/completions"
        model_args = (
@@ -52,3 +65,22 @@ def test_lm_eval_accuracy(more_args):
        assert (measured_value - RTOL < EXPECTED_VALUE
                and measured_value + RTOL > EXPECTED_VALUE
                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="V1 currently only supported on CUDA")
+def test_lm_eval_accuracy_v1_engine(monkeypatch):
+    """Run with the V1 Engine."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        run_test([])
+@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
+def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
+    """Run with the V0 Engine."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        run_test(more_args)
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
+import asyncio
+import contextlib
+import random
+import time
+from typing import Callable
+import os
+import openai
+import pytest
+import pytest_asyncio
+import requests
+from tests.utils import RemoteOpenAIServer
+from ...utils import models_path_prefix
+MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct")
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        "--load-format",
+        "dummy",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ids=["completion", "chat"],
+    argnames=["create_func_gen", "content_body"],
+    argvalues=[
+        (lambda x: x.completions.create, {
+            "prompt": " ".join(['A'] * 10_000)
+        }),
+        (lambda x: x.chat.completions.create, {
+            "messages": [{
+                "role": "user",
+                "content": " ".join(['A'] * 10_000)
+            }]
+        }),
+    ],
+)
+async def test_with_and_without_truncate(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncOpenAI,
+    create_func_gen: Callable,
+    content_body: dict,
+):
+    create_func = create_func_gen(client)
+    body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
+    num_requests = 10
+    truncate_prompt_tokens = ([1000] * (num_requests // 2) + [None] *
+                              (num_requests - num_requests // 2))
+    random.shuffle(truncate_prompt_tokens)
+    bodies = [{
+        **body, "extra_body": {
+            'truncate_prompt_tokens': t
+        }
+    } for t in truncate_prompt_tokens]
+    async def get_status_code(**kwargs):
+        try:
+            await create_func(**kwargs)
+            return 200
+        except openai.APIStatusError as e:
+            return e.status_code
+    responses = await asyncio.gather(*[get_status_code(**b) for b in bodies])
+    assert 500 not in responses
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ids=["single completion", "multiple completions", "chat"],
+    argnames=["create_func_gen", "content_body"],
+    argvalues=[
+        (lambda x: x.completions.create, {
+            "prompt": " ".join(['A'] * 300_000)
+        }),
+        (lambda x: x.completions.create, {
+            "prompt": [" ".join(['A'] * 300_000)] * 2
+        }),
+        (lambda x: x.chat.completions.create, {
+            "messages": [{
+                "role": "user",
+                "content": " ".join(['A'] * 300_000)
+            }]
+        }),
+    ],
+)
+async def test_healthcheck_response_time(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncOpenAI,
+    create_func_gen: Callable,
+    content_body: dict,
+):
+    num_requests = 50
+    create_func = create_func_gen(client)
+    body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
+    def get_response_time(url):
+        start_time = time.monotonic()
+        res = requests.get(url)
+        end_time = time.monotonic()
+        assert res.status_code == 200
+        return end_time - start_time
+    no_load_response_time = get_response_time(server.url_for("health"))
+    tasks = [
+        asyncio.create_task(create_func(**body)) for _ in range(num_requests)
+    ]
+    await asyncio.sleep(1)  # give the tasks a chance to start running
+    load_response_time = get_response_time(server.url_for("health"))
+    with contextlib.suppress(openai.APIStatusError):
+        await asyncio.gather(*tasks)
+    assert load_response_time < 100 * no_load_response_time
+    assert load_response_time < 0.1
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -24,8 +24,11 @@ def server():
        "--dtype",
        "bfloat16",
        "--max-model-len",
-        "4096",
+        "2048",
+        "--max-num-seqs",
+        "5",
        "--enforce-eager",
+        "--trust-remote-code",
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -69,11 +72,12 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
    }]
    # test single completion
-    chat_completion = await client.chat.completions.create(model=model_name,
+    chat_completion = await client.chat.completions.create(
-                                                           messages=messages,
+        model=model_name,
-                                                           max_tokens=10,
+        messages=messages,
-                                                           logprobs=True,
+        max_completion_tokens=10,
-                                                           top_logprobs=5)
+        logprobs=True,
+        top_logprobs=5)
    assert len(chat_completion.choices) == 1
    choice = chat_completion.choices[0]
@@ -92,7 +96,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_tokens=10,
+        max_completion_tokens=10,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0
@@ -124,11 +128,12 @@ async def test_single_chat_session_audio_base64encoded(
    }]
    # test single completion
-    chat_completion = await client.chat.completions.create(model=model_name,
+    chat_completion = await client.chat.completions.create(
-                                                           messages=messages,
+        model=model_name,
-                                                           max_tokens=10,
+        messages=messages,
-                                                           logprobs=True,
+        max_completion_tokens=10,
-                                                           top_logprobs=5)
+        logprobs=True,
+        top_logprobs=5)
    assert len(chat_completion.choices) == 1
    choice = chat_completion.choices[0]
@@ -147,7 +152,62 @@ async def test_single_chat_session_audio_base64encoded(
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_tokens=10,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_single_chat_session_input_audio(
+        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
+        base64_encoded_audio: Dict[str, str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0
@@ -179,7 +239,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_tokens=10,
+        max_completion_tokens=10,
        temperature=0.0,
    )
    output = chat_completion.choices[0].message.content
@@ -189,7 +249,67 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
    stream = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_tokens=10,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
+                                          model_name: str, audio_url: str,
+                                          base64_encoded_audio: Dict[str,
+                                                                     str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
        temperature=0.0,
        stream=True,
    )
@@ -214,7 +334,8 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
-                                 audio_url: str):
+                                 audio_url: str,
+                                 base64_encoded_audio: Dict[str, str]):
    messages = [{
        "role":
@@ -227,9 +348,10 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
                }
            },
            {
-                "type": "audio_url",
+                "type": "input_audio",
-                "audio_url": {
+                "input_audio": {
-                    "url": audio_url
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
                }
            },
            {
@@ -243,7 +365,7 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
        await client.chat.completions.create(
            model=model_name,
            messages=messages,
-            max_tokens=10,
+            max_completion_tokens=10,
            temperature=0.0,
        )

--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
+import asyncio
 from http import HTTPStatus
+from typing import List
 import openai
 import pytest
@@ -13,8 +15,44 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
 MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
+@pytest.fixture(scope='module')
+def server_args(request: pytest.FixtureRequest) -> List[str]:
+    """ Provide extra arguments to the server via indirect parametrization
+    Usage:
+    >>> @pytest.mark.parametrize(
+    >>>     "server_args",
+    >>>     [
+    >>>         ["--disable-frontend-multiprocessing"],
+    >>>         [
+    >>>             "--model=NousResearch/Hermes-3-Llama-3.1-70B",
+    >>>             "--enable-auto-tool-choice",
+    >>>         ],
+    >>>     ],
+    >>>     indirect=True,
+    >>> )
+    >>> def test_foo(server, client):
+    >>>     ...
+    This will run `test_foo` twice with servers with:
+    - `--disable-frontend-multiprocessing`
+    - `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
+    """
+    if not hasattr(request, "param"):
+        return []
+    val = request.param
+    if isinstance(val, str):
+        return [val]
+    return request.param
 @pytest.fixture(scope="module")
-def server():
+def server(server_args):
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
@@ -24,6 +62,7 @@ def server():
        "--enforce-eager",
        "--max-num-seqs",
        "128",
+        *server_args,
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -36,20 +75,83 @@ async def client(server):
        yield async_client
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param([], id="default-frontend-multiprocessing"),
+        pytest.param(["--disable-frontend-multiprocessing"],
+                     id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
 @pytest.mark.asyncio
-async def test_show_version(client: openai.AsyncOpenAI):
+async def test_show_version(server: RemoteOpenAIServer):
-    base_url = str(client.base_url)[:-3].strip("/")
+    response = requests.get(server.url_for("version"))
-    response = requests.get(base_url + "/version")
    response.raise_for_status()
    assert response.json() == {"version": VLLM_VERSION}
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param([], id="default-frontend-multiprocessing"),
+        pytest.param(["--disable-frontend-multiprocessing"],
+                     id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
 @pytest.mark.asyncio
-async def test_check_health(client: openai.AsyncOpenAI):
+async def test_check_health(server: RemoteOpenAIServer):
-    base_url = str(client.base_url)[:-3].strip("/")
+    response = requests.get(server.url_for("health"))
-    response = requests.get(base_url + "/health")
    assert response.status_code == HTTPStatus.OK
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param(["--max-model-len", "10100"],
+                     id="default-frontend-multiprocessing"),
+        pytest.param(
+            ["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
+            id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_request_cancellation(server: RemoteOpenAIServer):
+    # clunky test: send an ungodly amount of load in with short timeouts
+    # then ensure that it still responds quickly afterwards
+    chat_input = [{"role": "user", "content": "Write a long story"}]
+    client = server.get_async_client(timeout=0.5)
+    tasks = []
+    # Request about 2 million tokens
+    for _ in range(200):
+        task = asyncio.create_task(
+            client.chat.completions.create(messages=chat_input,
+                                           model=MODEL_NAME,
+                                           max_tokens=10000,
+                                           extra_body={"min_tokens": 10000}))
+        tasks.append(task)
+    done, pending = await asyncio.wait(tasks,
+                                       return_when=asyncio.ALL_COMPLETED)
+    # Make sure all requests were sent to the server and timed out
+    # (We don't want to hide other errors like 400s that would invalidate this
+    # test)
+    assert len(pending) == 0
+    for d in done:
+        with pytest.raises(openai.APITimeoutError):
+            d.result()
+    # If the server had not cancelled all the other requests, then it would not
+    # be able to respond to this one within the timeout
+    client = server.get_async_client(timeout=5)
+    response = await client.chat.completions.create(messages=chat_input,
+                                                    model=MODEL_NAME,
+                                                    max_tokens=10)
+    assert len(response.choices) == 1
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -17,9 +17,6 @@ from .test_completion import zephyr_lora_files  # noqa: F401
 # any model with a chat template should work here
 MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
-# generation quality here
-LORA_NAME = os.path.join(models_path_prefix, "typeof/zephyr-7b-beta-lora") 
 @pytest.fixture(scope="module")
@@ -69,11 +66,12 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
        "content": "what is 1+1?"
    }]
-    chat_completion = await client.chat.completions.create(model=model_name,
+    chat_completion = await client.chat.completions.create(
-                                                           messages=messages,
+        model=model_name,
-                                                           max_tokens=5,
+        messages=messages,
-                                                           temperature=0.0,
+        max_completion_tokens=5,
-                                                           logprobs=False)
+        temperature=0.0,
+        logprobs=False)
    choice = chat_completion.choices[0]
    assert choice.logprobs is None
@@ -94,12 +92,13 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
        "content": "what is 1+1?"
    }]
-    chat_completion = await client.chat.completions.create(model=model_name,
+    chat_completion = await client.chat.completions.create(
-                                                           messages=messages,
+        model=model_name,
-                                                           max_tokens=5,
+        messages=messages,
-                                                           temperature=0.0,
+        max_completion_tokens=5,
-                                                           logprobs=True,
+        temperature=0.0,
-                                                           top_logprobs=0)
+        logprobs=True,
+        top_logprobs=0)
    choice = chat_completion.choices[0]
    assert choice.logprobs is not None
@@ -121,12 +120,13 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
        "content": "what is 1+1?"
    }]
-    chat_completion = await client.chat.completions.create(model=model_name,
+    chat_completion = await client.chat.completions.create(
-                                                           messages=messages,
+        model=model_name,
-                                                           max_tokens=5,
+        messages=messages,
-                                                           temperature=0.0,
+        max_completion_tokens=5,
-                                                           logprobs=True,
+        temperature=0.0,
-                                                           top_logprobs=5)
+        logprobs=True,
+        top_logprobs=5)
    choice = chat_completion.choices[0]
    assert choice.logprobs is not None
@@ -153,7 +153,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
    with pytest.raises((openai.BadRequestError, openai.APIError)):
        stream = await client.chat.completions.create(model=model_name,
                                                      messages=messages,
-                                                      max_tokens=10,
+                                                      max_completion_tokens=10,
                                                      logprobs=True,
                                                      top_logprobs=21,
                                                      stream=True)
@@ -163,16 +163,17 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
    with pytest.raises(openai.BadRequestError):
        await client.chat.completions.create(model=model_name,
                                             messages=messages,
-                                             max_tokens=10,
+                                             max_completion_tokens=10,
                                             logprobs=True,
                                             top_logprobs=30,
                                             stream=False)
    # the server should still work afterwards
-    chat_completion = await client.chat.completions.create(model=model_name,
+    chat_completion = await client.chat.completions.create(
-                                                           messages=messages,
+        model=model_name,
-                                                           max_tokens=10,
+        messages=messages,
-                                                           stream=False)
+        max_completion_tokens=10,
+        stream=False)
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0
@@ -275,11 +276,12 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
    }]
    # test single completion
-    chat_completion = await client.chat.completions.create(model=model_name,
+    chat_completion = await client.chat.completions.create(
-                                                           messages=messages,
+        model=model_name,
-                                                           max_tokens=10,
+        messages=messages,
-                                                           logprobs=True,
+        max_completion_tokens=10,
-                                                           top_logprobs=5)
+        logprobs=True,
+        top_logprobs=5)
    assert chat_completion.id is not None
    assert len(chat_completion.choices) == 1
@@ -298,7 +300,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_tokens=10,
+        max_completion_tokens=10,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0
@@ -323,7 +325,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_tokens=10,
+        max_completion_tokens=10,
        temperature=0.0,
    )
    output = chat_completion.choices[0].message.content
@@ -333,7 +335,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
    stream = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_tokens=10,
+        max_completion_tokens=10,
        temperature=0.0,
        stream=True,
    )
@@ -377,7 +379,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
    stream = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_tokens=10,
+        max_completion_tokens=10,
        temperature=0.0,
        stream=True,
        stream_options={"include_usage": False})
@@ -388,7 +390,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
    #                                   "continuous_usage_stats": False}}
    stream = await client.chat.completions.create(model=model_name,
                                                  messages=messages,
-                                                  max_tokens=10,
+                                                  max_completion_tokens=10,
                                                  temperature=0.0,
                                                  stream=True,
                                                  stream_options={
@@ -417,7 +419,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
        await client.chat.completions.create(
            model=model_name,
            messages=messages,
-            max_tokens=10,
+            max_completion_tokens=10,
            temperature=0.0,
            stream=False,
            stream_options={"include_usage": None})
@@ -427,7 +429,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
        await client.chat.completions.create(
            model=model_name,
            messages=messages,
-            max_tokens=10,
+            max_completion_tokens=10,
            temperature=0.0,
            stream=False,
            stream_options={"include_usage": True})
@@ -437,19 +439,29 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
    stream = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_tokens=10,
+        max_completion_tokens=10,
+        extra_body=dict(min_tokens=10),
        temperature=0.0,
        stream=True,
        stream_options={
            "include_usage": True,
-            "continuous_usage_stats": True
+            "continuous_usage_stats": True,
        },
    )
+    last_completion_tokens = 0
    async for chunk in stream:
        assert chunk.usage.prompt_tokens >= 0
-        assert chunk.usage.completion_tokens >= 0
+        assert last_completion_tokens == 0 or \
+               chunk.usage.completion_tokens > last_completion_tokens or \
+               (
+                   not chunk.choices and
+                   chunk.usage.completion_tokens == last_completion_tokens
+               )
        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
                                            chunk.usage.completion_tokens)
+        last_completion_tokens = chunk.usage.completion_tokens
+    assert last_completion_tokens == 10
 # NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
@@ -474,7 +486,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
    chat_completion = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
-        max_tokens=10,
+        max_completion_tokens=10,
        extra_body=dict(guided_choice=sample_guided_choice,
                        guided_decoding_backend=guided_decoding_backend))
    choice1 = chat_completion.choices[0].message.content
@@ -488,7 +500,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
    chat_completion = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
-        max_tokens=10,
+        max_completion_tokens=10,
        extra_body=dict(guided_choice=sample_guided_choice,
                        guided_decoding_backend=guided_decoding_backend))
    choice2 = chat_completion.choices[0].message.content
@@ -515,7 +527,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
    chat_completion = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
-        max_tokens=1000,
+        max_completion_tokens=1000,
        extra_body=dict(guided_json=sample_json_schema,
                        guided_decoding_backend=guided_decoding_backend))
    message = chat_completion.choices[0].message
@@ -533,7 +545,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
    chat_completion = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
-        max_tokens=1000,
+        max_completion_tokens=1000,
        extra_body=dict(guided_json=sample_json_schema,
                        guided_decoding_backend=guided_decoding_backend))
    message = chat_completion.choices[0].message
@@ -561,7 +573,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
    chat_completion = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
-        max_tokens=20,
+        max_completion_tokens=20,
        extra_body=dict(guided_regex=sample_regex,
                        guided_decoding_backend=guided_decoding_backend))
    ip1 = chat_completion.choices[0].message.content
@@ -573,7 +585,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
    chat_completion = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
-        max_tokens=20,
+        max_completion_tokens=20,
        extra_body=dict(guided_regex=sample_regex,
                        guided_decoding_backend=guided_decoding_backend))
    ip2 = chat_completion.choices[0].message.content
@@ -621,7 +633,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
    chat_completion = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
-        max_tokens=10,
+        max_completion_tokens=10,
        logprobs=True,
        top_logprobs=5,
        extra_body=dict(guided_choice=sample_guided_choice,
@@ -658,7 +670,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
    chat_completion = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
-        max_tokens=1000,
+        max_completion_tokens=1000,
        tools=[{
            "type": "function",
            "function": {
@@ -692,7 +704,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
    stream = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
-        max_tokens=1000,
+        max_completion_tokens=1000,
        tools=[{
            "type": "function",
            "function": {
@@ -748,7 +760,7 @@ async def test_required_tool_use_not_yet_supported(
        await client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
-            max_tokens=1000,
+            max_completion_tokens=1000,
            tools=[{
                "type": "function",
                "function": {
@@ -763,7 +775,7 @@ async def test_required_tool_use_not_yet_supported(
        await client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
-            max_tokens=1000,
+            max_completion_tokens=1000,
            tools=[{
                "type": "function",
                "function": {
@@ -794,7 +806,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
    with pytest.raises(openai.BadRequestError):
        await client.chat.completions.create(model=MODEL_NAME,
                                             messages=messages,
-                                             max_tokens=1000,
+                                             max_completion_tokens=1000,
                                             tool_choice={
                                                 "type": "function",
                                                 "function": {
@@ -807,7 +819,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
        await client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
-            max_tokens=1000,
+            max_completion_tokens=1000,
            tools=[{
                "type": "function",
                "function": {
@@ -822,6 +834,20 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
                    "name": "nondefined_function_name"
                }
            })
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": sample_json_schema
+                }
+            }],
+            tool_choice={})
 @pytest.mark.asyncio
@@ -846,14 +872,28 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
 async def test_response_format_json_schema(client: openai.AsyncOpenAI):
+    prompt = 'what is 1+1? The format is "result": 2'
+    # Check that this prompt cannot lead to a valid JSON without json_schema
    for _ in range(2):
        resp = await client.chat.completions.create(
            model=MODEL_NAME,
            messages=[{
-                "role":
+                "role": "user",
-                "user",
+                "content": prompt
-                "content": ('what is 1+1? please respond with a JSON object, '
+            }],
-                            'the format is {"result": 2}')
+        )
+        content = resp.choices[0].message.content
+        assert content is not None
+        with pytest.raises((json.JSONDecodeError, AssertionError)):
+            loaded = json.loads(content)
+            assert loaded == {"result": 2}, loaded
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role": "user",
+                "content": prompt
            }],
            response_format={
                "type": "json_schema",
@@ -878,19 +918,19 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
-async def test_extra_fields(client: openai.AsyncOpenAI):
+async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
-    with pytest.raises(BadRequestError) as exc_info:
+    resp = await client.chat.completions.create(
-        await client.chat.completions.create(
+        model=MODEL_NAME,
-            model=MODEL_NAME,
+        messages=[{
-            messages=[{
+            "role": "user",
-                "role": "system",
+            "content": "what is 1+1?",
-                "content": "You are a helpful assistant.",
+            "extra_field": "0",
-                "extra_field": "0",
+        }],  # type: ignore
-            }],  # type: ignore
+        temperature=0,
-            temperature=0,
+        seed=0)
-            seed=0)
+    content = resp.choices[0].message.content
-    assert "extra_forbidden" in exc_info.value.message
+    assert content is not None
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
+from typing import NamedTuple
+import os
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+from ...utils import RemoteOpenAIServer, models_path_prefix
+# # any model with a chat template should work here
+MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--enforce-eager",
+        "--max-model-len",
+        "4080",
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+class TestCase(NamedTuple):
+    model_name: str
+    echo: bool
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        TestCase(model_name=MODEL_NAME, echo=True),
+        TestCase(model_name=MODEL_NAME, echo=False)
+    ],
+)
+async def test_chat_session_with_echo_and_continue_final_message(
+        client: openai.AsyncOpenAI, test_case: TestCase):
+    saying: str = "Here is a common saying about apple. An apple a day, keeps"
+    # test echo with continue_final_message parameter
+    chat_completion = await client.chat.completions.create(
+        model=test_case.model_name,
+        messages=[{
+            "role": "user",
+            "content": "tell me a common saying"
+        }, {
+            "role": "assistant",
+            "content": saying
+        }],
+        extra_body={
+            "echo": test_case.echo,
+            "continue_final_message": True,
+            "add_generation_prompt": False
+        })
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "stop"
+    message = choice.message
+    if test_case.echo:
+        assert message.content is not None and saying in message.content
+    else:
+        assert message.content is not None and saying not in message.content
+    assert message.role == "assistant"
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -13,7 +13,7 @@ assert chatml_jinja_path.exists()
 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATON_OUTPUT = [
-    (os.path.join(models_path_prefix, "facebook/opt-125m"), chatml_jinja_path, True, """<|im_start|>user
+    (os.path.join(models_path_prefix, "facebook/opt-125m"), chatml_jinja_path, True, False, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
@@ -21,12 +21,20 @@ Hi there!<|im_end|>
 What is the capital of<|im_end|>
 <|im_start|>assistant
 """),
-    (os.path.join(models_path_prefix, "facebook/opt-125m"), chatml_jinja_path, False, """<|im_start|>user
+    (os.path.join(models_path_prefix, "facebook/opt-125m"), chatml_jinja_path, False, False, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
 <|im_start|>user
-What is the capital of""")
+What is the capital of"""),
+    (os.path.join(models_path_prefix, "facebook/opt-125m"), chatml_jinja_path, False, True, """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of<|im_end|>
+<|im_start|>assistant
+The capital of"""),
 ]
 TEST_MESSAGES = [
@@ -43,6 +51,10 @@ TEST_MESSAGES = [
        'content': 'What is the capital of'
    },
 ]
+ASSISTANT_MESSAGE_TO_CONTINUE = {
+    'role': 'assistant',
+    'content': 'The capital of'
+}
 def test_load_chat_template():
@@ -74,10 +86,10 @@ def test_no_load_chat_template_literallike():
 @pytest.mark.parametrize(
-    "model,template,add_generation_prompt,expected_output",
+    "model,template,add_generation_prompt,continue_final_message,expected_output",
    MODEL_TEMPLATE_GENERATON_OUTPUT)
 def test_get_gen_prompt(model, template, add_generation_prompt,
-                        expected_output):
+                        continue_final_message, expected_output):
    # Initialize the tokenizer
    tokenizer = get_tokenizer(tokenizer_name=model)
    template_content = load_chat_template(chat_template=template)
@@ -85,8 +97,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
    # Create a mock request object using keyword arguments
    mock_request = ChatCompletionRequest(
        model=model,
-        messages=TEST_MESSAGES,
+        messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
-        add_generation_prompt=add_generation_prompt)
+        if continue_final_message else TEST_MESSAGES,
+        add_generation_prompt=add_generation_prompt,
+        continue_final_message=continue_final_message,
+    )
    # Call the function and get the result
    result = apply_hf_chat_template(
@@ -94,6 +109,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
        conversation=mock_request.messages,
        chat_template=mock_request.chat_template or template_content,
        add_generation_prompt=mock_request.add_generation_prompt,
+        continue_final_message=mock_request.continue_final_message,
    )
    # Test assertion

--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
+import os
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+from ...utils import RemoteOpenAIServer, models_path_prefix
+# any model with a chat template should work here
+MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--max-num-seqs",
+        "128",
+        "--enable-chunked-prefill",
+        "--max-num-batched-tokens",
+        "1000",
+        # large prompts create a lot of output
+        "--disable-log-requests",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+@pytest.mark.asyncio
+async def test_completion_stream_options_and_logprobs_with_long_prompts(
+        client: openai.AsyncOpenAI):
+    # Test stream with long prompt
+    prompt = "What is the capital of France?" * 400
+    stream = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=5,
+    )
+    tokens_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if not finished:
+            tokens_received += 1
+            assert chunk.choices[0].text
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+@pytest.mark.asyncio
+async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
+        client: openai.AsyncOpenAI):
+    # Test stream with long prompt
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?" * 400
+    }]
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=True,
+        top_logprobs=5,
+    )
+    tokens_received = 0
+    empty_chunks_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if not finished:
+            if chunk.choices[0].delta.content == "":
+                # when there is no tokens generated
+                assert chunk.usage.completion_tokens == 0
+                assert chunk.choices[0].logprobs is None
+                empty_chunks_received += 1
+            else:
+                tokens_received += 1
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+    assert empty_chunks_received <= 1
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
 import json
-import unittest
-from vllm.entrypoints.openai.cli_args import make_arg_parser
+import pytest
+from vllm.entrypoints.openai.cli_args import (make_arg_parser,
+                                              validate_parsed_serve_args)
 from vllm.entrypoints.openai.serving_engine import LoRAModulePath
 from vllm.utils import FlexibleArgumentParser
+from ...utils import VLLM_PATH
 LORA_MODULE = {
    "name": "module2",
    "path": "/path/to/module2",
    "base_model_name": "llama"
 }
+CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
+assert CHATML_JINJA_PATH.exists()
-class TestLoraParserAction(unittest.TestCase):
+@pytest.fixture
+def serve_parser():
+    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
+    return make_arg_parser(parser)
-    def setUp(self):
-        # Setting up argparse parser for tests
-        parser = FlexibleArgumentParser(
-            description="vLLM's remote OpenAI server.")
-        self.parser = make_arg_parser(parser)
-    def test_valid_key_value_format(self):
+### Tests for Lora module parsing
-        # Test old format: name=path
+def test_valid_key_value_format(serve_parser):
-        args = self.parser.parse_args([
+    # Test old format: name=path
-            '--lora-modules',
+    args = serve_parser.parse_args([
-            'module1=/path/to/module1',
+        '--lora-modules',
+        'module1=/path/to/module1',
+    ])
+    expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
+    assert args.lora_modules == expected
+def test_valid_json_format(serve_parser):
+    # Test valid JSON format input
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        json.dumps(LORA_MODULE),
+    ])
+    expected = [
+        LoRAModulePath(name='module2',
+                       path='/path/to/module2',
+                       base_model_name='llama')
+    ]
+    assert args.lora_modules == expected
+def test_invalid_json_format(serve_parser):
+    # Test invalid JSON format input, missing closing brace
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
+            '--lora-modules', '{"name": "module3", "path": "/path/to/module3"'
        ])
-        expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
-        self.assertEqual(args.lora_modules, expected)
-    def test_valid_json_format(self):
-        # Test valid JSON format input
+def test_invalid_type_error(serve_parser):
-        args = self.parser.parse_args([
+    # Test type error when values are not JSON or key=value
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
            '--lora-modules',
-            json.dumps(LORA_MODULE),
+            'invalid_format'  # This is not JSON or key=value format
        ])
-        expected = [
-            LoRAModulePath(name='module2',
-                           path='/path/to/module2',
+def test_invalid_json_field(serve_parser):
-                           base_model_name='llama')
+    # Test valid JSON format but missing required fields
-        ]
+    with pytest.raises(SystemExit):
-        self.assertEqual(args.lora_modules, expected)
+        serve_parser.parse_args([
-    def test_invalid_json_format(self):
-        # Test invalid JSON format input, missing closing brace
-        with self.assertRaises(SystemExit):
-            self.parser.parse_args([
-                '--lora-modules',
-                '{"name": "module3", "path": "/path/to/module3"'
-            ])
-    def test_invalid_type_error(self):
-        # Test type error when values are not JSON or key=value
-        with self.assertRaises(SystemExit):
-            self.parser.parse_args([
-                '--lora-modules',
-                'invalid_format'  # This is not JSON or key=value format
-            ])
-    def test_invalid_json_field(self):
-        # Test valid JSON format but missing required fields
-        with self.assertRaises(SystemExit):
-            self.parser.parse_args([
-                '--lora-modules',
-                '{"name": "module4"}'  # Missing required 'path' field
-            ])
-    def test_empty_values(self):
-        # Test when no LoRA modules are provided
-        args = self.parser.parse_args(['--lora-modules', ''])
-        self.assertEqual(args.lora_modules, [])
-    def test_multiple_valid_inputs(self):
-        # Test multiple valid inputs (both old and JSON format)
-        args = self.parser.parse_args([
            '--lora-modules',
-            'module1=/path/to/module1',
+            '{"name": "module4"}'  # Missing required 'path' field
-            json.dumps(LORA_MODULE),
        ])
-        expected = [
-            LoRAModulePath(name='module1', path='/path/to/module1'),
-            LoRAModulePath(name='module2',
-                           path='/path/to/module2',
-                           base_model_name='llama')
-        ]
-        self.assertEqual(args.lora_modules, expected)
-if __name__ == '__main__':
+def test_empty_values(serve_parser):
-    unittest.main()
+    # Test when no LoRA modules are provided
+    args = serve_parser.parse_args(['--lora-modules', ''])
+    assert args.lora_modules == []
+def test_multiple_valid_inputs(serve_parser):
+    # Test multiple valid inputs (both old and JSON format)
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        'module1=/path/to/module1',
+        json.dumps(LORA_MODULE),
+    ])
+    expected = [
+        LoRAModulePath(name='module1', path='/path/to/module1'),
+        LoRAModulePath(name='module2',
+                       path='/path/to/module2',
+                       base_model_name='llama')
+    ]
+    assert args.lora_modules == expected
+### Tests for serve argument validation that run prior to loading
+def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
+    """Ensure validation fails if tool choice is enabled with no call parser"""
+    # If we enable-auto-tool-choice, explode with no tool-call-parser
+    args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
+    """Ensure validation passes with tool choice enabled with a call parser"""
+    args = serve_parser.parse_args(args=[
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "mistral",
+    ])
+    validate_parsed_serve_args(args)
+def test_chat_template_validation_for_happy_paths(serve_parser):
+    """Ensure validation passes if the chat template exists"""
+    args = serve_parser.parse_args(
+        args=["--chat-template",
+              CHATML_JINJA_PATH.absolute().as_posix()])
+    validate_parsed_serve_args(args)
+def test_chat_template_validation_for_sad_paths(serve_parser):
+    """Ensure validation fails if the chat template doesn't exist"""
+    args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
+    with pytest.raises(ValueError):
+        validate_parsed_serve_args(args)
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -160,15 +160,15 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
 async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
    # test using token IDs
-    completion = await client.completions.create(
+    with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
-        model=MODEL_NAME,
+        # Added tokens should be rejected by the base model
-        prompt=[0, 0, 32000, 32001, 32002],
+        await client.completions.create(
-        echo=True,
+            model=MODEL_NAME,
-        max_tokens=5,
+            prompt=[0, 0, 32000, 32001, 32002],
-        temperature=0.0,
+            echo=True,
-    )
+            max_tokens=5,
-    # Added tokens should not appear in tokenized prompt
+            temperature=0.0,
-    assert "vllm" not in completion.choices[0].text
+        )
 @pytest.mark.asyncio
@@ -343,6 +343,40 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
    assert "".join(chunks) == single_output
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
+    """Streaming for parallel sampling.
+    The tokens from multiple samples, are flattened into a single stream,
+    with an index to indicate which sample the token belongs to.
+    """
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 5
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=max_tokens,
+                                             n=n,
+                                             stream=True)
+    chunks: List[List[str]] = [[] for i in range(n)]
+    finish_reason_count = 0
+    async for chunk in stream:
+        index = chunk.choices[0].index
+        text = chunk.choices[0].text
+        chunks[index].append(text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    assert finish_reason_count == n
+    for chunk in chunks:
+        assert len(chunk) == max_tokens
+        print("".join(chunk))
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
    "model_name",
@@ -506,8 +540,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
            max_tokens=5,
            temperature=0.0,
            extra_body=dict(
-                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+                # NOTE: this has to be true for n > 1 in vLLM, but
-                # for official client.
+                # not necessary for official client.
                use_beam_search=True),
        )
        assert len(batch.choices) == 4