Merge branch 'v0.6.3.post1-dev'

ad385667 · zhuwenwen · be0967c1 · 903593d3 · ad385667 · ad385667
Commit ad385667 authored Oct 23, 2024 by zhuwenwen
20 changed files
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -49,21 +49,6 @@ def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
    assert [o.outputs for o in o1] == [o.outputs for o in o2]
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize('prompt', PROMPTS)
-def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
-    pooling_params = PoolingParams()
-    with pytest.warns(DeprecationWarning, match="'prompts'"):
-        v1_output = llm.encode(prompts=prompt, pooling_params=pooling_params)
-    v2_output = llm.encode(prompt, pooling_params=pooling_params)
-    assert_outputs_equal(v1_output, v2_output)
-    v2_output = llm.encode({"prompt": prompt}, pooling_params=pooling_params)
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
 def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
@@ -79,25 +64,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
    assert_outputs_equal(v1_output, v2_output)
-@pytest.mark.skip_global_cleanup
-def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
-    pooling_params = PoolingParams()
-    with pytest.warns(DeprecationWarning, match="'prompts'"):
-        v1_output = llm.encode(prompts=PROMPTS, pooling_params=pooling_params)
-    v2_output = llm.encode(PROMPTS, pooling_params=pooling_params)
-    assert_outputs_equal(v1_output, v2_output)
-    v2_output = llm.encode(
-        [{
-            "prompt": p
-        } for p in PROMPTS],
-        pooling_params=pooling_params,
-    )
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
    pooling_params = PoolingParams()

--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -6,6 +6,7 @@ import pytest
 from vllm import LLM, RequestOutput, SamplingParams
 from ...conftest import cleanup
+from ..openai.test_vision import TEST_IMAGE_URLS
 MODEL_NAME = "facebook/opt-125m"
@@ -46,23 +47,6 @@ def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
    assert [o.outputs for o in o1] == [o.outputs for o in o2]
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize('prompt', PROMPTS)
-def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
-    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
-    with pytest.warns(DeprecationWarning, match="'prompts'"):
-        v1_output = llm.generate(prompts=prompt,
-                                 sampling_params=sampling_params)
-    v2_output = llm.generate(prompt, sampling_params=sampling_params)
-    assert_outputs_equal(v1_output, v2_output)
-    v2_output = llm.generate({"prompt": prompt},
-                             sampling_params=sampling_params)
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
 def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
@@ -78,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
    assert_outputs_equal(v1_output, v2_output)
-@pytest.mark.skip_global_cleanup
-def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
-    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
-    with pytest.warns(DeprecationWarning, match="'prompts'"):
-        v1_output = llm.generate(prompts=PROMPTS,
-                                 sampling_params=sampling_params)
-    v2_output = llm.generate(PROMPTS, sampling_params=sampling_params)
-    assert_outputs_equal(v1_output, v2_output)
-    v2_output = llm.generate(
-        [{
-            "prompt": p
-        } for p in PROMPTS],
-        sampling_params=sampling_params,
-    )
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
@@ -140,3 +104,90 @@ def test_multiple_sampling_params(llm: LLM):
    # sampling_params is None, default params should be applied
    outputs = llm.generate(PROMPTS, sampling_params=None)
    assert len(PROMPTS) == len(outputs)
+def test_chat():
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    prompt1 = "Explain the concept of entropy."
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt1
+        },
+    ]
+    outputs = llm.chat(messages)
+    assert len(outputs) == 1
+def test_multi_chat():
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    prompt1 = "Explain the concept of entropy."
+    prompt2 = "Explain what among us is."
+    conversation1 = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt1
+        },
+    ]
+    conversation2 = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt2
+        },
+    ]
+    messages = [conversation1, conversation2]
+    outputs = llm.chat(messages)
+    assert len(outputs) == 2
+@pytest.mark.parametrize("image_urls",
+                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+def test_chat_multi_image(image_urls: List[str]):
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        dtype="bfloat16",
+        max_model_len=4096,
+        max_num_seqs=5,
+        enforce_eager=True,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 2},
+    )
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            } for image_url in image_urls),
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    outputs = llm.chat(messages)
+    assert len(outputs) >= 0
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -50,7 +50,7 @@ def zephyr_lora_files():
 @pytest.mark.skip_global_cleanup
 def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
    lora_request = [
-        LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files)
+        LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
        for idx in range(len(PROMPTS))
    ]
    # Multiple SamplingParams should be matched with each prompt

--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -7,7 +7,7 @@ import pytest
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from ...conftest import cleanup
@@ -31,14 +31,12 @@ def test_guided_regex(sample_regex, llm):
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
-    )
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
-    outputs = llm.generate(
+    outputs = llm.generate(prompts=[
-        prompts=[
+        f"Give an example IPv4 address with this regex: {sample_regex}"
-            f"Give an example IPv4 address with this regex: {sample_regex}"
+    ] * 2,
-        ] * 2,
+                           sampling_params=sampling_params,
-        sampling_params=sampling_params,
+                           use_tqdm=True)
-        use_tqdm=True,
-        guided_options_request=dict(guided_regex=sample_regex))
    assert outputs is not None
    for output in outputs:
@@ -57,15 +55,13 @@ def test_guided_json_completion(sample_json_schema, llm):
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=1000,
-    )
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
-    outputs = llm.generate(
+    outputs = llm.generate(prompts=[
-        prompts=[
+        f"Give an example JSON for an employee profile "
-            f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}"
-            f"that fits this schema: {sample_json_schema}"
+    ] * 2,
-        ] * 2,
+                           sampling_params=sampling_params,
-        sampling_params=sampling_params,
+                           use_tqdm=True)
-        use_tqdm=True,
-        guided_options_request=dict(guided_json=sample_json_schema))
    assert outputs is not None
@@ -86,12 +82,11 @@ def test_guided_choice_completion(sample_guided_choice, llm):
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
-    )
+        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
    outputs = llm.generate(
        prompts="The best language for type-safe systems programming is ",
        sampling_params=sampling_params,
-        use_tqdm=True,
+        use_tqdm=True)
-        guided_options_request=dict(guided_choice=sample_guided_choice))
    assert outputs is not None
    for output in outputs:
@@ -112,13 +107,13 @@ def test_guided_grammar(sample_sql_statements, llm):
        temperature=0.8,
        top_p=0.95,
        max_tokens=1000,
-    )
+        guided_decoding=GuidedDecodingParams(grammar=sample_sql_statements))
    outputs = llm.generate(
        prompts=("Generate a sql state that select col_1 from "
                 "table_1 where it is equals to 1"),
        sampling_params=sampling_params,
        use_tqdm=True,
-        guided_options_request=dict(guided_grammar=sample_sql_statements))
+    )
    assert outputs is not None
    for output in outputs:
@@ -140,3 +135,28 @@ def test_guided_grammar(sample_sql_statements, llm):
        assert generated_text.strip() == ground_truth
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+@pytest.mark.skip_global_cleanup
+def test_guided_options_request_deprecation_warning(sample_regex, llm):
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    with pytest.warns(DeprecationWarning, match="guided_options_request"):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True,
+                     guided_options_request=dict(guided_regex=sample_regex))
+@pytest.mark.skip_global_cleanup
+def test_validation_against_both_guided_decoding_options(sample_regex, llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+    with pytest.raises(ValueError, match="Cannot set both"):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True,
+                     guided_options_request=dict(guided_regex=sample_regex))
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
+import sys
+from vllm import LLM, SamplingParams
+def test_lazy_outlines(sample_regex):
+    """If users don't use guided decoding, outlines should not be imported.
+    """
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    llm = LLM(model="facebook/opt-125m",
+              enforce_eager=True,
+              gpu_memory_utilization=0.3)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # make sure outlines is not imported
+    assert 'outlines' not in sys.modules
+    llm = LLM(model="facebook/opt-125m",
+              enforce_eager=True,
+              guided_decoding_backend="lm-format-enforcer",
+              gpu_memory_utilization=0.3)
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    outputs = llm.generate(
+        prompts=[
+            f"Give an example IPv4 address with this regex: {sample_regex}"
+        ] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+        guided_options_request=dict(guided_regex=sample_regex))
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # make sure outlines is not imported
+    assert 'outlines' not in sys.modules
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
+import pytest
+from vllm import LLM
+def test_empty_prompt():
+    llm = LLM(model="gpt2")
+    with pytest.raises(ValueError, match='Prompt cannot be empty'):
+        llm.generate([""])
--- a/tests/entrypoints/offline_mode/__init__.py
+++ b/tests/entrypoints/offline_mode/__init__.py
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
+"""Tests for HF_HUB_OFFLINE mode"""
+import importlib
+import sys
+import weakref
+import pytest
+from vllm import LLM
+from ...conftest import cleanup
+MODEL_NAME = "facebook/opt-125m"
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=4096,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.10,
+              enforce_eager=True)
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+        del llm
+    cleanup()
+@pytest.mark.skip_global_cleanup
+def test_offline_mode(llm: LLM, monkeypatch):
+    # we use the llm fixture to ensure the model files are in-cache
+    del llm
+    # Set HF to offline mode and ensure we can still construct an LLM
+    try:
+        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+        # Need to re-import huggingface_hub and friends to setup offline mode
+        _re_import_modules()
+        # Cached model files should be used in offline mode
+        LLM(model=MODEL_NAME,
+            max_num_batched_tokens=4096,
+            tensor_parallel_size=1,
+            gpu_memory_utilization=0.10,
+            enforce_eager=True)
+    finally:
+        # Reset the environment after the test
+        # NB: Assuming tests are run in online mode
+        monkeypatch.delenv("HF_HUB_OFFLINE")
+        _re_import_modules()
+        pass
+def _re_import_modules():
+    hf_hub_module_names = [
+        k for k in sys.modules if k.startswith("huggingface_hub")
+    ]
+    transformers_module_names = [
+        k for k in sys.modules if k.startswith("transformers")
+        and not k.startswith("transformers_modules")
+    ]
+    reload_exception = None
+    for module_name in hf_hub_module_names + transformers_module_names:
+        try:
+            importlib.reload(sys.modules[module_name])
+        except Exception as e:
+            reload_exception = e
+            # Try to continue clean up so that other tests are less likely to
+            # be affected
+    # Error this test if reloading a module failed
+    if reload_exception is not None:
+        raise reload_exception
--- a/tests/entrypoints/openai/test_accuracy.py
+++ b/tests/entrypoints/openai/test_accuracy.py
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+import lm_eval
+import pytest
+from ...utils import RemoteOpenAIServer
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUE = 0.58
+DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
+MORE_ARGS_LIST = [
+    ["--enable-chunked-prefill"],  # Chunked
+    ["--num-scheduler-steps", "8"],  # MS
+    ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"]  # MS+Stream
+]
+@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
+def test_lm_eval_accuracy(more_args):
+    args = list(DEFAULT_ARGS)
+    args.extend(more_args)
+    print(f"Running with: {args}")
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        url = f"{remote_server.url_for('v1')}/completions"
+        model_args = (
+            f"model={MODEL_NAME},"
+            f"base_url={url},"
+            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+        results = lm_eval.simple_evaluate(
+            model="local-completions",
+            model_args=model_args,
+            tasks=TASK,
+        )
+        measured_value = results["results"][TASK][FILTER]
+        assert (measured_value - RTOL < EXPECTED_VALUE
+                and measured_value + RTOL > EXPECTED_VALUE
+                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
+from typing import Dict, List
+import openai
+import pytest
+import pytest_asyncio
+from vllm.assets.audio import AudioAsset
+from vllm.multimodal.utils import encode_audio_base64, fetch_audio
+from ...utils import RemoteOpenAIServer
+MODEL_NAME = "fixie-ai/ultravox-v0_3"
+TEST_AUDIO_URLS = [
+    AudioAsset("winning_call").url,
+]
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+@pytest.fixture(scope="session")
+def base64_encoded_audio() -> Dict[str, str]:
+    return {
+        audio_url: encode_audio_base64(*fetch_audio(audio_url))
+        for audio_url in TEST_AUDIO_URLS
+    }
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
+                                         model_name: str, audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_single_chat_session_audio_base64encoded(
+        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
+        base64_encoded_audio: Dict[str, str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url":
+                    f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
+                                    model_name: str, audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
+                                 audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                }
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+    with pytest.raises(openai.BadRequestError):  # test multi-audio input
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+        )
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    completion = completion.choices[0].text
+    assert completion is not None and len(completion) >= 0
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
 from http import HTTPStatus
+from typing import List
 import openai
 import pytest
+import pytest_asyncio
 import requests
 from vllm.version import __version__ as VLLM_VERSION
@@ -11,8 +13,44 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+@pytest.fixture(scope='module')
+def server_args(request: pytest.FixtureRequest) -> List[str]:
+    """ Provide extra arguments to the server via indirect parametrization
+    Usage:
+    >>> @pytest.mark.parametrize(
+    >>>     "server_args",
+    >>>     [
+    >>>         ["--disable-frontend-multiprocessing"],
+    >>>         [
+    >>>             "--model=NousResearch/Hermes-3-Llama-3.1-70B",
+    >>>             "--enable-auto-tool-choice",
+    >>>         ],
+    >>>     ],
+    >>>     indirect=True,
+    >>> )
+    >>> def test_foo(server, client):
+    >>>     ...
+    This will run `test_foo` twice with servers with:
+    - `--disable-frontend-multiprocessing`
+    - `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
+    """
+    if not hasattr(request, "param"):
+        return []
+    val = request.param
+    if isinstance(val, str):
+        return [val]
+    return request.param
 @pytest.fixture(scope="module")
-def server():
+def server(server_args):
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
@@ -22,17 +60,28 @@ def server():
        "--enforce-eager",
        "--max-num-seqs",
        "128",
+        *server_args,
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param([], id="default-frontend-multiprocessing"),
+        pytest.param(["--disable-frontend-multiprocessing"],
+                     id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
 @pytest.mark.asyncio
 async def test_show_version(client: openai.AsyncOpenAI):
    base_url = str(client.base_url)[:-3].strip("/")
@@ -43,6 +92,15 @@ async def test_show_version(client: openai.AsyncOpenAI):
    assert response.json() == {"version": VLLM_VERSION}
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param([], id="default-frontend-multiprocessing"),
+        pytest.param(["--disable-frontend-multiprocessing"],
+                     id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
 @pytest.mark.asyncio
 async def test_check_health(client: openai.AsyncOpenAI):
    base_url = str(client.base_url)[:-3].strip("/")
@@ -50,12 +108,3 @@ async def test_check_health(client: openai.AsyncOpenAI):
    response = requests.get(base_url + "/health")
    assert response.status_code == HTTPStatus.OK
-@pytest.mark.asyncio
-async def test_log_metrics(client: openai.AsyncOpenAI):
-    base_url = str(client.base_url)[:-3].strip("/")
-    response = requests.get(base_url + "/metrics")
-    assert response.status_code == HTTPStatus.OK
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
 # imports for guided decoding tests
 import json
 import re
-from typing import List
+from typing import Dict, List, Optional
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
+import pytest_asyncio
 import torch
 from openai import BadRequestError
@@ -46,9 +47,10 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
        yield remote_server
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.mark.asyncio
@@ -174,6 +176,88 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
    assert message.content is not None and len(message.content) >= 0
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name, prompt_logprobs",
+    [(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
+)
+async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
+                                    model_name: str,
+                                    prompt_logprobs: Optional[int]):
+    params: Dict = {
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "Who won the world series in 2020?"
+        }, {
+            "role":
+            "assistant",
+            "content":
+            "The Los Angeles Dodgers won the World Series in 2020."
+        }, {
+            "role": "user",
+            "content": "Where was it played?"
+        }],
+        "model":
+        model_name
+    }
+    if prompt_logprobs is not None:
+        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
+    if prompt_logprobs is not None and prompt_logprobs < 0:
+        with pytest.raises(BadRequestError):
+            await client.chat.completions.create(**params)
+    else:
+        completion = await client.chat.completions.create(**params)
+        if prompt_logprobs is not None:
+            assert completion.prompt_logprobs is not None
+            assert len(completion.prompt_logprobs) > 0
+        else:
+            assert completion.prompt_logprobs is None
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
+                                                  model_name: str):
+    params: Dict = {
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "Who won the world series in 2020?"
+        }, {
+            "role":
+            "assistant",
+            "content":
+            "The Los Angeles Dodgers won the World Series in 2020."
+        }, {
+            "role": "user",
+            "content": "Where was it played?"
+        }],
+        "model":
+        model_name,
+        "extra_body": {
+            "prompt_logprobs": 1
+        }
+    }
+    completion_1 = await client.chat.completions.create(**params)
+    params["extra_body"] = {"prompt_logprobs": 2}
+    completion_2 = await client.chat.completions.create(**params)
+    assert len(completion_1.prompt_logprobs[3]) == 1
+    assert len(completion_2.prompt_logprobs[3]) == 2
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
    "model_name",
@@ -349,18 +433,28 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
        model=model_name,
        messages=messages,
        max_tokens=10,
+        extra_body=dict(min_tokens=10),
        temperature=0.0,
        stream=True,
        stream_options={
            "include_usage": True,
-            "continuous_usage_stats": True
+            "continuous_usage_stats": True,
        },
    )
+    last_completion_tokens = 0
    async for chunk in stream:
        assert chunk.usage.prompt_tokens >= 0
-        assert chunk.usage.completion_tokens >= 0
+        assert last_completion_tokens == 0 or \
+               chunk.usage.completion_tokens > last_completion_tokens or \
+               (
+                   not chunk.choices and
+                   chunk.usage.completion_tokens == last_completion_tokens
+               )
        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
                                            chunk.usage.completion_tokens)
+        last_completion_tokens = chunk.usage.completion_tokens
+    assert last_completion_tokens == 10
 # NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
@@ -755,6 +849,39 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
        assert loaded == {"result": 2}, loaded
+@pytest.mark.asyncio
+async def test_response_format_json_schema(client: openai.AsyncOpenAI):
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role":
+                "user",
+                "content": ('what is 1+1? please respond with a JSON object, '
+                            'the format is {"result": 2}')
+            }],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "foo_test",
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "result": {
+                                "type": "integer"
+                            },
+                        },
+                    },
+                }
+            })
+        content = resp.choices[0].message.content
+        assert content is not None
+        loaded = json.loads(content)
+        assert loaded == {"result": 2}, loaded
 @pytest.mark.asyncio
 async def test_extra_fields(client: openai.AsyncOpenAI):
    with pytest.raises(BadRequestError) as exc_info:

--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
-import os
-import pathlib
 import pytest
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
+                                         load_chat_template)
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.transformers_utils.tokenizer import get_tokenizer
-chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
+from ...utils import VLLM_PATH
-    __file__))).parent.parent / "examples/template_chatml.jinja"
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
 assert chatml_jinja_path.exists()
 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATON_OUTPUT = [
-    ("facebook/opt-125m", None, True,
+    ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
-     "Hello</s>Hi there!</s>What is the capital of</s>"),
-    ("facebook/opt-125m", None, False,
-     "Hello</s>Hi there!</s>What is the capital of</s>"),
-    ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
@@ -25,12 +20,20 @@ Hi there!<|im_end|>
 What is the capital of<|im_end|>
 <|im_start|>assistant
 """),
-    ("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
+    ("facebook/opt-125m", chatml_jinja_path, False, False, """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of"""),
+    ("facebook/opt-125m", chatml_jinja_path, False, True, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
 <|im_start|>user
-What is the capital of""")
+What is the capital of<|im_end|>
+<|im_start|>assistant
+The capital of"""),
 ]
 TEST_MESSAGES = [
@@ -47,6 +50,10 @@ TEST_MESSAGES = [
        'content': 'What is the capital of'
    },
 ]
+ASSISTANT_MESSAGE_TO_CONTINUE = {
+    'role': 'assistant',
+    'content': 'The capital of'
+}
 def test_load_chat_template():
@@ -78,10 +85,10 @@ def test_no_load_chat_template_literallike():
 @pytest.mark.parametrize(
-    "model,template,add_generation_prompt,expected_output",
+    "model,template,add_generation_prompt,continue_final_message,expected_output",
    MODEL_TEMPLATE_GENERATON_OUTPUT)
 def test_get_gen_prompt(model, template, add_generation_prompt,
-                        expected_output):
+                        continue_final_message, expected_output):
    # Initialize the tokenizer
    tokenizer = get_tokenizer(tokenizer_name=model)
    template_content = load_chat_template(chat_template=template)
@@ -89,15 +96,20 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
    # Create a mock request object using keyword arguments
    mock_request = ChatCompletionRequest(
        model=model,
-        messages=TEST_MESSAGES,
+        messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
-        add_generation_prompt=add_generation_prompt)
+        if continue_final_message else TEST_MESSAGES,
+        add_generation_prompt=add_generation_prompt,
+        continue_final_message=continue_final_message,
+    )
    # Call the function and get the result
-    result = tokenizer.apply_chat_template(
+    result = apply_hf_chat_template(
+        tokenizer,
        conversation=mock_request.messages,
-        tokenize=False,
+        chat_template=mock_request.chat_template or template_content,
        add_generation_prompt=mock_request.add_generation_prompt,
-        chat_template=mock_request.chat_template or template_content)
+        continue_final_message=mock_request.continue_final_message,
+    )
    # Test assertion
    assert result == expected_output, (

--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+from ...utils import RemoteOpenAIServer
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--max-num-seqs",
+        "128",
+        "--enable-chunked-prefill",
+        "--max-num-batched-tokens",
+        "1000",
+        # large prompts create a lot of output
+        "--disable-log-requests",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+@pytest.mark.asyncio
+async def test_completion_stream_options_and_logprobs_with_long_prompts(
+        client: openai.AsyncOpenAI):
+    # Test stream with long prompt
+    prompt = "What is the capital of France?" * 400
+    stream = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=5,
+    )
+    tokens_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if not finished:
+            tokens_received += 1
+            assert chunk.choices[0].text
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+@pytest.mark.asyncio
+async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
+        client: openai.AsyncOpenAI):
+    # Test stream with long prompt
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?" * 400
+    }]
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=True,
+        top_logprobs=5,
+    )
+    tokens_received = 0
+    empty_chunks_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if not finished:
+            if chunk.choices[0].delta.content == "":
+                # when there is no tokens generated
+                assert chunk.usage.completion_tokens == 0
+                assert chunk.choices[0].logprobs is None
+                empty_chunks_received += 1
+            else:
+                tokens_received += 1
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+    assert empty_chunks_received <= 1
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
+import json
+import pytest
+from vllm.entrypoints.openai.cli_args import (make_arg_parser,
+                                              validate_parsed_serve_args)
+from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.utils import FlexibleArgumentParser
+from ...utils import VLLM_PATH
+LORA_MODULE = {
+    "name": "module2",
+    "path": "/path/to/module2",
+    "base_model_name": "llama"
+}
+CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
+assert CHATML_JINJA_PATH.exists()
+@pytest.fixture
+def serve_parser():
+    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
+    return make_arg_parser(parser)
+### Tests for Lora module parsing
+def test_valid_key_value_format(serve_parser):
+    # Test old format: name=path
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        'module1=/path/to/module1',
+    ])
+    expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
+    assert args.lora_modules == expected
+def test_valid_json_format(serve_parser):
+    # Test valid JSON format input
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        json.dumps(LORA_MODULE),
+    ])
+    expected = [
+        LoRAModulePath(name='module2',
+                       path='/path/to/module2',
+                       base_model_name='llama')
+    ]
+    assert args.lora_modules == expected
+def test_invalid_json_format(serve_parser):
+    # Test invalid JSON format input, missing closing brace
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
+            '--lora-modules', '{"name": "module3", "path": "/path/to/module3"'
+        ])
+def test_invalid_type_error(serve_parser):
+    # Test type error when values are not JSON or key=value
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
+            '--lora-modules',
+            'invalid_format'  # This is not JSON or key=value format
+        ])
+def test_invalid_json_field(serve_parser):
+    # Test valid JSON format but missing required fields
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
+            '--lora-modules',
+            '{"name": "module4"}'  # Missing required 'path' field
+        ])
+def test_empty_values(serve_parser):
+    # Test when no LoRA modules are provided
+    args = serve_parser.parse_args(['--lora-modules', ''])
+    assert args.lora_modules == []
+def test_multiple_valid_inputs(serve_parser):
+    # Test multiple valid inputs (both old and JSON format)
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        'module1=/path/to/module1',
+        json.dumps(LORA_MODULE),
+    ])
+    expected = [
+        LoRAModulePath(name='module1', path='/path/to/module1'),
+        LoRAModulePath(name='module2',
+                       path='/path/to/module2',
+                       base_model_name='llama')
+    ]
+    assert args.lora_modules == expected
+### Tests for serve argument validation that run prior to loading
+def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
+    """Ensure validation fails if tool choice is enabled with no call parser"""
+    # If we enable-auto-tool-choice, explode with no tool-call-parser
+    args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
+    """Ensure validation passes with tool choice enabled with a call parser"""
+    args = serve_parser.parse_args(args=[
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "mistral",
+    ])
+    validate_parsed_serve_args(args)
+def test_chat_template_validation_for_happy_paths(serve_parser):
+    """Ensure validation passes if the chat template exists"""
+    args = serve_parser.parse_args(
+        args=["--chat-template",
+              CHATML_JINJA_PATH.absolute().as_posix()])
+    validate_parsed_serve_args(args)
+def test_chat_template_validation_for_sad_paths(serve_parser):
+    """Ensure validation fails if the chat template doesn't exist"""
+    args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
+    with pytest.raises(ValueError):
+        validate_parsed_serve_args(args)
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -3,11 +3,12 @@ import json
 import re
 import shutil
 from tempfile import TemporaryDirectory
-from typing import List
+from typing import Dict, List, Optional
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
+import pytest_asyncio
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
@@ -87,15 +88,19 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
    ]
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="module",
-def server(default_server_args):
+                params=["", "--disable-frontend-multiprocessing"])
+def server(default_server_args, request):
+    if request.param:
+        default_server_args.append(request.param)
    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
        yield remote_server
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.mark.asyncio
@@ -132,6 +137,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
        temperature=0.0,
    )
    assert len(completion.choices[0].text) >= 1
+    assert completion.choices[0].prompt_logprobs is None
 @pytest.mark.asyncio
@@ -269,6 +275,37 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
    assert len(completion.choices[0].text) >= 0
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
+                                                         (MODEL_NAME, 0),
+                                                         (MODEL_NAME, 1),
+                                                         (MODEL_NAME, None)])
+async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
+                                          model_name: str,
+                                          prompt_logprobs: Optional[int]):
+    params: Dict = {
+        "prompt": ["A robot may not injure another robot", "My name is"],
+        "model": model_name,
+    }
+    if prompt_logprobs is not None:
+        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
+    if prompt_logprobs is not None and prompt_logprobs < 0:
+        with pytest.raises(BadRequestError):
+            await client.completions.create(**params)
+    else:
+        completion = await client.completions.create(**params)
+        if prompt_logprobs is not None:
+            assert completion.choices[0].prompt_logprobs is not None
+            assert len(completion.choices[0].prompt_logprobs) > 0
+            assert completion.choices[1].prompt_logprobs is not None
+            assert len(completion.choices[1].prompt_logprobs) > 0
+        else:
+            assert completion.choices[0].prompt_logprobs is None
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
    "model_name",
@@ -466,8 +503,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
            max_tokens=5,
            temperature=0.0,
            extra_body=dict(
-                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+                # NOTE: this has to be true for n > 1 in vLLM, but
-                # for official client.
+                # not necessary for official client.
                use_beam_search=True),
        )
        assert len(batch.choices) == 4

--- a/tests/entrypoints/openai/test_disable_mp.py
+++ b/tests/entrypoints/openai/test_disable_mp.py
-"""
-Repeat of tests in test_completion.py with the non-mp backend.
-"""
-# imports for guided decoding tests
-import json
-import re
-import shutil
-from tempfile import TemporaryDirectory
-from typing import List
-import jsonschema
-import openai  # use the official client for correctness check
-import pytest
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
-from openai import BadRequestError
-from transformers import AutoTokenizer
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from ...utils import RemoteOpenAIServer
-# any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically these adapters use a different base model,
-# but we're not testing generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-PA_NAME = "swapnilbp/llama_tweet_ptune"
-# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
-# need to change to match the prompt adapter
-PA_NUM_VIRTUAL_TOKENS = 8
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-@pytest.fixture(scope="module")
-def zephyr_lora_added_tokens_files(zephyr_lora_files):
-    tmp_dir = TemporaryDirectory()
-    tmp_model_dir = f"{tmp_dir.name}/zephyr"
-    shutil.copytree(zephyr_lora_files, tmp_model_dir)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    # Copy tokenizer to adapter and add some unique tokens
-    # 32000, 32001, 32002
-    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-                                 special_tokens=True)
-    assert added == 3
-    tokenizer.save_pretrained(tmp_model_dir)
-    yield tmp_model_dir
-    tmp_dir.cleanup()
-@pytest.fixture(scope="module")
-def zephyr_pa_files():
-    return snapshot_download(repo_id=PA_NAME)
-@pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
-                        zephyr_pa_files):
-    return [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--max-num-seqs",
-        "128",
-        "--enforce-eager",
-        # lora config
-        "--enable-lora",
-        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
-        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        # pa config
-        "--enable-prompt-adapter",
-        "--prompt-adapters",
-        f"zephyr-pa={zephyr_pa_files}",
-        f"zephyr-pa2={zephyr_pa_files}",
-        "--max-prompt-adapters",
-        "2",
-        "--max-prompt-adapter-token",
-        "128",
-        "--disable-frontend-multiprocessing"
-    ]
-@pytest.fixture(scope="module")
-def server(default_server_args):
-    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
-        yield remote_server
-@pytest.fixture(scope="module")
-def client(server):
-    return server.get_async_client()
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras, then test prompt adapters
-    "model_name,num_virtual_tokens",
-    [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
-     ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
-     ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
-)
-async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
-                                 num_virtual_tokens: int):
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-    choice = completion.choices[0]
-    assert len(choice.text) >= 5
-    assert choice.finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5,
-        prompt_tokens=6 + num_virtual_tokens,
-        total_tokens=11 + num_virtual_tokens)
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 1
-@pytest.mark.asyncio
-async def test_added_lora_tokens(client: openai.AsyncOpenAI):
-    # test using token IDs
-    completion = await client.completions.create(
-        model="zephyr-lora2",
-        prompt=[0, 0, 32000, 32001, 32002],
-        echo=True,
-        max_tokens=5,
-        temperature=0.0,
-    )
-    # Added tokens should appear in tokenized prompt
-    assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
-@pytest.mark.asyncio
-async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 32000, 32001, 32002],
-        echo=True,
-        max_tokens=5,
-        temperature=0.0,
-    )
-    # Added tokens should not appear in tokenized prompt
-    assert "vllm" not in completion.choices[0].text
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras, then test prompt adapters
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
-)
-async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=None,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is None
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora and 1 pa hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
-)
-async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=0,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) == 1
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
-)
-async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=5,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
-)
-async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str):
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        await client.completions.create(
-            model=model_name,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=21,
-        )
-        ...
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        stream = await client.completions.create(
-            model=model_name,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=30,
-            stream=True,
-        )
-        async for chunk in stream:
-            ...
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 0
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
-)
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str):
-    prompt = "What is an LLM?"
-    single_completion = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-    )
-    single_output = single_completion.choices[0].text
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True)
-    chunks: List[str] = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    assert chunk.choices[0].finish_reason == "length"
-    assert chunk.choices[0].text
-    assert "".join(chunks) == single_output
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
-)
-async def test_completion_stream_options(client: openai.AsyncOpenAI,
-                                         model_name: str):
-    prompt = "What is the capital of France?"
-    # Test stream=True, stream_options=
-    #     {"include_usage": False, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
-    async for chunk in stream:
-        assert chunk.usage is None
-    # Test stream=True, stream_options=
-    #     {"include_usage": False, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
-    async for chunk in stream:
-        assert chunk.usage is None
-    # Test stream=True, stream_options=
-    #     {"include_usage": True, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-        else:
-            assert chunk.usage is None
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-    # Test stream=True, stream_options=
-    #     {"include_usage": True, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
-    async for chunk in stream:
-        assert chunk.usage is not None
-        assert chunk.usage.prompt_tokens > 0
-        assert chunk.usage.completion_tokens > 0
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
-        if chunk.choices[0].finish_reason is not None:
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-    # Test stream=False, stream_options=
-    #     {"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": None})
-    # Test stream=False, stream_options=
-    #    {"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": True})
-    # Test stream=False, stream_options=
-    #     {"continuous_usage_stats": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"continuous_usage_stats": None})
-    # Test stream=False, stream_options=
-    #    {"continuous_usage_stats": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"continuous_usage_stats": True})
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
-)
-async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
-    # test both text and token IDs
-    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
-        # test simple list
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-        )
-        assert len(batch.choices) == 2
-        assert batch.choices[0].text == batch.choices[1].text
-        # test n = 2
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            n=2,
-            max_tokens=5,
-            temperature=0.0,
-            extra_body=dict(
-                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
-                # for official client.
-                use_beam_search=True),
-        )
-        assert len(batch.choices) == 4
-        assert batch.choices[0].text != batch.choices[
-            1].text, "beam search should be different"
-        assert batch.choices[0].text == batch.choices[
-            2].text, "two copies of the same prompt should be the same"
-        assert batch.choices[1].text == batch.choices[
-            3].text, "two copies of the same prompt should be the same"
-        # test streaming
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-            stream=True,
-        )
-        texts = [""] * 2
-        async for chunk in batch:
-            assert len(chunk.choices) == 1
-            choice = chunk.choices[0]
-            texts[choice.index] += choice.text
-        assert texts[0] == texts[1]
-@pytest.mark.asyncio
-async def test_logits_bias(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 5
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    # Test exclusive selection
-    token_id = 1000
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token_id): 100},
-        seed=42,
-    )
-    assert len(completion.choices[0].text) >= 5
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
-                                add_special_tokens=False)["input_ids"]
-    assert all([
-        response == expected
-        for response, expected in zip(response_tokens, expected_tokens)
-    ])
-    # Test ban
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-    )
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    first_response = completion.choices[0].text
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token): -100
-                    for token in response_tokens},
-    )
-    assert first_response != completion.choices[0].text
-@pytest.mark.asyncio
-async def test_allowed_token_ids(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 1
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    # Test exclusive selection
-    allowed_ids = [21555, 21557, 21558]
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        seed=42,
-        extra_body=dict(allowed_token_ids=allowed_ids),
-        logprobs=1,
-    )
-    response_tokens = completion.choices[0].logprobs.tokens
-    assert len(response_tokens) == 1
-    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_json_completion(client: openai.AsyncOpenAI,
-                                      guided_decoding_backend: str,
-                                      sample_json_schema):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example JSON for an employee profile "
-        f"that fits this schema: {sample_json_schema}",
-        n=3,
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        output_json = json.loads(completion.choices[i].text)
-        jsonschema.validate(instance=output_json, schema=sample_json_schema)
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_completion(client: openai.AsyncOpenAI,
-                                       guided_decoding_backend: str,
-                                       sample_regex):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
-        n=3,
-        temperature=1.0,
-        max_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        assert re.fullmatch(sample_regex,
-                            completion.choices[i].text) is not None
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_completion(client: openai.AsyncOpenAI,
-                                        guided_decoding_backend: str,
-                                        sample_guided_choice):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt="The best language for type-safe systems programming is ",
-        n=2,
-        temperature=1.0,
-        max_tokens=10,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
-    assert completion.id is not None
-    assert len(completion.choices) == 2
-    for i in range(2):
-        assert completion.choices[i].text in sample_guided_choice
-@pytest.mark.asyncio
-async def test_guided_grammar(client: openai.AsyncOpenAI,
-                              sample_sql_statements):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=("Generate a sql state that select col_1 from "
-                "table_1 where it is equals to 1"),
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_grammar=sample_sql_statements))
-    content = completion.choices[0].text
-    # use Lark to parse the output, and make sure it's a valid parse tree
-    from lark import Lark
-    parser = Lark(sample_sql_statements)
-    parser.parse(content)
-    # remove spaces for comparison b/c we removed them in the grammar
-    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
-    assert content.strip() == ground_truth
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-@pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
-                                       model_name: str, logprobs_arg: int):
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    # test using text and token IDs
-    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
-        completion = await client.completions.create(model=model_name,
-                                                     prompt=prompt,
-                                                     max_tokens=5,
-                                                     temperature=0.0,
-                                                     echo=True,
-                                                     logprobs=logprobs_arg)
-        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
-                                                             list) else prompt
-        assert re.search(r"^" + prompt_text, completion.choices[0].text)
-        logprobs = completion.choices[0].logprobs
-        assert logprobs is not None
-        assert len(logprobs.text_offset) > 5
-        assert (len(logprobs.token_logprobs) > 5
-                and logprobs.token_logprobs[0] is None)
-        assert (len(logprobs.top_logprobs) > 5
-                and logprobs.top_logprobs[0] is None)
-        for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
-        assert len(logprobs.tokens) > 5
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
-                                          guided_decoding_backend: str,
-                                          sample_json_schema, sample_regex):
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(guided_json=42,
-                            guided_decoding_backend=guided_decoding_backend))
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example string that fits this regex",
-            extra_body=dict(guided_regex=sample_regex,
-                            guided_json=sample_json_schema))
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -3,6 +3,7 @@ import base64
 import numpy as np
 import openai
 import pytest
+import pytest_asyncio
 from ...utils import RemoteOpenAIServer
@@ -24,10 +25,10 @@ def embedding_server():
        yield remote_server
-@pytest.mark.asyncio
+@pytest_asyncio.fixture
-@pytest.fixture(scope="module")
+async def embedding_client(embedding_server):
-def embedding_client(embedding_server):
+    async with embedding_server.get_async_client() as async_client:
-    return embedding_server.get_async_client()
+        yield async_client
 @pytest.mark.asyncio
@@ -128,9 +129,79 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
    for data in responses_base64.data:
        decoded_responses_base64_data.append(
            np.frombuffer(base64.b64decode(data.embedding),
-                          dtype="float").tolist())
+                          dtype="float32").tolist())
    assert responses_float.data[0].embedding == decoded_responses_base64_data[
        0]
    assert responses_float.data[1].embedding == decoded_responses_base64_data[
        1]
+    # Default response is float32 decoded from base64 by OpenAI Client
+    responses_default = await embedding_client.embeddings.create(
+        input=input_texts, model=model_name)
+    assert responses_float.data[0].embedding == responses_default.data[
+        0].embedding
+    assert responses_float.data[1].embedding == responses_default.data[
+        1].embedding
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_single_embedding_truncation(
+        embedding_client: openai.AsyncOpenAI, model_name: str):
+    input_texts = [
+        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
+    ]
+    # test single embedding
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        extra_body={"truncate_prompt_tokens": 10})
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 10
+    assert embeddings.usage.total_tokens == 10
+    input_tokens = [
+        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
+        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
+    ]
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        extra_body={"truncate_prompt_tokens": 10})
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 10
+    assert embeddings.usage.total_tokens == 10
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_single_embedding_truncation_invalid(
+        embedding_client: openai.AsyncOpenAI, model_name: str):
+    input_texts = [
+        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
+    ]
+    with pytest.raises(openai.BadRequestError):
+        embeddings = await embedding_client.embeddings.create(
+            model=model_name,
+            input=input_texts,
+            extra_body={"truncate_prompt_tokens": 8193})
+        assert "error" in embeddings.object
+        assert "truncate_prompt_tokens value is greater than max_model_len. "\
+               "Please, select a smaller truncation size." in embeddings.message
--- a/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/tests/entrypoints/openai/test_encoder_decoder.py
+import openai
+import pytest
+import pytest_asyncio
+from ...utils import RemoteOpenAIServer
+MODEL_NAME = "facebook/bart-base"
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=2, total_tokens=7)
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
--- a/tests/entrypoints/openai/test_lora_lineage.py
+++ b/tests/entrypoints/openai/test_lora_lineage.py
+import json
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from ...utils import RemoteOpenAIServer
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+@pytest.fixture(scope="module")
+def server_with_lora_modules_json(zephyr_lora_files):
+    # Define the json format LoRA module configurations
+    lora_module_1 = {
+        "name": "zephyr-lora",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+    lora_module_2 = {
+        "name": "zephyr-lora2",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        json.dumps(lora_module_1),
+        json.dumps(lora_module_2),
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest_asyncio.fixture
+async def client_for_lora_lineage(server_with_lora_modules_json):
+    async with server_with_lora_modules_json.get_async_client(
+    ) as async_client:
+        yield async_client
+@pytest.mark.asyncio
+async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
+                                  zephyr_lora_files):
+    models = await client_for_lora_lineage.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert served_model.root == MODEL_NAME
+    assert served_model.parent is None
+    assert all(lora_model.root == zephyr_lora_files
+               for lora_model in lora_models)
+    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"