Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev

6d2051cc · zhuwenwen · 2c7f740a · a2c71c54 · 6d2051cc · 6d2051cc
Commit 6d2051cc authored Oct 21, 2024 by zhuwenwen
20 changed files
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -27,19 +27,16 @@ def schedule_and_update_computed_tokens(scheduler):
    return metas, out
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_simple():
-def test_simple(use_v2_block_manager: bool):
    """Verify basic scheduling works."""
    block_size = 4
    num_seq_group = 4
    max_model_len = 16
    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
-        max_num_batched_tokens,
+                                       num_seq_group,
-        num_seq_group,
+                                       max_model_len,
-        max_model_len,
+                                       enable_chunked_prefill=True)
-        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
@@ -74,8 +71,7 @@ def test_simple(use_v2_block_manager: bool):
    assert len(seq_group_meta) == num_seq_group
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_chunk():
-def test_chunk(use_v2_block_manager: bool):
    """Verify prefills are chunked properly."""
    block_size = 4
    max_seqs = 60
@@ -86,7 +82,7 @@ def test_chunk(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 32
    cache_config.num_gpu_blocks = 32
@@ -124,8 +120,7 @@ def test_chunk(use_v2_block_manager: bool):
    assert out.num_batched_tokens == 57
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_complex():
-def test_complex(use_v2_block_manager: bool):
    block_size = 4
    max_seqs = 60
    max_model_len = 80
@@ -135,7 +130,7 @@ def test_complex(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 64
    cache_config.num_gpu_blocks = 64
@@ -194,8 +189,7 @@ def test_complex(use_v2_block_manager: bool):
    assert running[2].is_prefill()
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_maximal_decoding():
-def test_maximal_decoding(use_v2_block_manager: bool):
    """Verify decoding requests are prioritized."""
    block_size = 4
    max_seqs = 2
@@ -206,7 +200,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
@@ -288,8 +282,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
    assert out.num_batched_tokens == 2
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_prompt_limit():
-def test_prompt_limit(use_v2_block_manager: bool):
    """Verify max_num_batched_tokens < max_model_len is possible."""
    block_size = 4
    max_seqs = 32
@@ -300,7 +293,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
@@ -323,8 +316,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
    assert out.num_batched_tokens == 32
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_prompt_limit_exceed():
-def test_prompt_limit_exceed(use_v2_block_manager: bool):
    block_size = 4
    max_seqs = 64
    max_model_len = 32
@@ -349,8 +341,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool):
    assert out.ignored_seq_groups[0] == seq_group
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_swap():
-def test_swap(use_v2_block_manager: bool):
    """Verify swapping works with chunked prefill requests"""
    block_size = 4
    max_seqs = 30
@@ -361,7 +352,7 @@ def test_swap(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
@@ -407,8 +398,7 @@ def test_swap(use_v2_block_manager: bool):
    assert out.blocks_to_swap_out == []
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_running_prefill_prioritized_over_swap():
-def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
    block_size = 4
    max_seqs = 30
    max_model_len = 200
@@ -418,7 +408,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 32
    cache_config.num_gpu_blocks = 32
@@ -501,8 +491,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
    assert out.blocks_to_swap_out == []
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_chunked_prefill_preempt():
-def test_chunked_prefill_preempt(use_v2_block_manager: bool):
    """Verify preempt works with chunked prefill requests"""
    block_size = 4
    max_seqs = 30
@@ -513,7 +502,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
@@ -568,8 +557,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
    assert out.num_batched_tokens == max_num_batched_tokens
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_chunked_prefill_max_seqs():
-def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
    block_size = 4
    max_seqs = 2
    max_model_len = 80
@@ -579,7 +567,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 128
    cache_config.num_gpu_blocks = 128
@@ -622,8 +610,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
    assert not running[1].is_prefill()
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_perfix_caching():
-def test_perfix_caching(use_v2_block_manager: bool):
    """Verify allocating full blocks when prefix caching is enabled."""
    block_size = 4
    max_seqs = 10
@@ -634,7 +621,7 @@ def test_perfix_caching(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size,
                               1.0,
                               1,

--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
+import pytest
+from tests.conftest import VllmRunner
+from tests.core.utils import create_dummy_prompt
+from vllm.engine.llm_engine import LLMEngine
+from vllm.platforms import current_platform
+from vllm.sequence import SequenceGroup
+MODEL = "JackFram/llama-160m"
+def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
+    scheduler = engine.scheduler[0]
+    scheduler.add_seq_group(seq_group)
+@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
+@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+def test_num_computed_tokens_update(num_scheduler_steps: int,
+                                    enable_chunked_prefill: bool,
+                                    enforce_eager: bool):
+    is_multi_step = num_scheduler_steps > 1
+    is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
+    if is_multi_step_chunked_prefill and current_platform.is_rocm():
+        pytest.skip("Multi-step with Chunked-Prefill does not support "
+                    "rocm_flash_attn backend")
+    # Make a vllm engine
+    runner = VllmRunner(model_name=MODEL,
+                        gpu_memory_utilization=0.7,
+                        num_scheduler_steps=num_scheduler_steps,
+                        enable_chunked_prefill=enable_chunked_prefill,
+                        enforce_eager=enforce_eager)
+    engine: LLMEngine = runner.model.llm_engine
+    # In multi-step + chunked-prefill there is no separate single prompt step.
+    # What is scheduled will run for num_scheduler_steps always.
+    num_prompt_steps = num_scheduler_steps \
+        if is_multi_step_chunked_prefill else 1
+    num_output_tokens_list = [4, 8, 12, 15, 16, 17]
+    # Create sequence and add to engine
+    prompt_len = 10
+    for req_idx, num_output_tokens in enumerate(num_output_tokens_list):
+        seq, seq_group = create_dummy_prompt(request_id=str(req_idx),
+                                             prompt_length=prompt_len,
+                                             min_tokens=num_output_tokens,
+                                             max_tokens=num_output_tokens)
+        add_seq_group_to_engine(engine, seq_group)
+        assert seq.data.get_num_computed_tokens() == 0
+        for _ in range(num_prompt_steps):
+            # prompt steps
+            engine.step()
+        if not seq.is_finished():
+            prompt_num_computed_tokens = seq.data.get_num_computed_tokens()
+            # Test correctness of num_computed_tokens after the prompt steps
+            assert prompt_num_computed_tokens == \
+                        prompt_len + num_prompt_steps - 1
+            decode_step_counter = 0
+            while not seq.is_finished():
+                # Test correctness of num_computed_tokens after the decode steps
+                assert seq.data.get_num_computed_tokens(
+                ) == prompt_num_computed_tokens + decode_step_counter
+                for _ in range(num_scheduler_steps):
+                    # decode step
+                    engine.step()
+                    decode_step_counter += 1
+        # Test correctness of num_computed_tokens after the sequence finish.
+        assert seq.data.get_num_computed_tokens(
+        ) == prompt_len + num_output_tokens - 1
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -3,7 +3,7 @@ from collections import deque
 from typing import List, Set, Tuple
 from unittest.mock import MagicMock
-import pytest
+import pytest  # noqa
 from torch import Use  # noqa
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
@@ -17,11 +17,13 @@ from .utils import (append_new_token, append_new_token_seq_group,
                    schedule_and_update_computed_tokens)
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_scheduler_add_seq_group():
-def test_scheduler_add_seq_group(use_v2_block_manager: bool):
    block_size = 4
    scheduler_config = SchedulerConfig(
-        100, 64, 1, use_v2_block_manager=use_v2_block_manager)
+        100,
+        64,
+        1,
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
    cache_config.num_cpu_blocks = 4
    cache_config.num_gpu_blocks = 4
@@ -37,11 +39,13 @@ def test_scheduler_add_seq_group(use_v2_block_manager: bool):
        assert scheduler.get_num_unfinished_seq_groups() == i + 1
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_scheduler_abort_seq_group():
-def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
    block_size = 4
    scheduler_config = SchedulerConfig(
-        100, 64, 1, use_v2_block_manager=use_v2_block_manager)
+        100,
+        64,
+        1,
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 4
    cache_config.num_gpu_blocks = 4
@@ -61,8 +65,7 @@ def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
    assert scheduler.get_num_unfinished_seq_groups() == 0
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_scheduler_schedule_simple():
-def test_scheduler_schedule_simple(use_v2_block_manager: bool):
    block_size = 4
    num_seq_group = 4
    max_model_len = 16
@@ -70,7 +73,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
        64,
        num_seq_group,
        max_model_len,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
@@ -105,8 +108,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
    append_new_token(out, 1)
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_scheduler_prefill_prioritized():
-def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
    """Verify running batched tokens are not applied to prefill requests."""
    block_size = 4
    max_model_len = 30
@@ -115,7 +117,7 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
        max_batched_num_tokens,
        2,
        max_model_len,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
@@ -139,12 +141,14 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
    assert get_sequence_groups(out) == [seq_group_b]
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_scheduler_schedule_preempt_abort():
-def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
    block_size = 4
    max_model_len = 16
    scheduler_config = SchedulerConfig(
-        64, 2, max_model_len, use_v2_block_manager=use_v2_block_manager)
+        64,
+        2,
+        max_model_len,
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 2
    cache_config.num_gpu_blocks = 2
@@ -194,8 +198,7 @@ def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
    assert scheduler.get_num_unfinished_seq_groups() == 1
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_scheduler_max_seqs():
-def test_scheduler_max_seqs(use_v2_block_manager: bool):
    block_size = 4
    num_seq_group = 4
    max_seq_group = 2
@@ -204,7 +207,7 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
        64,
        max_seq_group,
        max_model_len,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
@@ -242,15 +245,14 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
    assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_scheduler_delay_factor():
-def test_scheduler_delay_factor(use_v2_block_manager: bool):
    block_size = 4
    scheduler_config = SchedulerConfig(
        100,
        64,
        16,
        delay_factor=0.5,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
@@ -287,12 +289,10 @@ def test_scheduler_delay_factor(use_v2_block_manager: bool):
    append_new_token(out, 1)
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_swapped_out_prioritized():
-def test_swapped_out_prioritized(use_v2_block_manager: bool):
    block_size = 4
    scheduler = initialize_scheduler(max_num_seqs=6,
                                     block_size=block_size,
-                                     use_v2_block_manager=use_v2_block_manager,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
    # best_of=2 * 3 == 6 sequences.
@@ -344,7 +344,6 @@ def initialize_scheduler(
    max_token_budget=1000,
    max_model_len=1000,
    lora_config=None,
-    use_v2_block_manager=False,
    block_size=4,
    num_cpu_blocks=8,
    num_gpu_blocks=8,
@@ -354,7 +353,7 @@ def initialize_scheduler(
        max_token_budget,
        max_num_seqs,
        max_model_len,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = num_cpu_blocks
    cache_config.num_gpu_blocks = num_gpu_blocks
@@ -379,15 +378,12 @@ def add_token_budget(budget: SchedulingBudget,
    budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_prefill_schedule_max_prompt_len():
-def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
    """
    Test prompt longer than max_prompt_len is aborted.
    """
    block_size = 4
-    scheduler = initialize_scheduler(max_model_len=30,
+    scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
-                                     use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size)
    _, seq_group = create_dummy_prompt("0",
                                       prompt_length=60,
                                       block_size=block_size)
@@ -402,14 +398,12 @@ def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
    assert len(remaining_waiting) == 0
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_prefill_schedule_token_budget():
-def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
    """
    Test token budget respected.
    """
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=block_size,
-                                     block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
    budget = create_token_budget(token_budget=0)
@@ -439,8 +433,7 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
    assert len(remaining_waiting) == 1
    # Test when current_batched_tokens respected.
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=block_size,
-                                     block_size=block_size,
                                     num_cpu_blocks=16,
                                     num_gpu_blocks=16)
    budget = create_token_budget(token_budget=60)
@@ -467,14 +460,12 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
    assert len(remaining_waiting) == 0
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_prefill_schedule_max_seqs():
-def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
    """
    Test max seq respected.
    """
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=block_size,
-                                     block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
    budget = create_token_budget(max_num_seqs=2)
@@ -508,15 +499,13 @@ def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
    assert len(remaining_waiting) == 1
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_prefill_schedule_max_lora():
-def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
    """
    Test max lora is respected and prioritized.
    """
    block_size = 4
    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     use_v2_block_manager=use_v2_block_manager,
                                     block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
@@ -563,14 +552,12 @@ def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
    assert budget.num_batched_tokens == 60
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_prefill_schedule_no_block_manager_capacity():
-def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
    """
    Test sequence cannot be scheduled due to block manager has no capacity.
    """
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=block_size,
-                                     block_size=block_size,
                                     num_gpu_blocks=128,
                                     num_cpu_blocks=128)
    budget = create_token_budget()
@@ -607,14 +594,12 @@ def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
    assert len(remaining_waiting) == 0
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_decode_schedule_preempted():
-def test_decode_schedule_preempted(use_v2_block_manager: bool):
    """
    Test decodes cannot be scheduled and preempted.
    """
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=block_size,
-                                     block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
    curr_loras = None
@@ -653,14 +638,12 @@ def test_decode_schedule_preempted(use_v2_block_manager: bool):
    assert output.blocks_to_copy == []
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_decode_swap_beam_search():
-def test_decode_swap_beam_search(use_v2_block_manager: bool):
    """
    Test best_of > 1 swap out blocks
    """
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=block_size,
-                                     block_size=block_size,
                                     num_gpu_blocks=64,
                                     num_cpu_blocks=64)
    curr_loras = None
@@ -709,14 +692,12 @@ def test_decode_swap_beam_search(use_v2_block_manager: bool):
    assert output.blocks_to_copy == []
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_schedule_decode_blocks_to_copy_update():
-def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
    """
    Verify blocks_to_copy is updated.
    """
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=4,
-                                     block_size=4,
                                     num_cpu_blocks=16,
                                     num_gpu_blocks=16)
    _, seq_group = create_dummy_prompt("1",
@@ -747,11 +728,9 @@ def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
    assert output.blocks_to_copy == [(2, 3)]
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_schedule_swapped_simple():
-def test_schedule_swapped_simple(use_v2_block_manager: bool):
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=block_size)
-                                     block_size=block_size)
    curr_loras = None
    blocks_to_swap_out: List[Tuple[int, int]] = []
    _, seq_group = create_dummy_prompt("1",
@@ -778,11 +757,9 @@ def test_schedule_swapped_simple(use_v2_block_manager: bool):
    assert blocks_to_swap_out == blocks_to_swap_in_reverse
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_schedule_swapped_max_token_budget():
-def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=block_size,
-                                     block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
    curr_loras = None
@@ -815,11 +792,9 @@ def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
    assert len(output.prefill_seq_groups) == 0
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_schedule_swapped_max_seqs():
-def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=block_size,
-                                     block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
    curr_loras = None
@@ -852,12 +827,10 @@ def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
    assert len(output.prefill_seq_groups) == 0
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_schedule_swapped_max_loras():
-def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
    block_size = 4
    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     use_v2_block_manager=use_v2_block_manager,
                                     block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
@@ -887,11 +860,9 @@ def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
    assert len(curr_loras) == 1
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_schedule_swapped_cannot_swap_in():
-def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=block_size,
-                                     block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
    curr_loras = None
@@ -920,11 +891,9 @@ def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
    assert len(output.prefill_seq_groups) == 0
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_infeasible_swap():
-def test_infeasible_swap(use_v2_block_manager: bool):
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=block_size,
-                                     block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
    curr_loras = None
@@ -954,11 +923,9 @@ def test_infeasible_swap(use_v2_block_manager: bool):
    assert len(output.prefill_seq_groups) == 0
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
+def test_schedule_swapped_blocks_to_copy():
-def test_schedule_swapped_blocks_to_copy(use_v2_block_manager: bool):
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
+    scheduler = initialize_scheduler(block_size=block_size,
-                                     block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
    curr_loras = None

--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -13,9 +13,10 @@ def create_dummy_prompt(
    prompt_length: int,
    block_size: Optional[int] = None,
    lora_request: Optional[LoRARequest] = None,
-    use_beam_search: bool = False,
    best_of: int = 1,
    prompt_tokens: Optional[List[int]] = None,
+    min_tokens: int = 0,
+    max_tokens: int = 16,
 ) -> Tuple[Sequence, SequenceGroup]:
    if not block_size:
        block_size = prompt_length
@@ -35,8 +36,9 @@ def create_dummy_prompt(
                              seqs=[prompt],
                              arrival_time=time.time(),
                              sampling_params=SamplingParams(
-                                  use_beam_search=use_beam_search,
+                                  best_of=best_of,
-                                  best_of=best_of),
+                                  max_tokens=max_tokens,
+                                  min_tokens=min_tokens),
                              lora_request=lora_request)
    return prompt, seq_group
@@ -48,7 +50,6 @@ def create_dummy_prompt_encoder_decoder(
    encoder_prompt_length: int,
    block_size: Optional[int] = None,
    lora_request: Optional[LoRARequest] = None,
-    use_beam_search: bool = False,
    best_of: int = 1,
 ) -> Tuple[Sequence, Sequence, SequenceGroup]:
    if not block_size:
@@ -81,9 +82,7 @@ def create_dummy_prompt_encoder_decoder(
                              from_decoder_prompt=False)
    seq_group = SequenceGroup(request_id=request_id,
                              seqs=[decoder_prompt],
-                              sampling_params=SamplingParams(
+                              sampling_params=SamplingParams(best_of=best_of),
-                                  use_beam_search=use_beam_search,
-                                  best_of=best_of),
                              arrival_time=time.time(),
                              lora_request=lora_request,
                              encoder_seq=encoder_prompt)

--- a/tests/data/test_config.yaml
+++ b/tests/data/test_config.yaml
 port: 12312
+served_model_name: mymodel
 tensor_parallel_size: 2
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -6,10 +6,10 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 to fail.
 """
 import os
+from dataclasses import dataclass
+from typing import List, Literal, NamedTuple, Optional
 import pytest
-from packaging import version
-from transformers import __version__ as transformers_version
 from vllm.logger import init_logger
@@ -20,52 +20,253 @@ logger = init_logger("test_pipeline_parallel")
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
-@pytest.mark.parametrize(
+class ParallelSetup(NamedTuple):
-    ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
+    tp_size: int
-     "MODEL_NAME, DIST_BACKEND"),
+    pp_size: int
-    [
+    eager_mode: bool
-        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+    chunked_prefill: bool
-        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+@dataclass
-        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+class PPTestSettings:
-        (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+    parallel_setups: List[ParallelSetup]
-        (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+    distributed_backends: List[str]
-        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+    trust_remote_code: bool
-        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+    tokenizer_mode: Optional[str]
-        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        # NOTE: InternVL2 multi-node tests are flaky,
+    @staticmethod
-        # use mp backend to skip the multi-node tests
+    def detailed(
-        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
+        *,
-        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
+        tp_base: int = 1,
-        (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
+        pp_base: int = 2,
-        (1, 2, 0, 1, 0, "Qwen/Qwen2-VL-2B-Instruct", "mp")
+        trust_remote_code: bool = False,
-    ],
+        tokenizer_mode: Optional[str] = None,
-)
+    ):
-@fork_new_process_for_each_test
+        return PPTestSettings(
-def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
+            parallel_setups=[
-                    TRUST_REMOTE_CODE, MODEL_NAME, DIST_BACKEND):
+                ParallelSetup(tp_size=tp_base,
-    if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
+                              pp_size=pp_base,
+                              eager_mode=False,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=2 * pp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=2 * pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=2 * tp_base,
+                              pp_size=pp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=2 * tp_base,
+                              pp_size=pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp", "ray"],
+            trust_remote_code=trust_remote_code,
+            tokenizer_mode=tokenizer_mode,
+        )
+    @staticmethod
+    def fast(
+        *,
+        tp_base: int = 1,
+        pp_base: int = 2,
+        trust_remote_code: bool = False,
+        tokenizer_mode: Optional[str] = None,
+    ):
+        return PPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp"],
+            trust_remote_code=trust_remote_code,
+            tokenizer_mode=tokenizer_mode,
+        )
+    def iter_params(self, model_name: str):
+        for parallel_setup in self.parallel_setups:
+            for distributed_backend in self.distributed_backends:
+                yield (model_name, parallel_setup, distributed_backend,
+                       self.trust_remote_code, self.tokenizer_mode)
+# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
+# The values displayed here are only a rough indicator of the size of the model
+# yapf: disable
+GENERATION_MODEL_SETTINGS = {
+    # [DETAILED TESTS]
+    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
+    # [FAST TESTS]
+    # Uses Llama
+    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
+    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True),  # noqa: E501
+    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
+    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "bigscience/bloomz-1b1": PPTestSettings.fast(),
+    "THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
+    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
+    "databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
+    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
+    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
+    "tiiuae/falcon-7b": PPTestSettings.fast(),
+    "google/gemma-2b": PPTestSettings.fast(),
+    "google/gemma-2-9b": PPTestSettings.fast(),
+    "gpt2": PPTestSettings.fast(),
+    "bigcode/starcoder": PPTestSettings.fast(),
+    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
+    "EleutherAI/pythia-12b": PPTestSettings.fast(),
+    "ibm/PowerLM-3b": PPTestSettings.fast(),
+    "ibm/PowerMoE-3b": PPTestSettings.fast(),
+    # Uses Llama
+    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
+    "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
+    "core42/jais-13b-chat": PPTestSettings.fast(),
+    # TODO: Implement PP
+    # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
+    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
+    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
+    # Uses Llama
+    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
+    "mosaicml/mpt-7b": PPTestSettings.fast(),
+    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
+    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
+    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
+    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
+    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "microsoft/phi-2": PPTestSettings.fast(),
+    "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(),
+    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    # FIXME: https://github.com/vllm-project/vllm/issues/8553
+    # "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "adept/persimmon-8b-chat": PPTestSettings.fast(),
+    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
+    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
+    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
+    "bigcode/starcoder2-3b": PPTestSettings.fast(),
+    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
+    # FIXME: Cannot load tokenizer in latest transformers version
+    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+}
+EMBEDDING_MODEL_SETTINGS = {  # type: ignore[var-annotated]
+    # [FAST TESTS]
+    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
+    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
+    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True),  # noqa: E501
+}
+MULTIMODAL_MODEL_SETTINGS = {
+    # [FAST TESTS]
+    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
+    "facebook/chameleon-7b": PPTestSettings.fast(),
+    "adept/fuyu-8b": PPTestSettings.fast(),
+    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
+    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
+    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
+    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
+    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
+    # TODO: Implement PP
+    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
+    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
+    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
+    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
+}
+CONDITIONAL_GENERATION_MODEL_SETTINGS = {  # type: ignore[var-annotated]
+    # [FAST TESTS]
+    # TODO: Implement PP
+    # "facebook/bart-base": PPTestSettings.fast(),
+}
+# yapf: enable
+# NOTE: You can update this on your local machine to run specific tests
+TEST_MODELS = [
+    # [LANGUAGE GENERATION]
+    "meta-llama/Meta-Llama-3-8B",
+    "ibm/PowerLM-3b",
+    # [LANGUAGE EMBEDDING]
+    "intfloat/e5-mistral-7b-instruct",
+    "BAAI/bge-multilingual-gemma2",
+    # [MULTIMODAL GENERATION]
+    "OpenGVLab/InternVL2-1B",
+    "microsoft/Phi-3-vision-128k-instruct",
+    "fixie-ai/ultravox-v0_3",
+]
+def _compare_tp(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    trust_remote_code: bool,
+    tokenizer_mode: Optional[str],
+    num_gpus_available: int,
+    *,
+    method: Literal["generate", "encode"] = "encode",
+):
+    tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
+    if num_gpus_available < tp_size * pp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+    if VLLM_MULTI_NODE and distributed_backend == "mp":
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
-    # Skip tests that require transformers>=4.45.0
+    common_args = [
-    if "Qwen2-VL" in MODEL_NAME and version.parse(
-            transformers_version) < version.parse("4.45.0.dev0"):
-        pytest.skip("This test requires transformers>=4.45.0")
-    pp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
        "--max-model-len",
-        "8192",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
+            and chunked_prefill):
+        # Test Ray ADAG for a subset of the tests
+        pp_env = {
+            "VLLM_USE_RAY_COMPILED_DAG": "1",
+            "VLLM_USE_RAY_SPMD_WORKER": "1",
+            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
+        }
+        # Temporary. Currently when zeromq + SPMD is used, it does not properly
+        # terminate because of aDAG issue.
+        common_args.append("--disable-frontend-multiprocessing")
+    else:
+        pp_env = None
+    pp_args = [
+        *common_args,
        "--pipeline-parallel-size",
-        str(PP_SIZE),
+        str(pp_size),
        "--tensor-parallel-size",
-        str(TP_SIZE),
+        str(tp_size),
        "--distributed-executor-backend",
-        DIST_BACKEND,
+        distributed_backend,
    ]
    # compare without pipeline parallelism
@@ -74,44 +275,103 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
-        # use half precision for speed and memory savings in CI environment
+        *common_args,
-        "--dtype",
-        "float16",
-        "--max-model-len",
-        "8192",
        "--tensor-parallel-size",
-        str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
+        str(tp_size),
        "--distributed-executor-backend",
        "mp",
    ]
-    if CHUNKED_PREFILL:
-        pp_args.append("--enable-chunked-prefill")
-        tp_args.append("--enable-chunked-prefill")
-    if EAGER_MODE:
-        pp_args.append("--enforce-eager")
-        tp_args.append("--enforce-eager")
-    if TRUST_REMOTE_CODE:
-        pp_args.append("--trust-remote-code")
-        tp_args.append("--trust-remote-code")
-    pp_env = None
-    if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
-            and CHUNKED_PREFILL):
-        # Test Ray ADAG for a subset of the tests
-        pp_env = {
-            "VLLM_USE_RAY_COMPILED_DAG": "1",
-            "VLLM_USE_RAY_SPMD_WORKER": "1",
-            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
-        }
-        # Temporary. Currently when zeromq + SPMD is used, it does not properly
-        # terminate because of aDAG issue.
-        pp_args.append("--disable-frontend-multiprocessing")
-        tp_args.append("--disable-frontend-multiprocessing")
    try:
-        compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
+        compare_two_settings(model_name,
+                             pp_args,
+                             tp_args,
+                             pp_env,
+                             method=method)
    except Exception:
        if pp_env is None:
            raise
        else:
            # Ray ADAG tests are flaky, so we don't want to fail the test
            logger.exception("Ray ADAG tests failed")
+@pytest.mark.parametrize(
+    ("model_name", "parallel_setup", "distributed_backend",
+     "trust_remote_code", "tokenizer_mode"),
+    [
+        params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
+        for params in settings.iter_params(model_name)
+        if model_name in TEST_MODELS
+    ],
+)
+@fork_new_process_for_each_test
+def test_tp_language_generation(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    trust_remote_code: bool,
+    tokenizer_mode: Optional[str],
+    num_gpus_available,
+):
+    _compare_tp(model_name,
+                parallel_setup,
+                distributed_backend,
+                trust_remote_code,
+                tokenizer_mode,
+                num_gpus_available,
+                method="generate")
+@pytest.mark.parametrize(
+    ("model_name", "parallel_setup", "distributed_backend",
+     "trust_remote_code", "tokenizer_mode"),
+    [
+        params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
+        for params in settings.iter_params(model_name)
+        if model_name in TEST_MODELS
+    ],
+)
+@fork_new_process_for_each_test
+def test_tp_language_embedding(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    trust_remote_code: bool,
+    tokenizer_mode: Optional[str],
+    num_gpus_available,
+):
+    _compare_tp(model_name,
+                parallel_setup,
+                distributed_backend,
+                trust_remote_code,
+                tokenizer_mode,
+                num_gpus_available,
+                method="encode")
+@pytest.mark.parametrize(
+    ("model_name", "parallel_setup", "distributed_backend",
+     "trust_remote_code", "tokenizer_mode"),
+    [
+        params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
+        for params in settings.iter_params(model_name)
+        if model_name in TEST_MODELS
+    ],
+)
+@fork_new_process_for_each_test
+def test_tp_multimodal_generation(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    trust_remote_code: bool,
+    tokenizer_mode: Optional[str],
+    num_gpus_available,
+):
+    _compare_tp(model_name,
+                parallel_setup,
+                distributed_backend,
+                trust_remote_code,
+                tokenizer_mode,
+                num_gpus_available,
+                method="generate")
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -42,22 +42,42 @@ def test_bad_nullable_kvs(arg):
        nullable_kvs(arg)
-@pytest.mark.parametrize(("arg", "expected"), [
+# yapf: disable
-    (None, None),
+@pytest.mark.parametrize(("arg", "expected", "option"), [
-    ("{}", {}),
+    (None, None, "mm-processor-kwargs"),
-    ('{"num_crops": 4}', {
+    ("{}", {}, "mm-processor-kwargs"),
-        "num_crops": 4
+    (
-    }),
+        '{"num_crops": 4}',
-    ('{"foo": {"bar": "baz"}}', {
+        {
-        "foo": {
+            "num_crops": 4
-            "bar": "baz"
+        },
-        }
+        "mm-processor-kwargs"
-    }),
+    ),
+    (
+        '{"foo": {"bar": "baz"}}',
+        {
+            "foo":
+            {
+                "bar": "baz"
+            }
+        },
+        "mm-processor-kwargs"
+    ),
+    (
+        '{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
+        {
+            "cast_logits_dtype": "bfloat16",
+            "sequence_parallel_norm": True,
+            "sequence_parallel_norm_threshold": 2048,
+        },
+        "override-neuron-config"
+    ),
 ])
-def test_mm_processor_kwargs_prompt_parser(arg, expected):
+# yapf: enable
+def test_composite_arg_parser(arg, expected, option):
    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
    if arg is None:
        args = parser.parse_args([])
    else:
-        args = parser.parse_args(["--mm-processor-kwargs", arg])
+        args = parser.parse_args([f"--{option}", arg])
-    assert args.mm_processor_kwargs == expected
+    assert getattr(args, option.replace("-", "_")) == expected
--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
-def test_custom_executor(model, tmpdir):
+def test_custom_executor(model, tmp_path):
    cwd = os.path.abspath(".")
-    os.chdir(tmpdir)
+    os.chdir(tmp_path)
    try:
        assert not os.path.exists(".marker")
@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
-def test_custom_executor_async(model, tmpdir):
+def test_custom_executor_async(model, tmp_path):
    cwd = os.path.abspath(".")
-    os.chdir(tmpdir)
+    os.chdir(tmp_path)
    try:
        assert not os.path.exists(".marker")

--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -49,21 +49,6 @@ def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
    assert [o.outputs for o in o1] == [o.outputs for o in o2]
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize('prompt', PROMPTS)
-def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
-    pooling_params = PoolingParams()
-    with pytest.warns(DeprecationWarning, match="'prompts'"):
-        v1_output = llm.encode(prompts=prompt, pooling_params=pooling_params)
-    v2_output = llm.encode(prompt, pooling_params=pooling_params)
-    assert_outputs_equal(v1_output, v2_output)
-    v2_output = llm.encode({"prompt": prompt}, pooling_params=pooling_params)
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
 def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
@@ -79,25 +64,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
    assert_outputs_equal(v1_output, v2_output)
-@pytest.mark.skip_global_cleanup
-def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
-    pooling_params = PoolingParams()
-    with pytest.warns(DeprecationWarning, match="'prompts'"):
-        v1_output = llm.encode(prompts=PROMPTS, pooling_params=pooling_params)
-    v2_output = llm.encode(PROMPTS, pooling_params=pooling_params)
-    assert_outputs_equal(v1_output, v2_output)
-    v2_output = llm.encode(
-        [{
-            "prompt": p
-        } for p in PROMPTS],
-        pooling_params=pooling_params,
-    )
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
    pooling_params = PoolingParams()

--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -47,23 +47,6 @@ def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
    assert [o.outputs for o in o1] == [o.outputs for o in o2]
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize('prompt', PROMPTS)
-def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
-    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
-    with pytest.warns(DeprecationWarning, match="'prompts'"):
-        v1_output = llm.generate(prompts=prompt,
-                                 sampling_params=sampling_params)
-    v2_output = llm.generate(prompt, sampling_params=sampling_params)
-    assert_outputs_equal(v1_output, v2_output)
-    v2_output = llm.generate({"prompt": prompt},
-                             sampling_params=sampling_params)
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
 def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
@@ -79,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
    assert_outputs_equal(v1_output, v2_output)
-@pytest.mark.skip_global_cleanup
-def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
-    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
-    with pytest.warns(DeprecationWarning, match="'prompts'"):
-        v1_output = llm.generate(prompts=PROMPTS,
-                                 sampling_params=sampling_params)
-    v2_output = llm.generate(PROMPTS, sampling_params=sampling_params)
-    assert_outputs_equal(v1_output, v2_output)
-    v2_output = llm.generate(
-        [{
-            "prompt": p
-        } for p in PROMPTS],
-        sampling_params=sampling_params,
-    )
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)

--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -7,7 +7,7 @@ import pytest
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from ...conftest import cleanup
@@ -31,14 +31,12 @@ def test_guided_regex(sample_regex, llm):
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
-    )
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
-    outputs = llm.generate(
+    outputs = llm.generate(prompts=[
-        prompts=[
+        f"Give an example IPv4 address with this regex: {sample_regex}"
-            f"Give an example IPv4 address with this regex: {sample_regex}"
+    ] * 2,
-        ] * 2,
+                           sampling_params=sampling_params,
-        sampling_params=sampling_params,
+                           use_tqdm=True)
-        use_tqdm=True,
-        guided_options_request=dict(guided_regex=sample_regex))
    assert outputs is not None
    for output in outputs:
@@ -57,15 +55,13 @@ def test_guided_json_completion(sample_json_schema, llm):
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=1000,
-    )
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
-    outputs = llm.generate(
+    outputs = llm.generate(prompts=[
-        prompts=[
+        f"Give an example JSON for an employee profile "
-            f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}"
-            f"that fits this schema: {sample_json_schema}"
+    ] * 2,
-        ] * 2,
+                           sampling_params=sampling_params,
-        sampling_params=sampling_params,
+                           use_tqdm=True)
-        use_tqdm=True,
-        guided_options_request=dict(guided_json=sample_json_schema))
    assert outputs is not None
@@ -86,12 +82,11 @@ def test_guided_choice_completion(sample_guided_choice, llm):
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
-    )
+        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
    outputs = llm.generate(
        prompts="The best language for type-safe systems programming is ",
        sampling_params=sampling_params,
-        use_tqdm=True,
+        use_tqdm=True)
-        guided_options_request=dict(guided_choice=sample_guided_choice))
    assert outputs is not None
    for output in outputs:
@@ -112,13 +107,13 @@ def test_guided_grammar(sample_sql_statements, llm):
        temperature=0.8,
        top_p=0.95,
        max_tokens=1000,
-    )
+        guided_decoding=GuidedDecodingParams(grammar=sample_sql_statements))
    outputs = llm.generate(
        prompts=("Generate a sql state that select col_1 from "
                 "table_1 where it is equals to 1"),
        sampling_params=sampling_params,
        use_tqdm=True,
-        guided_options_request=dict(guided_grammar=sample_sql_statements))
+    )
    assert outputs is not None
    for output in outputs:
@@ -140,3 +135,28 @@ def test_guided_grammar(sample_sql_statements, llm):
        assert generated_text.strip() == ground_truth
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+@pytest.mark.skip_global_cleanup
+def test_guided_options_request_deprecation_warning(sample_regex, llm):
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    with pytest.warns(DeprecationWarning, match="guided_options_request"):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True,
+                     guided_options_request=dict(guided_regex=sample_regex))
+@pytest.mark.skip_global_cleanup
+def test_validation_against_both_guided_decoding_options(sample_regex, llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+    with pytest.raises(ValueError, match="Cannot set both"):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True,
+                     guided_options_request=dict(guided_regex=sample_regex))
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -21,7 +21,9 @@ def server():
        "--dtype",
        "bfloat16",
        "--max-model-len",
-        "4096",
+        "2048",
+        "--max-num-seqs",
+        "5",
        "--enforce-eager",
    ]

--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
 from http import HTTPStatus
+from typing import List
 import openai
 import pytest
@@ -12,8 +13,44 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+@pytest.fixture(scope='module')
+def server_args(request: pytest.FixtureRequest) -> List[str]:
+    """ Provide extra arguments to the server via indirect parametrization
+    Usage:
+    >>> @pytest.mark.parametrize(
+    >>>     "server_args",
+    >>>     [
+    >>>         ["--disable-frontend-multiprocessing"],
+    >>>         [
+    >>>             "--model=NousResearch/Hermes-3-Llama-3.1-70B",
+    >>>             "--enable-auto-tool-choice",
+    >>>         ],
+    >>>     ],
+    >>>     indirect=True,
+    >>> )
+    >>> def test_foo(server, client):
+    >>>     ...
+    This will run `test_foo` twice with servers with:
+    - `--disable-frontend-multiprocessing`
+    - `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
+    """
+    if not hasattr(request, "param"):
+        return []
+    val = request.param
+    if isinstance(val, str):
+        return [val]
+    return request.param
 @pytest.fixture(scope="module")
-def server():
+def server(server_args):
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
@@ -23,6 +60,7 @@ def server():
        "--enforce-eager",
        "--max-num-seqs",
        "128",
+        *server_args,
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -35,6 +73,15 @@ async def client(server):
        yield async_client
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param([], id="default-frontend-multiprocessing"),
+        pytest.param(["--disable-frontend-multiprocessing"],
+                     id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
 @pytest.mark.asyncio
 async def test_show_version(client: openai.AsyncOpenAI):
    base_url = str(client.base_url)[:-3].strip("/")
@@ -45,6 +92,15 @@ async def test_show_version(client: openai.AsyncOpenAI):
    assert response.json() == {"version": VLLM_VERSION}
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param([], id="default-frontend-multiprocessing"),
+        pytest.param(["--disable-frontend-multiprocessing"],
+                     id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
 @pytest.mark.asyncio
 async def test_check_health(client: openai.AsyncOpenAI):
    base_url = str(client.base_url)[:-3].strip("/")

--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -433,18 +433,28 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
        model=model_name,
        messages=messages,
        max_tokens=10,
+        extra_body=dict(min_tokens=10),
        temperature=0.0,
        stream=True,
        stream_options={
            "include_usage": True,
-            "continuous_usage_stats": True
+            "continuous_usage_stats": True,
        },
    )
+    last_completion_tokens = 0
    async for chunk in stream:
        assert chunk.usage.prompt_tokens >= 0
-        assert chunk.usage.completion_tokens >= 0
+        assert last_completion_tokens == 0 or \
+               chunk.usage.completion_tokens > last_completion_tokens or \
+               (
+                   not chunk.choices and
+                   chunk.usage.completion_tokens == last_completion_tokens
+               )
        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
                                            chunk.usage.completion_tokens)
+        last_completion_tokens = chunk.usage.completion_tokens
+    assert last_completion_tokens == 10
 # NOTE: Not sure why, but when I place this after `test_guided_regex_chat`

--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -12,7 +12,7 @@ assert chatml_jinja_path.exists()
 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATON_OUTPUT = [
-    ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
+    ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
@@ -20,12 +20,20 @@ Hi there!<|im_end|>
 What is the capital of<|im_end|>
 <|im_start|>assistant
 """),
-    ("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
+    ("facebook/opt-125m", chatml_jinja_path, False, False, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
 <|im_start|>user
-What is the capital of""")
+What is the capital of"""),
+    ("facebook/opt-125m", chatml_jinja_path, False, True, """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of<|im_end|>
+<|im_start|>assistant
+The capital of"""),
 ]
 TEST_MESSAGES = [
@@ -42,6 +50,10 @@ TEST_MESSAGES = [
        'content': 'What is the capital of'
    },
 ]
+ASSISTANT_MESSAGE_TO_CONTINUE = {
+    'role': 'assistant',
+    'content': 'The capital of'
+}
 def test_load_chat_template():
@@ -73,10 +85,10 @@ def test_no_load_chat_template_literallike():
 @pytest.mark.parametrize(
-    "model,template,add_generation_prompt,expected_output",
+    "model,template,add_generation_prompt,continue_final_message,expected_output",
    MODEL_TEMPLATE_GENERATON_OUTPUT)
 def test_get_gen_prompt(model, template, add_generation_prompt,
-                        expected_output):
+                        continue_final_message, expected_output):
    # Initialize the tokenizer
    tokenizer = get_tokenizer(tokenizer_name=model)
    template_content = load_chat_template(chat_template=template)
@@ -84,8 +96,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
    # Create a mock request object using keyword arguments
    mock_request = ChatCompletionRequest(
        model=model,
-        messages=TEST_MESSAGES,
+        messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
-        add_generation_prompt=add_generation_prompt)
+        if continue_final_message else TEST_MESSAGES,
+        add_generation_prompt=add_generation_prompt,
+        continue_final_message=continue_final_message,
+    )
    # Call the function and get the result
    result = apply_hf_chat_template(
@@ -93,6 +108,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
        conversation=mock_request.messages,
        chat_template=mock_request.chat_template or template_content,
        add_generation_prompt=mock_request.add_generation_prompt,
+        continue_final_message=mock_request.continue_final_message,
    )
    # Test assertion

--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+from ...utils import RemoteOpenAIServer
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--max-num-seqs",
+        "128",
+        "--enable-chunked-prefill",
+        "--max-num-batched-tokens",
+        "1000",
+        # large prompts create a lot of output
+        "--disable-log-requests",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+@pytest.mark.asyncio
+async def test_completion_stream_options_and_logprobs_with_long_prompts(
+        client: openai.AsyncOpenAI):
+    # Test stream with long prompt
+    prompt = "What is the capital of France?" * 400
+    stream = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=5,
+    )
+    tokens_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if not finished:
+            tokens_received += 1
+            assert chunk.choices[0].text
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+@pytest.mark.asyncio
+async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
+        client: openai.AsyncOpenAI):
+    # Test stream with long prompt
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?" * 400
+    }]
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=True,
+        top_logprobs=5,
+    )
+    tokens_received = 0
+    empty_chunks_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if not finished:
+            if chunk.choices[0].delta.content == "":
+                # when there is no tokens generated
+                assert chunk.usage.completion_tokens == 0
+                assert chunk.choices[0].logprobs is None
+                empty_chunks_received += 1
+            else:
+                tokens_received += 1
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+    assert empty_chunks_received <= 1
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
 import json
-import unittest
-from vllm.entrypoints.openai.cli_args import make_arg_parser
+import pytest
+from vllm.entrypoints.openai.cli_args import (make_arg_parser,
+                                              validate_parsed_serve_args)
 from vllm.entrypoints.openai.serving_engine import LoRAModulePath
 from vllm.utils import FlexibleArgumentParser
+from ...utils import VLLM_PATH
 LORA_MODULE = {
    "name": "module2",
    "path": "/path/to/module2",
    "base_model_name": "llama"
 }
+CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
+assert CHATML_JINJA_PATH.exists()
-class TestLoraParserAction(unittest.TestCase):
+@pytest.fixture
+def serve_parser():
+    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
+    return make_arg_parser(parser)
-    def setUp(self):
-        # Setting up argparse parser for tests
-        parser = FlexibleArgumentParser(
-            description="vLLM's remote OpenAI server.")
-        self.parser = make_arg_parser(parser)
-    def test_valid_key_value_format(self):
+### Tests for Lora module parsing
-        # Test old format: name=path
+def test_valid_key_value_format(serve_parser):
-        args = self.parser.parse_args([
+    # Test old format: name=path
-            '--lora-modules',
+    args = serve_parser.parse_args([
-            'module1=/path/to/module1',
+        '--lora-modules',
+        'module1=/path/to/module1',
+    ])
+    expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
+    assert args.lora_modules == expected
+def test_valid_json_format(serve_parser):
+    # Test valid JSON format input
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        json.dumps(LORA_MODULE),
+    ])
+    expected = [
+        LoRAModulePath(name='module2',
+                       path='/path/to/module2',
+                       base_model_name='llama')
+    ]
+    assert args.lora_modules == expected
+def test_invalid_json_format(serve_parser):
+    # Test invalid JSON format input, missing closing brace
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
+            '--lora-modules', '{"name": "module3", "path": "/path/to/module3"'
        ])
-        expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
-        self.assertEqual(args.lora_modules, expected)
-    def test_valid_json_format(self):
-        # Test valid JSON format input
+def test_invalid_type_error(serve_parser):
-        args = self.parser.parse_args([
+    # Test type error when values are not JSON or key=value
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
            '--lora-modules',
-            json.dumps(LORA_MODULE),
+            'invalid_format'  # This is not JSON or key=value format
        ])
-        expected = [
-            LoRAModulePath(name='module2',
-                           path='/path/to/module2',
+def test_invalid_json_field(serve_parser):
-                           base_model_name='llama')
+    # Test valid JSON format but missing required fields
-        ]
+    with pytest.raises(SystemExit):
-        self.assertEqual(args.lora_modules, expected)
+        serve_parser.parse_args([
-    def test_invalid_json_format(self):
-        # Test invalid JSON format input, missing closing brace
-        with self.assertRaises(SystemExit):
-            self.parser.parse_args([
-                '--lora-modules',
-                '{"name": "module3", "path": "/path/to/module3"'
-            ])
-    def test_invalid_type_error(self):
-        # Test type error when values are not JSON or key=value
-        with self.assertRaises(SystemExit):
-            self.parser.parse_args([
-                '--lora-modules',
-                'invalid_format'  # This is not JSON or key=value format
-            ])
-    def test_invalid_json_field(self):
-        # Test valid JSON format but missing required fields
-        with self.assertRaises(SystemExit):
-            self.parser.parse_args([
-                '--lora-modules',
-                '{"name": "module4"}'  # Missing required 'path' field
-            ])
-    def test_empty_values(self):
-        # Test when no LoRA modules are provided
-        args = self.parser.parse_args(['--lora-modules', ''])
-        self.assertEqual(args.lora_modules, [])
-    def test_multiple_valid_inputs(self):
-        # Test multiple valid inputs (both old and JSON format)
-        args = self.parser.parse_args([
            '--lora-modules',
-            'module1=/path/to/module1',
+            '{"name": "module4"}'  # Missing required 'path' field
-            json.dumps(LORA_MODULE),
        ])
-        expected = [
-            LoRAModulePath(name='module1', path='/path/to/module1'),
-            LoRAModulePath(name='module2',
-                           path='/path/to/module2',
-                           base_model_name='llama')
-        ]
-        self.assertEqual(args.lora_modules, expected)
-if __name__ == '__main__':
+def test_empty_values(serve_parser):
-    unittest.main()
+    # Test when no LoRA modules are provided
+    args = serve_parser.parse_args(['--lora-modules', ''])
+    assert args.lora_modules == []
+def test_multiple_valid_inputs(serve_parser):
+    # Test multiple valid inputs (both old and JSON format)
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        'module1=/path/to/module1',
+        json.dumps(LORA_MODULE),
+    ])
+    expected = [
+        LoRAModulePath(name='module1', path='/path/to/module1'),
+        LoRAModulePath(name='module2',
+                       path='/path/to/module2',
+                       base_model_name='llama')
+    ]
+    assert args.lora_modules == expected
+### Tests for serve argument validation that run prior to loading
+def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
+    """Ensure validation fails if tool choice is enabled with no call parser"""
+    # If we enable-auto-tool-choice, explode with no tool-call-parser
+    args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
+    """Ensure validation passes with tool choice enabled with a call parser"""
+    args = serve_parser.parse_args(args=[
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "mistral",
+    ])
+    validate_parsed_serve_args(args)
+def test_chat_template_validation_for_happy_paths(serve_parser):
+    """Ensure validation passes if the chat template exists"""
+    args = serve_parser.parse_args(
+        args=["--chat-template",
+              CHATML_JINJA_PATH.absolute().as_posix()])
+    validate_parsed_serve_args(args)
+def test_chat_template_validation_for_sad_paths(serve_parser):
+    """Ensure validation fails if the chat template doesn't exist"""
+    args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
+    with pytest.raises(ValueError):
+        validate_parsed_serve_args(args)
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -503,8 +503,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
            max_tokens=5,
            temperature=0.0,
            extra_body=dict(
-                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+                # NOTE: this has to be true for n > 1 in vLLM, but
-                # for official client.
+                # not necessary for official client.
                use_beam_search=True),
        )
        assert len(batch.choices) == 4

--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -144,3 +144,64 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
        0].embedding
    assert responses_float.data[1].embedding == responses_default.data[
        1].embedding
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_single_embedding_truncation(
+        embedding_client: openai.AsyncOpenAI, model_name: str):
+    input_texts = [
+        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
+    ]
+    # test single embedding
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        extra_body={"truncate_prompt_tokens": 10})
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 10
+    assert embeddings.usage.total_tokens == 10
+    input_tokens = [
+        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
+        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
+    ]
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        extra_body={"truncate_prompt_tokens": 10})
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 10
+    assert embeddings.usage.total_tokens == 10
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_single_embedding_truncation_invalid(
+        embedding_client: openai.AsyncOpenAI, model_name: str):
+    input_texts = [
+        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
+    ]
+    with pytest.raises(openai.BadRequestError):
+        embeddings = await embedding_client.embeddings.create(
+            model=model_name,
+            input=input_texts,
+            extra_body={"truncate_prompt_tokens": 8193})
+        assert "error" in embeddings.object
+        assert "truncate_prompt_tokens value is greater than max_model_len. "\
+               "Please, select a smaller truncation size." in embeddings.message
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -70,7 +70,6 @@ EXPECTED_VALUES = {
    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
     ("_count", _NUM_REQUESTS)],
    "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
-    "vllm:request_params_best_of": [("_count", _NUM_REQUESTS)],
    "vllm:prompt_tokens": [("_total",
                            _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
    "vllm:generation_tokens":
@@ -151,9 +150,6 @@ EXPECTED_METRICS = [
    "vllm:request_params_n_sum",
    "vllm:request_params_n_bucket",
    "vllm:request_params_n_count",
-    "vllm:request_params_best_of_sum",
-    "vllm:request_params_best_of_bucket",
-    "vllm:request_params_best_of_count",
    "vllm:num_preemptions_total",
    "vllm:prompt_tokens_total",
    "vllm:generation_tokens_total",