Merge branch 'v0.6.3.post1-dev'

ad385667 · zhuwenwen · be0967c1 · 903593d3 · ad385667 · ad385667
Commit ad385667 authored Oct 23, 2024 by zhuwenwen
20 changed files
--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
+from typing import List
+import pytest  # noqa
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.sequence import SequenceGroup
+from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
+                    get_sequence_groups, schedule_and_update_computed_tokens)
+def test_scheduler_schedule_simple_encoder_decoder():
+    '''
+    Test basic scheduler functionality in the context
+    of an encoder/decoder model. Focus on testing
+    enc/dec-specific functionality sense tests already
+    exist for decoder-only functionality
+    Test behavior:
+    * Construct Scheduler
+    * Construct dummy encoder/decoder sequence groups
+    * Add dummy seq groups to scheduler backlog
+    * Schedule the next seq group & validate:
+        * Cross-attn block tables
+        * Updated states of seq groups
+        * Number of batched tokens
+        * Number of blocks to copy/swap-in/swap-out
+        * Number of scheduled seq groups
+    * Repeat for both prefill- and decode-phase
+    * Abort scheduled seq groups
+    * Assert that aborted seq groups no longer appear in
+      cross-attention block table
+    '''
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16  # enc and dec prompts per seq_group
+    cache_config.num_gpu_blocks = 16  # enc and dec prompts per seq_group
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+    # Add seq groups to scheduler.
+    req_id_list = []
+    for i in range(num_seq_group):
+        req_id = str(i)
+        req_id_list.append(req_id)
+        _, _, seq_group = create_dummy_prompt_encoder_decoder(
+            req_id, block_size, block_size, block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+    # Schedule seq groups prefill.
+    num_tokens = block_size * num_seq_group
+    seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
+    # - Verify that sequence group cross-attention block tables are
+    #   registered with the block manager
+    assert all([(req_id in scheduler.block_manager.cross_block_tables)
+                for req_id in req_id_list])
+    # - Validate sequence-group status
+    assert set(get_sequence_groups(out)) == set(running)
+    # - Validate number of batched tokens
+    assert out.num_batched_tokens == num_tokens
+    # - Validate there are no remaining blocks to swap
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    # - Validate all seq groups were scheduled
+    assert len(seq_group_meta_list) == num_seq_group
+    append_new_token(out, 1)
+    # Schedule seq groups decode.
+    seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
+    # - Verify that sequence group metadata includes encoder attention
+    #   and cross-attention metadata
+    assert all([
+        not ((seq_group_meta.encoder_seq_data is None) or
+             (seq_group_meta.cross_block_table is None))
+        for seq_group_meta in seq_group_meta_list
+    ])
+    # - Validate sequence-group status
+    assert set(get_sequence_groups(out)) == set(running)
+    # - Validate there is one batched token per seq group
+    assert out.num_batched_tokens == num_seq_group
+    # - Validate there are no remaining blocks to swap
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    # - Validate that all seq groups were scheduled
+    assert len(seq_group_meta_list) == num_seq_group
+    append_new_token(out, 1)
+    # Abort sequences
+    for req_id in req_id_list:
+        scheduler.abort_seq_group(req_id)
+        # - Verify that sequence group cross-attention block tables are
+        #   NO LONGER registered with the block manager
+        assert req_id not in scheduler.block_manager.cross_block_tables
--- a/tests/core/test_serialization.py
+++ b/tests/core/test_serialization.py
+import msgspec
+from vllm.executor.msgspec_utils import decode_hook, encode_hook
+from vllm.sequence import ExecuteModelRequest
+from ..spec_decode.utils import create_batch
+def test_msgspec_serialization():
+    num_lookahead_slots = 4
+    seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots)
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=num_lookahead_slots,
+        running_queue_size=4)
+    encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
+    decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
+                                      dec_hook=decode_hook)
+    req = decoder.decode(encoder.encode(execute_model_req))
+    expected = execute_model_req.seq_group_metadata_list
+    actual = req.seq_group_metadata_list
+    assert (len(expected) == len(actual))
+    expected = expected[0]
+    actual = actual[0]
+    assert expected.block_tables == actual.block_tables
+    assert expected.is_prompt == actual.is_prompt
+    assert expected.request_id == actual.request_id
+    assert (expected.seq_data[0].prompt_token_ids ==
+            actual.seq_data[0].prompt_token_ids)
+    assert (expected.seq_data[0].output_token_ids ==
+            actual.seq_data[0].output_token_ids)
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -13,15 +13,18 @@ def create_dummy_prompt(
    prompt_length: int,
    block_size: Optional[int] = None,
    lora_request: Optional[LoRARequest] = None,
-    use_beam_search: bool = False,
    best_of: int = 1,
+    prompt_tokens: Optional[List[int]] = None,
+    min_tokens: int = 0,
+    max_tokens: int = 16,
 ) -> Tuple[Sequence, SequenceGroup]:
    if not block_size:
        block_size = prompt_length
-    # Create dummy prompt sequence with tokens 0...block_size-1
+    if prompt_tokens is None:
-    # and prompt "0 ... block_size".
+        # Create dummy prompt sequence with tokens 0...block_size-1
-    prompt_tokens = list(range(prompt_length))
+        # and prompt "0 ... block_size".
+        prompt_tokens = list(range(prompt_length))
    prompt_str = " ".join([str(t) for t in prompt_tokens])
    prompt = Sequence(int(request_id),
                      inputs={
@@ -33,8 +36,9 @@ def create_dummy_prompt(
                              seqs=[prompt],
                              arrival_time=time.time(),
                              sampling_params=SamplingParams(
-                                  use_beam_search=use_beam_search,
+                                  best_of=best_of,
-                                  best_of=best_of),
+                                  max_tokens=max_tokens,
+                                  min_tokens=min_tokens),
                              lora_request=lora_request)
    return prompt, seq_group
@@ -46,39 +50,39 @@ def create_dummy_prompt_encoder_decoder(
    encoder_prompt_length: int,
    block_size: Optional[int] = None,
    lora_request: Optional[LoRARequest] = None,
-    use_beam_search: bool = False,
    best_of: int = 1,
 ) -> Tuple[Sequence, Sequence, SequenceGroup]:
    if not block_size:
        block_size = decoder_prompt_length
    # Create dummy prompt sequence with tokens 0...block_size-1
-    # and prompt "0 ... block_size".
+    # and prompt "0 ... block_size". Note that the prompt string
+    # doesn't actually match the tokens
    decoder_prompt_tokens = list(range(decoder_prompt_length))
    decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
+    encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
+    encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
+    inputs = {
+        "prompt": decoder_prompt_str,
+        "prompt_token_ids": decoder_prompt_tokens,
+        "encoder_prompt": encoder_prompt_str,
+        "encoder_prompt_token_ids": encoder_prompt_tokens,
+        "multi_modal_data": None,
+    }
    decoder_prompt = Sequence(int(request_id),
-                              inputs={
+                              inputs=inputs,
-                                  "prompt": decoder_prompt_str,
+                              block_size=block_size,
-                                  "prompt_token_ids": decoder_prompt_tokens,
+                              from_decoder_prompt=True)
-                                  "multi_modal_data": None,
-                              },
-                              block_size=block_size)
-    encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
-    encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
    encoder_prompt = Sequence(int(request_id),
-                              inputs={
+                              inputs=inputs,
-                                  "prompt": encoder_prompt_str,
+                              block_size=block_size,
-                                  "prompt_token_ids": encoder_prompt_tokens,
+                              from_decoder_prompt=False)
-                                  "multi_modal_data": None,
-                              },
-                              block_size=block_size)
    seq_group = SequenceGroup(request_id=request_id,
                              seqs=[decoder_prompt],
-                              sampling_params=SamplingParams(
+                              sampling_params=SamplingParams(best_of=best_of),
-                                  use_beam_search=use_beam_search,
-                                  best_of=best_of),
                              arrival_time=time.time(),
                              lora_request=lora_request,
                              encoder_seq=encoder_prompt)
@@ -139,17 +143,21 @@ def create_seq_group_encoder_decoder(
    prompt_token_ids = [0] * seq_prompt_len
+    inputs = {
+        "prompt": "",
+        "prompt_token_ids": prompt_token_ids,
+        "encoder_prompt": "",
+        "encoder_prompt_token_ids": prompt_token_ids,
+        "multi_modal_data": None,
+    }
    seqs = []
    for seq_id_offset, output_len in enumerate(seq_output_lens):
-        seq = Sequence(
+        # Construct decoder input sequences
-            seq_id=seq_id_start + seq_id_offset,
+        seq = Sequence(seq_id=seq_id_start + seq_id_offset,
-            inputs={
+                       inputs=inputs,
-                "prompt": "",
+                       block_size=16,
-                "prompt_token_ids": prompt_token_ids,
+                       from_decoder_prompt=True)
-                "multi_modal_data": None,
-            },
-            block_size=16,
-        )
        for i in range(output_len):
            seq.append_token_id(
@@ -158,16 +166,11 @@ def create_seq_group_encoder_decoder(
            )
        seqs.append(seq)
-    # Encoder sequence
+    # Encoder input sequence
-    encoder_seq = Sequence(
+    encoder_seq = Sequence(seq_id=seq_id_start + len(seq_output_lens),
-        seq_id=seq_id_start + len(seq_output_lens),
+                           inputs=inputs,
-        inputs={
+                           block_size=16,
-            "prompt": "",
+                           from_decoder_prompt=False)
-            "prompt_token_ids": prompt_token_ids,
-            "multi_modal_data": None,
-        },
-        block_size=16,
-    )
    return SequenceGroup(request_id=request_id,
                         seqs=seqs,
@@ -177,4 +180,31 @@ def create_seq_group_encoder_decoder(
 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
    return (seq_len + block_size - 1) // block_size
\ No newline at end of file
+# Helper functions for scheduler tests
+def get_sequence_groups(scheduler_output):
+    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
+def append_new_token(out, token_id: int):
+    seq_groups = get_sequence_groups(out)
+    for seq_group in seq_groups:
+        for seq in seq_group.get_seqs():
+            seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+def schedule_and_update_computed_tokens(scheduler):
+    metas, out, _ = scheduler.schedule()
+    for s, meta in zip(out.scheduled_seq_groups, metas):
+        s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
+    return metas, out
+def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
+    seq_group.update_num_computed_tokens(token_chunk_size)
+    for seq in seq_group.get_seqs():
+        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
--- a/tests/data/test_config.yaml
+++ b/tests/data/test_config.yaml
+port: 12312
+served_model_name: mymodel
+tensor_parallel_size: 2
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-Run:
-```sh
-cd $VLLM_PATH/tests
-pytest distributed/test_basic_distributed_correctness.py
-```
-"""
-import os
-import pytest
-from vllm.utils import cuda_device_count_stateless
-from ..models.utils import check_outputs_equal
-from ..utils import fork_new_process_for_each_test
-TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "model, distributed_executor_backend, attention_backend, test_suite", [
-        ("facebook/opt-125m", "ray", "", "L4"),
-        ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
-        ("facebook/opt-125m", "ray", "", "A100"),
-        ("facebook/opt-125m", "mp", "", "A100"),
-        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
-    ])
-@fork_new_process_for_each_test
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-    attention_backend: str,
-    test_suite: str,
-) -> None:
-    if test_suite != TARGET_TEST_SUITE:
-        pytest.skip(f"Skip test for {test_suite}")
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-        # test ray adag
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-    if attention_backend:
-        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
-    dtype = "half"
-    max_tokens = 5
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=2,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-Run:
-```sh
-pytest test_chunked_prefill_distributed.py
-```
-"""
-import pytest
-from vllm.utils import cuda_device_count_stateless
-from ..models.utils import check_outputs_equal
-from ..utils import fork_new_process_for_each_test
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("facebook/opt-125m", "ray"),
-    ("meta-llama/Llama-2-7b-hf", "ray"),
-    ("facebook/opt-125m", "mp"),
-    ("meta-llama/Llama-2-7b-hf", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-) -> None:
-    dtype = "half"
-    max_tokens = 5
-    chunked_prefill_token_size = 16
-    # Add a chunked prefill config.
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    assert chunked_prefill_token_size != -1
-    enable_chunked_prefill = True
-    max_num_batched_tokens = chunked_prefill_token_size
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            max_num_seqs=max_num_seqs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -34,7 +34,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
    t = all_tensors[rank % tp_size]
    t = tensor_model_parallel_all_reduce(t)
-    assert torch.allclose(t, expected)
+    torch.testing.assert_close(t, expected)
 @ray.remote(num_gpus=1, max_calls=1)
@@ -62,7 +62,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
        expected = torch.cat(all_tensors, dim=all_gather_dimension)
        t = all_tensors[rank % tp_size]
        t = tensor_model_parallel_all_gather(t, all_gather_dimension)
-        assert torch.allclose(t, expected)
+        torch.testing.assert_close(t, expected)
 @ray.remote(num_gpus=1, max_calls=1)
@@ -96,12 +96,12 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
    else:
        recv_dict = broadcast_tensor_dict(src=0)
        assert len(recv_dict) == len(test_dict)
-        assert torch.allclose(recv_dict["a"], test_dict["a"])
+        torch.testing.assert_close(recv_dict["a"], test_dict["a"])
-        assert torch.allclose(recv_dict["b"], test_dict["b"])
+        torch.testing.assert_close(recv_dict["b"], test_dict["b"])
        assert recv_dict["c"] == test_dict["c"]
        assert recv_dict["d"] == test_dict["d"]
        assert recv_dict["e"] == test_dict["e"]
-        assert torch.allclose(recv_dict["f"], test_dict["f"])
+        torch.testing.assert_close(recv_dict["f"], test_dict["f"])
 @ray.remote(num_gpus=1, max_calls=1)
@@ -136,12 +136,12 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
    if not get_pp_group().is_first_rank:
        assert len(recv_dict) == len(test_dict)
-        assert torch.allclose(recv_dict["a"], test_dict["a"])
+        torch.testing.assert_close(recv_dict["a"], test_dict["a"])
-        assert torch.allclose(recv_dict["b"], test_dict["b"])
+        torch.testing.assert_close(recv_dict["b"], test_dict["b"])
        assert recv_dict["c"] == test_dict["c"]
        assert recv_dict["d"] == test_dict["d"]
        assert recv_dict["e"] == test_dict["e"]
-        assert torch.allclose(recv_dict["f"], test_dict["f"])
+        torch.testing.assert_close(recv_dict["f"], test_dict["f"])
 @ray.remote(num_gpus=1, max_calls=1)
@@ -163,7 +163,7 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
        get_pp_group().send(test_tensor)
    if not get_pp_group().is_first_rank:
-        assert torch.allclose(test_tensor, recv_tensor)
+        torch.testing.assert_close(test_tensor, recv_tensor)
 @pytest.mark.skipif(torch.cuda.device_count() < 2,

--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -72,8 +72,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
                        out2 = tensor_model_parallel_all_reduce(inp2)
                        dist.all_reduce(inp2, group=group)
            graph.replay()
-            assert torch.allclose(out1, inp1)
+            torch.testing.assert_close(out1, inp1)
-            assert torch.allclose(out2, inp2)
+            torch.testing.assert_close(out2, inp2)
 @ray.remote(num_gpus=1, max_calls=1)
@@ -96,13 +96,13 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
    out = inp
    for _ in range(num_communication):
        out = fa.all_reduce_unreg(out)
-    assert torch.allclose(out, inp * (tp_size**num_communication))
+    torch.testing.assert_close(out, inp * (tp_size**num_communication))
    inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
    out = inp
    for _ in range(num_communication):
        out = fa.all_reduce_unreg(out)
-    assert torch.allclose(out, inp * (tp_size**num_communication))
+    torch.testing.assert_close(out, inp * (tp_size**num_communication))
 @pytest.mark.parametrize("tp_size", [2])

--- a/tests/distributed/test_distributed_oot.py
+++ b/tests/distributed/test_distributed_oot.py
+from ..entrypoints.openai.test_oot_registration import (
+    run_and_test_dummy_opt_api_server)
+def test_distributed_oot(dummy_opt_path: str):
+    run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
--- a/tests/distributed/test_multi_node_assignment.py
+++ b/tests/distributed/test_multi_node_assignment.py
+"""Make sure ray assigns GPU workers to the correct node.
+Run:
+```sh
+cd $VLLM_PATH/tests
+pytest distributed/test_multi_node_assignment.py
+```
+"""
+import os
+import pytest
+import ray
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from vllm import initialize_ray_cluster
+from vllm.config import ParallelConfig
+from vllm.executor.ray_utils import _wait_until_pg_removed
+from vllm.utils import get_ip
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+@pytest.mark.skipif(not VLLM_MULTI_NODE,
+                    reason="Need at least 2 nodes to run the test.")
+def test_multi_node_assignment() -> None:
+    # NOTE: important to keep this class definition here
+    # to let ray use cloudpickle to serialize it.
+    class Actor:
+        def get_ip(self):
+            return get_ip()
+    for _ in range(10):
+        config = ParallelConfig(1, 2)
+        initialize_ray_cluster(config)
+        current_ip = get_ip()
+        workers = []
+        for bundle_id, bundle in enumerate(
+                config.placement_group.bundle_specs):
+            if not bundle.get("GPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=config.placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=1,
+                scheduling_strategy=scheduling_strategy,
+            )(Actor).remote()
+            worker_ip = ray.get(worker.get_ip.remote())
+            assert worker_ip == current_ip
+            workers.append(worker)
+        for worker in workers:
+            ray.kill(worker)
+        _wait_until_pg_removed(config.placement_group)
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -6,47 +6,267 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 to fail.
 """
 import os
+from dataclasses import dataclass
+from typing import List, Literal, NamedTuple, Optional
 import pytest
+from vllm.logger import init_logger
 from ..utils import compare_two_settings, fork_new_process_for_each_test
+logger = init_logger("test_pipeline_parallel")
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
-@pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
+class ParallelSetup(NamedTuple):
-                          "MODEL_NAME, DIST_BACKEND"),
+    tp_size: int
-                         [
+    pp_size: int
-                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
+    eager_mode: bool
-                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+    chunked_prefill: bool
-                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
-                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+@dataclass
-                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+class PPTestSettings:
-                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+    parallel_setups: List[ParallelSetup]
-                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+    distributed_backends: List[str]
-                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+    trust_remote_code: bool
-                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+    tokenizer_mode: Optional[str]
-                         ])
-def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
+    @staticmethod
-                    DIST_BACKEND):
+    def detailed(
-    if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
+        *,
+        tp_base: int = 1,
+        pp_base: int = 2,
+        trust_remote_code: bool = False,
+        tokenizer_mode: Optional[str] = None,
+    ):
+        return PPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=pp_base,
+                              eager_mode=False,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=2 * pp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=2 * pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=2 * tp_base,
+                              pp_size=pp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=2 * tp_base,
+                              pp_size=pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp", "ray"],
+            trust_remote_code=trust_remote_code,
+            tokenizer_mode=tokenizer_mode,
+        )
+    @staticmethod
+    def fast(
+        *,
+        tp_base: int = 1,
+        pp_base: int = 2,
+        trust_remote_code: bool = False,
+        tokenizer_mode: Optional[str] = None,
+    ):
+        return PPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp"],
+            trust_remote_code=trust_remote_code,
+            tokenizer_mode=tokenizer_mode,
+        )
+    def iter_params(self, model_name: str):
+        for parallel_setup in self.parallel_setups:
+            for distributed_backend in self.distributed_backends:
+                yield (model_name, parallel_setup, distributed_backend,
+                       self.trust_remote_code, self.tokenizer_mode)
+# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
+# The values displayed here are only a rough indicator of the size of the model
+# yapf: disable
+GENERATION_MODEL_SETTINGS = {
+    # [DETAILED TESTS]
+    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
+    # [FAST TESTS]
+    # Uses Llama
+    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
+    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True),  # noqa: E501
+    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
+    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "bigscience/bloomz-1b1": PPTestSettings.fast(),
+    "THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
+    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
+    "databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
+    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
+    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
+    "tiiuae/falcon-7b": PPTestSettings.fast(),
+    "google/gemma-2b": PPTestSettings.fast(),
+    "google/gemma-2-9b": PPTestSettings.fast(),
+    "gpt2": PPTestSettings.fast(),
+    "bigcode/starcoder": PPTestSettings.fast(),
+    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
+    "EleutherAI/pythia-12b": PPTestSettings.fast(),
+    "ibm/PowerLM-3b": PPTestSettings.fast(),
+    "ibm/PowerMoE-3b": PPTestSettings.fast(),
+    # Uses Llama
+    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
+    "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
+    "core42/jais-13b-chat": PPTestSettings.fast(),
+    # TODO: Implement PP
+    # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
+    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
+    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
+    # Uses Llama
+    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
+    "mosaicml/mpt-7b": PPTestSettings.fast(),
+    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
+    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
+    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
+    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
+    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "microsoft/phi-2": PPTestSettings.fast(),
+    "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(),
+    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    # FIXME: https://github.com/vllm-project/vllm/issues/8553
+    # "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "adept/persimmon-8b-chat": PPTestSettings.fast(),
+    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
+    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
+    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
+    "bigcode/starcoder2-3b": PPTestSettings.fast(),
+    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
+    # FIXME: Cannot load tokenizer in latest transformers version
+    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+}
+EMBEDDING_MODEL_SETTINGS = {  # type: ignore[var-annotated]
+    # [FAST TESTS]
+    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
+    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
+    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True),  # noqa: E501
+}
+MULTIMODAL_MODEL_SETTINGS = {
+    # [FAST TESTS]
+    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
+    "facebook/chameleon-7b": PPTestSettings.fast(),
+    "adept/fuyu-8b": PPTestSettings.fast(),
+    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
+    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
+    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
+    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
+    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
+    # TODO: Implement PP
+    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
+    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
+    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
+    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
+}
+CONDITIONAL_GENERATION_MODEL_SETTINGS = {  # type: ignore[var-annotated]
+    # [FAST TESTS]
+    # TODO: Implement PP
+    # "facebook/bart-base": PPTestSettings.fast(),
+}
+# yapf: enable
+# NOTE: You can update this on your local machine to run specific tests
+TEST_MODELS = [
+    # [LANGUAGE GENERATION]
+    "meta-llama/Meta-Llama-3-8B",
+    "ibm/PowerLM-3b",
+    # [LANGUAGE EMBEDDING]
+    "intfloat/e5-mistral-7b-instruct",
+    "BAAI/bge-multilingual-gemma2",
+    # [MULTIMODAL GENERATION]
+    "OpenGVLab/InternVL2-1B",
+    "microsoft/Phi-3-vision-128k-instruct",
+    "fixie-ai/ultravox-v0_3",
+]
+def _compare_tp(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    trust_remote_code: bool,
+    tokenizer_mode: Optional[str],
+    num_gpus_available: int,
+    *,
+    method: Literal["generate", "encode"] = "encode",
+):
+    tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
+    if num_gpus_available < tp_size * pp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+    if VLLM_MULTI_NODE and distributed_backend == "mp":
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
-    USE_RAY_ADAG_NCCL = 0
+    common_args = [
-    USE_RAY_ADAG = 0
-    pp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
+            and chunked_prefill):
+        # Test Ray ADAG for a subset of the tests
+        pp_env = {
+            "VLLM_USE_RAY_COMPILED_DAG": "1",
+            "VLLM_USE_RAY_SPMD_WORKER": "1",
+            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
+        }
+        # Temporary. Currently when zeromq + SPMD is used, it does not properly
+        # terminate because of aDAG issue.
+        common_args.append("--disable-frontend-multiprocessing")
+    else:
+        pp_env = None
+    pp_args = [
+        *common_args,
        "--pipeline-parallel-size",
-        str(PP_SIZE),
+        str(pp_size),
        "--tensor-parallel-size",
-        str(TP_SIZE),
+        str(tp_size),
        "--distributed-executor-backend",
-        DIST_BACKEND,
+        distributed_backend,
    ]
    # compare without pipeline parallelism
@@ -55,54 +275,103 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
-        # use half precision for speed and memory savings in CI environment
+        *common_args,
-        "--dtype",
-        "bfloat16",
        "--tensor-parallel-size",
-        str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
+        str(tp_size),
        "--distributed-executor-backend",
        "mp",
    ]
-    if CHUNKED_PREFILL:
-        pp_args.append("--enable-chunked-prefill")
-        tp_args.append("--enable-chunked-prefill")
-    if EAGER_MODE:
-        pp_args.append("--enforce-eager")
-        tp_args.append("--enforce-eager")
-    pp_env = None
-    if USE_RAY_ADAG:
-        assert DIST_BACKEND == "ray", (
-            "Ray ADAG is only supported with Ray distributed backend")
-        pp_env = {
-            "VLLM_USE_RAY_COMPILED_DAG": "1",
-            "VLLM_USE_RAY_SPMD_WORKER": "1",
-            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
-            str(int(USE_RAY_ADAG_NCCL)),
-        }
-    compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
+    try:
+        compare_two_settings(model_name,
+                             pp_args,
+                             tp_args,
+                             pp_env,
+                             method=method)
+    except Exception:
+        if pp_env is None:
+            raise
+        else:
+            # Ray ADAG tests are flaky, so we don't want to fail the test
+            logger.exception("Ray ADAG tests failed")
-@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
+@pytest.mark.parametrize(
-    (2, "JackFram/llama-160m"),
+    ("model_name", "parallel_setup", "distributed_backend",
-])
+     "trust_remote_code", "tokenizer_mode"),
-@pytest.mark.parametrize("ATTN_BACKEND", [
+    [
-    "FLASH_ATTN",
+        params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
-    "FLASHINFER",
+        for params in settings.iter_params(model_name)
-])
+        if model_name in TEST_MODELS
+    ],
+)
 @fork_new_process_for_each_test
-def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
+def test_tp_language_generation(
-    cudagraph_args = [
+    model_name: str,
-        # use half precision for speed and memory savings in CI environment
+    parallel_setup: ParallelSetup,
-        "--dtype",
+    distributed_backend: str,
-        "float16",
+    trust_remote_code: bool,
-        "--pipeline-parallel-size",
+    tokenizer_mode: Optional[str],
-        str(PP_SIZE),
+    num_gpus_available,
-        "--distributed-executor-backend",
+):
-        "mp",
+    _compare_tp(model_name,
-    ]
+                parallel_setup,
-    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
+                distributed_backend,
+                trust_remote_code,
+                tokenizer_mode,
+                num_gpus_available,
+                method="generate")
-    eager_args = cudagraph_args + ["--enforce-eager"]
+@pytest.mark.parametrize(
+    ("model_name", "parallel_setup", "distributed_backend",
+     "trust_remote_code", "tokenizer_mode"),
+    [
+        params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
+        for params in settings.iter_params(model_name)
+        if model_name in TEST_MODELS
+    ],
+)
+@fork_new_process_for_each_test
+def test_tp_language_embedding(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    trust_remote_code: bool,
+    tokenizer_mode: Optional[str],
+    num_gpus_available,
+):
+    _compare_tp(model_name,
+                parallel_setup,
+                distributed_backend,
+                trust_remote_code,
+                tokenizer_mode,
+                num_gpus_available,
+                method="encode")
-    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
+@pytest.mark.parametrize(
+    ("model_name", "parallel_setup", "distributed_backend",
+     "trust_remote_code", "tokenizer_mode"),
+    [
+        params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
+        for params in settings.iter_params(model_name)
+        if model_name in TEST_MODELS
+    ],
+)
+@fork_new_process_for_each_test
+def test_tp_multimodal_generation(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    trust_remote_code: bool,
+    tokenizer_mode: Optional[str],
+    num_gpus_available,
+):
+    _compare_tp(model_name,
+                parallel_setup,
+                distributed_backend,
+                trust_remote_code,
+                tokenizer_mode,
+                num_gpus_available,
+                method="generate")
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
+import os
+import pytest
+from ..utils import compare_two_settings, fork_new_process_for_each_test
+@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
+    (2, "JackFram/llama-160m"),
+])
+@pytest.mark.parametrize("ATTN_BACKEND", [
+    "FLASH_ATTN",
+    "FLASHINFER",
+])
+@fork_new_process_for_each_test
+def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
+    cudagraph_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--pipeline-parallel-size",
+        str(PP_SIZE),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
+    eager_args = cudagraph_args + ["--enforce-eager"]
+    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
 import os
-import torch
+import torch.distributed as dist
 from vllm.distributed.parallel_state import in_the_same_node_as
-torch.distributed.init_process_group(backend="gloo")
+if __name__ == "__main__":
-test_result = all(
+    dist.init_process_group(backend="gloo")
-    in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))
+    test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
-expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-assert test_result == expected, f"Expected {expected}, got {test_result}"
+    assert test_result == expected, f"Expected {expected}, got {test_result}"
-print("Same node test passed!")
+    print("Same node test passed!")
--- a/tests/encoder_decoder/__init__.py
+++ b/tests/encoder_decoder/__init__.py
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
+"""E2E tests to verify the correctness of the encoder-decoder framework
+Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
+"""
+from typing import List, Optional, Tuple
+import pytest
+from transformers import AutoModelForSeq2SeqLM
+from vllm.sequence import SampleLogprobs
+from vllm.utils import is_cpu
+from ..conftest import DecoderPromptType
+from ..models.utils import check_logprobs_close
+def vllm_to_hf_output(
+    vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
+    decoder_prompt_type: DecoderPromptType,
+):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+    hf_output_str = output_str + "</s>"
+    if decoder_prompt_type == DecoderPromptType.NONE:
+        hf_output_str = "<s>" + hf_output_str
+    return output_ids, hf_output_str, out_logprobs
+@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@pytest.mark.skipif(
+    is_cpu(),
+    reason="CPU backend is not currently supported with encoder/decoder models"
+)
+def test_encoder_decoder_e2e(
+    hf_runner,
+    vllm_runner,
+    example_encoder_decoder_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    decoder_prompt_type: DecoderPromptType,
+    enforce_eager: bool,
+) -> None:
+    '''
+    End-to-End (E2E) test for the encoder-decoder framework. 
+    This test evaluates the encoder-decoder functionality using the BART
+    model. We compare the outputs of the Hugging Face and vLLM
+    implementations to ensure that both implementations produce consistent
+    and correct results.
+    '''
+    test_case_prompts = example_encoder_decoder_prompts[decoder_prompt_type]
+    # Configuration settings for HF baseline
+    hf_kwargs = {
+        "top_k": None,
+        "num_beams": 1,
+        "repetition_penalty": 1.0,
+        "top_p": 1.0,
+        "length_penalty": 1.0,
+        "early_stopping": False,
+        "no_repeat_ngram_size": None,
+        "min_length": 0
+    }
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+            test_case_prompts,
+            max_tokens,
+            num_logprobs,
+            **hf_kwargs,
+        ))
+    with vllm_runner(model, dtype=dtype,
+                     enforce_eager=enforce_eager) as vllm_model:
+        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+            test_case_prompts, max_tokens, num_logprobs)
+    hf_skip_tokens = (1
+                      if decoder_prompt_type == DecoderPromptType.NONE else 0)
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=[
+            vllm_to_hf_output(vllm_output, decoder_prompt_type)
+            for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+        num_outputs_0_skip_tokens=hf_skip_tokens,
+    )
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
+from argparse import ArgumentTypeError
+import pytest
+from vllm.engine.arg_utils import EngineArgs, nullable_kvs
+from vllm.utils import FlexibleArgumentParser
+@pytest.mark.parametrize(("arg", "expected"), [
+    (None, None),
+    ("image=16", {
+        "image": 16
+    }),
+    ("image=16,video=2", {
+        "image": 16,
+        "video": 2
+    }),
+    ("Image=16, Video=2", {
+        "image": 16,
+        "video": 2
+    }),
+])
+def test_limit_mm_per_prompt_parser(arg, expected):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args(["--limit-mm-per-prompt", arg])
+    assert args.limit_mm_per_prompt == expected
+@pytest.mark.parametrize(
+    ("arg"),
+    [
+        "image",  # Missing =
+        "image=4,image=5",  # Conflicting values
+        "image=video=4"  # Too many = in tokenized arg
+    ])
+def test_bad_nullable_kvs(arg):
+    with pytest.raises(ArgumentTypeError):
+        nullable_kvs(arg)
+# yapf: disable
+@pytest.mark.parametrize(("arg", "expected", "option"), [
+    (None, None, "mm-processor-kwargs"),
+    ("{}", {}, "mm-processor-kwargs"),
+    (
+        '{"num_crops": 4}',
+        {
+            "num_crops": 4
+        },
+        "mm-processor-kwargs"
+    ),
+    (
+        '{"foo": {"bar": "baz"}}',
+        {
+            "foo":
+            {
+                "bar": "baz"
+            }
+        },
+        "mm-processor-kwargs"
+    ),
+    (
+        '{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
+        {
+            "cast_logits_dtype": "bfloat16",
+            "sequence_parallel_norm": True,
+            "sequence_parallel_norm_threshold": 2048,
+        },
+        "override-neuron-config"
+    ),
+])
+# yapf: enable
+def test_composite_arg_parser(arg, expected, option):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args([f"--{option}", arg])
+    assert getattr(args, option.replace("-", "_")) == expected
--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
-def test_custom_executor(model, tmpdir):
+def test_custom_executor(model, tmp_path):
    cwd = os.path.abspath(".")
-    os.chdir(tmpdir)
+    os.chdir(tmp_path)
    try:
        assert not os.path.exists(".marker")
@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
-def test_custom_executor_async(model, tmpdir):
+def test_custom_executor_async(model, tmp_path):
    cwd = os.path.abspath(".")
-    os.chdir(tmpdir)
+    os.chdir(tmp_path)
    try:
        assert not os.path.exists(".marker")

--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -83,7 +83,7 @@ def test_local_workers() -> None:
    workers[3].process.kill()
    # Other workers should get shut down here
-    worker_monitor.join(2)
+    worker_monitor.join(20)
    # Ensure everything is stopped
    assert not worker_monitor.is_alive()
@@ -108,7 +108,7 @@ def test_local_workers_clean_shutdown() -> None:
    # Clean shutdown
    worker_monitor.close()
-    worker_monitor.join(5)
+    worker_monitor.join(20)
    # Ensure everything is stopped
    assert not worker_monitor.is_alive()
@@ -161,7 +161,7 @@ async def test_local_workers_async() -> None:
    workers[3].process.kill()
    # Other workers should get shut down here
-    worker_monitor.join(2)
+    worker_monitor.join(20)
    # Ensure everything is stopped
    assert not worker_monitor.is_alive()

--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str):
    # token ids.
    llm = LLM(model=model, skip_tokenizer_init=True)
    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
-    with pytest.raises(ValueError) as err:
+    with pytest.raises(ValueError, match="cannot pass text prompts when"):
        llm.generate("abc", sampling_params)
-    assert "prompts must be None if" in str(err.value)
    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
                           sampling_params=sampling_params)
    assert len(outputs) > 0

--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@@ -7,6 +7,8 @@ from vllm import CompletionOutput, LLMEngine, SamplingParams
 MODEL = "meta-llama/llama-2-7b-hf"
 MAX_TOKENS = 200
+IS_ASYNC = False
 @pytest.fixture(scope="session")
 def vllm_model(vllm_runner):
@@ -14,99 +16,148 @@ def vllm_model(vllm_runner):
        yield vllm_model
-@pytest.mark.skip_global_cleanup
+def _test_stopping(llm_engine: LLMEngine,
-def test_stop_basic(vllm_model):
+                   expected_output: str,
-    _test_stopping(vllm_model.model.llm_engine,
+                   expected_reason: Any,
+                   stop: Optional[List[str]] = None,
+                   stop_token_ids: Optional[List[int]] = None,
+                   include_in_output: bool = False,
+                   use_async_output_proc: bool = False) -> None:
+    llm_engine.add_request(
+        "id", "A story about vLLM:\n",
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=MAX_TOKENS,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            include_stop_str_in_output=include_in_output,
+        ), None)
+    output: Optional[CompletionOutput] = None
+    output_text = ""
+    stop_reason = None
+    if use_async_output_proc:
+        llm_engine.step()
+    while llm_engine.has_unfinished_requests():
+        (request_output, ) = llm_engine.step()
+        (output, ) = request_output.outputs
+        # Ensure we don't backtrack
+        assert output.text.startswith(output_text)
+        output_text = output.text
+        stop_reason = output.stop_reason
+    assert output is not None
+    assert output_text == expected_output
+    assert stop_reason == expected_reason
+def _set_async_mode(llm_engine, is_async):
+    llm_engine.scheduler[0].use_async_output_proc = is_async
+def _stop_basic(llm_engine, is_async):
+    _test_stopping(llm_engine,
                   stop=["."],
                   include_in_output=False,
                   expected_output="VLLM is a 100% volunteer organization",
-                   expected_reason=".")
+                   expected_reason=".",
+                   use_async_output_proc=is_async)
-    _test_stopping(vllm_model.model.llm_engine,
+    _test_stopping(llm_engine,
                   stop=["."],
                   include_in_output=True,
                   expected_output="VLLM is a 100% volunteer organization.",
-                   expected_reason=".")
+                   expected_reason=".",
+                   use_async_output_proc=is_async)
-@pytest.mark.skip_global_cleanup
+def _stop_multi_tokens(llm_engine, is_async):
-def test_stop_multi_tokens(vllm_model):
    _test_stopping(
-        vllm_model.model.llm_engine,
+        llm_engine,
        stop=["group of peo", "short"],
        include_in_output=False,
        expected_output="VLLM is a 100% volunteer organization. We are a ",
-        expected_reason="group of peo")
+        expected_reason="group of peo",
+        use_async_output_proc=is_async)
    _test_stopping(
-        vllm_model.model.llm_engine,
+        llm_engine,
        stop=["group of peo", "short"],
        include_in_output=True,
        expected_output=
        "VLLM is a 100% volunteer organization. We are a group of peo",
-        expected_reason="group of peo")
+        expected_reason="group of peo",
+        use_async_output_proc=is_async)
-@pytest.mark.skip_global_cleanup
+def _stop_partial_token(llm_engine, is_async):
-def test_stop_partial_token(vllm_model):
+    _test_stopping(llm_engine,
-    _test_stopping(vllm_model.model.llm_engine,
                   stop=["gani"],
                   include_in_output=False,
                   expected_output="VLLM is a 100% volunteer or",
-                   expected_reason="gani")
+                   expected_reason="gani",
+                   use_async_output_proc=is_async)
-    _test_stopping(vllm_model.model.llm_engine,
+    _test_stopping(llm_engine,
                   stop=["gani"],
                   include_in_output=True,
                   expected_output="VLLM is a 100% volunteer organi",
-                   expected_reason="gani")
+                   expected_reason="gani",
+                   use_async_output_proc=is_async)
-@pytest.mark.skip_global_cleanup
+def _stop_token_id(llm_engine, is_async):
-def test_stop_token_id(vllm_model):
    # token id 13013 => " organization"
-    _test_stopping(vllm_model.model.llm_engine,
+    _test_stopping(llm_engine,
                   stop_token_ids=[13013],
                   include_in_output=False,
                   expected_output="VLLM is a 100% volunteer",
-                   expected_reason=13013)
+                   expected_reason=13013,
+                   use_async_output_proc=is_async)
-    _test_stopping(vllm_model.model.llm_engine,
+    _test_stopping(llm_engine,
                   stop_token_ids=[13013],
                   include_in_output=True,
                   expected_output="VLLM is a 100% volunteer organization",
-                   expected_reason=13013)
+                   expected_reason=13013,
+                   use_async_output_proc=is_async)
-def _test_stopping(llm_engine: LLMEngine,
+@pytest.mark.skip_global_cleanup
-                   expected_output: str,
+def test_stop_basic(vllm_model):
-                   expected_reason: Any,
+    _set_async_mode(vllm_model.model.llm_engine, True)
-                   stop: Optional[List[str]] = None,
+    _stop_basic(vllm_model.model.llm_engine, is_async=True)
-                   stop_token_ids: Optional[List[int]] = None,
-                   include_in_output: bool = False) -> None:
-    llm_engine.add_request(
-        "id", "A story about vLLM:\n",
-        SamplingParams(
-            temperature=0.0,
-            max_tokens=MAX_TOKENS,
-            stop=stop,
-            stop_token_ids=stop_token_ids,
-            include_stop_str_in_output=include_in_output,
-        ), None)
-    output: Optional[CompletionOutput] = None
+    _set_async_mode(vllm_model.model.llm_engine, False)
-    output_text = ""
+    _stop_basic(vllm_model.model.llm_engine, is_async=False)
-    stop_reason = None
-    while llm_engine.has_unfinished_requests():
-        (request_output, ) = llm_engine.step()
-        (output, ) = request_output.outputs
-        # Ensure we don't backtrack
-        assert output.text.startswith(output_text)
-        output_text = output.text
-        stop_reason = output.stop_reason
-    assert output is not None
+@pytest.mark.skip_global_cleanup
-    assert output_text == expected_output
+def test_stop_multi_tokens(vllm_model):
-    assert stop_reason == expected_reason
+    _set_async_mode(vllm_model.model.llm_engine, True)
+    _stop_multi_tokens(vllm_model.model.llm_engine, is_async=True)
+    _set_async_mode(vllm_model.model.llm_engine, False)
+    _stop_multi_tokens(vllm_model.model.llm_engine, is_async=False)
+@pytest.mark.skip_global_cleanup
+def test_stop_partial_token(vllm_model):
+    _set_async_mode(vllm_model.model.llm_engine, True)
+    _stop_partial_token(vllm_model.model.llm_engine, is_async=True)
+    _set_async_mode(vllm_model.model.llm_engine, False)
+    _stop_partial_token(vllm_model.model.llm_engine, is_async=False)
+@pytest.mark.skip_global_cleanup
+def test_stop_token_id(vllm_model):
+    _set_async_mode(vllm_model.model.llm_engine, True)
+    _stop_token_id(vllm_model.model.llm_engine, is_async=True)
+    _set_async_mode(vllm_model.model.llm_engine, False)
+    _stop_token_id(vllm_model.model.llm_engine, is_async=False)