Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1

af7f4372 · zhuwenwen · 5e19cdef · 09c77926 · af7f4372 · af7f4372
Commit af7f4372 authored Sep 03, 2024 by zhuwenwen
20 changed files
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -311,6 +311,68 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)


+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("num_gpu_blocks", [4])
+@pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10])
+@pytest.mark.parametrize("enable_caching", [True, False])
+def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
+                  enable_caching):
+    """ Verify the block manager can correctly determine if a sequence group
+        can be swapped in/out.
+    """
+    num_cpu_blocks = num_gpu_blocks
+    block_manager = BlockSpaceManagerV2(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0,
+                                        enable_caching=enable_caching)
+    prompt, seq_group = create_dummy_prompt(
+        "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+    prompt.status = SequenceStatus.RUNNING
+
+    # Swap seq group from GPU -> CPU.
+    gpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    prompt.status = SequenceStatus.SWAPPED
+
+    # At this moment, we still have enough free blocks to swap in the seq group.
+    if num_lookahead_slots <= block_size:
+        assert block_manager.can_swap_in(seq_group,
+                                         num_lookahead_slots) == AllocStatus.OK
+    else:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+
+    # During Swapped out, 2 cached blocks were evicted from the GPU,
+    # so the prompt1 can't be swapped in
+    prompt2_len = 2 * block_size - 1
+    prompt2, seq_group2 = create_dummy_prompt(
+        "2",
+        prompt_length=prompt2_len,
+        prompt_tokens=[10000 + i for i in range(prompt2_len)])
+    prompt2.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group2)
+
+    # Swap seq group from CPU -> GPU.
+    if num_lookahead_slots <= block_size:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.LATER
+    else:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+
+
 # TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.



--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -100,3 +100,45 @@ class TestNaiveBlockAllocator:
        for i, block in enumerate(blocks):
            assert allocator.get_num_free_blocks() == i
            allocator.free(block)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [4])
+    @pytest.mark.parametrize("block_size", [8])
+    def test_naive_block_get_num_blocks_touched(num_blocks, block_size):
+        """ Verify the allocator can correctly return the number of
+        blocks touched, with different lookahead slots.
+        """
+        allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
+                                            num_blocks=num_blocks,
+                                            block_size=block_size)
+        allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock,
+                                            num_blocks=num_blocks,
+                                            block_size=block_size)
+
+        # Create a chain of cacheable blocks in the dst
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            "immutable",
+            allocator_src,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+        src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
+
+        # All blocks are cached
+        assert allocator_dst.get_num_blocks_touched(
+            src_blocks) == num_blocks - 1
+
+        # Insert one non-full block in the src
+        allocate_non_full_block = \
+            TestNaiveBlockAllocator.create_allocate_lambda(
+                "mutable", allocator_src,
+                prev_block=src_blocks[-1],token_ids=[]
+            )
+        src_blocks.append(allocate_non_full_block())
+        src_blocks[-1].append_token_ids([0])
+
+        assert allocator_dst.get_num_blocks_touched(
+            src_blocks, num_lookahead_slots=1) == num_blocks
+        assert allocator_dst.get_num_blocks_touched(
+            src_blocks, num_lookahead_slots=block_size - 1) == num_blocks
+        assert allocator_dst.get_num_blocks_touched(
+            src_blocks, num_lookahead_slots=block_size) == (num_blocks + 1)
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -315,6 +315,60 @@ class TestPrefixCachingBlockAllocator:
                                                       i)
            allocator.free(block)

+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [4])
+    @pytest.mark.parametrize("block_size", [8])
+    def test_prefix_caching_block_get_num_blocks_touched(
+            num_blocks, block_size):
+        """ Verify the allocator can correctly return the number of
+        blocks touched, when there are cached prefixes and different
+        lookahead slots.
+        """
+        allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                    block_size=block_size)
+        allocator_dst = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                    block_size=block_size)
+
+        # Create token ids that will exhaust all blocks except the last
+        token_ids = list(range((num_blocks - 1) * block_size))
+
+        # Create a chain of cacheable blocks in the dst
+        cached_blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator_dst,
+        )
+
+        # Create a chain of the same blocks in the src
+        blocks_to_swap_in = \
+            TestPrefixCachingBlockAllocator.create_immutable_chain(
+                block_size=block_size,
+                token_ids=token_ids,
+                allocator=allocator_src,
+            )
+
+        # All blocks are cached
+        assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in) == 0
+
+        # Free the first block in the dst
+        allocator_dst.free(cached_blocks[0])
+
+        # Now the first block becomes dangling, the swapped blocks need
+        # to reclaim the first block in the dst
+        assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in) == 1
+
+        # Insert one non-full block in the src
+        non_full_block = allocator_src.allocate_mutable_block(
+            blocks_to_swap_in[-1])
+        non_full_block.append_token_ids([0])
+        blocks_to_swap_in.append(non_full_block)
+        assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in,
+                                                    num_lookahead_slots=1) == 2
+        assert allocator_dst.get_num_blocks_touched(
+            blocks_to_swap_in, num_lookahead_slots=block_size - 1) == 2
+        assert allocator_dst.get_num_blocks_touched(
+            blocks_to_swap_in, num_lookahead_slots=block_size) == 3
+
    @staticmethod
    @pytest.mark.parametrize("num_blocks", [1024])
    @pytest.mark.parametrize("block_size", [16])
@@ -628,6 +682,32 @@ class TestPrefixCachingBlockAllocator:

        assert new_block[0].block_id == last_block_id

+    # Test case for cache mertics
+    @staticmethod
+    def test_metric():
+        block_size = 16
+        allocator = PrefixCachingBlockAllocator(num_blocks=4,
+                                                block_size=block_size)
+        # Test when no query (0/0)
+        assert allocator.get_prefix_cache_hit_rate() == 0.0
+
+        token_ids = list(range(block_size))
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids)
+        # Test 0/1 hit rate
+        assert allocator.get_prefix_cache_hit_rate() == 0.0
+
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids)
+        # Test 1/2 hit rate
+        assert allocator.get_prefix_cache_hit_rate() == 0.5
+
+        # Test more than one block
+        for _ in range(2, 1005):
+            allocator.allocate_immutable_block(prev_block=None,
+                                               token_ids=token_ids)
+        assert allocator.get_prefix_cache_hit_rate() > 0.99
+
    @staticmethod
    def create_immutable_chain(
        block_size: int,

--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -9,33 +9,11 @@ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler, SchedulingBudget
 from vllm.lora.request import LoRARequest
-from vllm.sequence import Logprob, SequenceGroup, SequenceStatus
+from vllm.sequence import SequenceGroup, SequenceStatus

-from .utils import create_dummy_prompt
-
-
-def get_sequence_groups(scheduler_output):
-    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
-
-
-def append_new_token(out, token_id: int):
-    seq_groups = get_sequence_groups(out)
-    for seq_group in seq_groups:
-        for seq in seq_group.get_seqs():
-            seq.append_token_id(token_id, {token_id: Logprob(token_id)})
-
-
-def schedule_and_update_computed_tokens(scheduler):
-    metas, out = scheduler.schedule()
-    for s, meta in zip(out.scheduled_seq_groups, metas):
-        s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
-    return metas, out
-
-
-def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
-    seq_group.update_num_computed_tokens(token_chunk_size)
-    for seq in seq_group.get_seqs():
-        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+from .utils import (append_new_token, append_new_token_seq_group,
+                    create_dummy_prompt, get_sequence_groups,
+                    schedule_and_update_computed_tokens)


 def test_scheduler_add_seq_group():

--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
+from typing import List
+
+import pytest  # noqa
+
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.sequence import SequenceGroup
+
+from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
+                    get_sequence_groups, schedule_and_update_computed_tokens)
+
+
+def test_scheduler_schedule_simple_encoder_decoder():
+    '''
+    Test basic scheduler functionality in the context
+    of an encoder/decoder model. Focus on testing
+    enc/dec-specific functionality sense tests already
+    exist for decoder-only functionality
+
+    Test behavior:
+    * Construct Scheduler
+    * Construct dummy encoder/decoder sequence groups
+    * Add dummy seq groups to scheduler backlog
+    * Schedule the next seq group & validate:
+        * Cross-attn block tables
+        * Updated states of seq groups
+        * Number of batched tokens
+        * Number of blocks to copy/swap-in/swap-out
+        * Number of scheduled seq groups
+    * Repeat for both prefill- and decode-phase
+    * Abort scheduled seq groups
+    * Assert that aborted seq groups no longer appear in
+      cross-attention block table
+    '''
+
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16  # enc and dec prompts per seq_group
+    cache_config.num_gpu_blocks = 16  # enc and dec prompts per seq_group
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    req_id_list = []
+    for i in range(num_seq_group):
+        req_id = str(i)
+        req_id_list.append(req_id)
+        _, _, seq_group = create_dummy_prompt_encoder_decoder(
+            req_id, block_size, block_size, block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Schedule seq groups prefill.
+    num_tokens = block_size * num_seq_group
+    seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
+    # - Verify that sequence group cross-attention block tables are
+    #   registered with the block manager
+    assert all([(req_id in scheduler.block_manager.cross_block_tables)
+                for req_id in req_id_list])
+    # - Validate sequence-group status
+    assert set(get_sequence_groups(out)) == set(running)
+    # - Validate number of batched tokens
+    assert out.num_batched_tokens == num_tokens
+    # - Validate there are no remaining blocks to swap
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    # - Validate all seq groups were scheduled
+    assert len(seq_group_meta_list) == num_seq_group
+    append_new_token(out, 1)
+
+    # Schedule seq groups decode.
+    seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
+    # - Verify that sequence group metadata includes encoder attention
+    #   and cross-attention metadata
+    assert all([
+        not ((seq_group_meta.encoder_seq_data is None) or
+             (seq_group_meta.cross_block_table is None))
+        for seq_group_meta in seq_group_meta_list
+    ])
+    # - Validate sequence-group status
+    assert set(get_sequence_groups(out)) == set(running)
+    # - Validate there is one batched token per seq group
+    assert out.num_batched_tokens == num_seq_group
+    # - Validate there are no remaining blocks to swap
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    # - Validate that all seq groups were scheduled
+    assert len(seq_group_meta_list) == num_seq_group
+    append_new_token(out, 1)
+
+    # Abort sequences
+    for req_id in req_id_list:
+        scheduler.abort_seq_group(req_id)
+        # - Verify that sequence group cross-attention block tables are
+        #   NO LONGER registered with the block manager
+        assert req_id not in scheduler.block_manager.cross_block_tables
--- a/tests/core/test_serialization.py
+++ b/tests/core/test_serialization.py
+import msgspec
+
+from vllm.executor.msgspec_utils import decode_hook, encode_hook
+from vllm.sequence import ExecuteModelRequest
+
+from ..spec_decode.utils import create_batch
+
+
+def test_msgspec_serialization():
+    num_lookahead_slots = 4
+    seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots)
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=num_lookahead_slots,
+        running_queue_size=4)
+
+    encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
+    decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
+                                      dec_hook=decode_hook)
+    req = decoder.decode(encoder.encode(execute_model_req))
+    expected = execute_model_req.seq_group_metadata_list
+    actual = req.seq_group_metadata_list
+    assert (len(expected) == len(actual))
+    expected = expected[0]
+    actual = actual[0]
+
+    assert expected.block_tables == actual.block_tables
+    assert expected.is_prompt == actual.is_prompt
+    assert expected.request_id == actual.request_id
+    assert (expected.seq_data[0].prompt_token_ids ==
+            actual.seq_data[0].prompt_token_ids)
+    assert (expected.seq_data[0].output_token_ids ==
+            actual.seq_data[0].output_token_ids)
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -15,13 +15,15 @@ def create_dummy_prompt(
    lora_request: Optional[LoRARequest] = None,
    use_beam_search: bool = False,
    best_of: int = 1,
+    prompt_tokens: Optional[List[int]] = None,
 ) -> Tuple[Sequence, SequenceGroup]:
    if not block_size:
        block_size = prompt_length

-    # Create dummy prompt sequence with tokens 0...block_size-1
-    # and prompt "0 ... block_size".
-    prompt_tokens = list(range(prompt_length))
+    if prompt_tokens is None:
+        # Create dummy prompt sequence with tokens 0...block_size-1
+        # and prompt "0 ... block_size".
+        prompt_tokens = list(range(prompt_length))
    prompt_str = " ".join([str(t) for t in prompt_tokens])
    prompt = Sequence(int(request_id),
                      inputs={
@@ -53,27 +55,30 @@ def create_dummy_prompt_encoder_decoder(
        block_size = decoder_prompt_length

    # Create dummy prompt sequence with tokens 0...block_size-1
-    # and prompt "0 ... block_size".
+    # and prompt "0 ... block_size". Note that the prompt string
+    # doesn't actually match the tokens
    decoder_prompt_tokens = list(range(decoder_prompt_length))
    decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
+    encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
+    encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
+
+    inputs = {
+        "prompt": decoder_prompt_str,
+        "prompt_token_ids": decoder_prompt_tokens,
+        "encoder_prompt": encoder_prompt_str,
+        "encoder_prompt_token_ids": encoder_prompt_tokens,
+        "multi_modal_data": None,
+    }

    decoder_prompt = Sequence(int(request_id),
-                              inputs={
-                                  "prompt": decoder_prompt_str,
-                                  "prompt_token_ids": decoder_prompt_tokens,
-                                  "multi_modal_data": None,
-                              },
-                              block_size=block_size)
+                              inputs=inputs,
+                              block_size=block_size,
+                              from_decoder_prompt=True)

-    encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
-    encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
    encoder_prompt = Sequence(int(request_id),
-                              inputs={
-                                  "prompt": encoder_prompt_str,
-                                  "prompt_token_ids": encoder_prompt_tokens,
-                                  "multi_modal_data": None,
-                              },
-                              block_size=block_size)
+                              inputs=inputs,
+                              block_size=block_size,
+                              from_decoder_prompt=False)
    seq_group = SequenceGroup(request_id=request_id,
                              seqs=[decoder_prompt],
                              sampling_params=SamplingParams(
@@ -139,17 +144,21 @@ def create_seq_group_encoder_decoder(

    prompt_token_ids = [0] * seq_prompt_len

+    inputs = {
+        "prompt": "",
+        "prompt_token_ids": prompt_token_ids,
+        "encoder_prompt": "",
+        "encoder_prompt_token_ids": prompt_token_ids,
+        "multi_modal_data": None,
+    }
+
    seqs = []
    for seq_id_offset, output_len in enumerate(seq_output_lens):
-        seq = Sequence(
-            seq_id=seq_id_start + seq_id_offset,
-            inputs={
-                "prompt": "",
-                "prompt_token_ids": prompt_token_ids,
-                "multi_modal_data": None,
-            },
-            block_size=16,
-        )
+        # Construct decoder input sequences
+        seq = Sequence(seq_id=seq_id_start + seq_id_offset,
+                       inputs=inputs,
+                       block_size=16,
+                       from_decoder_prompt=True)

        for i in range(output_len):
            seq.append_token_id(
@@ -158,16 +167,11 @@ def create_seq_group_encoder_decoder(
            )
        seqs.append(seq)

-    # Encoder sequence
-    encoder_seq = Sequence(
-        seq_id=seq_id_start + len(seq_output_lens),
-        inputs={
-            "prompt": "",
-            "prompt_token_ids": prompt_token_ids,
-            "multi_modal_data": None,
-        },
-        block_size=16,
-    )
+    # Encoder input sequence
+    encoder_seq = Sequence(seq_id=seq_id_start + len(seq_output_lens),
+                           inputs=inputs,
+                           block_size=16,
+                           from_decoder_prompt=False)

    return SequenceGroup(request_id=request_id,
                         seqs=seqs,
@@ -177,4 +181,31 @@ def create_seq_group_encoder_decoder(


 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
-    return (seq_len + block_size - 1) // block_size
\ No newline at end of file
+    return (seq_len + block_size - 1) // block_size
+
+
+# Helper functions for scheduler tests
+
+
+def get_sequence_groups(scheduler_output):
+    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
+
+
+def append_new_token(out, token_id: int):
+    seq_groups = get_sequence_groups(out)
+    for seq_group in seq_groups:
+        for seq in seq_group.get_seqs():
+            seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+
+
+def schedule_and_update_computed_tokens(scheduler):
+    metas, out = scheduler.schedule()
+    for s, meta in zip(out.scheduled_seq_groups, metas):
+        s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
+    return metas, out
+
+
+def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
+    seq_group.update_num_computed_tokens(token_chunk_size)
+    for seq in seq_group.get_seqs():
+        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -22,7 +22,8 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
 @pytest.mark.skipif(cuda_device_count_stateless() < 2,
                    reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize(
-    "model, distributed_executor_backend, attention_backend, test_suite", [
+    "model, distributed_executor_backend, attention_backend, "
+    "test_suite", [
        ("facebook/opt-125m", "ray", "", "L4"),
        ("facebook/opt-125m", "mp", "", "L4"),
        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),

--- a/tests/distributed/test_basic_distributed_correctness_enc_dec.py
+++ b/tests/distributed/test_basic_distributed_correctness_enc_dec.py
+"""For encoder/decoder models only:
+Compare the outputs of HF and distributed vLLM when using greedy sampling.
+
+Run:
+```sh
+cd $VLLM_PATH/tests
+
+pytest distributed/test_basic_distributed_correctness_enc_dec.py
+```
+"""
+
+import pytest
+from transformers import AutoModelForSeq2SeqLM
+
+from vllm.utils import cuda_device_count_stateless
+
+from ..conftest import DecoderPromptType
+from ..models.utils import check_logprobs_close
+from ..utils import fork_new_process_for_each_test
+
+
+@pytest.mark.skipif(cuda_device_count_stateless() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("model, distributed_executor_backend", [
+    ("facebook/bart-large-cnn", "ray"),
+    ("facebook/bart-large-cnn", "mp"),
+])
+@fork_new_process_for_each_test
+def test_models(
+    model: str,
+    distributed_executor_backend: str,
+    hf_runner,
+    vllm_runner,
+    example_encoder_decoder_prompts,
+) -> None:
+    '''
+    Test vLLM BART inference on more than one GPU, comparing
+    outputs against HF as a baseline.
+
+    Fork a new process for each test, to prevent CUDA from
+    being re-initialized by successive tests within the same
+    process.
+
+    Arguments:
+
+    * model: the HF ID of the specific BART variant under test
+    * distributed_executor_backend
+    * hf_runner: HuggingFace (HF) test model runner
+    * vllm_runner: vLLM test model runner
+    * example_encoder_decoder_prompts: test fixture which provides a 
+                                        dictionary of dummy prompts
+    '''
+
+    dtype = "float"
+    max_tokens = 64
+    num_logprobs = 5
+
+    # Example inputs with non-trivial (i.e. not None/empty) encoder &
+    # decoder prompts.
+    test_prompts = example_encoder_decoder_prompts[DecoderPromptType.CUSTOM]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            distributed_executor_backend=distributed_executor_backend,
+            enforce_eager=True,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+            test_prompts, max_tokens, num_logprobs)
+
+    # Configuration settings for HF baseline
+    hf_kwargs = {
+        "top_k": None,
+        "num_beams": 1,
+        "repetition_penalty": 1.0,
+        "top_p": 1.0,
+        "length_penalty": 1.0,
+        "early_stopping": False,
+        "no_repeat_ngram_size": None,
+        "min_length": 0
+    }
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+            test_prompts,
+            max_tokens,
+            num_logprobs,
+            **hf_kwargs,
+        ))
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -6,6 +6,8 @@ pytest test_chunked_prefill_distributed.py
 ```
 """

+import os
+
 import pytest

 from vllm.utils import cuda_device_count_stateless
@@ -30,6 +32,11 @@ def test_models(
    model: str,
    distributed_executor_backend: str,
 ) -> None:
+    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray":  # noqa
+        assert distributed_executor_backend == "ray"
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"

    dtype = "half"
    max_tokens = 5

--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -34,7 +34,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
    t = all_tensors[rank % tp_size]
    t = tensor_model_parallel_all_reduce(t)
-    assert torch.allclose(t, expected)
+    torch.testing.assert_close(t, expected)


 @ray.remote(num_gpus=1, max_calls=1)
@@ -62,7 +62,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
        expected = torch.cat(all_tensors, dim=all_gather_dimension)
        t = all_tensors[rank % tp_size]
        t = tensor_model_parallel_all_gather(t, all_gather_dimension)
-        assert torch.allclose(t, expected)
+        torch.testing.assert_close(t, expected)


 @ray.remote(num_gpus=1, max_calls=1)
@@ -96,12 +96,12 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
    else:
        recv_dict = broadcast_tensor_dict(src=0)
        assert len(recv_dict) == len(test_dict)
-        assert torch.allclose(recv_dict["a"], test_dict["a"])
-        assert torch.allclose(recv_dict["b"], test_dict["b"])
+        torch.testing.assert_close(recv_dict["a"], test_dict["a"])
+        torch.testing.assert_close(recv_dict["b"], test_dict["b"])
        assert recv_dict["c"] == test_dict["c"]
        assert recv_dict["d"] == test_dict["d"]
        assert recv_dict["e"] == test_dict["e"]
-        assert torch.allclose(recv_dict["f"], test_dict["f"])
+        torch.testing.assert_close(recv_dict["f"], test_dict["f"])


 @ray.remote(num_gpus=1, max_calls=1)
@@ -136,12 +136,12 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,

    if not get_pp_group().is_first_rank:
        assert len(recv_dict) == len(test_dict)
-        assert torch.allclose(recv_dict["a"], test_dict["a"])
-        assert torch.allclose(recv_dict["b"], test_dict["b"])
+        torch.testing.assert_close(recv_dict["a"], test_dict["a"])
+        torch.testing.assert_close(recv_dict["b"], test_dict["b"])
        assert recv_dict["c"] == test_dict["c"]
        assert recv_dict["d"] == test_dict["d"]
        assert recv_dict["e"] == test_dict["e"]
-        assert torch.allclose(recv_dict["f"], test_dict["f"])
+        torch.testing.assert_close(recv_dict["f"], test_dict["f"])


 @ray.remote(num_gpus=1, max_calls=1)
@@ -163,7 +163,7 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
        get_pp_group().send(test_tensor)

    if not get_pp_group().is_first_rank:
-        assert torch.allclose(test_tensor, recv_tensor)
+        torch.testing.assert_close(test_tensor, recv_tensor)


 @pytest.mark.skipif(torch.cuda.device_count() < 2,

--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -72,8 +72,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
                        out2 = tensor_model_parallel_all_reduce(inp2)
                        dist.all_reduce(inp2, group=group)
            graph.replay()
-            assert torch.allclose(out1, inp1)
-            assert torch.allclose(out2, inp2)
+            torch.testing.assert_close(out1, inp1)
+            torch.testing.assert_close(out2, inp2)


 @ray.remote(num_gpus=1, max_calls=1)
@@ -96,13 +96,13 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
    out = inp
    for _ in range(num_communication):
        out = fa.all_reduce_unreg(out)
-    assert torch.allclose(out, inp * (tp_size**num_communication))
+    torch.testing.assert_close(out, inp * (tp_size**num_communication))

    inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
    out = inp
    for _ in range(num_communication):
        out = fa.all_reduce_unreg(out)
-    assert torch.allclose(out, inp * (tp_size**num_communication))
+    torch.testing.assert_close(out, inp * (tp_size**num_communication))


 @pytest.mark.parametrize("tp_size", [2])

--- a/tests/distributed/test_distributed_oot.py
+++ b/tests/distributed/test_distributed_oot.py
+from ..entrypoints.openai.test_oot_registration import (
+    run_and_test_dummy_opt_api_server)
+
+
+def test_distributed_oot(dummy_opt_path: str):
+    run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
--- a/tests/distributed/test_multi_node_assignment.py
+++ b/tests/distributed/test_multi_node_assignment.py
+"""Make sure ray assigns GPU workers to the correct node.
+
+Run:
+```sh
+cd $VLLM_PATH/tests
+
+pytest distributed/test_multi_node_assignment.py
+```
+"""
+
+import os
+
+import pytest
+import ray
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+from vllm import initialize_ray_cluster
+from vllm.config import ParallelConfig
+from vllm.executor.ray_utils import _wait_until_pg_removed
+from vllm.utils import get_ip
+
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+
+
+@pytest.mark.skipif(not VLLM_MULTI_NODE,
+                    reason="Need at least 2 nodes to run the test.")
+def test_multi_node_assignment() -> None:
+
+    # NOTE: important to keep this class definition here
+    # to let ray use cloudpickle to serialize it.
+    class Actor:
+
+        def get_ip(self):
+            return get_ip()
+
+    for _ in range(10):
+        config = ParallelConfig(1, 2)
+        initialize_ray_cluster(config)
+
+        current_ip = get_ip()
+        workers = []
+        for bundle_id, bundle in enumerate(
+                config.placement_group.bundle_specs):
+            if not bundle.get("GPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=config.placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=1,
+                scheduling_strategy=scheduling_strategy,
+            )(Actor).remote()
+            worker_ip = ray.get(worker.get_ip.remote())
+            assert worker_ip == current_ip
+            workers.append(worker)
+
+        for worker in workers:
+            ray.kill(worker)
+
+        _wait_until_pg_removed(config.placement_group)
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -18,8 +18,10 @@ from ..utils import fork_new_process_for_each_test
 @pytest.mark.parametrize("model, distributed_executor_backend", [
    ("llava-hf/llava-1.5-7b-hf", "ray"),
    ("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
+    ("facebook/chameleon-7b", "ray"),
    ("llava-hf/llava-1.5-7b-hf", "mp"),
    ("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
+    ("facebook/chameleon-7b", "mp"),
 ])
 @fork_new_process_for_each_test
 def test_models(hf_runner, vllm_runner, image_assets, model: str,
@@ -34,6 +36,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model: str,
        from ..models.test_llava import models, run_test
    elif model.startswith("llava-hf/llava-v1.6"):
        from ..models.test_llava_next import models, run_test
+    elif model.startswith("facebook/chameleon"):
+        from ..models.test_chameleon import models, run_test
    else:
        raise NotImplementedError(f"Unsupported model: {model}")


--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -9,34 +9,36 @@ import os

 import pytest

+from vllm.logger import init_logger
+
 from ..utils import compare_two_settings, fork_new_process_for_each_test

+logger = init_logger("test_pipeline_parallel")
+
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"


 @pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
                          "MODEL_NAME, DIST_BACKEND"),
                         [
-                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
-                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
-                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
                         ])
+@fork_new_process_for_each_test
 def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
                    DIST_BACKEND):
    if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")

-    USE_RAY_ADAG_NCCL = 0
-    USE_RAY_ADAG = 0
-
    pp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
@@ -70,39 +72,24 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
        pp_args.append("--enforce-eager")
        tp_args.append("--enforce-eager")
    pp_env = None
-    if USE_RAY_ADAG:
-        assert DIST_BACKEND == "ray", (
-            "Ray ADAG is only supported with Ray distributed backend")
+    if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
+            and CHUNKED_PREFILL):
+        # Test Ray ADAG for a subset of the tests
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
-            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
-            str(int(USE_RAY_ADAG_NCCL)),
+            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
+        # Temporary. Currently when zeromq + SPMD is used, it does not properly
+        # terminate because of aDAG issue.
+        pp_args.append("--disable-frontend-multiprocessing")
+        tp_args.append("--disable-frontend-multiprocessing")

-    compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
-
-
-@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
-    (2, "JackFram/llama-160m"),
-])
-@pytest.mark.parametrize("ATTN_BACKEND", [
-    "FLASH_ATTN",
-    "FLASHINFER",
-])
-@fork_new_process_for_each_test
-def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
-    cudagraph_args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "float16",
-        "--pipeline-parallel-size",
-        str(PP_SIZE),
-        "--distributed-executor-backend",
-        "mp",
-    ]
-    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
-
-    eager_args = cudagraph_args + ["--enforce-eager"]
-
-    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
+    try:
+        compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
+    except Exception:
+        if pp_env is None:
+            raise
+        else:
+            # Ray ADAG tests are flaky, so we don't want to fail the test
+            logger.exception("Ray ADAG tests failed")
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
+import os
+
+import pytest
+
+from ..utils import compare_two_settings, fork_new_process_for_each_test
+
+
+@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
+    (2, "JackFram/llama-160m"),
+])
+@pytest.mark.parametrize("ATTN_BACKEND", [
+    "FLASH_ATTN",
+    "FLASHINFER",
+])
+@fork_new_process_for_each_test
+def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
+    cudagraph_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--pipeline-parallel-size",
+        str(PP_SIZE),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
+
+    eager_args = cudagraph_args + ["--enforce-eager"]
+
+    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
+import pytest
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+@pytest.mark.parametrize(("arg", "expected"), [
+    (None, None),
+    ("image=16", {
+        "image": 16
+    }),
+    ("image=16,video=2", {
+        "image": 16,
+        "video": 2
+    }),
+])
+def test_limit_mm_per_prompt_parser(arg, expected):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args(["--limit-mm-per-prompt", arg])
+
+    assert args.limit_mm_per_prompt == expected
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -140,3 +140,22 @@ def test_multiple_sampling_params(llm: LLM):
    # sampling_params is None, default params should be applied
    outputs = llm.generate(PROMPTS, sampling_params=None)
    assert len(PROMPTS) == len(outputs)
+
+
+def test_chat():
+
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+
+    prompt1 = "Explain the concept of entropy."
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt1
+        },
+    ]
+    outputs = llm.chat(messages)
+    assert len(outputs) == 1
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
+import pytest
+
+from vllm import LLM
+
+
+def test_empty_prompt():
+    llm = LLM(model="gpt2")
+    with pytest.raises(ValueError, match='Prompt cannot be empty'):
+        llm.generate([""])