Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

705f6a35 · zhuwenwen · af837396 · 4cf256ae · 705f6a35 · 705f6a35
Commit 705f6a35 authored Jul 16, 2024 by zhuwenwen
20 changed files
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -14,11 +14,11 @@ class TestNaiveBlockAllocator:
                               prev_block: Optional[Block],
                               token_ids: List[int]):
        if allocate_type == "immutable":
-            allocate_block = lambda: allocator.allocate_immutable(
+            allocate_block = lambda: allocator.allocate_immutable_block(
                prev_block=prev_block, token_ids=token_ids)
        elif allocate_type == "mutable":
-            allocate_block = lambda: allocator.allocate_mutable(prev_block=
-                                                                prev_block)
+            allocate_block = lambda: allocator.allocate_mutable_block(
+                prev_block=prev_block)
        else:
            raise ValueError()


--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -26,11 +26,10 @@ class TestPrefixCachingBlock:
        token_ids = list(range(num_to_fill))
        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)

-        block_with_prev = PrefixCachingBlock(
-            prev_block=None,
-            token_ids=token_ids,
-            block_size=block_size,
-            prefix_caching_allocator=mock_allocator)
+        block_with_prev = PrefixCachingBlock(prev_block=None,
+                                             token_ids=token_ids,
+                                             block_size=block_size,
+                                             allocator=mock_allocator)

        if is_curr_block_full:
            # Expect hash since block is full.
@@ -71,7 +70,7 @@ class TestPrefixCachingBlock:
            prev_block=previous_block,
            token_ids=token_ids,
            block_size=block_size,
-            prefix_caching_allocator=mock_allocator,
+            allocator=mock_allocator,
        )

        if is_curr_block_full and prev_block_has_hash:
@@ -123,7 +122,7 @@ class TestPrefixCachingBlock:
                     num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
        """Helper method which creates a chain of blocks.
        """
-        blocks = []
+        blocks: List[PrefixCachingBlock] = []
        num_blocks = math.ceil(
            len(token_ids) / block_size) + num_empty_trailing_blocks

@@ -138,7 +137,7 @@ class TestPrefixCachingBlock:
                prev_block=prev_block,
                token_ids=[],
                block_size=block_size,
-                prefix_caching_allocator=allocator,
+                allocator=allocator,
            )

            tokens_to_append = token_ids[block_number *
@@ -159,11 +158,11 @@ class TestPrefixCachingBlockAllocator:
                               prev_block: Optional[Block],
                               token_ids: List[int]):
        if allocate_type == "immutable":
-            allocate_block = lambda: allocator.allocate_immutable(
+            allocate_block = lambda: allocator.allocate_immutable_block(
                prev_block=prev_block, token_ids=token_ids)
        elif allocate_type == "mutable":
-            allocate_block = lambda: allocator.allocate_mutable(prev_block=
-                                                                prev_block)
+            allocate_block = lambda: allocator.allocate_mutable_block(
+                prev_block=prev_block)
        else:
            raise ValueError()

@@ -233,12 +232,13 @@ class TestPrefixCachingBlockAllocator:

        # Expect allocation with unseen hash to fail.
        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_immutable(prev_block=chain[-1],
-                                         token_ids=list(range(block_size)))
+            allocator.allocate_immutable_block(prev_block=chain[-1],
+                                               token_ids=list(
+                                                   range(block_size)))

        # Expect mutable allocation to fail.
        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_mutable(prev_block=chain[-1])
+            allocator.allocate_mutable_block(prev_block=chain[-1])

        # Expect allocation of exact same chain to pass.
        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
@@ -270,7 +270,7 @@ class TestPrefixCachingBlockAllocator:

        # Expect mutable allocation to fail.
        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_mutable(prev_block=None)
+            allocator.allocate_mutable_block(prev_block=None)

        block_to_free = chain[-1]

@@ -280,11 +280,11 @@ class TestPrefixCachingBlockAllocator:
            allocator.free(block_to_free)
            assert block_to_free.block_id is None, i

-            new_block = allocator.allocate_mutable(prev_block=None)
+            new_block = allocator.allocate_mutable_block(prev_block=None)
            assert new_block.block_id == block_id, i

            with pytest.raises(BlockAllocator.NoFreeBlocksError):
-                allocator.allocate_mutable(prev_block=None)
+                allocator.allocate_mutable_block(prev_block=None)

            block_to_free = new_block

@@ -376,7 +376,6 @@ class TestPrefixCachingBlockAllocator:

        # Create token ids that will exhaust all blocks.
        token_ids = list(range(num_blocks_to_consume * block_size))
-        blocks = list(range(num_blocks_to_consume))

        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
            block_size=block_size,
@@ -384,9 +383,6 @@ class TestPrefixCachingBlockAllocator:
            allocator=allocator,
        )

-        # mark all blocks in first chain as computed
-        allocator.mark_blocks_as_computed(blocks)
-
        # After zero_point, second_chain's token_ids would be set -1, which
        # make it different from here comparing with first_chain
        zero_point = random.randint(1, len(token_ids) - 1)
@@ -424,15 +420,16 @@ class TestPrefixCachingBlockAllocator:
                                                block_size=block_size)
        token_ids = list(range(block_size))

-        block = allocator.allocate_immutable(prev_block=None,
-                                             token_ids=token_ids)
+        block = allocator.allocate_immutable_block(prev_block=None,
+                                                   token_ids=token_ids)

        assert allocator._refcounter.get(block.block_id) == 1
-        m = allocator.allocate_mutable(prev_block=None)
+        m = allocator.allocate_mutable_block(prev_block=None)

        block_id = m.block_id
        for i in range(block_size):
            m.append_token_ids([i])
+
        # After block get promoted to immutable from mutable, if there is
        # already same content hash block, then it shall be released into
        # hashless_allocator
@@ -452,48 +449,79 @@ class TestPrefixCachingBlockAllocator:

        all_blocks_list = [i for i in range(num_blocks)]
        zero_ref = {i: 0 for i in range(num_blocks)}
+        one_ref = {i: 1 for i in range(num_blocks)}
        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
                                                block_size=block_size)
        token_ids = list(range(num_blocks * block_size))

-        # now we have num_blocks free blocks in hashless allocator
-        # with internal tracking list _blocks _cached_blocks and evictor
-        # empty and block's ref shall be 0
+        # Verify initial/pre-alloc state
+
+        # Ensure all blocks are free inside hashless allocator
        assert list(allocator._hashless_allocator._free_block_indices
                    ) == all_blocks_list
-        assert len(allocator._blocks.keys()) == 0
+        # Ensure no tracked blocks
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert not allocator._block_tracker[block_id].active
+        # Ensure no cached blocks
        assert len(allocator._cached_blocks.values()) == 0
+        # Ensure no evicted blocks
        assert len(allocator.evictor.free_table.keys()) == 0
+        # Ensure 0s ref counts for all blocks
        assert allocator._refcounter._refcounts == zero_ref

        # Allocate immutable chains with only one block residuled in
        new_block = []
        for i in range(num_blocks):
-            block = allocator.allocate_immutable(
+            block = allocator.allocate_immutable_block(
                prev_block=None,
                token_ids=token_ids[block_size * i:block_size * (i + 1)])
            new_block.append(block)

+        # Verify post-alloc state
+
+        # Ensure no blocks are free inside hashless allocator
+        assert (len(allocator._hashless_allocator._free_block_indices) == 0)
+        # Ensure all blocks are tracked
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert allocator._block_tracker[block_id].active
+        # Ensure all blocks are cached (all promoted)
+        assert len(allocator._cached_blocks.values()) == num_blocks
+        # Ensure no evicted blocks
+        assert len(allocator.evictor.free_table.keys()) == 0
+        # Ensure 1s ref counts for all blocks
+        assert allocator._refcounter._refcounts == one_ref
+
        # Free all blocks, and now all blocks shall be in the evictor
-        # there shall be no tracking data left in _blocks
+        # there shall be no tracking data left in _block_tracker
        # all blocks shall be tracked in _cached_blocks
        # all blocks' ref shall be zero
        for block in new_block:
            allocator.free(block)

-        assert len(allocator._blocks.keys()) == 0
+        # Verify post-free state
+
+        # Ensure no tracked blocks
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert not allocator._block_tracker[block_id].active
+        # Ensure no blocks in hashless allocator (all promoted)
        assert len(allocator._hashless_allocator._free_block_indices) == 0
+        # Ensure all blocks are cached
        assert list(allocator._cached_blocks.values()) == all_blocks_list
+        # Ensure all blocks are inside the evictor
        assert list(allocator.evictor.free_table.keys()) == all_blocks_list
+        # Ensure 0s refcounts
        assert allocator._refcounter._refcounts == zero_ref

        # Allocate a mutable block, and the first block shall be evicted
        # and set its content hash into None, ref to 1
-        mutable = allocator.allocate_mutable(prev_block=None)
+        mutable = allocator.allocate_mutable_block(prev_block=None)

        assert mutable.block_id == 0
        assert mutable.content_hash is None
-        assert 0 in allocator._blocks
+        assert allocator._block_tracker[0].active
        assert allocator._refcounter.get(0) == 1
        assert 0 not in allocator._cached_blocks
        assert 0 not in allocator.evictor
@@ -502,27 +530,27 @@ class TestPrefixCachingBlockAllocator:
        # hashless allocator
        allocator.free(mutable)

-        assert len(allocator._blocks.keys()) == 0
+        assert not allocator._block_tracker[0].active
        assert allocator._refcounter._refcounts == zero_ref
        assert 0 not in allocator._cached_blocks
        assert 0 not in allocator.evictor
        assert 0 in allocator._hashless_allocator._free_block_indices

-        # when allocate immutable with first block_size tokens, we
+        # When allocate immutable with first block_size tokens, we
        # shall get free block from hashless allocator, thus no block left
        # in hashless
-        block = allocator.allocate_immutable(prev_block=None,
-                                             token_ids=token_ids[:block_size])
+        block = allocator.allocate_immutable_block(
+            prev_block=None, token_ids=token_ids[:block_size])

        assert block.block_id == 0
        assert len(allocator._hashless_allocator._free_block_indices) == 0
-        assert 0 in allocator._blocks
+        assert allocator._block_tracker[0].active
        assert 0 in allocator._cached_blocks.values()
        assert allocator._refcounter.get(0) == 1
        assert 0 not in allocator.evictor

        # allocate mutable block again, it shall be popped from evictor
-        mutable = allocator.allocate_mutable(prev_block=None)
+        mutable = allocator.allocate_mutable_block(prev_block=None)
        assert len(allocator._hashless_allocator._free_block_indices) == 0
        assert mutable.block_id not in allocator.evictor.free_table
        assert allocator._refcounter.get(mutable.block_id) == 1
@@ -608,7 +636,7 @@ class TestPrefixCachingBlockAllocator:
    ) -> List[PrefixCachingBlock]:
        """Helper method which creates a chain of blocks.
        """
-        blocks = []
+        blocks: List[Block] = []
        num_blocks = math.ceil(len(token_ids) / block_size)

        if num_blocks == 0:
@@ -619,7 +647,7 @@ class TestPrefixCachingBlockAllocator:
            block_token_ids = token_ids[block_number *
                                        block_size:(block_number + 1) *
                                        block_size]
-            prev_block = allocator.allocate_immutable(
+            prev_block = allocator.allocate_immutable_block(
                prev_block=prev_block, token_ids=block_token_ids)
            blocks.append(prev_block)


--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -149,7 +149,7 @@ def test_complex():
    # Only the first seq group has a new token appended.
    append_new_token(running[0], 1)

-    # Add 2 more requsets.
+    # Add 2 more requests.
    for i in range(2, 4):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
        scheduler.add_seq_group(seq_group)
@@ -483,11 +483,11 @@ def test_chunked_prefill_preempt():
    # The request should be preempted.
    scheduler.block_manager.can_append_slots = MagicMock()

-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+    def cannot_append_second_group1(seq_group, num_lookahead_slots):
        return seq_group.request_id != "1"

    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
+        cannot_append_second_group1)

    # The running prefill is now preempted.
    _, out = schedule_and_update_computed_tokens(scheduler)
@@ -505,11 +505,11 @@ def test_chunked_prefill_preempt():
    assert seq_group.get_num_uncomputed_tokens() == 30

    # We should be able to run prefill twice as it is chunked.
-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+    def cannot_append_second_group2(seq_group, num_lookahead_slots):
        return True

    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
+        cannot_append_second_group2)
    _, out = schedule_and_update_computed_tokens(scheduler)
    assert len(out.scheduled_seq_groups) == 1
    assert out.num_prefill_groups == 1
@@ -530,7 +530,7 @@ def test_chunked_prefill_max_seqs():
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running = []
+    running: List[SequenceGroup] = []

    _, seq_group = create_dummy_prompt("1", prompt_length=65)
    scheduler.add_seq_group(seq_group)

--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
 import time
 from collections import deque
-from typing import List
+from typing import Deque, List, Set, Tuple
 from unittest.mock import MagicMock

 import pytest  # noqa
@@ -65,7 +65,7 @@ def test_scheduler_abort_seq_group():

    # Add multiple seq groups to scheduler.
    num_seq_group = 4
-    request_ids = set()
+    request_ids: Set[str] = set()
    for i in range(num_seq_group):
        _, seq_group = create_dummy_prompt(str(i), block_size)
        scheduler.add_seq_group(seq_group)
@@ -347,7 +347,7 @@ def test_prefill_schedule_max_prompt_len():
    Test prompt longer than max_prompt_len is aborted.
    """
    scheduler = initialize_scheduler(max_model_len=30)
-    _, seq_group = create_dummy_prompt(0, prompt_length=60)
+    _, seq_group = create_dummy_prompt("0", prompt_length=60)
    waiting = deque([seq_group])
    budget = create_token_budget()
    remaining_waiting, output = scheduler._schedule_prefills(
@@ -364,7 +364,7 @@ def test_prefill_schedule_token_budget():
    Test token budget respected.
    """
    scheduler = initialize_scheduler()
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
    budget = create_token_budget(token_budget=0)
    for i in range(2):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
@@ -419,7 +419,7 @@ def test_prefill_schedule_max_seqs():
    Test max seq respected.
    """
    scheduler = initialize_scheduler()
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
    budget = create_token_budget(max_num_seqs=2)
    for i in range(3):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
@@ -453,9 +453,9 @@ def test_prefill_schedule_max_lora():
    """
    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
    scheduler = initialize_scheduler(lora_config=lora_config)
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
    budget = create_token_budget(token_budget=120)
-    curr_loras = set()
+    curr_loras: Set[int] = set()
    for i in range(2):
        _, seq_group = create_dummy_prompt(str(i),
                                           prompt_length=60,
@@ -499,7 +499,7 @@ def test_prefill_schedule_no_block_manager_capacity():
    Test sequence cannot be scheduled due to block manager has no capacity.
    """
    scheduler = initialize_scheduler()
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
    budget = create_token_budget()
    for i in range(3):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
@@ -536,7 +536,7 @@ def test_decode_schedule_preempted():
    Test decodes cannot be scheduled and preempted.
    """
    scheduler = initialize_scheduler()
-    running = deque()
+    running: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
    for i in range(3):
@@ -577,7 +577,7 @@ def test_decode_swap_beam_search():
    Test best_of > 1 swap out blocks
    """
    scheduler = initialize_scheduler()
-    running = deque()
+    running: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
    budget = create_token_budget()
@@ -628,7 +628,7 @@ def test_schedule_decode_blocks_to_copy_update():
    """
    scheduler = initialize_scheduler()
    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-    running = deque()
+    running: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
    scheduler._allocate_and_set_running(seq_group)
@@ -656,10 +656,10 @@ def test_schedule_decode_blocks_to_copy_update():

 def test_schedule_swapped_simple():
    scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
    scheduler._allocate_and_set_running(seq_group)
    append_new_token_seq_group(60, seq_group, 1)
@@ -683,10 +683,10 @@ def test_schedule_swapped_simple():

 def test_schedule_swapped_max_token_budget():
    scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    for _ in range(2):
        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
        scheduler._allocate_and_set_running(seq_group)
@@ -717,10 +717,10 @@ def test_schedule_swapped_max_token_budget():

 def test_schedule_swapped_max_seqs():
    scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    for i in range(4):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
        scheduler._allocate_and_set_running(seq_group)
@@ -750,10 +750,10 @@ def test_schedule_swapped_max_seqs():
 def test_schedule_swapped_max_loras():
    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
    scheduler = initialize_scheduler(lora_config=lora_config)
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
-    curr_loras = set()
-    blocks_to_swap_out = []
+    curr_loras: Set[int] = set()
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    for i in range(2):
        _, seq_group = create_dummy_prompt(str(i),
                                           prompt_length=60,
@@ -779,10 +779,10 @@ def test_schedule_swapped_max_loras():

 def test_schedule_swapped_cannot_swap_in():
    scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    for _ in range(2):
        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
        scheduler._allocate_and_set_running(seq_group)
@@ -806,10 +806,10 @@ def test_schedule_swapped_cannot_swap_in():

 def test_infeasible_swap():
    scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    for _ in range(2):
        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
        scheduler._allocate_and_set_running(seq_group)
@@ -834,13 +834,13 @@ def test_infeasible_swap():

 def test_schedule_swapped_blocks_to_copy():
    scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
    scheduler._allocate_and_set_running(seq_group)
    append_new_token_seq_group(60, seq_group, 1)
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    scheduler._swap_out(seq_group, blocks_to_swap_out)
    swapped.append(seq_group)


--- a/tests/core/utils.py
+++ b/tests/core/utils.py
 import time
-from typing import Iterable, Optional, Tuple
+from typing import List, Optional
+from typing import Sequence as GenericSequence
+from typing import Tuple

 from vllm import SamplingParams
 from vllm.lora.request import LoRARequest
@@ -46,7 +48,7 @@ def create_dummy_prompt_encoder_decoder(
    lora_request: Optional[LoRARequest] = None,
    use_beam_search: bool = False,
    best_of: int = 1,
-) -> Tuple[Sequence, SequenceGroup]:
+) -> Tuple[Sequence, Sequence, SequenceGroup]:
    if not block_size:
        block_size = decoder_prompt_length

@@ -86,7 +88,7 @@ def create_dummy_prompt_encoder_decoder(

 def create_seq_group(
        seq_prompt_len: int = 1024,
-        seq_output_lens: Iterable[int] = (128, ),
+        seq_output_lens: GenericSequence[int] = (128, ),
        request_id: str = '0',
        seq_id_start: int = 0,
        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
@@ -98,7 +100,7 @@ def create_seq_group(

    prompt_token_ids = [0] * seq_prompt_len

-    seqs = []
+    seqs: List[Sequence] = []
    for seq_id_offset, output_len in enumerate(seq_output_lens):
        seq = Sequence(
            seq_id=seq_id_start + seq_id_offset,
@@ -125,7 +127,7 @@ def create_seq_group(

 def create_seq_group_encoder_decoder(
        seq_prompt_len: int = 1024,
-        seq_output_lens: Iterable[int] = (128, ),
+        seq_output_lens: GenericSequence[int] = (128, ),
        request_id: str = '0',
        seq_id_start: int = 0,
        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:

--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -15,16 +15,18 @@ TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
 import os

 import pytest
-import torch
+
+from vllm.utils import cuda_device_count_stateless
+
+from ..models.utils import check_outputs_equal

 MODELS = [
    os.environ["TEST_DIST_MODEL"],
 ]
 DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
-VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
+@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                    reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
@@ -39,24 +41,23 @@ def test_models(
 ) -> None:
    distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)

-    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
-    enforce_eager = backend_by_env_var == "FLASHINFER"
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
    with vllm_runner(model,
                     dtype=dtype,
                     tensor_parallel_size=2,
-                     enforce_eager=enforce_eager,
                     distributed_executor_backend=distributed_executor_backend
                     ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)

-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -14,7 +14,10 @@ TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
 import os

 import pytest
-import torch
+
+from vllm.utils import cuda_device_count_stateless
+
+from ..models.utils import check_outputs_equal

 MODELS = [
    os.environ["TEST_DIST_MODEL"],
@@ -22,7 +25,7 @@ MODELS = [
 DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
+@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                    reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
@@ -45,8 +48,10 @@ def test_models(
    enable_chunked_prefill = True
    max_num_batched_tokens = chunked_prefill_token_size

-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).

    with vllm_runner(
            model,
@@ -59,10 +64,12 @@ def test_models(
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)

-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -8,12 +8,11 @@ import pytest
 import ray
 import torch

-from vllm.distributed import (broadcast_tensor_dict,
+from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                              tensor_model_parallel_all_gather,
                              tensor_model_parallel_all_reduce)

-from ..utils import (init_test_distributed_environment,
-                     multi_process_tensor_parallel)
+from ..utils import init_test_distributed_environment, multi_process_parallel


 @ray.remote(num_gpus=1, max_calls=1)
@@ -33,7 +32,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
        (r + 1) for r in range(tp_size)
    ]
    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
-    t = all_tensors[rank]
+    t = all_tensors[rank % tp_size]
    t = tensor_model_parallel_all_reduce(t)
    assert torch.allclose(t, expected)

@@ -61,7 +60,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
            for r in range(tp_size)
        ]
        expected = torch.cat(all_tensors, dim=all_gather_dimension)
-        t = all_tensors[rank]
+        t = all_tensors[rank % tp_size]
        t = tensor_model_parallel_all_gather(t, all_gather_dimension)
        assert torch.allclose(t, expected)

@@ -92,7 +91,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
    }

-    if rank == 0:
+    if (rank % tp_size) == 0:
        broadcast_tensor_dict(test_dict, src=0)
    else:
        recv_dict = broadcast_tensor_dict(src=0)
@@ -105,6 +104,68 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
        assert torch.allclose(recv_dict["f"], test_dict["f"])


+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+                                      distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    test_dict = {
+        # device tensor
+        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+        # CPU tensor
+        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
+        "c": "test",
+        "d": [1, 2, 3],
+        "e": {
+            "a": 1,
+            "b": 2
+        },
+        # empty tensor
+        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
+    }
+
+    if not get_pp_group().is_first_rank:
+        recv_dict = get_pp_group().recv_tensor_dict()
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send_tensor_dict(test_dict)
+
+    if not get_pp_group().is_first_rank:
+        assert len(recv_dict) == len(test_dict)
+        assert torch.allclose(recv_dict["a"], test_dict["a"])
+        assert torch.allclose(recv_dict["b"], test_dict["b"])
+        assert recv_dict["c"] == test_dict["c"]
+        assert recv_dict["d"] == test_dict["d"]
+        assert recv_dict["e"] == test_dict["e"]
+        assert torch.allclose(recv_dict["f"], test_dict["f"])
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
+                          distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    size = 64
+    test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
+
+    if not get_pp_group().is_first_rank:
+        recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send(test_tensor)
+
+    if not get_pp_group().is_first_rank:
+        assert torch.allclose(test_tensor, recv_tensor)
+
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("tp_size", [2])
@@ -113,4 +174,27 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
    broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel(tp_size, test_target):
-    multi_process_tensor_parallel(tp_size, 1, test_target)
+    multi_process_parallel(tp_size, 1, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize(
+    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
+def test_multi_process_pipeline_parallel(pp_size, test_target):
+    multi_process_parallel(1, pp_size, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize("test_target", [
+    send_recv_test_worker, send_recv_tensor_dict_test_worker,
+    all_reduce_test_worker, all_gather_test_worker,
+    broadcast_tensor_dict_test_worker
+])
+def test_multi_process_tensor_parallel_pipeline_parallel(
+        tp_size, pp_size, test_target):
+    multi_process_parallel(tp_size, pp_size, test_target)
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -7,12 +7,12 @@ import torch
 import torch.distributed as dist

 from vllm.distributed.communication_op import (  # noqa
-    graph_capture, tensor_model_parallel_all_reduce)
+    tensor_model_parallel_all_reduce)
 from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
-                                             get_tp_ca_communicator)
+                                             get_tp_group, graph_capture)

-from ..utils import (init_test_distributed_environment,
-                     multi_process_tensor_parallel)
+from ..utils import (ensure_model_parallel_initialized,
+                     init_test_distributed_environment, multi_process_parallel)

 random.seed(42)
 test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
@@ -27,8 +27,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
                                      distributed_init_port)
-
-    group = get_tensor_model_parallel_group()
+    ensure_model_parallel_initialized(tp_size, pp_size)
+    group = get_tensor_model_parallel_group().device_group

    # A small all_reduce for warmup.
    # this is needed because device communicators might be created lazily
@@ -91,7 +91,7 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
    # communicate independently
    num_communication = rank // tp_size + 1
    sz = 1024
-    fa = get_tp_ca_communicator()
+    fa = get_tp_group().ca_comm
    inp = torch.ones(sz, dtype=torch.float32, device=device)
    out = inp
    for _ in range(num_communication):
@@ -112,4 +112,4 @@ def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
    world_size = tp_size * pipeline_parallel_size
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")
-    multi_process_tensor_parallel(tp_size, pipeline_parallel_size, test_target)
+    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
+"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
+The second test will hang if more than one test is run per command, so we need
+to run the tests one by one. The solution is to pass arguments (model name) by
+environment variables.
+
+Run:
+```sh
+TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \
+    test_multimodal_broadcast.py
+TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \
+    test_multimodal_broadcast.py
+```
+"""
+import os
+
+import pytest
+
+from vllm.utils import cuda_device_count_stateless
+
+model = os.environ["TEST_DIST_MODEL"]
+
+if model.startswith("llava-hf/llava"):
+    from ..models.test_llava import models, run_test
+elif model.startswith("microsoft/Phi-3-vision"):
+    from ..models.test_phi3v import models, run_test
+else:
+    raise NotImplementedError(f"Unsupported model: {model}")
+
+
+@pytest.mark.parametrize("tensor_parallel_size", [2])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets,
+                tensor_parallel_size: int, dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(
+            f"Need at least {tensor_parallel_size} GPUs to run the test.")
+
+    distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model=models[0],
+        size_factors=[1.0],
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )
--- a/tests/distributed/test_parallel_state.py
+++ b/tests/distributed/test_parallel_state.py
+from typing import Any, Dict
+
+import pytest
+import torch
+
+from vllm.distributed.parallel_state import (_split_tensor_dict,
+                                             _update_nested_dict)
+
+
+def test_split_tensor_dict():
+    test_dict = {
+        "key_a": "a",
+        "key_b": torch.arange(8, dtype=torch.float32),
+        "key_c": {
+            "key_1": torch.arange(5, dtype=torch.float32),
+            "key_2": torch.tensor([], dtype=torch.float32),
+            "key_3": 123,
+        },
+        "key_d": {},
+    }
+    metadata_list, tensor_list = _split_tensor_dict(test_dict)
+    assert len(metadata_list) == 6
+    assert torch.allclose(tensor_list[0], test_dict["key_b"])
+    assert torch.allclose(tensor_list[1], test_dict["key_c"]["key_1"])
+    assert torch.allclose(tensor_list[2], test_dict["key_c"]["key_2"])
+
+
+def test_split_tensor_dict_invalid_key():
+    test_dict = {
+        "a%b": "a",
+    }
+    with pytest.raises(AssertionError):
+        _split_tensor_dict(test_dict)
+
+
+def test_update_nested_dict():
+    flattened_keys_values = [("key1%key2%key3", "value1"),
+                             ("key1%key2%key4", "value2"),
+                             ("key1%key5", "value3"), ("key6%key7", "value4"),
+                             ("key8", "value5")]
+    res: Dict[str, Any] = {}
+
+    for flat_key, value in flattened_keys_values:
+        _update_nested_dict(res, flat_key, value)
+    assert res == {
+        "key1": {
+            "key2": {
+                "key3": "value1",
+                "key4": "value2"
+            },
+            "key5": "value3"
+        },
+        "key6": {
+            "key7": "value4"
+        },
+        "key8": "value5"
+    }
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
+import os
+
+import openai  # use the official client for correctness check
+import pytest
+
+from ..utils import RemoteOpenAIServer
+
+# downloading lora to test lora requests
+
+# any model with a chat template should work here
+MODEL_NAME = "meta-llama/Meta-Llama-3-8B"
+EAGER_MODE = bool(int(os.getenv("EAGER_MODE", 0)))
+CHUNKED_PREFILL = bool(int(os.getenv("CHUNKED_PREFILL", 0)))
+TP_SIZE = int(os.getenv("TP_SIZE", 1))
+PP_SIZE = int(os.getenv("PP_SIZE", 1))
+
+pytestmark = pytest.mark.asyncio
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--model",
+        MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--pipeline-parallel-size",
+        str(PP_SIZE),
+        "--tensor-parallel-size",
+        str(TP_SIZE),
+        "--distributed-executor-backend",
+        "ray",
+    ]
+    if CHUNKED_PREFILL:
+        args += [
+            "--enable-chunked-prefill",
+        ]
+    if EAGER_MODE:
+        args += [
+            "--enforce-eager",
+        ]
+    with RemoteOpenAIServer(args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+async def test_check_models(server, client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    assert served_model.id == MODEL_NAME
+    assert all(model.root == MODEL_NAME for model in models)
+
+
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_single_completion(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+
+
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_batch_completions(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
+    # test simple list
+    batch = await client.completions.create(
+        model=model_name,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(batch.choices) == 2
+    assert batch.choices[0].text == batch.choices[1].text
+
+    # test n = 2
+    batch = await client.completions.create(
+        model=model_name,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        n=2,
+        max_tokens=5,
+        temperature=0.0,
+        extra_body=dict(
+            # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+            # for official client.
+            use_beam_search=True),
+    )
+    assert len(batch.choices) == 4
+    assert batch.choices[0].text != batch.choices[
+        1].text, "beam search should be different"
+    assert batch.choices[0].text == batch.choices[
+        2].text, "two copies of the same prompt should be the same"
+    assert batch.choices[1].text == batch.choices[
+        3].text, "two copies of the same prompt should be the same"
+
+    # test streaming
+    batch = await client.completions.create(
+        model=model_name,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+    )
+    texts = [""] * 2
+    async for chunk in batch:
+        assert len(chunk.choices) == 1
+        choice = chunk.choices[0]
+        texts[choice.index] += choice.text
+    assert texts[0] == texts[1]
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
 import multiprocessing
 import os
+from typing import Dict, List

 import pytest
 import torch
 import torch.distributed

 from vllm.distributed.communication_op import (  # noqa
-    graph_capture, tensor_model_parallel_all_reduce)
+    tensor_model_parallel_all_reduce)
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
 from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             get_world_group, graph_capture,
                                             init_distributed_environment)
 from vllm.utils import update_environment_variables


 def distributed_run(fn, world_size):
    number_of_processes = world_size
-    processes = []
+    processes: List[multiprocessing.Process] = []
    for i in range(number_of_processes):
-        env = {}
+        env: Dict[str, str] = {}
        env['RANK'] = str(i)
        env['LOCAL_RANK'] = str(i)
        env['WORLD_SIZE'] = str(number_of_processes)
@@ -53,7 +55,8 @@ def worker_fn_wrapper(fn):

 @worker_fn_wrapper
 def worker_fn():
-    pynccl_comm = PyNcclCommunicator()
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
    tensor = torch.ones(16, 1024, 1024,
                        dtype=torch.float32).cuda(pynccl_comm.rank)
    with pynccl_comm.change_state(enable=True):
@@ -129,7 +132,8 @@ def test_pynccl_multiple_allreduce_with_vllm():
 def worker_fn_with_cudagraph():
    with torch.no_grad():
        graph = torch.cuda.CUDAGraph()
-        pynccl_comm = PyNcclCommunicator()
+        pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                         device=get_world_group().device)
        # run something in the default stream to initialize torch engine
        a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
        torch.cuda.synchronize()
@@ -154,7 +158,8 @@ def test_pynccl_with_cudagraph():

 @worker_fn_wrapper
 def send_recv_worker_fn():
-    pynccl_comm = PyNcclCommunicator()
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
    if pynccl_comm.rank == 0:
        tensor = torch.ones(16, 1024, 1024,
                            dtype=torch.float32).cuda(pynccl_comm.rank)
@@ -163,9 +168,13 @@ def send_recv_worker_fn():
                             dtype=torch.float32).cuda(pynccl_comm.rank)
    with pynccl_comm.change_state(enable=True):
        if pynccl_comm.rank == 0:
-            pynccl_comm.send(tensor)
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
        else:
-            pynccl_comm.recv(tensor)
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
    result = tensor.mean().cpu().item()
    assert result == 1

@@ -198,9 +207,13 @@ def multiple_send_recv_worker_fn():
                             device=device)
    with pynccl_comm.change_state(enable=True):
        if torch.distributed.get_rank() in [0, 1]:
-            pynccl_comm.send(tensor)
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
        else:
-            pynccl_comm.recv(tensor)
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
    result = tensor.mean().cpu().item()
    if torch.distributed.get_rank() in [0, 2]:
        assert result == 1

--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -2,10 +2,12 @@ import os

 import torch

-from vllm.distributed.parallel_state import is_in_the_same_node
+from vllm.distributed.parallel_state import in_the_same_node_as

 torch.distributed.init_process_group(backend="gloo")
-test_result = is_in_the_same_node(torch.distributed.group.WORLD)
+test_result = all(
+    in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))

 expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
 assert test_result == expected, f"Expected {expected}, got {test_result}"
+print("Same node test passed!")
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
+import multiprocessing
+import random
+import time
+from typing import List
+
+import numpy as np
+import torch.distributed as dist
+
+from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+from vllm.utils import update_environment_variables
+
+
+def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
+    np.random.seed(seed)
+    sizes = np.random.randint(1, 10_000, n)
+    # on average, each array will have 5k elements
+    # with int64, each array will have 40kb
+    return [np.random.randint(1, 100, i) for i in sizes]
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes = []
+    for i in range(number_of_processes):
+        env = {}
+        env['RANK'] = str(i)
+        env['LOCAL_RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def worker_fn_wrapper(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapped_fn(env):
+        update_environment_variables(env)
+        dist.init_process_group(backend="gloo")
+        fn()
+
+    return wrapped_fn
+
+
+@worker_fn_wrapper
+def worker_fn():
+    writer_rank = 2
+    broadcaster = MessageQueue.create_from_process_group(
+        dist.group.WORLD, 40 * 1024, 2, writer_rank)
+    if dist.get_rank() == writer_rank:
+        seed = random.randint(0, 1000)
+        dist.broadcast_object_list([seed], writer_rank)
+    else:
+        recv = [None]
+        dist.broadcast_object_list(recv, writer_rank)
+        seed = recv[0]  # type: ignore
+    dist.barrier()
+    # in case we find a race condition
+    # print the seed so that we can reproduce the error
+    print(f"Rank {dist.get_rank()} got seed {seed}")
+    # test broadcasting with about 400MB of data
+    N = 10_000
+    if dist.get_rank() == writer_rank:
+        arrs = get_arrays(N, seed)
+        for x in arrs:
+            broadcaster.broadcast_object(x)
+            time.sleep(random.random() / 1000)
+    else:
+        arrs = get_arrays(N, seed)
+        for x in arrs:
+            y = broadcaster.broadcast_object(None)
+            assert np.array_equal(x, y)
+            time.sleep(random.random() / 1000)
+    dist.barrier()
+
+
+def test_shm_broadcast():
+    distributed_run(worker_fn, 4)
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
+import ray
+
+import vllm.envs as envs
+from vllm.utils import (cuda_device_count_stateless,
+                        update_environment_variables)
+
+
+@ray.remote
+class _CUDADeviceCountStatelessTestActor:
+
+    def get_count(self):
+        return cuda_device_count_stateless()
+
+    def set_cuda_visible_devices(self, cuda_visible_devices: str):
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+
+    def get_cuda_visible_devices(self):
+        return envs.CUDA_VISIBLE_DEVICES
+
+
+def test_cuda_device_count_stateless():
+    """Test that cuda_device_count_stateless changes return value if
+    CUDA_VISIBLE_DEVICES is changed."""
+    actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
+        num_gpus=2).remote()
+    assert sorted(ray.get(
+        actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"]
+    assert ray.get(actor.get_count.remote()) == 2
+    ray.get(actor.set_cuda_visible_devices.remote("0"))
+    assert ray.get(actor.get_count.remote()) == 1
+    ray.get(actor.set_cuda_visible_devices.remote(""))
+    assert ray.get(actor.get_count.remote()) == 0
--- a/tests/engine/output_processor/test_multi_step.py
+++ b/tests/engine/output_processor/test_multi_step.py
@@ -32,7 +32,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):

    output_processor = MultiStepOutputProcessor(
        detokenizer=detokenizer,
-        scheduler=scheduler,
+        scheduler=[scheduler],
        seq_counter=seq_counter,
        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
        stop_checker=stop_checker,
@@ -86,7 +86,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,

    output_processor = MultiStepOutputProcessor(
        detokenizer=detokenizer,
-        scheduler=scheduler,
+        scheduler=[scheduler],
        seq_counter=seq_counter,
        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
        stop_checker=stop_checker,
@@ -148,7 +148,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,

    output_processor = MultiStepOutputProcessor(
        detokenizer=detokenizer,
-        scheduler=scheduler,
+        scheduler=[scheduler],
        seq_counter=seq_counter,
        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
        stop_checker=stop_checker,
@@ -215,7 +215,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,

    output_processor = MultiStepOutputProcessor(
        detokenizer=detokenizer,
-        scheduler=scheduler,
+        scheduler=[scheduler],
        seq_counter=seq_counter,
        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
        stop_checker=stop_checker,

--- a/tests/entrypoints/llm/__init__.py
+++ b/tests/entrypoints/llm/__init__.py
--- a/tests/entrypoints/test_llm_encode.py
+++ b/tests/entrypoints/test_llm_encode.py
@@ -5,7 +5,7 @@ import pytest

 from vllm import LLM, EmbeddingRequestOutput, PoolingParams

-from ..conftest import cleanup
+from ...conftest import cleanup

 MODEL_NAME = "intfloat/e5-mistral-7b-instruct"

@@ -25,8 +25,6 @@ TOKEN_IDS = [
    [1000, 1003, 1001, 1002],
 ]

-pytestmark = pytest.mark.llm
-

 @pytest.fixture(scope="module")
 def llm():

--- a/tests/entrypoints/test_llm_generate.py
+++ b/tests/entrypoints/test_llm_generate.py
@@ -5,7 +5,7 @@ import pytest

 from vllm import LLM, RequestOutput, SamplingParams

-from ..conftest import cleanup
+from ...conftest import cleanup

 MODEL_NAME = "facebook/opt-125m"

@@ -23,8 +23,6 @@ TOKEN_IDS = [
    [0, 3, 1, 2],
 ]

-pytestmark = pytest.mark.llm
-

 @pytest.fixture(scope="module")
 def llm():