merge v0.4.0

7c4f76e3 · zhuwenwen · 2da0dd3e · 51c31bc1 · 7c4f76e3 · 7c4f76e3
Commit 7c4f76e3 authored Apr 15, 2024 by zhuwenwen
20 changed files
--- a/tests/conftest.py
+++ b/tests/conftest.py
+import contextlib
+import gc
 import os
 from typing import List, Optional, Tuple
 import pytest
 import torch
-from transformers import AutoModelForCausalLM
+from PIL import Image
+from transformers import (AutoModelForCausalLM, AutoProcessor,
+                          LlavaForConditionalGeneration)
 from vllm import LLM, SamplingParams
+from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
+from vllm.model_executor.parallel_utils.parallel_state import (
+    destroy_model_parallel)
+from vllm.sequence import MultiModalData
 from vllm.transformers_utils.tokenizer import get_tokenizer
 _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
+# Multi modal related
+_PIXEL_VALUES_FILES = [
+    os.path.join(_TEST_DIR, "images", filename) for filename in
+    ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
+]
+_IMAGE_FEATURES_FILES = [
+    os.path.join(_TEST_DIR, "images", filename) for filename in
+    ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
+]
+_IMAGE_FILES = [
+    os.path.join(_TEST_DIR, "images", filename)
+    for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
+]
+_IMAGE_PROMPTS = [
+    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
+    "<image>\nUSER: What is the season?\nASSISTANT:"
+]
+assert len(_PIXEL_VALUES_FILES) == len(_IMAGE_FEATURES_FILES) == len(
+    _IMAGE_FILES) == len(_IMAGE_PROMPTS)
 def _read_prompts(filename: str) -> List[str]:
    with open(filename, "r") as f:
@@ -19,6 +47,53 @@ def _read_prompts(filename: str) -> List[str]:
        return prompts
+def cleanup():
+    destroy_model_parallel()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    gc.collect()
+    torch.cuda.empty_cache()
+@pytest.fixture(autouse=True)
+def cleanup_fixture():
+    yield
+    cleanup()
+@pytest.fixture(scope="session")
+def hf_image_prompts() -> List[str]:
+    return _IMAGE_PROMPTS
+@pytest.fixture(scope="session")
+def hf_images() -> List[Image.Image]:
+    return [Image.open(filename) for filename in _IMAGE_FILES]
+@pytest.fixture()
+def vllm_images(request) -> "torch.Tensor":
+    vision_language_config = request.getfixturevalue("model_and_config")[1]
+    all_images = []
+    if vision_language_config.image_input_type == (
+            VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
+        filenames = _IMAGE_FEATURES_FILES
+    else:
+        filenames = _PIXEL_VALUES_FILES
+    for filename in filenames:
+        all_images.append(torch.load(filename))
+    return torch.concat(all_images, dim=0)
+@pytest.fixture()
+def vllm_image_prompts(request) -> List[str]:
+    vision_language_config = request.getfixturevalue("model_and_config")[1]
+    return [
+        "<image>" * (vision_language_config.image_feature_size - 1) + p
+        for p in _IMAGE_PROMPTS
+    ]
 @pytest.fixture
 def example_prompts() -> List[str]:
    prompts = []
@@ -41,6 +116,10 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
    "float": torch.float,
 }
+_VISION_LANGUAGE_MODELS = {
+    "llava-hf/llava-1.5-7b-hf": LlavaForConditionalGeneration,
+}
 class HfRunner:
@@ -52,11 +131,24 @@ class HfRunner:
    ) -> None:
        assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
-        self.model = AutoModelForCausalLM.from_pretrained(
+        self.model_name = model_name
-            model_name,
+        if model_name not in _VISION_LANGUAGE_MODELS:
-            torch_dtype=torch_dtype,
+            self.model = AutoModelForCausalLM.from_pretrained(
-            trust_remote_code=True,
+                model_name,
-        ).cuda()
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+            ).cuda()
+            self.processor = None
+        else:
+            self.model = _VISION_LANGUAGE_MODELS[model_name].from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+            ).cuda()
+            self.processor = AutoProcessor.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+            )
        if tokenizer_name is None:
            tokenizer_name = model_name
        self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)
@@ -64,13 +156,28 @@ class HfRunner:
    def generate(
        self,
        prompts: List[str],
+        images: Optional[List[Image.Image]] = None,
        **kwargs,
    ) -> List[Tuple[List[int], str]]:
        outputs: List[Tuple[List[int], str]] = []
-        for prompt in prompts:
+        if images:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+            assert len(prompts) == len(images)
+        for i, prompt in enumerate(prompts):
+            if self.model_name not in _VISION_LANGUAGE_MODELS:
+                input_ids = self.tokenizer(prompt,
+                                           return_tensors="pt").input_ids
+                inputs = {"input_ids": input_ids.cuda()}
+            else:
+                image = images[i] if images else None
+                inputs = self.processor(text=prompt,
+                                        images=image,
+                                        return_tensors="pt")
+                inputs = {
+                    key: value.cuda() if value is not None else None
+                    for key, value in inputs.items()
+                }
            output_ids = self.model.generate(
-                input_ids.cuda(),
+                **inputs,
                use_cache=True,
                **kwargs,
            )
@@ -87,10 +194,12 @@ class HfRunner:
        self,
        prompts: List[str],
        max_tokens: int,
+        images: Optional["torch.Tensor"] = None,
    ) -> List[Tuple[List[int], str]]:
        outputs = self.generate(prompts,
                                do_sample=False,
-                                max_new_tokens=max_tokens)
+                                max_new_tokens=max_tokens,
+                                images=images)
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            outputs[i] = (output_ids[0], output_str[0])
@@ -150,6 +259,10 @@ class HfRunner:
            all_logprobs.append(seq_logprobs)
        return all_logprobs
+    def __del__(self):
+        del self.model
+        cleanup()
 @pytest.fixture
 def hf_runner():
@@ -162,9 +275,14 @@ class VllmRunner:
        self,
        model_name: str,
        tokenizer_name: Optional[str] = None,
+        # Use smaller max model length, otherwise bigger model cannot run due
+        # to kv cache size limit.
+        max_model_len=1024,
        dtype: str = "half",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
+        block_size: int = 16,
+        enable_chunked_prefill: bool = False,
        **kwargs,
    ) -> None:
        self.model = LLM(
@@ -175,6 +293,9 @@ class VllmRunner:
            swap_space=0,
            disable_log_stats=disable_log_stats,
            tensor_parallel_size=tensor_parallel_size,
+            max_model_len=max_model_len,
+            block_size=block_size,
+            enable_chunked_prefill=enable_chunked_prefill,
            **kwargs,
        )
@@ -182,9 +303,16 @@ class VllmRunner:
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
+        images: Optional["torch.Tensor"] = None,
    ) -> List[Tuple[List[int], str]]:
-        req_outputs = self.model.generate(prompts,
+        if images is not None:
-                                          sampling_params=sampling_params)
+            assert len(prompts) == images.shape[0]
+        req_outputs = self.model.generate(
+            prompts,
+            sampling_params=sampling_params,
+            multi_modal_data=MultiModalData(type=MultiModalData.Type.IMAGE,
+                                            data=images)
+            if images is not None else None)
        outputs = []
        for req_output in req_outputs:
            prompt_str = req_output.prompt
@@ -221,9 +349,10 @@ class VllmRunner:
        self,
        prompts: List[str],
        max_tokens: int,
+        images: Optional[torch.Tensor] = None,
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-        outputs = self.generate(prompts, greedy_params)
+        outputs = self.generate(prompts, greedy_params, images=images)
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
@@ -254,7 +383,21 @@ class VllmRunner:
        outputs = self.generate(prompts, beam_search_params)
        return outputs
+    def __del__(self):
+        del self.model
+        cleanup()
 @pytest.fixture
 def vllm_runner():
    return VllmRunner
+def get_tokenizer_pool_config(tokenizer_group_type):
+    if tokenizer_group_type is None:
+        return None
+    if tokenizer_group_type == "ray":
+        return TokenizerPoolConfig(pool_size=1,
+                                   pool_type="ray",
+                                   extra_config={})
+    raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
--- a/tests/worker/spec_decode/__init__.py
+++ b/tests/worker/spec_decode/__init__.py
--- a/vllm/model_executor/layers/triton_kernel/__init__.py
+++ b/vllm/model_executor/layers/triton_kernel/__init__.py
--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
+import contextlib
+import gc
+import pytest
+import ray
+import torch
+from vllm import LLM
+from vllm.model_executor.parallel_utils.parallel_state import (
+    destroy_model_parallel)
+from vllm.model_executor.utils import set_random_seed
+def cleanup():
+    destroy_model_parallel()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    gc.collect()
+    torch.cuda.empty_cache()
+    ray.shutdown()
+@pytest.fixture
+def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                           baseline_llm_kwargs, seed):
+    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                                baseline_llm_kwargs, seed)
+@pytest.fixture
+def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                       test_llm_kwargs, seed):
+    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                                test_llm_kwargs, seed)
+def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                         distinct_llm_kwargs, seed):
+    kwargs = {
+        **common_llm_kwargs,
+        **per_test_common_llm_kwargs,
+        **distinct_llm_kwargs,
+    }
+    def generator_inner():
+        llm = LLM(**kwargs)
+        set_random_seed(seed)
+        yield llm
+        del llm
+        cleanup()
+    for llm in generator_inner():
+        yield llm
+        del llm
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
+from itertools import cycle
+import pytest
+from vllm import SamplingParams
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "forced_num_gpu_blocks": 5 * (64 + 1),
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "use_v2_block_manager": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
+                                               test_llm_generator, batch_size):
+    """Verify block manager v2 produces same outputs as block manager v1, even
+    when there is preemption.
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+    If the output token ids are equivalent, then we have confidence that the KV
+    cache is not corrupted in the v2 block manager.
+    NOTE: We want a significant number of generated tokens so that any incorrect
+    KV mapping has time to build up error.
+    """
+    output_len = 1024
+    temperature = 0.0
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+    print('Getting token ids from block manager v1')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+    print('Getting token ids from block manager v2')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+    assert baseline_token_ids == test_token_ids
+def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
+    for llm in llm_generator:
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+        token_ids = [output.outputs[0].token_ids for output in outputs]
+        del llm
+    return token_ids
--- a/tests/core/block/test_block_space_manager.py
+++ b/tests/core/block/test_block_space_manager.py
+import pytest
+from vllm.core.block_manager_v2 import BlockSpaceManagerV2
+from vllm.core.interfaces import AllocStatus
+from ..utils import create_seq_group
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
+@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
+                                num_gpu_blocks: int, watermark: float):
+    block_manager = BlockSpaceManagerV2(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        watermark=watermark,
+    )
+    num_watermark_blocks = int(watermark * num_gpu_blocks)
+    num_output_blocks_per_seq = 1
+    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
+    # the current implementation assumes all seqs are new prompts / don't have
+    # different output lens.
+    num_output_blocks = num_output_blocks_per_seq
+    for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
+        seq_group = create_seq_group(
+            seq_prompt_lens=block_size * num_prompt_blocks,
+            seq_output_lens=[
+                block_size * num_output_blocks_per_seq
+                for _ in range(num_seqs_per_group)
+            ],
+        )
+        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+        can_allocate_result = block_manager.can_allocate(seq_group)
+        num_required_blocks = num_prompt_blocks + num_output_blocks
+        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
+            assert can_allocate_result == AllocStatus.NEVER
+        elif num_gpu_blocks >= num_required_blocks:
+            assert can_allocate_result == AllocStatus.OK
+        else:
+            assert can_allocate_result == AllocStatus.LATER
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
+import pytest
+from vllm.core.block.block_table import BlockTable
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.utils import Device, cdiv, chunk_list
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+def test_allocate_naive(block_size: int, sequence_len: int):
+    """Test the allocation of blocks using the naive allocator.
+    This test creates a CpuGpuBlockAllocator with the specified block size and
+    number of blocks. It then allocates multiple BlockTables with varying
+    sequence lengths and verifies that the number of free blocks decreases as
+    expected after each allocation.
+    """
+    assert block_size > 1
+    num_gpu_blocks = 1024
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type="naive",
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+    token_ids = list(range(sequence_len))
+    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
+    block_tables = []
+    for i in range(5):
+        assert allocator.get_num_free_blocks(
+            device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
+        block_tables.append(
+            BlockTable(
+                block_size=block_size,
+                block_allocator=allocator,
+            ))
+        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+def test_allocate_prefix_caching(block_size: int, sequence_len: int):
+    """Test the allocation of blocks using the prefix caching allocator.
+    This test creates a CpuGpuBlockAllocator with the specified block size and
+    number of blocks, using the prefix caching allocator. It then allocates
+    multiple BlockTables with varying sequence lengths and verifies that the
+    number of free blocks decreases as expected after each allocation.
+    The test expects all sequences to share allocations, except for their last
+    block, which may be mutable. It calculates the expected number of immutable
+    and mutable blocks per allocation based on the sequence length and block
+    size.
+    """
+    assert block_size > 1
+    num_gpu_blocks = 1024
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type="prefix_caching",
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+    token_ids = list(range(sequence_len))
+    chunked_tokens = list(chunk_list(token_ids, block_size))
+    num_mutable_blocks_per_alloc = 0 if len(
+        chunked_tokens[-1]) == block_size else 1
+    num_immutable_blocks_per_alloc = len(
+        chunked_tokens) - num_mutable_blocks_per_alloc
+    block_tables = []
+    for alloc_i in range(1, 6):
+        block_tables.append(
+            BlockTable(
+                block_size=block_size,
+                block_allocator=allocator,
+            ))
+        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
+        # Expect all sequences to share allocations, except for their last block
+        # (which may be mutable).
+        assert allocator.get_num_free_blocks(
+            device=Device.GPU) == num_gpu_blocks - (
+                num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
+                (alloc_i))
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+@pytest.mark.parametrize("device", ["cpu", "gpu"])
+def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
+                       device: str):
+    """Test the allocation and freeing of blocks using different allocators and
+    devices.
+    This test creates a CpuGpuBlockAllocator with the specified block size,
+    number of blocks, allocator type, and device. It then allocates a BlockTable
+    multiple times with the same sequence and verifies that the number of free
+    blocks remains consistent after each allocation and freeing.
+    """
+    device = Device[device.upper()]
+    num_device_blocks = 1024
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_device_blocks,
+        num_cpu_blocks=num_device_blocks,
+        block_size=block_size,
+    )
+    token_ids = list(range(sequence_len))
+    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+    for i in range(5):
+        block_table.allocate(token_ids=token_ids, device=device)
+        assert allocator.get_num_free_blocks(
+            device) == num_device_blocks - num_blocks_per_alloc
+        assert all(block_id is not None
+                   for block_id in block_table.physical_block_ids)
+        block_table.free()
+        assert allocator.get_num_free_blocks(device) == num_device_blocks
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_append_token_ids_allocation(block_size: int, sequence_len: int,
+                                     append_len: int, allocator_type: str):
+    """Test the allocation behavior when appending token IDs to a BlockTable.
+    This test creates a CpuGpuBlockAllocator with the specified block size,
+    number of blocks, and allocator type. It then allocates a BlockTable with an
+    initial sequence and appends additional token IDs to it. The test verifies
+    that the number of allocated blocks before and after appending matches the
+    expected values.
+    """
+    num_gpu_blocks = 1024
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+    num_expected_blocks_before_append = len(
+        list(chunk_list(token_ids, block_size)))
+    num_expected_appended_blocks = len(
+        list(chunk_list(token_ids + token_ids_to_append,
+                        block_size))) - num_expected_blocks_before_append
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    assert len(
+        block_table.physical_block_ids) == num_expected_blocks_before_append
+    block_table.append_token_ids(token_ids_to_append)
+    assert len(
+        block_table.physical_block_ids
+    ) == num_expected_blocks_before_append + num_expected_appended_blocks
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
+                                           num_empty_slots: int,
+                                           allocator_type: str):
+    """Test the allocation behavior when ensuring a certain number of empty
+    slots in a BlockTable.
+    This test creates a CpuGpuBlockAllocator with the specified block size,
+    number of blocks, and allocator type. It then allocates a BlockTable with an
+    initial sequence and ensures a certain number of empty slots. The test
+    verifies that the number of allocated blocks before and after ensuring empty
+    slots matches the expected values. It also checks that filling up the empty
+    slots does not consume additional blocks.
+    """
+    num_gpu_blocks = 1024
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+    token_ids = list(range(sequence_len))
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+    num_expected_blocks_before_append = len(
+        list(chunk_list(token_ids, block_size)))
+    num_expected_appended_blocks = len(
+        list(chunk_list(token_ids + [-1] * num_empty_slots,
+                        block_size))) - num_expected_blocks_before_append
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    # Assert that the empty slots consume the expected number of additional
+    # blocks.
+    assert len(
+        block_table.physical_block_ids) == num_expected_blocks_before_append
+    block_table.ensure_num_empty_slots(num_empty_slots)
+    assert len(
+        block_table.physical_block_ids
+    ) == num_expected_blocks_before_append + num_expected_appended_blocks
+    # Now, ensure no additional blocks consumed as we fill up the empty slots.
+    num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
+    block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
+    assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 9])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("append_size", [1, 4, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
+                                          append_len: int, allocator_type: str,
+                                          append_size: int):
+    """Verify token ids are correctly appended. Appends various amounts of
+    token ids in various append sizes, and verifies the final sequence is
+    correct.
+    """
+    num_gpu_blocks = 1024
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    appended_so_far = []
+    for append in chunk_list(token_ids_to_append, append_size):
+        block_table.append_token_ids(append)
+        appended_so_far.extend(append)
+        assert block_table._get_all_token_ids() == token_ids + appended_so_far
+    assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
+@pytest.mark.parametrize("seq_len", [1, 9, 129])
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_fork(seq_len: int, block_size: int, allocator_type: str):
+    """Create a sequence using the specified allocator.
+        1. Assert that after forking the sequence, the free block count is the
+            same.
+        2. Assert that the forked sequence has the same physical mappings.
+        3. Then free the original sequence; verify that the free block count is
+            the same.
+        4. Finally, free the forked sequence and verify that the free block
+            count drops to zero.
+    """
+    num_gpu_blocks = 1024
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+    token_ids = list(range(seq_len))
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+    block_table.allocate(token_ids)
+    num_free_blocks_before_fork = allocator.get_num_free_blocks(
+        device=Device.GPU)
+    forked_block_table = block_table.fork()
+    # Expect physical_block_ids and token_ids to match.
+    assert (block_table.physical_block_ids ==
+            forked_block_table.physical_block_ids)
+    assert block_table._get_all_token_ids(
+    ) == forked_block_table._get_all_token_ids()
+    # Do not expect any additional allocations.
+    assert allocator.get_num_free_blocks(
+        device=Device.GPU) == num_free_blocks_before_fork
+    # Free the original blocks. Assert num free blocks does not change, since
+    # refcount is nonzero.
+    block_table.free()
+    assert allocator.get_num_free_blocks(
+        device=Device.GPU) == num_free_blocks_before_fork
+    # Expect the forked block table to be unaffected by the free.
+    assert all(block_id is not None
+               for block_id in forked_block_table.physical_block_ids)
+    # Free the forked blocks. Assert num free blocks does change, since
+    # refcount is now zero.
+    forked_block_table.free()
+    assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("appender", ["forked", "original"])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_cow(block_size: int, sequence_len: int, append_len: int,
+             allocator_type: str, appender: str):
+    """Fork a sequence; append to the forked sequence; verify there's a CoW.
+    """
+    num_gpu_blocks = 1024
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+    original_block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+    num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
+    num_expected_cow_blocks = cdiv(sequence_len + append_len,
+                                   block_size) - (sequence_len // block_size)
+    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    original_block_ids = original_block_table.physical_block_ids
+    forked_block_table = original_block_table.fork()
+    # Expect no additional allocation (copy on _write_).
+    assert allocator.get_num_free_blocks(
+        Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
+    if appender == "forked":
+        appender_block_table = forked_block_table
+        static_block_table = original_block_table
+    elif appender == "original":
+        appender_block_table = original_block_table
+        static_block_table = forked_block_table
+    else:
+        raise ValueError(f"unknown test config {appender=}")
+    # Write tokens.
+    appender_block_table.append_token_ids(token_ids_to_append)
+    # Expect the non-appending block table to have no change.
+    assert static_block_table.physical_block_ids == original_block_ids
+    assert appender_block_table.physical_block_ids != original_block_ids
+    # Expect the blocks changed during append to have a CoW.
+    assert allocator.get_num_free_blocks(
+        Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
+                                         num_expected_cow_blocks)
+    cows = allocator.clear_copy_on_writes()
+    if sequence_len % block_size > 0:
+        # If the last block in the sequence is not full, then when appending we
+        # expect a CoW.
+        assert cows
+        cow_block_id = sequence_len // block_size
+        expected_src = static_block_table.physical_block_ids[cow_block_id]
+        expected_dst = appender_block_table.physical_block_ids[cow_block_id]
+        assert expected_src in cows
+        assert expected_dst in cows[expected_src]
+    else:
+        # Otherwise, there should be no copy-on-write.
+        assert not cows
+    static_block_table.free()
+    appender_block_table.free()
+    # After free, expect all blocks to be freed.
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
+@pytest.mark.parametrize("appender", ["forked", "original"])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_cow_lookahead_simple(block_size: int, sequence_len: int,
+                              append_len: int, lookahead_slots: int,
+                              allocator_type: str, appender: str):
+    """Similar to test_cow, except with lookahead allocation. The assertions are
+    less rigorous due to the complexity of the property under test.
+    """
+    num_gpu_blocks = 1024
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+    original_block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    # Allocate lookahead slots.
+    original_block_table.ensure_num_empty_slots(lookahead_slots)
+    original_block_ids = original_block_table.physical_block_ids
+    forked_block_table = original_block_table.fork()
+    if appender == "forked":
+        appender_block_table = forked_block_table
+        static_block_table = original_block_table
+    elif appender == "original":
+        appender_block_table = original_block_table
+        static_block_table = forked_block_table
+    else:
+        raise ValueError(f"unknown test config {appender=}")
+    # Write tokens.
+    appender_block_table.append_token_ids(token_ids_to_append)
+    # Expect the non-appending block table to have no change.
+    assert static_block_table.physical_block_ids == original_block_ids
+    assert appender_block_table.physical_block_ids != original_block_ids
+    cows = allocator.clear_copy_on_writes()
+    # Always expect copy-on-write
+    assert cows
+    if sequence_len % block_size > 0:
+        # If the last block in the sequence is not full, then when appending we
+        # expect a CoW.
+        assert cows
+        cow_block_id = sequence_len // block_size
+        expected_src = static_block_table.physical_block_ids[cow_block_id]
+        expected_dst = appender_block_table.physical_block_ids[cow_block_id]
+        assert expected_src in cows
+        assert expected_dst in cows[expected_src]
+    static_block_table.free()
+    appender_block_table.free()
+    # After free, expect all blocks to be freed.
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
--- a/tests/core/block/test_common.py
+++ b/tests/core/block/test_common.py
+import random
+import pytest
+from vllm.core.block.common import RefCounter
+@pytest.mark.parametrize("seed", list(range(20)))
+@pytest.mark.parametrize("num_incrs", [1, 100])
+@pytest.mark.parametrize("num_blocks", [1024])
+def test_incr(seed: int, num_incrs: int, num_blocks: int):
+    random.seed(seed)
+    all_block_indices = list(range(num_blocks))
+    counter = RefCounter(all_block_indices=all_block_indices)
+    block_id = random.randint(0, num_blocks - 1)
+    for i in range(num_incrs):
+        value = counter.incr(block_id)
+        assert value == i + 1
+@pytest.mark.parametrize("seed", list(range(20)))
+@pytest.mark.parametrize("num_incrs", [1, 100])
+@pytest.mark.parametrize("num_blocks", [1024])
+def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
+    random.seed(seed)
+    all_block_indices = list(range(num_blocks))
+    counter = RefCounter(all_block_indices=all_block_indices)
+    block_id = random.randint(0, num_blocks - 1)
+    for i in range(num_incrs):
+        value = counter.incr(block_id)
+        assert value == i + 1
+    for i in range(num_incrs):
+        value = counter.decr(block_id)
+        assert value == num_incrs - (i + 1)
+    with pytest.raises(AssertionError):
+        counter.decr(block_id)
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
+import pytest
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.utils import Device, chunk_list
+@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
+@pytest.mark.parametrize("num_gpu_blocks", [1024])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
+                          block_size: int, allocator_type: str):
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=num_cpu_blocks,
+        block_size=block_size,
+    )
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+    cpu_blocks = [
+        allocator.allocate_mutable(prev_block=None, device=Device.CPU)
+        for _ in range(num_cpu_blocks)
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+    gpu_blocks = [
+        allocator.allocate_mutable(prev_block=None, device=Device.GPU)
+        for _ in range(num_gpu_blocks)
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+    _ = [allocator.free(block) for block in cpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+    _ = [allocator.free(block) for block in gpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
+@pytest.mark.parametrize("num_gpu_blocks", [1024])
+@pytest.mark.parametrize("block_size", [2])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
+                            block_size: int, allocator_type: str):
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=num_cpu_blocks,
+        block_size=block_size,
+    )
+    unique_token_ids = list(
+        range((num_cpu_blocks + num_gpu_blocks) * block_size))
+    gpu_token_ids = chunk_list(unique_token_ids[:num_gpu_blocks * block_size],
+                               block_size)
+    cpu_token_ids = chunk_list(unique_token_ids[num_gpu_blocks * block_size:],
+                               block_size)
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+    cpu_blocks = [
+        allocator.allocate_immutable(prev_block=None,
+                                     token_ids=token_ids,
+                                     device=Device.CPU)
+        for token_ids in cpu_token_ids
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+    gpu_blocks = [
+        allocator.allocate_immutable(prev_block=None,
+                                     token_ids=token_ids,
+                                     device=Device.GPU)
+        for token_ids in gpu_token_ids
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+    _ = [allocator.free(block) for block in cpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+    _ = [allocator.free(block) for block in gpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
+from typing import List, Optional
+import pytest
+from vllm.core.block.interfaces import Block, BlockAllocator
+from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+class TestNaiveBlockAllocator:
+    @staticmethod
+    def create_allocate_lambda(allocate_type: str,
+                               allocator: NaiveBlockAllocator,
+                               prev_block: Optional[Block],
+                               token_ids: List[int]):
+        if allocate_type == "immutable":
+            allocate_block = lambda: allocator.allocate_immutable(
+                prev_block=prev_block, token_ids=token_ids)
+        elif allocate_type == "mutable":
+            allocate_block = lambda: allocator.allocate_mutable(prev_block=
+                                                                prev_block)
+        else:
+            raise ValueError()
+        return allocate_block
+    @staticmethod
+    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_ooms(allocate_type: str, num_blocks: int,
+                           block_size: int):
+        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
+                                        num_blocks=num_blocks,
+                                        block_size=block_size)
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            allocate_type,
+            allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+        [allocate_block() for _ in range(num_blocks)]
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocate_block()
+    @staticmethod
+    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_free_prevents_oom(allocate_type: str, num_blocks: int,
+                               block_size: int):
+        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
+                                        num_blocks=num_blocks,
+                                        block_size=block_size)
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            allocate_type,
+            allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+        blocks = [allocate_block() for _ in range(num_blocks)]
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocate_block()
+        block_to_free = blocks.pop()
+        for _ in range(100):
+            block_id = block_to_free.block_id
+            allocator.free(block_to_free)
+            assert block_to_free.block_id is None
+            new_block = allocate_block()
+            assert new_block.block_id == block_id
+            with pytest.raises(BlockAllocator.NoFreeBlocksError):
+                allocate_block()
+            block_to_free = new_block
+    @staticmethod
+    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
+                                 block_size: int):
+        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
+                                        num_blocks=num_blocks,
+                                        block_size=block_size)
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            allocate_type,
+            allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+        assert allocator.get_num_free_blocks() == num_blocks
+        blocks = [allocate_block() for _ in range(num_blocks)]
+        for i, block in enumerate(blocks):
+            assert allocator.get_num_free_blocks() == i
+            allocator.free(block)
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
+import math
+import random
+from typing import List, Optional
+from unittest.mock import MagicMock
+import pytest
+from vllm.core.block.interfaces import Block, BlockAllocator
+from vllm.core.block.prefix_caching_block import (PrefixCachingBlock,
+                                                  PrefixCachingBlockAllocator)
+class TestPrefixCachingBlock:
+    @staticmethod
+    @pytest.mark.parametrize("seed", list(range(10)))
+    @pytest.mark.parametrize("block_size", [1, 16])
+    @pytest.mark.parametrize("is_curr_block_full", [True, False])
+    def test_first_block_has_correct_content_hash(seed: int, block_size: int,
+                                                  is_curr_block_full: bool):
+        """Verify a block which is first in the sequence has the correct hash.
+        """
+        random.seed(seed)
+        num_to_fill = block_size if is_curr_block_full else random.randint(
+            0, block_size - 1)
+        token_ids = list(range(num_to_fill))
+        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+        block_with_prev = PrefixCachingBlock(
+            prev_block=None,
+            token_ids=token_ids,
+            block_size=block_size,
+            prefix_caching_allocator=mock_allocator)
+        if is_curr_block_full:
+            # Expect hash since block is full.
+            assert block_with_prev.content_hash == (
+                PrefixCachingBlock.hash_block_tokens(
+                    is_first_block=True,
+                    prev_block_hash=None,
+                    cur_block_token_ids=token_ids))
+        else:
+            # Do not expect hash since block is not full.
+            assert block_with_prev.content_hash is None
+    @staticmethod
+    @pytest.mark.parametrize("seed", list(range(10)))
+    @pytest.mark.parametrize("block_size", [1, 16])
+    @pytest.mark.parametrize("is_curr_block_full", [True, False])
+    @pytest.mark.parametrize("prev_block_has_hash", [True, False])
+    def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
+                                                is_curr_block_full: bool,
+                                                prev_block_has_hash: bool):
+        """Verify a block which is not first in the sequence has the correct
+        hash.
+        """
+        random.seed(seed)
+        previous_block = MagicMock(spec=PrefixCachingBlock)
+        prev_block_hash = random.randint(0, 1000)
+        previous_block.content_hash = (prev_block_hash
+                                       if prev_block_has_hash else None)
+        num_to_fill = block_size if is_curr_block_full else random.randint(
+            0, block_size - 1)
+        token_ids = list(range(num_to_fill))
+        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+        block_with_prev = PrefixCachingBlock(
+            prev_block=previous_block,
+            token_ids=token_ids,
+            block_size=block_size,
+            prefix_caching_allocator=mock_allocator,
+        )
+        if is_curr_block_full and prev_block_has_hash:
+            # Expect hash since block is full and previous block has hash.
+            assert (block_with_prev.content_hash ==
+                    PrefixCachingBlock.hash_block_tokens(
+                        is_first_block=False,
+                        prev_block_hash=prev_block_hash,
+                        cur_block_token_ids=token_ids))
+        else:
+            # Do not expect hash since block is not full or the previous block
+            # does not have a hash.
+            assert block_with_prev.content_hash is None
+    @staticmethod
+    @pytest.mark.parametrize("block_size", [1, 2, 16])
+    @pytest.mark.parametrize("num_tokens", list(range(3)))
+    @pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10])
+    def test_blocks_have_correct_hash_in_chain(block_size: int,
+                                               num_tokens: int,
+                                               num_empty_trailing_blocks: int):
+        """Create two chains of logical blocks with the same contents.
+        Assert the hashes are equal.
+        """
+        random.seed(0)
+        token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
+        first_chain, second_chain = [
+            TestPrefixCachingBlock.create_chain(
+                block_size=block_size,
+                token_ids=token_ids,
+                num_empty_trailing_blocks=num_empty_trailing_blocks)
+            for _ in range(2)
+        ]
+        for first_chain_block, second_chain_block in zip(
+                first_chain, second_chain):
+            assert (first_chain_block.content_hash ==
+                    second_chain_block.content_hash)
+        if not first_chain or not second_chain:
+            assert first_chain == second_chain
+            assert num_tokens == 0
+    @staticmethod
+    def create_chain(block_size: int,
+                     token_ids: List[int],
+                     num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
+        """Helper method which creates a chain of blocks.
+        """
+        blocks = []
+        num_blocks = math.ceil(
+            len(token_ids) / block_size) + num_empty_trailing_blocks
+        if num_blocks == 0:
+            return []
+        allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+        prev_block = None
+        for block_number in range(0, num_blocks):
+            prev_block = PrefixCachingBlock(
+                prev_block=prev_block,
+                token_ids=[],
+                block_size=block_size,
+                prefix_caching_allocator=allocator,
+            )
+            tokens_to_append = token_ids[block_number *
+                                         block_size:(block_number + 1) *
+                                         block_size]
+            if tokens_to_append:
+                prev_block.append_token_ids(tokens_to_append)
+            blocks.append(prev_block)
+        return blocks
+class TestPrefixCachingBlockAllocator:
+    @staticmethod
+    def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
+                               prev_block: Optional[Block],
+                               token_ids: List[int]):
+        if allocate_type == "immutable":
+            allocate_block = lambda: allocator.allocate_immutable(
+                prev_block=prev_block, token_ids=token_ids)
+        elif allocate_type == "mutable":
+            allocate_block = lambda: allocator.allocate_mutable(prev_block=
+                                                                prev_block)
+        else:
+            raise ValueError()
+        return allocate_block
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
+            allocate_type="mutable",
+            allocator=allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)),
+        )
+        [allocate_block() for _ in range(num_blocks)]
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocate_block()
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_immutable_does_not_oom_single_hash(
+            num_blocks: int, block_size: int):
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
+            allocate_type="immutable",
+            allocator=allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)),
+        )
+        blocks = [allocate_block() for _ in range(num_blocks)]
+        # Expect no OOM. If these were mutable blocks, this would OOM.
+        non_oom_block = allocate_block()
+        # Expect all blocks to have same physical block index.
+        for block in blocks:
+            assert (block.block_id == non_oom_block.block_id)
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_immutable_ooms_many_hash(num_blocks: int,
+                                               block_size: int):
+        """Consume all blocks using many different hashes/block content.
+        Do this by creating a sequence that is very long.
+        Expect next block to OOM.
+        """
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks * block_size))
+        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+        # Expect allocation with unseen hash to fail.
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocator.allocate_immutable(prev_block=chain[-1],
+                                         token_ids=list(range(block_size)))
+        # Expect mutable allocation to fail.
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocator.allocate_mutable(prev_block=chain[-1])
+        # Expect allocation of exact same chain to pass.
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+        # Expect physical block indices to be the same in both chains.
+        assert chain and second_chain
+        for first_chain_block, second_chain_block in zip(chain, second_chain):
+            assert (first_chain_block.block_id == second_chain_block.block_id)
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_free_prevents_oom(num_blocks: int, block_size: int):
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks * block_size))
+        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+        # Expect mutable allocation to fail.
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocator.allocate_mutable(prev_block=None)
+        block_to_free = chain[-1]
+        # Expect free/allocate loop to succeed many times.
+        for i in range(100):
+            block_id = block_to_free.block_id
+            allocator.free(block_to_free)
+            assert block_to_free.block_id is None, i
+            new_block = allocator.allocate_mutable(prev_block=None)
+            assert new_block.block_id == block_id, i
+            with pytest.raises(BlockAllocator.NoFreeBlocksError):
+                allocator.allocate_mutable(prev_block=None)
+            block_to_free = new_block
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        num_blocks_to_consume = random.randint(1, num_blocks - 1)
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks_to_consume * block_size))
+        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+        # Free each block in chain, assert num free blocks includes new free
+        # block.
+        for i, block in enumerate(chain):
+            assert allocator.get_num_free_blocks() == (num_blocks -
+                                                       num_blocks_to_consume +
+                                                       i)
+            allocator.free(block)
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
+                                        seed: int):
+        """Verify sharing occurs by allocating two sequences that share prefixes
+        and incrementally freeing blocks.
+        """
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        num_blocks_to_consume = random.randint(1, num_blocks - 1)
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks_to_consume * block_size))
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+        # Free each block in the first chain. Since all blocks are shared, the
+        # free count should stay constant.
+        for i, block in enumerate(first_chain):
+            assert allocator.get_num_free_blocks() == (num_blocks -
+                                                       num_blocks_to_consume)
+            allocator.free(block)
+        # Free each block in the second chain. Since the refcount is now zero,
+        # the free count should increment with each free.
+        for i, block in enumerate(second_chain):
+            assert allocator.get_num_free_blocks() == (num_blocks -
+                                                       num_blocks_to_consume +
+                                                       i)
+            allocator.free(block)
+    @staticmethod
+    def create_immutable_chain(
+        block_size: int,
+        token_ids: List[int],
+        allocator: PrefixCachingBlockAllocator,
+    ) -> List[PrefixCachingBlock]:
+        """Helper method which creates a chain of blocks.
+        """
+        blocks = []
+        num_blocks = math.ceil(len(token_ids) / block_size)
+        if num_blocks == 0:
+            return []
+        prev_block = None
+        for block_number in range(0, num_blocks):
+            block_token_ids = token_ids[block_number *
+                                        block_size:(block_number + 1) *
+                                        block_size]
+            prev_block = allocator.allocate_immutable(
+                prev_block=prev_block, token_ids=block_token_ids)
+            blocks.append(prev_block)
+        return blocks
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
+import time
+from typing import List
+import pytest
+from vllm import SamplingParams
+from vllm.block import PhysicalTokenBlock
+from vllm.core.block_manager_v1 import (BlockSpaceManagerV1,
+                                        UncachedBlockAllocator)
+from vllm.core.interfaces import AllocStatus
+from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus
+from vllm.utils import Device
+from .utils import create_dummy_prompt
+def test_block_allocator_allocate():
+    block_size = 4
+    num_cpu_blocks = 4
+    cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
+                                           num_cpu_blocks)
+    # Allocate all available cpu blocks.
+    num_free = num_cpu_blocks
+    assert cpu_allocator.get_num_free_blocks() == num_free
+    for _ in range(num_cpu_blocks):
+        block = cpu_allocator.allocate()
+        num_free -= 1
+        assert block not in cpu_allocator.free_blocks
+        assert cpu_allocator.get_num_free_blocks() == num_free
+    with pytest.raises(ValueError):
+        cpu_allocator.allocate()
+def test_block_allocator_free():
+    block_size = 4
+    num_cpu_blocks = 4
+    cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
+                                           num_cpu_blocks)
+    # Allocate all available cpu blocks.
+    blocks: List[PhysicalTokenBlock] = []
+    for _ in range(num_cpu_blocks):
+        block = cpu_allocator.allocate()
+        blocks.append(block)
+        assert block not in cpu_allocator.free_blocks
+    # Free all allocated cpu blocks.
+    num_free = 0
+    assert cpu_allocator.get_num_free_blocks() == num_free
+    for block in blocks:
+        cpu_allocator.free(block)
+        num_free += 1
+        assert block in cpu_allocator.free_blocks
+        assert cpu_allocator.get_num_free_blocks() == num_free
+        with pytest.raises(ValueError):
+            cpu_allocator.free(block)
+def test_allocate():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+    # Allocate same sequence group to all available gpu blocks.
+    for i in range(num_gpu_blocks):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        assert block_manager.can_allocate(seq_group)
+        block_manager.allocate(seq_group)
+    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
+    # Allocate same sequence group to all available gpu blocks.
+    # Use watermark to reserve one gpu block.
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=1 / num_gpu_blocks)
+    for i in range(num_gpu_blocks - 1):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        assert block_manager.can_allocate(seq_group)
+        block_manager.allocate(seq_group)
+    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
+def test_append_slot_single_seq():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+    # Allocate single seq to gpu block.
+    prompt, seq_group = create_dummy_prompt("1", block_size)
+    block_manager.allocate(seq_group)
+    # Nothing to append. Sequence has no new logical blocks.
+    assert block_manager.can_append_slot(seq_group)
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    assert not block_manager.append_slot(prompt)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_blocks == after_blocks
+    # Add block_size number of new tokens and append slot.
+    for i in range(block_size):
+        token_id = i + 5
+        prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+    assert block_manager.can_append_slot(seq_group)
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    assert not block_manager.append_slot(prompt)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_blocks - after_blocks == 1
+def test_append_slot_cow():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size=block_size,
+                                        num_cpu_blocks=num_cpu_blocks,
+                                        num_gpu_blocks=num_gpu_blocks,
+                                        watermark=0)
+    # Allocate prompt to gpu block. There is one slot left in the block.
+    prompt = Sequence(seq_id=1,
+                      prompt="one two three",
+                      prompt_token_ids=[1, 2, 3],
+                      block_size=block_size)
+    # Fork the sequence, such that a COW will be required when we append a new
+    # token id.
+    child = prompt.fork(new_seq_id=2)
+    # Allocate space for the sequence group.
+    seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
+                              time.time(), time.perf_counter)
+    block_manager.allocate(seq_group)
+    # Fork and append a new token id. We expect a COW to be scheduled.
+    token_id = 4
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.fork(prompt, child)
+    assert block_manager.can_append_slot(seq_group)
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    maybe_src_dst_block = block_manager.append_slot(child)
+    assert maybe_src_dst_block is not None
+    src_block, dst_block = maybe_src_dst_block
+    assert src_block != dst_block
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_blocks - after_blocks == 1
+def test_fork():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+    prompt, seq_group = create_dummy_prompt("1",
+                                            block_size - 1,
+                                            block_size=block_size)
+    block_manager.allocate(seq_group)
+    # Fork prompt and copy block tables.
+    child = prompt.fork(2)
+    block_manager.fork(prompt, child)
+    assert block_manager.get_block_table(
+        prompt) == block_manager.get_block_table(child)
+    token_id = 4
+    # Append token to child. Block is shared so copy on write occurs.
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.append_slot(child)
+    assert block_manager.get_block_table(
+        prompt) != block_manager.get_block_table(child)
+def test_swap():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+    # Swap seq group from GPU -> CPU.
+    gpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    assert list(mapping.keys()) == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    prompt.status = SequenceStatus.SWAPPED
+    # Swap seq group from CPU -> GPU.
+    cpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_in(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_in(seq_group)
+    assert list(mapping.keys()) == cpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
+    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
+def test_free():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+    prompt, seq_group = create_dummy_prompt("1", block_size)
+    block_manager.allocate(seq_group)
+    # Free allocated seq.
+    prompt_blocks = len(block_manager.get_block_table(prompt))
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    block_manager.free(prompt)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert after_blocks == before_blocks + prompt_blocks
+    # Block table for freed seq is deleted.
+    with pytest.raises(KeyError):
+        block_manager.get_block_table(prompt)
+def test_reset():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+    # Allocate same seq group on all available gpu blocks.
+    original_blocks = block_manager.get_num_free_gpu_blocks()
+    for i in range(num_gpu_blocks):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        block_manager.allocate(seq_group)
+    assert block_manager.get_num_free_gpu_blocks() == 0
+    # Resetting block manager frees all allocated blocks.
+    block_manager.reset()
+    assert block_manager.get_num_free_gpu_blocks() == original_blocks
+def test_sliding_window_multi_seq():
+    """
+    Tests that memory allocation and deallocation is handled
+    correctly with multiple sequences that exceed the sliding
+    window's capacity.
+    """
+    block_size = 1
+    num_cpu_blocks = 8
+    num_gpu_blocks = 8
+    sliding_window = 2
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        sliding_window=sliding_window,
+                                        watermark=0)
+    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
+    parent = Sequence(1, "one two three", [0, 1, 2], block_size)
+    seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(),
+                              None)
+    block_manager.allocate(seq_group)
+    # assert the number of blocks allocated is correct
+    # the parent seq has len 3, but since sliding_window is 2,
+    # we will use at most 2 blocks
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window
+    # Fork prompt and copy block tables.
+    child = parent.fork(2)
+    block_manager.fork(parent, child)
+    # assert the number of blocks allocated is correct
+    # forking does not increase memory consumption
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window
+    # assert both parent and child share all blocks
+    assert block_manager.get_block_table(
+        parent) == block_manager.get_block_table(child)
+    token_id = 4
+    # Append token to child. Block is shared so copy on write occurs.
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.append_slot(child)
+    # assert the number of blocks allocated is correct
+    # we will use now one block more. Each seq will use 2 blocks,
+    # but only one can be shared
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window - 1
+    token_id = 5
+    parent.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.append_slot(parent)
+    # assert the number of blocks allocated is correct
+    # no change, because both sequences are still just sharing one block
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window - 1
+    block_table_parent = block_manager.get_block_table(parent)
+    block_table_child = block_manager.get_block_table(child)
+    assert block_table_parent != block_table_child
+    # assert both blocks are sharing the second-last block
+    assert block_table_parent[-2] == block_table_child[-2]
+    # now let's clean up...
+    block_manager.free(parent)
+    # assert the number of blocks allocated is correct
+    # We have freed one seq, reducing the ref count of two blocks by one.
+    # One of the two was only used by the parent seq, so this is now free.
+    # The child seq still consumes sliding_window blocks
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window
+    # free all blocks
+    block_manager.free(child)
+    # assert all blocks are free now
+    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
+import time
+from typing import List
+import pytest  # noqa
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.sequence import Logprob, SequenceGroup
+from .utils import create_dummy_prompt
+def get_sequence_groups(scheduler_output):
+    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
+def test_scheduler_add_seq_group():
+    block_size = 4
+    scheduler_config = SchedulerConfig(100, 64, 1)
+    cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
+    cache_config.num_cpu_blocks = 4
+    cache_config.num_gpu_blocks = 4
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    # Add seq group to scheduler.
+    num_seq_group = 4
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        scheduler.add_seq_group(seq_group)
+        assert scheduler.get_num_unfinished_seq_groups() == i + 1
+def test_scheduler_abort_seq_group():
+    block_size = 4
+    scheduler_config = SchedulerConfig(100, 64, 1)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 4
+    cache_config.num_gpu_blocks = 4
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    # Add multiple seq groups to scheduler.
+    num_seq_group = 4
+    request_ids = set()
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        scheduler.add_seq_group(seq_group)
+        request_ids.add(str(i))
+    # Abort all added seq groups.
+    assert scheduler.get_num_unfinished_seq_groups() == num_seq_group
+    scheduler.abort_seq_group(request_ids)
+    assert scheduler.get_num_unfinished_seq_groups() == 0
+def test_scheduler_schedule_simple():
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+    # Schedule seq groups prompts.
+    num_tokens = block_size * num_seq_group
+    seq_group_meta, out = scheduler.schedule()
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_tokens
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+    # Schedule seq groups generation.
+    seq_group_meta, out = scheduler.schedule()
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_seq_group
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+def test_scheduler_schedule_preempt_abort():
+    block_size = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, 2, max_model_len)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 2
+    cache_config.num_gpu_blocks = 2
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    # Add seq groups to scheduler.
+    seq_a, seq_group_a = create_dummy_prompt("1", block_size)
+    seq_b, seq_group_b = create_dummy_prompt("2", block_size)
+    scheduler.add_seq_group(seq_group_a)
+    scheduler.add_seq_group(seq_group_b)
+    # Schedule seq groups prompts.
+    seq_group_meta, out = scheduler.schedule()
+    assert get_sequence_groups(out) == [seq_group_a, seq_group_b]
+    assert out.num_batched_tokens == block_size * 2  # seq_a and seq_b
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 2
+    assert scheduler.get_num_unfinished_seq_groups() == 2
+    # Append "generated" tokens, allowing the sequence to mark prompt tokens as
+    # processed.
+    token_id = 0
+    seq_a.append_token_id(token_id, {token_id: Logprob(0.0)})
+    seq_b.append_token_id(token_id, {token_id: Logprob(0.0)})
+    # Schedule seq groups generation and preempt seq group b.
+    seq_group_meta, out = scheduler.schedule()
+    assert get_sequence_groups(out) == [seq_group_a]
+    assert out.num_batched_tokens == 1
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 1
+    assert scheduler.get_num_unfinished_seq_groups() == 2
+    # Abort seq group a. Re-schedule seq group b prompt with recomputation.
+    scheduler.abort_seq_group("1")
+    seq_group_meta, out = scheduler.schedule()
+    assert get_sequence_groups(out) == [seq_group_b]
+    assert out.num_batched_tokens == 5  # 4 prompt + 1 generation.
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 1
+    assert scheduler.get_num_unfinished_seq_groups() == 1
+def test_scheduler_max_seqs():
+    block_size = 4
+    num_seq_group = 4
+    max_seq_group = 2
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    all_seq_groups: List[SequenceGroup] = []
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
+        all_seq_groups.append(seq_group)
+    # Append 1 seq group
+    scheduler.add_seq_group(all_seq_groups[0])
+    # Schedule seq groups prompts.
+    _, out = scheduler.schedule()
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
+    # Schedule seq groups generation.
+    _, out = scheduler.schedule()
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
+    # Append 2 more seq group
+    scheduler.add_seq_group(all_seq_groups[1])
+    scheduler.add_seq_group(all_seq_groups[2])
+    # Schedule seq groups prompts.
+    # Only 1 seq group should be scheduled since max_seq_group is 2
+    # and one is prompting.
+    _, out = scheduler.schedule()
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
+def test_scheduler_delay_factor():
+    block_size = 4
+    scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    # schedule first prompt
+    _, seq_group = create_dummy_prompt("0", prompt_length=block_size)
+    scheduler.add_seq_group(seq_group)
+    seq_group_meta, out = scheduler.schedule()
+    assert out.prompt_run
+    assert seq_group_meta[0].request_id == '0'
+    # wait for a second before scheduling next prompt
+    time.sleep(1)
+    _, seq_group = create_dummy_prompt("1", prompt_length=block_size)
+    scheduler.add_seq_group(seq_group)
+    # second prompt should *not* be scheduled
+    seq_group_meta, out = scheduler.schedule()
+    assert not out.prompt_run
+    assert seq_group_meta[0].request_id == '0'
+    # wait for more than 0.5 second and try again
+    time.sleep(0.6)
+    seq_group_meta, out = scheduler.schedule()
+    assert out.prompt_run
+    assert seq_group_meta[0].request_id == '1'
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
+import time
+from typing import Tuple
+from vllm import SamplingParams
+from vllm.sequence import Logprob, Sequence, SequenceGroup
+def create_dummy_prompt(
+        request_id: str,
+        prompt_length: int,
+        block_size: int = None) -> Tuple[Sequence, SequenceGroup]:
+    if not block_size:
+        block_size = prompt_length
+    # Create dummy prompt sequence with tokens 0...block_size-1
+    # and prompt "0 ... block_size".
+    prompt_tokens = list(range(prompt_length))
+    prompt_str = " ".join([str(t) for t in prompt_tokens])
+    prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
+    seq_group = SequenceGroup(request_id, [prompt], SamplingParams(),
+                              time.time(), None)
+    return prompt, seq_group
+def create_seq_group(
+    seq_prompt_lens=1024,
+    seq_output_lens=(128, ),
+    request_id='0',
+    seq_id_start=0,
+) -> SequenceGroup:
+    assert len(seq_output_lens) > 0
+    prompt_token_ids = [0] * seq_prompt_lens
+    seqs = []
+    for seq_id_offset, output_len in enumerate(seq_output_lens):
+        seq = Sequence(
+            seq_id=seq_id_start + seq_id_offset,
+            prompt="",
+            prompt_token_ids=prompt_token_ids,
+            block_size=16,
+        )
+        for i in range(output_len):
+            seq.append_token_id(
+                token_id=i,
+                logprobs={i: Logprob(0.0)},
+            )
+        seqs.append(seq)
+    seq_group = SequenceGroup(
+        request_id=request_id,
+        seqs=seqs,
+        sampling_params=SamplingParams(),
+        arrival_time=time.time(),
+    )
+    return seq_group
+def round_up_to_next_block(seq_len: int, block_size: int) -> int:
+    return (seq_len + block_size - 1) // block_size
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
+vLLM will allocate all the available memory, so we need to run the tests one
-Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`.
+by one. The solution is to pass arguments (model name) by environment
+variables.
+Run:
+```sh
+TEST_DIST_MODEL=facebook/opt-125m pytest \
+    test_basic_distributed_correctness.py
+TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
+    test_basic_distributed_correctness.py
+```
 """
+import os
 import pytest
 import torch
 MODELS = [
-    "facebook/opt-125m",
+    os.environ["TEST_DIST_MODEL"],
-    "meta-llama/Llama-2-7b-hf",
 ]

--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
 """Test the communication operators.
-Run `pytest tests/distributed/test_comm_ops.py --forked`.
+Run `pytest tests/distributed/test_comm_ops.py`.
 """
+import os
 import pytest
-import torch
 import ray
+import torch
 from vllm.model_executor.parallel_utils.communication_op import (
-    tensor_model_parallel_all_reduce,
+    broadcast_tensor_dict, tensor_model_parallel_all_gather,
-    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce)
-    broadcast_tensor_dict,
-)
 from vllm.test_utils import (init_test_distributed_environment,
                             multi_process_tensor_parallel)
@@ -18,6 +18,12 @@ from vllm.test_utils import (init_test_distributed_environment,
 @ray.remote(num_gpus=1, max_calls=1)
 def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
                           distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
    init_test_distributed_environment(1, tensor_parallel_size, rank,
                                      distributed_init_port)
    num_elements = 8
@@ -34,6 +40,12 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
 @ray.remote(num_gpus=1, max_calls=1)
 def all_gather_test_worker(tensor_parallel_size: int, rank: int,
                           distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
    init_test_distributed_environment(1, tensor_parallel_size, rank,
                                      distributed_init_port)
    num_dimensions = 3
@@ -56,6 +68,12 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
 @ray.remote(num_gpus=1, max_calls=1)
 def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
                                      distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
    init_test_distributed_environment(1, tensor_parallel_size, rank,
                                      distributed_init_port)
    test_dict = {

--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
+import os
 import random
-import os
 import pytest
 import ray
 import torch

--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
+import multiprocessing
+import os
+import pytest
+import torch
+from vllm.model_executor.parallel_utils.pynccl import (NCCLCommunicator,
+                                                       ncclGetUniqueId)
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes = []
+    for i in range(number_of_processes):
+        env = os.environ.copy()
+        env['RANK'] = str(i)
+        env['LOCAL_RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+    for p in processes:
+        p.join()
+def update_env(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapper(env):
+        import os
+        os.environ.update(env)
+        fn()
+    return wrapper
+@update_env
+def worker_fn():
+    comm = NCCLCommunicator()
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
+    comm.all_reduce(tensor)
+    result = tensor.mean().cpu().item()
+    assert result == comm.world_size
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl():
+    distributed_run(worker_fn, 2)
+@update_env
+def worker_fn_with_cudagraph():
+    with torch.no_grad():
+        graph = torch.cuda.CUDAGraph()
+        comm = NCCLCommunicator()
+        # run something in the default stream to initialize torch engine
+        a = torch.ones((4, 4), device=f'cuda:{comm.rank}')
+        torch.cuda.synchronize()
+        with torch.cuda.graph(graph, stream=comm.stream):
+            # operation during the graph capture is recorded but not executed
+            # see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
+            comm.all_reduce(a)
+        comm.stream.synchronize()
+        assert a.mean().cpu().item() == comm.world_size**0
+        graph.replay()
+        comm.stream.synchronize()
+        assert a.mean().cpu().item() == comm.world_size**1
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_with_cudagraph():
+    distributed_run(worker_fn_with_cudagraph, 2)
+def test_ncclGetUniqueId():
+    unique_id = ncclGetUniqueId()
+    # `list(unique_id.internal)` is something like this:
+    # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    # as long as the function doesn't raise an exception, we're good
+    assert unique_id is not None
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
+import pytest
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("block_size", [16])
+def test_computed_prefix_blocks(model: str, block_size: int):
+    # This test checks if we are able to run the engine to completion
+    # without triggering asserts.
+    # We are in a scenario where all blocks from the second request's prompt
+    # are full and already computed when the second request arrives.
+    prompt = (
+        "You are a helpful assistant. How do I build a car from cardboard and "
+        "paper clips? Is there an easy to follow video tutorial available "
+        "online for free?")
+    prompt2 = (
+        " Please recommend to me some resources where I can learn not only to "
+        "handle technical difficulties of building a car, but also "
+        "decoration.")
+    engine_args = EngineArgs(model=model,
+                             block_size=block_size,
+                             enable_prefix_caching=True)
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams()
+    engine.add_request("0", prompt + prompt2, sampling_params)
+    engine.step()
+    engine.add_request("1", prompt, sampling_params)
+    engine.step()
--- a/tests/engine/test_detokenize.py
+++ b/tests/engine/test_detokenize.py
-import pytest
-from transformers import AutoTokenizer
-from vllm.transformers_utils.tokenizer import detokenize_incrementally
-TRUTH = [
-    "Hello here, this is a simple test",  # noqa: E501
-    "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa: E501
-    "我很感谢你的热情"  # noqa: E501
-]
-TOKENIZERS = [
-    "facebook/opt-125m",
-    "gpt2",
-    "bigcode/tiny_starcoder_py",
-    "EleutherAI/gpt-j-6b",
-    "EleutherAI/pythia-70m",
-    "bigscience/bloom-560m",
-    "mosaicml/mpt-7b",
-    "tiiuae/falcon-7b",
-    "meta-llama/Llama-2-7b-hf",
-    "codellama/CodeLlama-7b-hf",
-]
-def _run_incremental_decode(tokenizer, all_input_ids,
-                            skip_special_tokens: bool):
-    decoded_text = ""
-    offset = 0
-    token_offset = 0
-    prev_tokens = None
-    for i in range(len(all_input_ids)):
-        new_tokens, text, offset, token_offset = detokenize_incrementally(
-            tokenizer,
-            all_input_ids[:i + 1],
-            prev_tokens,
-            offset,
-            token_offset,
-            skip_special_tokens=skip_special_tokens)
-        decoded_text += text
-        if prev_tokens is None:
-            prev_tokens = new_tokens
-        else:
-            prev_tokens += new_tokens
-    return decoded_text
-@pytest.mark.parametrize("truth", TRUTH)
-@pytest.mark.parametrize("tokenizer_id", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", (True, False))
-def test_decode_streaming(tokenizer_id, truth, skip_special_tokens):
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
-    all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"]
-    if skip_special_tokens:
-        all_input_ids = ([tokenizer.bos_token_id]
-                         if tokenizer.bos_token_id is not None else
-                         []) + all_input_ids + [tokenizer.eos_token_id]
-    decoded_text = _run_incremental_decode(
-        tokenizer, all_input_ids, skip_special_tokens=skip_special_tokens)
-    assert decoded_text == truth