[V0 Deprecation] Remove V0 Core tests (#25082)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

[V0 Deprecation] Remove V0 Core tests (#25082)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
4b946d69 · Woosuk Kwon · GitHub · 087c6ffc · 4b946d69 · 087c6ffc
Unverified Commit 4b946d69 authored Sep 17, 2025 by Woosuk Kwon Committed by GitHub Sep 17, 2025
20 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -91,17 +91,6 @@ steps:
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test # 22min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
-  fast_check: true
-  source_file_dependencies:
-  - vllm/core
-  - vllm/distributed
-  - tests/core
-  commands:
-  - pytest -v -s core
 - label: Entrypoints Unit Tests # 5min
  timeout_in_minutes: 10
  working_dir: "/vllm-workspace/tests"

--- a/tests/core/__init__.py
+++ b/tests/core/__init__.py
--- a/tests/core/block/__init__.py
+++ b/tests/core/block/__init__.py
--- a/tests/core/block/conftest.py
+++ b/tests/core/block/conftest.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-@pytest.fixture()
-def should_do_global_cleanup_after_test() -> bool:
-    """Disable the global cleanup fixture for tests in this directory. This
-    provides a ~10x speedup for unit tests that don't load a model to GPU.
-    This requires that tests in this directory clean up after themselves if they
-    use the GPU.
-    """
-    return False
--- a/tests/core/block/e2e/__init__.py
+++ b/tests/core/block/e2e/__init__.py
--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable
-from typing import Callable, Optional
-import pytest
-from vllm import LLM
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.model_executor.utils import set_random_seed
-@pytest.fixture
-def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                           baseline_llm_kwargs, seed):
-    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                                baseline_llm_kwargs, seed)
-@pytest.fixture
-def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                       test_llm_kwargs, seed):
-    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                                test_llm_kwargs, seed)
-def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                         distinct_llm_kwargs, seed):
-    kwargs = {
-        **common_llm_kwargs,
-        **per_test_common_llm_kwargs,
-        **distinct_llm_kwargs,
-    }
-    def generator_inner():
-        llm = LLM(**kwargs)
-        set_random_seed(seed)
-        yield llm
-        del llm
-        cleanup_dist_env_and_memory()
-    for llm in generator_inner():
-        yield llm
-        del llm
-def get_text_from_llm_generator(llm_generator: Iterable[LLM],
-                                prompts,
-                                sampling_params,
-                                llm_cb: Optional[Callable[[LLM],
-                                                          None]] = None):
-    for llm in llm_generator:
-        if llm_cb:
-            llm_cb(llm)
-        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
-        text = [output.outputs[0].text for output in outputs]
-        del llm
-    return text
-def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
-    for llm in llm_generator:
-        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
-        token_ids = [output.outputs[0].token_ids for output in outputs]
-        del llm
-    return token_ids
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from itertools import cycle
-import pytest
-from vllm import SamplingParams
-from .conftest import get_token_ids_from_llm_generator
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "preemption_mode": "swap"
-}, {
-    "preemption_mode": "recompute"
-}])
-@pytest.mark.parametrize("batch_size", [10])
-@pytest.mark.parametrize("seed", [1])
-def test_block_manager_with_preemption(baseline_llm_generator,
-                                       test_llm_generator, batch_size):
-    """Verify block manager produces same outputs even when there is preemption.
-    This constructs two LLM, each with limited number of GPU blocks. The limit
-    is decided such that as the sequences in the batch grow, sequences must be
-    preempted and removed from cache.
-    If the output token ids are equivalent, then we have confidence that the KV
-    cache is not corrupted.
-    NOTE: We want a significant number of generated tokens so that any incorrect
-    KV mapping has time to build up error.
-    NOTE(Kuntai): Though we have removed block manager v1, this test is still
-    useful as it asserts the behavior of block manager v2 (now it is called 
-    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
-    keep this test.
-    """
-    output_len = 1024
-    temperature = 0.0
-    # We want to ensure equality even with preemption.
-    # We force the total block size to be 1 + cdiv(output_len, block_size)
-    # so that only one sequence can fit at a time (once the sequences grow).
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-    assert baseline_token_ids == test_token_ids
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-        # Our prompts will generate 128 tokens; since the prompts themselves are
-        # small, we don't need much KV space beyond 128.
-        "max_model_len": 160,
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-    }])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-    [
-        {
-            "block_size": 16,
-            # Allow only 2 sequences of ~128 tokens in worst case.
-            # Note 8 = 128/block_size
-            "num_gpu_blocks_override": 2 * (8 + 1),
-        },
-        {
-            "block_size": 8,
-            # Allow only 2 sequences of ~128 tokens in worst case.
-            # Note 16 = 128/block_size
-            "num_gpu_blocks_override": 2 * (16 + 2),
-        }
-    ])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "num_lookahead_slots": 0,
-}])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        {
-            # We run one test with block_size < lookahead_slots, one test with
-            # block_size > lookahead_slots
-            "num_lookahead_slots": 10,
-            "preemption_mode": "swap",
-        },
-        {
-            "num_lookahead_slots": 10,
-            "preemption_mode": "recompute",
-        }
-    ])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("seed", [1])
-def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
-                                                   test_llm_generator,
-                                                   batch_size):
-    """Verify vLLM produces the same output with greedy sampling, when lookahead
-    scheduling is used vs. not.
-    Lookahead scheduling is not expected to modify the output, as it simply
-    allocates empty slots ahead of the known token ids in a sliding fashion.
-    This test constrains the total number of blocks to force preemption. It also
-    varies the block size so that the lookahead size is less than and greater
-    than the block size.
-    """
-    output_len = 128
-    temperature = 0.0
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-    print('Getting token ids without lookahead scheduling')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-    print('Getting token ids with lookahead scheduling')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-    assert baseline_token_ids == test_token_ids
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [
-        {
-            # Use a small model for a fast test.
-            "model": "facebook/opt-125m",
-            # skip cuda graph creation for fast test.
-            "enforce_eager": True,
-            "enable_chunked_prefill": True,
-        },
-    ])
-@pytest.mark.parametrize("per_test_common_llm_kwargs",
-                         [{
-                             "block_size": 16,
-                             "max_num_batched_tokens": 2,
-                             "max_num_seqs": 2,
-                         }, {
-                             "block_size": 16,
-                             "max_num_batched_tokens": 3,
-                             "max_num_seqs": 2,
-                         }, {
-                             "block_size": 16,
-                             "max_num_batched_tokens": 256,
-                             "max_num_seqs": 10,
-                         }])
-@pytest.mark.parametrize("baseline_llm_kwargs", [
-    {},
-])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "num_lookahead_slots": 0,
-    },
-    {
-        "num_lookahead_slots": 5,
-    },
-])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("seed", [1])
-def test_chunked_prefill_block_manager(baseline_llm_generator,
-                                       test_llm_generator, batch_size):
-    """Verify that chunked prefill works with SelfAttnBlockSpaceManager, 
-    with and without lookahead scheduling.
-    """
-    output_len = 32
-    temperature = 0.0
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        ("1 + " * 50) + " 1 = ",  # Longer prompt.
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-    print('Getting token ids with BlockManager')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-    print('Getting token ids with BlockManager, with lookahead slots.')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-    assert baseline_token_ids == test_token_ids
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-        # Enable prefill cache
-        "enable_prefix_caching": True,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "preemption_mode": "swap"
-}, {
-    "preemption_mode": "recompute"
-}])
-@pytest.mark.parametrize("batch_size", [10])
-@pytest.mark.parametrize("seed", [1])
-def test_block_manager_prefix_caching_enabled_with_preemption(
-        baseline_llm_generator, test_llm_generator, batch_size):
-    """Verify block manager produces same outputs even when there is preemption.
-    This constructs two LLM, each with limited number of GPU blocks. The limit
-    is decided such that as the sequences in the batch grow, sequences must be
-    preempted and removed from cache.
-    If the output token ids are equivalent, then we have confidence that the KV
-    cache is not corrupted.
-    NOTE: We want a significant number of generated tokens so that any incorrect
-    KV mapping has time to build up error.
-    NOTE(Kuntai): Though we have removed block manager v1, this test is still
-    useful as it asserts the behavior of block manager v2 (now it is called 
-    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
-    keep this test.
-    """
-    output_len = 1024
-    temperature = 0.0
-    # We want to ensure equality even with preemption.
-    # We force the total block size to be 1 + cdiv(output_len, block_size)
-    # so that only one sequence can fit at a time (once the sequences grow).
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-    print('Getting token ids from block manager')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-    print('Getting token ids from block manager, with preemption')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-    assert baseline_token_ids == test_token_ids
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "enable_prefix_caching": False
-}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "enable_prefix_caching": True,
-    "preemption_mode": "swap"
-}, {
-    "enable_prefix_caching": True,
-    "preemption_mode": "recompute"
-}])
-@pytest.mark.parametrize("batch_size", [10])
-@pytest.mark.parametrize("seed", [1])
-def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
-                                             test_llm_generator, batch_size):
-    """Verify block manager v2 with auto prefix caching enabled produces same
-    outputs as auto prefix caching disabled, even when there is preemption.
-    This constructs two LLM, each with limited number of GPU blocks. The limit
-    is decided such that as the sequences in the batch grow, sequences must be
-    preempted and removed from cache.
-    If the output token ids are equivalent, then we have confidence that auto
-    prefix caching itself at least don't cause result error.
-    """
-    output_len = 1024
-    temperature = 0.0
-    # We want to ensure equality even with preemption.
-    # We force the total block size to be 1 + cdiv(output_len, block_size)
-    # so that only one sequence can fit at a time (once the sequences grow).
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-    print('Getting token ids with APC disabled')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-    print('Getting token ids with APC enabled')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-    assert baseline_token_ids == test_token_ids
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-        # we keep the blocks small, so that hit eviction quickly
-        "max_model_len": 48,
-        "block_size": 16,
-        "num_gpu_blocks_override": 3,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "enable_prefix_caching": False
-}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "enable_prefix_caching": True,
-}])
-@pytest.mark.parametrize("seed", [1])
-def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
-                                                  test_llm_generator):
-    """Verify block manager v2 with auto prefix caching could work normally
-    even when eviction started.
-    With APC enabled, all blocks are held by native block at the beginning.
-    Then blocks are managed by evictor instead. If cache hit at the evictor's
-    block, then it could be reused, or we need to recompute its kv cache.
-    """
-    output_len = 10
-    temperature = 0.0
-    prompts = [
-        "You are a helpful assistant. Please answer truthfully and write "
-        "out your thinking step by step to be sure you get the right answer. "
-        "If you make a mistake, attempt to correct it. who are you?",
-        "You are a helpful assistant. Please answer truthfully and write out "
-        "your thinking step by step to be sure you get the right answer. You "
-        "are helpful and harmless and you follow ethical guidelines. "
-        "who are you?"
-    ]
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-    print('Getting token ids with APC disabled')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-    print('Getting token ids with APC enabled')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-    assert baseline_token_ids == test_token_ids
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import random
-import pytest
-from tests.kernels.utils import override_backend_env_variable
-from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-from .conftest import get_text_from_llm_generator
-# relatively small model with 4k sliding window
-MODEL = "bigcode/starcoder2-3b"
-BLOCK_SIZE = 16
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model": MODEL,
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-        "block_size": BLOCK_SIZE,
-        # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
-        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("batch_size", [5])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
-def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
-                                  batch_size, seed, backend, monkeypatch):
-    """
-    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
-    asks for value of one of them (which is outside the sliding window).
-    If we tell it upfront which we are going to be looking for, then
-    it answers correctly (mostly).
-    Additionally, we compare the results of the v1 and v2 managers.
-    """
-    if backend == "XFORMERS" and current_platform.is_rocm():
-        pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
-    sampling_params = SamplingParams(
-        max_tokens=1024,
-        ignore_eos=True,
-        temperature=0.0,
-    )
-    prompts, answer, indices = prep_prompts(batch_size)
-    baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
-                                                 prompts,
-                                                 sampling_params,
-                                                 llm_cb=check_window(prompts))
-    check_answers(indices, answer, baseline_texts)
-    print('Getting token ids from block manager v2')
-    test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
-                                             sampling_params)
-    check_answers(indices, answer, test_texts)
-    cmp = [
-        expected_text == actual_text
-        for expected_text, actual_text in zip(baseline_texts, test_texts)
-    ]
-    print(cmp)
-    # make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768
-    # however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290
-    # states that xformers and flash_attn have different ideas about the window
-    # size anyways
-    assert sum(cmp) > 0.7 * len(cmp)
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model": MODEL,
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-        "block_size": BLOCK_SIZE,
-        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
-@pytest.mark.parametrize("batch_size", [5])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
-def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
-                                        backend, monkeypatch):
-    """
-    This is similar to test_sliding_window_retrieval, however, it doesn't
-    compare against the v1 block manager since v1 doesn't support
-    chunked prefill with sliding window.
-    The results with and without chunked prefill are not the same due to
-    numerical instabilities.
-    """
-    if backend == "XFORMERS" and current_platform.is_rocm():
-        pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
-    sampling_params = SamplingParams(
-        max_tokens=10,
-        ignore_eos=True,
-        temperature=0.0,
-    )
-    prompts, answer, indices = prep_prompts(batch_size)
-    # We don't compare with the baseline model here, since the results
-    # slightly different due to different tailing in attention.
-    test_texts = get_text_from_llm_generator(test_llm_generator,
-                                             prompts,
-                                             sampling_params,
-                                             llm_cb=check_window(prompts))
-    check_answers(indices, answer, test_texts)
-def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
-    """
-    Generate prompts which a bunch of assignments,
-    then asking for the value of one of them.
-    The prompt is just under 10k tokens; sliding window is 4k
-    so the answer is outside sliding window, but should still be correct.
-    Args:
-        batch_size: number of prompts to generate
-        ln_range: an argument to control the length of the prompt
-    """
-    prompts: list[str] = []
-    answer: list[int] = []
-    indices: list[int] = []
-    random.seed(1)
-    for _ in range(batch_size):
-        idx = random.randint(30, 90)
-        indices.append(idx)
-        prompt = "```python\n# We set a number of variables, " + \
-                 f"x{idx} will be important later\n"
-        ln = random.randint(*ln_range)
-        for k in range(30, ln):
-            v = random.randint(10, 99)
-            if k == idx:
-                answer.append(v)
-            prompt += f"x{k} = {v}\n"
-        prompt += f"# Now, we check the value of x{idx}:\n"
-        prompt += f"assert x{idx} == "
-        prompts.append(prompt)
-    return prompts, answer, indices
-def check_answers(indices: list[int],
-                  answer: list[int],
-                  outputs: list[str],
-                  accept_rate: float = 0.7):
-    answer2 = [int(text[0:2].strip()) for text in outputs]
-    print(list(zip(indices, zip(answer, answer2))))
-    numok = 0
-    for a1, a2 in zip(answer, answer2):
-        if a1 == a2:
-            numok += 1
-    frac_ok = numok / len(answer)
-    print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
-    assert frac_ok >= accept_rate
-def check_window(prompts: list[str]):
-    def inner(llm: LLM):
-        sliding_window = llm.llm_engine.model_config.get_sliding_window()
-        assert sliding_window and sliding_window > 0
-        assert any(
-            len(llm.get_tokenizer().tokenize(prompt)) > sliding_window
-            for prompt in prompts)
-    return inner
--- a/tests/core/block/test_block_manager.py
+++ b/tests/core/block/test_block_manager.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-from vllm.core.block_manager import SelfAttnBlockSpaceManager
-from vllm.core.interfaces import AllocStatus
-from vllm.sequence import Logprob, SequenceStatus
-from vllm.utils import chunk_list
-from ..utils import create_dummy_prompt, create_seq_group
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
-@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
-@pytest.mark.parametrize("watermark", [0.0, 0.5])
-def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
-                                num_gpu_blocks: int, watermark: float):
-    block_manager = SelfAttnBlockSpaceManager(
-        block_size=block_size,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        watermark=watermark,
-    )
-    num_watermark_blocks = int(watermark * num_gpu_blocks)
-    num_output_blocks_per_seq = 1
-    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
-    # the current implementation assumes all seqs are new prompts / don't have
-    # different output lens.
-    num_output_blocks = num_output_blocks_per_seq
-    for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
-        seq_group = create_seq_group(
-            seq_prompt_len=block_size * num_prompt_blocks,
-            seq_output_lens=[
-                block_size * num_output_blocks_per_seq
-                for _ in range(num_seqs_per_group)
-            ],
-        )
-        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
-        can_allocate_result = block_manager.can_allocate(seq_group)
-        num_required_blocks = num_prompt_blocks + num_output_blocks
-        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
-            assert can_allocate_result == AllocStatus.NEVER
-        elif num_gpu_blocks >= num_required_blocks:
-            assert can_allocate_result == AllocStatus.OK
-        else:
-            assert can_allocate_result == AllocStatus.LATER
-@pytest.mark.parametrize("block_size", [1, 8])
-@pytest.mark.parametrize("prompt_len", [1, 7, 8])
-@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
-@pytest.mark.parametrize("num_lookahead_slots", [0, 10])
-def test_append_slots(block_size, prompt_len, num_slots_to_append,
-                      num_lookahead_slots):
-    """Verify append_slots consumes the correct number of blocks from the block
-    table.
-    """
-    num_gpu_blocks = 1024
-    watermark = 0.1
-    block_manager = SelfAttnBlockSpaceManager(
-        block_size=block_size,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=0,
-        watermark=watermark,
-    )
-    seq_group = create_seq_group(
-        seq_prompt_len=prompt_len,
-        seq_output_lens=[0],
-    )
-    # Allocate seq
-    assert block_manager.can_allocate(seq_group)
-    block_manager.allocate(seq_group)
-    # Seq seq to RUNNING
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-    # Append tokens to the sequeqnce
-    for token_id in range(num_slots_to_append):
-        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
-    # Append slots for new tokens and lookahead slots.
-    free_blocks_before_append = block_manager.get_num_free_gpu_blocks()
-    block_manager.append_slots(seq, num_lookahead_slots)
-    num_consumed_blocks = (free_blocks_before_append -
-                           block_manager.get_num_free_gpu_blocks())
-    # Expect consumed blocks to be new blocks required to support the new slots.
-    expected_consumed_blocks = len(
-        list(
-            chunk_list(
-                list(
-                    range(prompt_len + num_slots_to_append +
-                          num_lookahead_slots)),
-                block_size))) - len(
-                    list(chunk_list(list(range(prompt_len)), block_size)))
-    assert num_consumed_blocks == expected_consumed_blocks
-@pytest.mark.parametrize("block_size", [8])
-@pytest.mark.parametrize("num_cpu_blocks", [4])
-@pytest.mark.parametrize("num_gpu_blocks", [4])
-@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
-@pytest.mark.parametrize("enable_caching", [False, True])
-def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
-              enable_caching):
-    """Verify blocks number on src/desc device is correct after swapping in/out
-        sequence group (not missing or extra blocks).
-    """
-    block_manager = SelfAttnBlockSpaceManager(block_size,
-                                              num_cpu_blocks,
-                                              num_gpu_blocks,
-                                              watermark=0,
-                                              enable_caching=enable_caching)
-    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
-    prompt.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group)
-    # Emulate a forward pass by appending a single token.
-    # The block manager then knows how many unprocessed
-    # tokens will be written in the next forward pass.
-    token_id = 0
-    prompt.status = SequenceStatus.RUNNING
-    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
-    # Swap seq group from GPU -> CPU.
-    gpu_blocks = block_manager.get_block_table(prompt)
-    assert block_manager.can_swap_out(seq_group)
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_out(seq_group)
-    mapping_keys = [key for key, _ in mapping]
-    assert mapping_keys == gpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
-    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
-    prompt.status = SequenceStatus.SWAPPED
-    # Swap seq group from CPU -> GPU.
-    assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_in(seq_group)
-    cpu_blocks = block_manager.get_block_table(prompt)
-    mapping_keys = [key for key, _ in mapping]
-    assert mapping_keys == [cpu_blocks[0]]
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
-@pytest.mark.parametrize("block_size", [8])
-@pytest.mark.parametrize("num_gpu_blocks", [4])
-@pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10])
-@pytest.mark.parametrize("enable_caching", [True, False])
-def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
-                  enable_caching):
-    """ Verify the block manager can correctly determine if a sequence group
-        can be swapped in/out.
-    """
-    num_cpu_blocks = num_gpu_blocks
-    block_manager = SelfAttnBlockSpaceManager(block_size,
-                                              num_cpu_blocks,
-                                              num_gpu_blocks,
-                                              watermark=0,
-                                              enable_caching=enable_caching)
-    prompt, seq_group = create_dummy_prompt(
-        "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
-    prompt.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group)
-    prompt.status = SequenceStatus.RUNNING
-    # Swap seq group from GPU -> CPU.
-    gpu_blocks = block_manager.get_block_table(prompt)
-    assert block_manager.can_swap_out(seq_group)
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_out(seq_group)
-    mapping_keys = [key for key, _ in mapping]
-    assert mapping_keys == gpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
-    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
-    prompt.status = SequenceStatus.SWAPPED
-    # At this moment, we still have enough free blocks to swap in the seq group.
-    if num_lookahead_slots <= block_size:
-        assert block_manager.can_swap_in(seq_group,
-                                         num_lookahead_slots) == AllocStatus.OK
-    else:
-        assert block_manager.can_swap_in(
-            seq_group, num_lookahead_slots) == AllocStatus.NEVER
-    # During Swapped out, 2 cached blocks were evicted from the GPU,
-    # so the prompt1 can't be swapped in
-    prompt2_len = 2 * block_size - 1
-    prompt2, seq_group2 = create_dummy_prompt(
-        "2",
-        prompt_length=prompt2_len,
-        prompt_tokens=[10000 + i for i in range(prompt2_len)])
-    prompt2.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group2)
-    # Swap seq group from CPU -> GPU.
-    if num_lookahead_slots <= block_size:
-        assert block_manager.can_swap_in(
-            seq_group, num_lookahead_slots) == AllocStatus.LATER
-    else:
-        assert block_manager.can_swap_in(
-            seq_group, num_lookahead_slots) == AllocStatus.NEVER
-@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
-@pytest.mark.parametrize("enable_caching", [False, True])
-def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
-    """Verifies that swapping fails if there is not enough free blocks
-    to account for unseen tokens and lookahead_slots.
-    """
-    block_size = 8
-    num_cpu_blocks = 1
-    num_gpu_blocks = 1
-    block_manager = SelfAttnBlockSpaceManager(block_size,
-                                              num_cpu_blocks,
-                                              num_gpu_blocks,
-                                              watermark=0,
-                                              enable_caching=enable_caching)
-    prompt_length = block_size - 3
-    assert prompt_length > 0
-    prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
-    prompt.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group)
-    # Emulate a forward pass by appending a single token.
-    # The block manager then knows how many unprocessed
-    # tokens will be written in the next forward pass.
-    token_id = 0
-    prompt.status = SequenceStatus.RUNNING
-    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
-    # Swap seq group from GPU -> CPU.
-    assert block_manager.can_swap_out(seq_group)
-    block_manager.swap_out(seq_group)
-    prompt.status = SequenceStatus.SWAPPED
-    # Swap seq group from CPU -> GPU.
-    # The number of unseen tokens is 1. If the number of existing
-    # tokens plus the unseen ones and number of lookahead slots exceeds
-    # the total number of available GPU blocks then the swap
-    # should fail.
-    num_unseen_tokens = 1
-    if (num_lookahead_slots + num_unseen_tokens +
-            prompt_length) <= (block_size * num_gpu_blocks):
-        assert block_manager.can_swap_in(seq_group,
-                                         num_lookahead_slots) == AllocStatus.OK
-    else:
-        assert block_manager.can_swap_in(
-            seq_group, num_lookahead_slots) == AllocStatus.NEVER
-# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
-@pytest.mark.parametrize("block_size", [8, 16])
-@pytest.mark.parametrize("prompt_len", [10, 300, 1000])
-@pytest.mark.parametrize("num_slots_to_append", [50])
-@pytest.mark.parametrize("sliding_window", [20, 32, 200, 512])
-def test_sliding_window(block_size, prompt_len, num_slots_to_append,
-                        sliding_window):
-    """Verify append_slots consumes the correct number of blocks from the block
-    table.
-    """
-    num_gpu_blocks = 1024
-    watermark = 0.1
-    block_manager = SelfAttnBlockSpaceManager(
-        block_size=block_size,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=0,
-        watermark=watermark,
-        sliding_window=sliding_window,
-    )
-    def check_used(min_n, max_n=None):
-        if max_n is None:
-            max_n = min_n
-        used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
-        assert min_n <= used
-        assert used <= max_n
-    def num_blocks(num_tokens):
-        return (num_tokens + block_size - 1) // block_size
-    check_used(0)
-    seq_group = create_seq_group(
-        seq_prompt_len=prompt_len,
-        seq_output_lens=[0],
-    )
-    check_used(0)
-    # Allocate seq
-    assert block_manager.can_allocate(seq_group)
-    block_manager.allocate(seq_group)
-    check_used(num_blocks(prompt_len))
-    # Seq seq to RUNNING
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-    seq.data.update_num_computed_tokens(prompt_len)
-    check_used(num_blocks(prompt_len))
-    # this is how we compute it in SelfAttnBlockSpaceManager.__init__
-    sliding_blocks = (sliding_window // block_size) + 2
-    # plus one block for null block
-    sliding_blocks += 1
-    # Append tokens to the sequeqnce
-    for token_id in range(num_slots_to_append):
-        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
-        seq.data.update_num_computed_tokens(1)
-        block_manager.append_slots(seq, num_lookahead_slots=0)
-        if prompt_len < sliding_window + 10:
-            check_used(0, sliding_blocks + 1)
-        else:
-            check_used(sliding_blocks, sliding_blocks + 1)
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-from vllm.core.block.block_table import BlockTable
-from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
-from vllm.utils import Device, cdiv, chunk_list
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-def test_allocate_naive(block_size: int, sequence_len: int):
-    """Test the allocation of blocks using the naive allocator.
-    This test creates a CpuGpuBlockAllocator with the specified block size and
-    number of blocks. It then allocates multiple BlockTables with varying
-    sequence lengths and verifies that the number of free blocks decreases as
-    expected after each allocation.
-    """
-    assert block_size > 1
-    num_gpu_blocks = 1024
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type="naive",
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        block_size=block_size,
-    )
-    token_ids = list(range(sequence_len))
-    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
-    block_tables: list[BlockTable] = []
-    for i in range(5):
-        assert allocator.get_num_free_blocks(
-            device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
-        block_tables.append(
-            BlockTable(
-                block_size=block_size,
-                block_allocator=allocator,
-            ))
-        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-def test_allocate_prefix_caching(block_size: int, sequence_len: int):
-    """Test the allocation of blocks using the prefix caching allocator.
-    This test creates a CpuGpuBlockAllocator with the specified block size and
-    number of blocks, using the prefix caching allocator. It then allocates
-    multiple BlockTables with varying sequence lengths and verifies that the
-    number of free blocks decreases as expected after each allocation.
-    The test expects all sequences to share allocations, except for their last
-    block, which may be mutable. It calculates the expected number of immutable
-    and mutable blocks per allocation based on the sequence length and block
-    size.
-    """
-    assert block_size > 1
-    num_gpu_blocks = 1024
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type="prefix_caching",
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        block_size=block_size,
-    )
-    token_ids = list(range(sequence_len))
-    chunked_tokens = list(chunk_list(token_ids, block_size))
-    num_mutable_blocks_per_alloc = 0 if len(
-        chunked_tokens[-1]) == block_size else 1
-    num_immutable_blocks_per_alloc = len(
-        chunked_tokens) - num_mutable_blocks_per_alloc
-    block_tables: list[BlockTable] = []
-    for alloc_i in range(1, 6):
-        block_tables.append(
-            BlockTable(
-                block_size=block_size,
-                block_allocator=allocator,
-            ))
-        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
-        # Expect all sequences to share allocations, except for their last block
-        # (which may be mutable).
-        assert allocator.get_num_free_blocks(
-            device=Device.GPU) == num_gpu_blocks - (
-                num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
-                (alloc_i))
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-@pytest.mark.parametrize("device", ["cpu", "gpu"])
-def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
-                       device: str):
-    """Test the allocation and freeing of blocks using different allocators and
-    devices.
-    This test creates a CpuGpuBlockAllocator with the specified block size,
-    number of blocks, allocator type, and device. It then allocates a BlockTable
-    multiple times with the same sequence and verifies that the number of free
-    blocks remains consistent after each allocation and freeing.
-    """
-    device = Device[device.upper()]
-    num_device_blocks = 1024
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_device_blocks,
-        num_cpu_blocks=num_device_blocks,
-        block_size=block_size,
-    )
-    token_ids = list(range(sequence_len))
-    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
-    block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-    for i in range(5):
-        block_table.allocate(token_ids=token_ids, device=device)
-        assert allocator.get_num_free_blocks(
-            device) == num_device_blocks - num_blocks_per_alloc
-        assert all(block_id is not None
-                   for block_id in block_table.physical_block_ids)
-        block_table.free()
-        assert allocator.get_num_free_blocks(device) == num_device_blocks
-@pytest.mark.parametrize("block_size", [1, 8])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-@pytest.mark.parametrize("append_len", [1, 16, 129])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_append_token_ids_allocation(block_size: int, sequence_len: int,
-                                     append_len: int, allocator_type: str):
-    """Test the allocation behavior when appending token IDs to a BlockTable.
-    This test creates a CpuGpuBlockAllocator with the specified block size,
-    number of blocks, and allocator type. It then allocates a BlockTable with an
-    initial sequence and appends additional token IDs to it. The test verifies
-    that the number of allocated blocks before and after appending matches the
-    expected values.
-    """
-    num_gpu_blocks = 1024
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        block_size=block_size,
-    )
-    token_ids = list(range(sequence_len))
-    token_ids_to_append = list(range(append_len))
-    block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-    num_expected_blocks_before_append = len(
-        list(chunk_list(token_ids, block_size)))
-    num_expected_appended_blocks = len(
-        list(chunk_list(token_ids + token_ids_to_append,
-                        block_size))) - num_expected_blocks_before_append
-    block_table.allocate(token_ids=token_ids, device=Device.GPU)
-    assert len(
-        block_table.physical_block_ids) == num_expected_blocks_before_append
-    block_table.append_token_ids(token_ids_to_append)
-    assert len(
-        block_table.physical_block_ids
-    ) == num_expected_blocks_before_append + num_expected_appended_blocks
-@pytest.mark.parametrize("block_size", [1, 8])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
-                                           num_empty_slots: int,
-                                           allocator_type: str):
-    """Test the allocation behavior when ensuring a certain number of empty
-    slots in a BlockTable.
-    This test creates a CpuGpuBlockAllocator with the specified block size,
-    number of blocks, and allocator type. It then allocates a BlockTable with an
-    initial sequence and ensures a certain number of empty slots. The test
-    verifies that the number of allocated blocks before and after ensuring empty
-    slots matches the expected values. It also checks that filling up the empty
-    slots does not consume additional blocks.
-    """
-    num_gpu_blocks = 1024
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        block_size=block_size,
-    )
-    token_ids = list(range(sequence_len))
-    block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-    num_expected_blocks_before_append = len(
-        list(chunk_list(token_ids, block_size)))
-    num_expected_appended_blocks = len(
-        list(chunk_list(token_ids + [-1] * num_empty_slots,
-                        block_size))) - num_expected_blocks_before_append
-    block_table.allocate(token_ids=token_ids, device=Device.GPU)
-    # Assert that the empty slots consume the expected number of additional
-    # blocks.
-    assert len(
-        block_table.physical_block_ids) == num_expected_blocks_before_append
-    block_table.ensure_num_empty_slots(num_empty_slots)
-    assert len(
-        block_table.physical_block_ids
-    ) == num_expected_blocks_before_append + num_expected_appended_blocks
-    # Now, ensure no additional blocks consumed as we fill up the empty slots.
-    num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
-    block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
-    assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
-@pytest.mark.parametrize("block_size", [1, 8])
-@pytest.mark.parametrize("sequence_len", [1, 9])
-@pytest.mark.parametrize("append_len", [1, 16, 129])
-@pytest.mark.parametrize("append_size", [1, 4, 129])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
-                                          append_len: int, allocator_type: str,
-                                          append_size: int):
-    """Verify token ids are correctly appended. Appends various amounts of
-    token ids in various append sizes, and verifies the final sequence is
-    correct.
-    """
-    num_gpu_blocks = 1024
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        block_size=block_size,
-    )
-    token_ids = list(range(sequence_len))
-    token_ids_to_append = list(range(append_len))
-    block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-    block_table.allocate(token_ids=token_ids, device=Device.GPU)
-    appended_so_far: list[int] = []
-    for append in chunk_list(token_ids_to_append, append_size):
-        block_table.append_token_ids(append)
-        appended_so_far.extend(append)
-        assert block_table._get_all_token_ids() == token_ids + appended_so_far
-    assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
-@pytest.mark.parametrize("seq_len", [1, 9, 129])
-@pytest.mark.parametrize("block_size", [1, 8])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_fork(seq_len: int, block_size: int, allocator_type: str):
-    """Create a sequence using the specified allocator.
-        1. Assert that after forking the sequence, the free block count is the
-            same.
-        2. Assert that the forked sequence has the same physical mappings.
-        3. Then free the original sequence; verify that the free block count is
-            the same.
-        4. Finally, free the forked sequence and verify that the free block
-            count drops to zero.
-    """
-    num_gpu_blocks = 1024
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=0,
-        block_size=block_size,
-    )
-    token_ids = list(range(seq_len))
-    block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-    block_table.allocate(token_ids)
-    num_free_blocks_before_fork = allocator.get_num_free_blocks(
-        device=Device.GPU)
-    forked_block_table = block_table.fork()
-    # Expect physical_block_ids and token_ids to match.
-    assert (block_table.physical_block_ids ==
-            forked_block_table.physical_block_ids)
-    assert block_table._get_all_token_ids(
-    ) == forked_block_table._get_all_token_ids()
-    # Do not expect any additional allocations.
-    assert allocator.get_num_free_blocks(
-        device=Device.GPU) == num_free_blocks_before_fork
-    # Free the original blocks. Assert num free blocks does not change, since
-    # refcount is nonzero.
-    block_table.free()
-    assert allocator.get_num_free_blocks(
-        device=Device.GPU) == num_free_blocks_before_fork
-    # Expect the forked block table to be unaffected by the free.
-    assert all(block_id is not None
-               for block_id in forked_block_table.physical_block_ids)
-    # Free the forked blocks. Assert num free blocks does change, since
-    # refcount is now zero.
-    forked_block_table.free()
-    assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
-@pytest.mark.parametrize("block_size", [8])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-@pytest.mark.parametrize("append_len", [1, 16, 129])
-@pytest.mark.parametrize("appender", ["forked", "original"])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_cow(block_size: int, sequence_len: int, append_len: int,
-             allocator_type: str, appender: str):
-    """Fork a sequence; append to the forked sequence; verify there's a CoW.
-    """
-    num_gpu_blocks = 1024
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=0,
-        block_size=block_size,
-    )
-    token_ids = list(range(sequence_len))
-    token_ids_to_append = list(range(append_len))
-    original_block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-    num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
-    num_expected_cow_blocks = cdiv(sequence_len + append_len,
-                                   block_size) - (sequence_len // block_size)
-    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
-    original_block_ids = original_block_table.physical_block_ids[:]
-    print("original_block_ids = {}".format(original_block_ids))
-    forked_block_table = original_block_table.fork()
-    # Expect no additional allocation (copy on _write_).
-    assert allocator.get_num_free_blocks(
-        Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
-    if appender == "forked":
-        appender_block_table = forked_block_table
-        static_block_table = original_block_table
-    elif appender == "original":
-        appender_block_table = original_block_table
-        static_block_table = forked_block_table
-    else:
-        raise ValueError(f"unknown test config {appender=}")
-    # Write tokens.
-    appender_block_table.append_token_ids(token_ids_to_append)
-    # Expect the non-appending block table to have no change.
-    assert static_block_table.physical_block_ids == original_block_ids
-    assert appender_block_table.physical_block_ids != original_block_ids
-    # Expect the blocks changed during append to have a CoW.
-    assert allocator.get_num_free_blocks(
-        Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
-                                         num_expected_cow_blocks)
-    cows = allocator.clear_copy_on_writes()
-    if sequence_len % block_size > 0:
-        # If the last block in the sequence is not full, then when appending we
-        # expect a CoW.
-        assert cows
-        cow_block_id = sequence_len // block_size
-        expected_src = static_block_table.physical_block_ids[cow_block_id]
-        expected_dst = appender_block_table.physical_block_ids[cow_block_id]
-        assert (expected_src, expected_dst) in cows
-    else:
-        # Otherwise, there should be no copy-on-write.
-        assert not cows
-    static_block_table.free()
-    appender_block_table.free()
-    # After free, expect all blocks to be freed.
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-@pytest.mark.parametrize("block_size", [8])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-@pytest.mark.parametrize("append_len", [1, 16, 129])
-@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
-@pytest.mark.parametrize("appender", ["forked", "original"])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_cow_lookahead_simple(block_size: int, sequence_len: int,
-                              append_len: int, lookahead_slots: int,
-                              allocator_type: str, appender: str):
-    """Similar to test_cow, except with lookahead allocation. The assertions are
-    less rigorous due to the complexity of the property under test.
-    """
-    num_gpu_blocks = 1024
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=0,
-        block_size=block_size,
-    )
-    token_ids = list(range(sequence_len))
-    token_ids_to_append = list(range(append_len))
-    original_block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
-    # Allocate lookahead slots.
-    original_block_table.ensure_num_empty_slots(lookahead_slots)
-    original_block_ids = original_block_table.physical_block_ids[:]
-    forked_block_table = original_block_table.fork()
-    if appender == "forked":
-        appender_block_table = forked_block_table
-        static_block_table = original_block_table
-    elif appender == "original":
-        appender_block_table = original_block_table
-        static_block_table = forked_block_table
-    else:
-        raise ValueError(f"unknown test config {appender=}")
-    # Write tokens.
-    appender_block_table.append_token_ids(token_ids_to_append)
-    # Expect the non-appending block table to have no change.
-    assert static_block_table.physical_block_ids == original_block_ids
-    assert appender_block_table.physical_block_ids != original_block_ids
-    cows = allocator.clear_copy_on_writes()
-    # Always expect copy-on-write
-    assert cows
-    if sequence_len % block_size > 0:
-        # If the last block in the sequence is not full, then when appending we
-        # expect a CoW.
-        assert cows
-        cow_block_id = sequence_len // block_size
-        expected_src = static_block_table.physical_block_ids[cow_block_id]
-        expected_dst = appender_block_table.physical_block_ids[cow_block_id]
-        assert (expected_src, expected_dst) in cows
-    static_block_table.free()
-    appender_block_table.free()
-    # After free, expect all blocks to be freed.
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-@pytest.mark.parametrize("block_size", [1, 8])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-@pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
-@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
-                                            num_new_tokens: int,
-                                            num_lookahead_slots: int,
-                                            allocator_type: str):
-    """Verify correct calculation of get_num_blocks_touched_by_append_slots.
-    This is done by using copy-on-write, which requires any modified block to
-    be copied before write if the refcount > 1. We set the refcount>1 by forking
-    a sequence, then measure the free blocks before and after an append. If the
-    number of consumed blocks equals what `get_num_blocks_touched_by_append_
-    slots` returns, then the calculation is correct.
-    """
-    num_gpu_blocks = 1024
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=0,
-        block_size=block_size,
-    )
-    token_ids = list(range(sequence_len))
-    token_ids_to_append = list(range(num_new_tokens))
-    block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-    block_table.allocate(token_ids=token_ids, device=Device.GPU)
-    # Add lookahead before fork so both sequences have the same lookahead
-    # blocks.
-    block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
-    # Fork sequence so that every block has refcount > 1.
-    _ = block_table.fork()
-    # Determine how many blocks should be touched.
-    expected_num_touched_blocks = (
-        block_table.get_num_blocks_touched_by_append_slots(
-            token_ids=token_ids_to_append,
-            num_lookahead_slots=num_lookahead_slots))
-    # Measure how many blocks are touched by measuring num_free_blocks before
-    # and after the append.
-    #
-    # We expect append_token_ids to CoW all mutated blocks that have refcount>1.
-    num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
-    block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
-    num_consumed_blocks = (num_free_blocks_before_append -
-                           allocator.get_num_free_blocks(Device.GPU))
-    # TODO(cade) ensure equality when num_lookahead_slots > 0.
-    # The reason we have < is because lookahead blocks are not copied eagerly;
-    # they are copied on first write. This will cause issues for beam search +
-    # speculative decoding. This is acceptable for now as it is a large effort
-    # to combine the two. To fix this, we can ensure single sequence ownership
-    # of lookahead blocks by appending empty slots to each block, which will
-    # trigger the CoW.
-    #
-    # Until then, we can accept that the consumed tokens are <= the expected
-    # tokens when appending with lookahead.
-    if num_lookahead_slots > 0:
-        assert num_consumed_blocks <= expected_num_touched_blocks
-    else:
-        assert num_consumed_blocks == expected_num_touched_blocks
--- a/tests/core/block/test_common.py
+++ b/tests/core/block/test_common.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import random
-import pytest
-from vllm.core.block.common import RefCounter
-@pytest.mark.parametrize("seed", list(range(20)))
-@pytest.mark.parametrize("num_incrs", [1, 100])
-@pytest.mark.parametrize("num_blocks", [1024])
-def test_incr(seed: int, num_incrs: int, num_blocks: int):
-    random.seed(seed)
-    all_block_indices = list(range(num_blocks))
-    counter = RefCounter(all_block_indices=all_block_indices)
-    block_id = random.randint(0, num_blocks - 1)
-    for i in range(num_incrs):
-        value = counter.incr(block_id)
-        assert value == i + 1
-@pytest.mark.parametrize("seed", list(range(20)))
-@pytest.mark.parametrize("num_incrs", [1, 100])
-@pytest.mark.parametrize("num_blocks", [1024])
-def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
-    random.seed(seed)
-    all_block_indices = list(range(num_blocks))
-    counter = RefCounter(all_block_indices=all_block_indices)
-    block_id = random.randint(0, num_blocks - 1)
-    for i in range(num_incrs):
-        value = counter.incr(block_id)
-        assert value == i + 1
-    for i in range(num_incrs):
-        value = counter.decr(block_id)
-        assert value == num_incrs - (i + 1)
-    with pytest.raises(AssertionError):
-        counter.decr(block_id)
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
-from vllm.utils import Device, chunk_list
-@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
-@pytest.mark.parametrize("num_gpu_blocks", [1024])
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
-                                block_size: int, allocator_type: str):
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=num_cpu_blocks,
-        block_size=block_size,
-    )
-    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-    cpu_blocks = [
-        allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
-        for _ in range(num_cpu_blocks)
-    ]
-    assert allocator.get_num_free_blocks(Device.CPU) == 0
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-    gpu_blocks = [
-        allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
-        for _ in range(num_gpu_blocks)
-    ]
-    assert allocator.get_num_free_blocks(Device.CPU) == 0
-    assert allocator.get_num_free_blocks(Device.GPU) == 0
-    _ = [allocator.free(block) for block in cpu_blocks]
-    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
-    assert allocator.get_num_free_blocks(Device.GPU) == 0
-    _ = [allocator.free(block) for block in gpu_blocks]
-    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
-@pytest.mark.parametrize("num_gpu_blocks", [1024])
-@pytest.mark.parametrize("block_size", [2])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
-                                  block_size: int, allocator_type: str):
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=num_cpu_blocks,
-        block_size=block_size,
-    )
-    unique_token_ids = list(
-        range((num_cpu_blocks + num_gpu_blocks) * block_size))
-    gpu_token_ids = list(
-        chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
-    cpu_token_ids = list(
-        chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
-    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-    cpu_blocks = [
-        allocator.allocate_immutable_block(prev_block=None,
-                                           token_ids=token_ids,
-                                           device=Device.CPU)
-        for token_ids in cpu_token_ids
-    ]
-    assert allocator.get_num_free_blocks(Device.CPU) == 0
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-    gpu_blocks = [
-        allocator.allocate_immutable_block(prev_block=None,
-                                           token_ids=token_ids,
-                                           device=Device.GPU)
-        for token_ids in gpu_token_ids
-    ]
-    assert allocator.get_num_free_blocks(Device.CPU) == 0
-    assert allocator.get_num_free_blocks(Device.GPU) == 0
-    _ = [allocator.free(block) for block in cpu_blocks]
-    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
-    assert allocator.get_num_free_blocks(Device.GPU) == 0
-    _ = [allocator.free(block) for block in gpu_blocks]
-    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
-import pytest
-from vllm.core.block.interfaces import Block, BlockAllocator
-from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
-class TestNaiveBlockAllocator:
-    @staticmethod
-    def create_allocate_lambda(allocate_type: str,
-                               allocator: NaiveBlockAllocator,
-                               prev_block: Optional[Block],
-                               token_ids: list[int]):
-        if allocate_type == "immutable":
-            allocate_block = lambda: allocator.allocate_immutable_block(
-                prev_block=prev_block, token_ids=token_ids)
-        elif allocate_type == "mutable":
-            allocate_block = lambda: allocator.allocate_mutable_block(
-                prev_block=prev_block)
-        else:
-            raise ValueError()
-        return allocate_block
-    @staticmethod
-    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
-    @pytest.mark.parametrize("num_blocks", [1, 1024])
-    @pytest.mark.parametrize("block_size", [1, 16])
-    def test_allocate_ooms(allocate_type: str, num_blocks: int,
-                           block_size: int):
-        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
-                                        num_blocks=num_blocks,
-                                        block_size=block_size)
-        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
-            allocate_type,
-            allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)))
-        [allocate_block() for _ in range(num_blocks)]
-        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocate_block()
-    @staticmethod
-    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
-    @pytest.mark.parametrize("num_blocks", [1, 1024])
-    @pytest.mark.parametrize("block_size", [1, 16])
-    def test_free_prevents_oom(allocate_type: str, num_blocks: int,
-                               block_size: int):
-        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
-                                        num_blocks=num_blocks,
-                                        block_size=block_size)
-        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
-            allocate_type,
-            allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)))
-        blocks = [allocate_block() for _ in range(num_blocks)]
-        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocate_block()
-        block_to_free = blocks.pop()
-        for _ in range(100):
-            block_id = block_to_free.block_id
-            allocator.free(block_to_free)
-            assert block_to_free.block_id is None
-            new_block = allocate_block()
-            assert new_block.block_id == block_id
-            with pytest.raises(BlockAllocator.NoFreeBlocksError):
-                allocate_block()
-            block_to_free = new_block
-    @staticmethod
-    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
-    @pytest.mark.parametrize("num_blocks", [1024])
-    @pytest.mark.parametrize("block_size", [16])
-    def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
-                                 block_size: int):
-        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
-                                        num_blocks=num_blocks,
-                                        block_size=block_size)
-        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
-            allocate_type,
-            allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)))
-        assert allocator.get_num_free_blocks() == num_blocks
-        blocks = [allocate_block() for _ in range(num_blocks)]
-        for i, block in enumerate(blocks):
-            assert allocator.get_num_free_blocks() == i
-            allocator.free(block)
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [4])
-    @pytest.mark.parametrize("block_size", [8])
-    def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
-        """ Verify the allocator can correctly return the number of
-        full blocks touched.
-        """
-        allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
-                                            num_blocks=num_blocks,
-                                            block_size=block_size)
-        allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock,
-                                            num_blocks=num_blocks,
-                                            block_size=block_size)
-        # Create a chain of cacheable blocks in the dst
-        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
-            "immutable",
-            allocator_src,
-            prev_block=None,
-            token_ids=list(range(block_size)))
-        src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
-        # All blocks are cached
-        assert allocator_dst.get_num_full_blocks_touched(
-            src_blocks) == num_blocks - 1
-        # Insert one non-full block in the src
-        allocate_non_full_block = \
-            TestNaiveBlockAllocator.create_allocate_lambda(
-                "mutable", allocator_src,
-                prev_block=src_blocks[-1],token_ids=[]
-            )
-        src_blocks.append(allocate_non_full_block())
-        src_blocks[-1].append_token_ids([0])
-        assert allocator_dst.get_num_full_blocks_touched(
-            src_blocks) == num_blocks - 1
-        # Fill up the last source block and then invoke
-        # get_num_blocks_touched
-        src_blocks[-1].append_token_ids([0] * (block_size - 1))
-        assert allocator_dst.get_num_full_blocks_touched(
-            src_blocks) == num_blocks
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import math
-import random
-from typing import Optional
-from unittest.mock import MagicMock
-import pytest
-from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence
-from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
-from vllm.core.block.interfaces import Block, BlockAllocator
-from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
-                                                  PrefixCachingBlock,
-                                                  PrefixCachingBlockAllocator)
-from vllm.sequence import Logprob
-from vllm.utils import Device
-class TestPrefixCachingBlock:
-    @staticmethod
-    @pytest.mark.parametrize("seed", list(range(10)))
-    @pytest.mark.parametrize("block_size", [1, 16])
-    @pytest.mark.parametrize("is_curr_block_full", [True, False])
-    def test_first_block_has_correct_content_hash(seed: int, block_size: int,
-                                                  is_curr_block_full: bool):
-        """Verify a block which is first in the sequence has the correct hash.
-        """
-        random.seed(seed)
-        num_to_fill = block_size if is_curr_block_full else random.randint(
-            0, block_size - 1)
-        token_ids = list(range(num_to_fill))
-        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
-        block_with_prev = PrefixCachingBlock(prev_block=None,
-                                             token_ids=token_ids,
-                                             block_size=block_size,
-                                             allocator=mock_allocator)
-        if is_curr_block_full:
-            # Expect hash since block is full.
-            assert block_with_prev.content_hash == (
-                PrefixCachingBlock.hash_block_tokens(
-                    is_first_block=True,
-                    prev_block_hash=None,
-                    cur_block_token_ids=token_ids))
-        else:
-            # Do not expect hash since block is not full.
-            assert block_with_prev.content_hash is None
-    @staticmethod
-    @pytest.mark.parametrize("seed", list(range(10)))
-    @pytest.mark.parametrize("block_size", [1, 16])
-    @pytest.mark.parametrize("is_curr_block_full", [True, False])
-    @pytest.mark.parametrize("prev_block_has_hash", [True, False])
-    def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
-                                                is_curr_block_full: bool,
-                                                prev_block_has_hash: bool):
-        """Verify a block which is not first in the sequence has the correct
-        hash.
-        """
-        random.seed(seed)
-        previous_block = MagicMock(spec=PrefixCachingBlock)
-        prev_block_hash = random.randint(0, 1000)
-        previous_block.content_hash = (prev_block_hash if prev_block_has_hash
-                                       else hash('None'))
-        num_to_fill = block_size if is_curr_block_full else random.randint(
-            0, block_size - 1)
-        token_ids = list(range(num_to_fill))
-        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
-        block_with_prev = PrefixCachingBlock(
-            prev_block=previous_block,
-            token_ids=token_ids,
-            block_size=block_size,
-            allocator=mock_allocator,
-        )
-        if is_curr_block_full and prev_block_has_hash:
-            # Expect hash since block is full and previous block has hash.
-            assert (block_with_prev.content_hash ==
-                    PrefixCachingBlock.hash_block_tokens(
-                        is_first_block=False,
-                        prev_block_hash=prev_block_hash,
-                        cur_block_token_ids=token_ids))
-        else:
-            # Do not expect hash since block is not full or the previous block
-            # does not have a hash.
-            assert block_with_prev.content_hash is None
-    @staticmethod
-    @pytest.mark.parametrize("block_size", [1, 2, 16])
-    @pytest.mark.parametrize("num_tokens", list(range(3)))
-    @pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10])
-    def test_blocks_have_correct_hash_in_chain(block_size: int,
-                                               num_tokens: int,
-                                               num_empty_trailing_blocks: int):
-        """Create two chains of logical blocks with the same contents.
-        Assert the hashes are equal.
-        """
-        random.seed(0)
-        token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
-        first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            num_empty_trailing_blocks=num_empty_trailing_blocks)
-                                     for _ in range(2))
-        for first_chain_block, second_chain_block in zip(
-                first_chain, second_chain):
-            assert (first_chain_block.content_hash ==
-                    second_chain_block.content_hash)
-        if not first_chain or not second_chain:
-            assert first_chain == second_chain
-            assert num_tokens == 0
-    @staticmethod
-    def create_chain(block_size: int,
-                     token_ids: list[int],
-                     num_empty_trailing_blocks=0) -> list[PrefixCachingBlock]:
-        """Helper method which creates a chain of blocks.
-        """
-        blocks: list[PrefixCachingBlock] = []
-        num_blocks = math.ceil(
-            len(token_ids) / block_size) + num_empty_trailing_blocks
-        if num_blocks == 0:
-            return []
-        allocator = MagicMock(spec=PrefixCachingBlockAllocator)
-        prev_block = None
-        for block_number in range(0, num_blocks):
-            prev_block = PrefixCachingBlock(
-                prev_block=prev_block,
-                token_ids=[],
-                block_size=block_size,
-                allocator=allocator,
-            )
-            tokens_to_append = token_ids[block_number *
-                                         block_size:(block_number + 1) *
-                                         block_size]
-            if tokens_to_append:
-                prev_block.append_token_ids(tokens_to_append)
-            blocks.append(prev_block)
-        return blocks
-class TestPrefixCachingBlockAllocator:
-    @staticmethod
-    def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
-                               prev_block: Optional[Block],
-                               token_ids: list[int]):
-        if allocate_type == "immutable":
-            allocate_block = lambda: allocator.allocate_immutable_block(
-                prev_block=prev_block, token_ids=token_ids)
-        elif allocate_type == "mutable":
-            allocate_block = lambda: allocator.allocate_mutable_block(
-                prev_block=prev_block)
-        else:
-            raise ValueError()
-        return allocate_block
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1, 1024])
-    @pytest.mark.parametrize("block_size", [1, 16])
-    def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
-            allocate_type="mutable",
-            allocator=allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)),
-        )
-        [allocate_block() for _ in range(num_blocks)]
-        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocate_block()
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1, 1024])
-    @pytest.mark.parametrize("block_size", [1, 16])
-    def test_allocate_immutable_does_not_oom_single_hash(
-            num_blocks: int, block_size: int):
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
-            allocate_type="immutable",
-            allocator=allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)),
-        )
-        blocks = [allocate_block() for _ in range(num_blocks)]
-        # Expect no OOM. If these were mutable blocks, this would OOM.
-        non_oom_block = allocate_block()
-        # Expect all blocks to have same physical block index.
-        for block in blocks:
-            assert (block.block_id == non_oom_block.block_id)
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1, 1024])
-    @pytest.mark.parametrize("block_size", [1, 16])
-    def test_allocate_immutable_ooms_many_hash(num_blocks: int,
-                                               block_size: int):
-        """Consume all blocks using many different hashes/block content.
-        Do this by creating a sequence that is very long.
-        Expect next block to OOM.
-        """
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        # Create token ids that will exhaust all blocks.
-        token_ids = list(range(num_blocks * block_size))
-        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-        # Expect allocation with unseen hash to fail.
-        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_immutable_block(prev_block=chain[-1],
-                                               token_ids=list(
-                                                   range(block_size)))
-        # Expect mutable allocation to fail.
-        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_mutable_block(prev_block=chain[-1])
-        # Expect allocation of exact same chain to pass.
-        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-        # Expect physical block indices to be the same in both chains.
-        assert chain and second_chain
-        for first_chain_block, second_chain_block in zip(chain, second_chain):
-            assert (first_chain_block.block_id == second_chain_block.block_id)
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1, 1024])
-    @pytest.mark.parametrize("block_size", [1, 16])
-    def test_free_prevents_oom(num_blocks: int, block_size: int):
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        # Create token ids that will exhaust all blocks.
-        token_ids = list(range(num_blocks * block_size))
-        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-        # Expect mutable allocation to fail.
-        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_mutable_block(prev_block=None)
-        block_to_free = chain[-1]
-        # Expect free/allocate loop to succeed many times.
-        for i in range(100):
-            block_id = block_to_free.block_id
-            allocator.free(block_to_free)
-            assert block_to_free.block_id is None, i
-            new_block = allocator.allocate_mutable_block(prev_block=None)
-            assert new_block.block_id == block_id, i
-            with pytest.raises(BlockAllocator.NoFreeBlocksError):
-                allocator.allocate_mutable_block(prev_block=None)
-            block_to_free = new_block
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1024])
-    @pytest.mark.parametrize("block_size", [16])
-    @pytest.mark.parametrize("seed", list(range(20)))
-    def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
-        random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        num_blocks_to_consume = random.randint(1, num_blocks - 1)
-        # Create token ids that will exhaust all blocks.
-        token_ids = list(range(num_blocks_to_consume * block_size))
-        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-        # Free each block in chain, assert num free blocks includes new free
-        # block.
-        for i, block in enumerate(chain):
-            assert allocator.get_num_free_blocks() == (num_blocks -
-                                                       num_blocks_to_consume +
-                                                       i)
-            allocator.free(block)
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [4])
-    @pytest.mark.parametrize("block_size", [8])
-    def test_prefix_caching_block_get_num_full_blocks_touched(
-            num_blocks, block_size):
-        """ Verify the allocator can correctly return the number of
-        blocks touched, when there are cached prefixes.
-        """
-        allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                    block_size=block_size)
-        allocator_dst = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                    block_size=block_size)
-        # Create token ids that will exhaust all blocks except the last
-        token_ids = list(range((num_blocks - 1) * block_size))
-        # Create a chain of cacheable blocks in the dst
-        cached_blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator_dst,
-        )
-        # Create a chain of the same blocks in the src
-        blocks_to_swap_in = \
-            TestPrefixCachingBlockAllocator.create_immutable_chain(
-                block_size=block_size,
-                token_ids=token_ids,
-                allocator=allocator_src,
-            )
-        # All blocks are cached
-        assert allocator_dst.get_num_full_blocks_touched(
-            blocks_to_swap_in) == 0
-        # Free the first block in the dst
-        allocator_dst.free(cached_blocks[0])
-        # Now the first block becomes dangling, the swapped blocks need
-        # to reclaim the first block in the dst
-        assert allocator_dst.get_num_full_blocks_touched(
-            blocks_to_swap_in) == 1
-        # Insert one non-full block in the src
-        non_full_block = allocator_src.allocate_mutable_block(
-            blocks_to_swap_in[-1])
-        non_full_block.append_token_ids([0])
-        blocks_to_swap_in.append(non_full_block)
-        assert allocator_dst.get_num_full_blocks_touched(
-            blocks_to_swap_in) == 1
-        # Fill up the last mutable block and invoke get_num_blocks_touched.
-        # Note: The last block is not cached so it will be touched.
-        non_full_block.append_token_ids([0] * (block_size - 1))
-        assert allocator_dst.get_num_full_blocks_touched(
-            blocks_to_swap_in) == 2
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1024])
-    @pytest.mark.parametrize("block_size", [16])
-    @pytest.mark.parametrize("seed", list(range(20)))
-    def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
-                                        seed: int):
-        """Verify sharing occurs by allocating two sequences that share prefixes
-        and incrementally freeing blocks.
-        """
-        random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        num_blocks_to_consume = random.randint(1, num_blocks - 1)
-        # Create token ids that will exhaust all blocks.
-        token_ids = list(range(num_blocks_to_consume * block_size))
-        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-        # Free each block in the first chain. Since all blocks are shared, the
-        # free count should stay constant.
-        for i, block in enumerate(first_chain):
-            assert allocator.get_num_free_blocks() == (num_blocks -
-                                                       num_blocks_to_consume)
-            allocator.free(block)
-        # Free each block in the second chain. Since the refcount is now zero,
-        # the free count should increment with each free.
-        for i, block in enumerate(second_chain):
-            assert allocator.get_num_free_blocks() == (num_blocks -
-                                                       num_blocks_to_consume +
-                                                       i)
-            allocator.free(block)
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1024])
-    @pytest.mark.parametrize("block_size", [16])
-    @pytest.mark.parametrize("seed", list(range(20)))
-    def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
-                                           seed: int):
-        """Verify get_common_computed_block_ids could get correct result
-        by create two immutable chain sharing prefix at specified pos,
-        and compare whether we also could get right result
-        from get_common_computed_block_ids.
-        """
-        random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2,
-                                                block_size=block_size)
-        num_blocks_to_consume = random.randint(1, num_blocks - 1)
-        # Create token ids that will exhaust all blocks.
-        token_ids = list(range(num_blocks_to_consume * block_size))
-        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-        # After zero_point, second_chain's token_ids would be set -1, which
-        # make it different from here comparing with first_chain
-        zero_point = random.randint(1, len(token_ids) - 1)
-        zero_point_blocks = zero_point // block_size
-        token_ids[zero_point:] = [-1] * (len(token_ids) - zero_point)
-        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-        first_computed_ids = [
-            first_chain[i].block_id for i in range(num_blocks_to_consume)
-        ]
-        second_computed_ids = [
-            second_chain[i].block_id for i in range(num_blocks_to_consume)
-        ]
-        res = allocator.get_common_computed_block_ids(
-            [first_computed_ids, second_computed_ids])
-        assert (len(res) == zero_point_blocks)
-    # Test case that assume those prompted block after first immutable would
-    # be freed into hashless allocator, while first immutable block get ref
-    # increased.
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [3])
-    @pytest.mark.parametrize("block_size", [16])
-    @pytest.mark.parametrize("seed", list(range(10)))
-    def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
-        random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        token_ids = list(range(block_size))
-        block = allocator.allocate_immutable_block(prev_block=None,
-                                                   token_ids=token_ids)
-        assert allocator._refcounter.get(block.block_id) == 1
-        m = allocator.allocate_mutable_block(prev_block=None)
-        block_id = m.block_id
-        for i in range(block_size):
-            m.append_token_ids([i])
-        # After block get promoted to immutable from mutable, if there is
-        # already same content hash block, then it shall be released into
-        # hashless_allocator
-        # And first immutable block's ref get increased by 1
-        assert m.block_id == block.block_id
-        assert block_id in allocator._hashless_allocator._free_block_indices
-        assert allocator._refcounter.get(block.block_id) == 2
-    # Test case when eviction and allocation are mixed,
-    # make sure they work as expected
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [3])
-    @pytest.mark.parametrize("block_size", [16])
-    @pytest.mark.parametrize("seed", list(range(10)))
-    def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
-        random.seed(seed)
-        all_blocks_list = [i for i in range(num_blocks)]
-        zero_ref = {i: 0 for i in range(num_blocks)}
-        one_ref = {i: 1 for i in range(num_blocks)}
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        token_ids = list(range(num_blocks * block_size))
-        # Verify initial/pre-alloc state
-        # Ensure all blocks are free inside hashless allocator
-        assert list(allocator._hashless_allocator._free_block_indices
-                    ) == all_blocks_list
-        # Ensure no tracked blocks
-        assert len(allocator._block_tracker.keys()) == num_blocks
-        for block_id in range(num_blocks):
-            assert not allocator._block_tracker[block_id].active
-        # Ensure no cached blocks
-        assert len(allocator._cached_blocks.values()) == 0
-        # Ensure no evicted blocks
-        assert len(allocator.evictor.free_table.keys()) == 0
-        # Ensure 0s ref counts for all blocks
-        assert allocator._refcounter._refcounts == zero_ref
-        # Allocate immutable chains with only one block residuled in
-        new_block = []
-        for i in range(num_blocks):
-            block = allocator.allocate_immutable_block(
-                prev_block=None,
-                token_ids=token_ids[block_size * i:block_size * (i + 1)])
-            new_block.append(block)
-        # Verify post-alloc state
-        # Ensure no blocks are free inside hashless allocator
-        assert (len(allocator._hashless_allocator._free_block_indices) == 0)
-        # Ensure all blocks are tracked
-        assert len(allocator._block_tracker.keys()) == num_blocks
-        for block_id in range(num_blocks):
-            assert allocator._block_tracker[block_id].active
-        # Ensure all blocks are cached (all promoted)
-        assert len(allocator._cached_blocks.values()) == num_blocks
-        # Ensure no evicted blocks
-        assert len(allocator.evictor.free_table.keys()) == 0
-        # Ensure 1s ref counts for all blocks
-        assert allocator._refcounter._refcounts == one_ref
-        # Free all blocks, and now all blocks shall be in the evictor
-        # there shall be no tracking data left in _block_tracker
-        # all blocks shall be tracked in _cached_blocks
-        # all blocks' ref shall be zero
-        for block in new_block:
-            allocator.free(block)
-        # Verify post-free state
-        # Ensure no tracked blocks
-        assert len(allocator._block_tracker.keys()) == num_blocks
-        for block_id in range(num_blocks):
-            assert not allocator._block_tracker[block_id].active
-        # Ensure no blocks in hashless allocator (all promoted)
-        assert len(allocator._hashless_allocator._free_block_indices) == 0
-        # Ensure all blocks are cached
-        assert list(allocator._cached_blocks.values()) == all_blocks_list
-        # Ensure all blocks are inside the evictor
-        assert list(allocator.evictor.free_table.keys()) == all_blocks_list
-        # Ensure 0s refcounts
-        assert allocator._refcounter._refcounts == zero_ref
-        # Allocate a mutable block, and the first block shall be evicted
-        # and set its content hash into None, ref to 1
-        mutable = allocator.allocate_mutable_block(prev_block=None)
-        assert mutable.block_id == 0
-        assert mutable.content_hash is None
-        assert allocator._block_tracker[0].active
-        assert allocator._refcounter.get(0) == 1
-        assert 0 not in allocator._cached_blocks
-        assert 0 not in allocator.evictor
-        # Since this mutable block has no hash yet, it shall be released into
-        # hashless allocator
-        allocator.free(mutable)
-        assert not allocator._block_tracker[0].active
-        assert allocator._refcounter._refcounts == zero_ref
-        assert 0 not in allocator._cached_blocks
-        assert 0 not in allocator.evictor
-        assert 0 in allocator._hashless_allocator._free_block_indices
-        # When allocate immutable with first block_size tokens, we
-        # shall get free block from hashless allocator, thus no block left
-        # in hashless
-        block = allocator.allocate_immutable_block(
-            prev_block=None, token_ids=token_ids[:block_size])
-        assert block.block_id == 0
-        assert len(allocator._hashless_allocator._free_block_indices) == 0
-        assert allocator._block_tracker[0].active
-        assert 0 in allocator._cached_blocks.values()
-        assert allocator._refcounter.get(0) == 1
-        assert 0 not in allocator.evictor
-        # allocate mutable block again, it shall be popped from evictor
-        mutable = allocator.allocate_mutable_block(prev_block=None)
-        assert len(allocator._hashless_allocator._free_block_indices) == 0
-        assert mutable.block_id not in allocator.evictor.free_table
-        assert allocator._refcounter.get(mutable.block_id) == 1
-    # Test case where two last accessed times are equal
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1024])
-    @pytest.mark.parametrize("block_size", [16])
-    @pytest.mark.parametrize("seed", list(range(20)))
-    def test_eviction_order(num_blocks: int, block_size: int, seed: int):
-        """This test case simulate the two chain created and free in order,
-        and together they would exhaust the initial freed blocks.
-        So the next block created after those two chain shall use the block
-        from the first chain as that block has long access time.
-        While first chain has two blocks, it shall pick up the last one, as
-        it has larger token number.
-        """
-        random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        num_blocks_to_consume = num_blocks + 1
-        token_ids = list(range(num_blocks_to_consume * block_size))
-        num_blocks_in_first_chain = 2
-        num_tokens_in_first_chain = block_size * num_blocks_in_first_chain
-        # First chain takes the first block
-        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids[:num_tokens_in_first_chain],
-            allocator=allocator,
-        )
-        # There should only be one block allocated at this point
-        assert allocator.get_num_free_blocks() == (num_blocks -
-                                                   num_blocks_in_first_chain)
-        # Set the last accessed time of the first block to 1
-        blocks_ids = [block.block_id for block in first_chain]
-        allocator.mark_blocks_as_accessed(blocks_ids, 1)
-        # Second chain takes the rest of the blocks
-        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids[num_tokens_in_first_chain:-block_size],
-            allocator=allocator,
-        )
-        # There shouldn't be any blocks left at this point
-        assert allocator.get_num_free_blocks() == (0)
-        assert len(first_chain) == num_blocks_in_first_chain
-        last_block_id = first_chain[-1].block_id
-        # Free each block in the first chain.
-        for i, block in enumerate(first_chain):
-            allocator.free(block)
-        # Set the last accessed time on all of the blocks in the second chain
-        # to 2
-        blocks_ids = [block.block_id for block in second_chain]
-        allocator.mark_blocks_as_accessed(blocks_ids, 2)
-        # Free each block in the second chain.
-        for i, block in enumerate(second_chain):
-            allocator.free(block)
-        # Allocate a new block and check that it's the least recently used block
-        # from the first chain.
-        new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids[-block_size:],
-            allocator=allocator,
-        )
-        assert new_block[0].block_id == last_block_id
-    # Test case for cache mertics
-    @staticmethod
-    def test_metric():
-        block_size = 16
-        allocator = PrefixCachingBlockAllocator(num_blocks=4,
-                                                block_size=block_size)
-        # Test when no query (0/0)
-        assert allocator.get_prefix_cache_hit_rate() == 0.0
-        token_ids = list(range(block_size))
-        allocator.allocate_immutable_block(prev_block=None,
-                                           token_ids=token_ids)
-        # Test 0/1 hit rate
-        assert allocator.get_prefix_cache_hit_rate() == 0.0
-        allocator.allocate_immutable_block(prev_block=None,
-                                           token_ids=token_ids)
-        # Test 1/2 hit rate
-        assert allocator.get_prefix_cache_hit_rate() == 0.5
-        # Test more than one block
-        for _ in range(2, 1005):
-            allocator.allocate_immutable_block(prev_block=None,
-                                               token_ids=token_ids)
-        assert allocator.get_prefix_cache_hit_rate() > 0.99
-    # Test case for marking cache hit blocks as computed right after
-    # a batch of prefill sequences are scheduled.
-    @staticmethod
-    def test_touch_block():
-        block_size = 16
-        common_blocks = 4
-        allocator = PrefixCachingBlockAllocator(num_blocks=8,
-                                                block_size=block_size)
-        common_token_ids = list(range(block_size * common_blocks))
-        # Mimic the behavior of allocating the same block chain
-        # (i.e., common prefix) for a batch of 3 different prefill sequences.
-        for _ in range(3):
-            blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
-                block_size=block_size,
-                token_ids=common_token_ids,
-                allocator=allocator,
-            )
-            block_hashes = [block.content_hash for block in blocks]
-            # The allocated blocks should  be marked as touched
-            # but not computed.
-            computed_block_ids = allocator.find_cached_blocks_prefix(
-                block_hashes)
-            assert len(computed_block_ids) == 0
-        allocator.mark_blocks_as_computed([])
-        computed_block_ids = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes)
-        assert len(computed_block_ids) == common_blocks
-    @staticmethod
-    def test_find_cached_blocks_prefix():
-        """
-        This test verifies the behavior of find_cached_blocks_prefix.
-        """
-        block_size = 4
-        num_blocks = 8
-        total_test_blocks = 12
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        token_ids = list(range(total_test_blocks * block_size))
-        block_tokens_seq1 = token_ids[:num_blocks * block_size]
-        blocks_seq1 = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=block_tokens_seq1,
-            allocator=allocator,
-        )
-        block_hashes_seq1 = [block.content_hash for block in blocks_seq1]
-        allocator.mark_blocks_as_computed([])
-        # All blocks should be cached.
-        cached_blocks_seq1 = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes_seq1)
-        assert len(cached_blocks_seq1) == num_blocks
-        # Free the first sequence.
-        for block in blocks_seq1:
-            allocator.free(block)
-        # All blocks should be still be cached if not required to be allocated.
-        cached_blocks = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes_seq1)
-        assert len(cached_blocks) == num_blocks
-        block_tokens_seq2 = token_ids[num_blocks * block_size:]
-        blocks_seq2 = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=block_tokens_seq2,
-            allocator=allocator,
-        )
-        block_hashes_seq2 = [block.content_hash for block in blocks_seq2]
-        allocator.mark_blocks_as_computed([])
-        cached_blocks = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes_seq2)
-        assert len(cached_blocks) == len(blocks_seq2)
-        # Half of the blocks from seq1 should still be cached.
-        num_evicted_blocks = len(blocks_seq2)
-        cached_blocks = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes_seq1)
-        assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks
-    # Test reset prefix cache
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [10])
-    @pytest.mark.parametrize("block_size", [16])
-    def test_reset_prefix_cache(num_blocks: int, block_size: int):
-        """This test case simulates the case of resetting the prefix cache."""
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        token_ids = list(range(3 * block_size))
-        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-        # Free each block in the first chain.
-        for block in first_chain:
-            allocator.free(block)
-        # Failed to reset prefix cache because some blocks are not freed yet.
-        assert not allocator.reset_prefix_cache()
-        assert allocator.get_prefix_cache_hit_rate() > 0.0
-        # Free each block in the second chain.
-        for block in second_chain:
-            allocator.free(block)
-        # Reset prefix cache.
-        assert allocator.reset_prefix_cache()
-        assert allocator.get_prefix_cache_hit_rate() == 0.0
-    @staticmethod
-    def create_immutable_chain(
-        block_size: int,
-        token_ids: list[int],
-        allocator: PrefixCachingBlockAllocator,
-        extra_hash: Optional[int] = None,
-    ) -> list[PrefixCachingBlock]:
-        """Helper method which creates a chain of blocks.
-        """
-        blocks: list[Block] = []
-        num_blocks = math.ceil(len(token_ids) / block_size)
-        if num_blocks == 0:
-            return []
-        prev_block = None
-        for block_number in range(0, num_blocks):
-            block_token_ids = token_ids[block_number *
-                                        block_size:(block_number + 1) *
-                                        block_size]
-            prev_block = allocator.allocate_immutable_block(
-                prev_block=prev_block,
-                token_ids=block_token_ids,
-                extra_hash=extra_hash)
-            blocks.append(prev_block)
-        return blocks
-class TestComputedBlocksTracker:
-    @staticmethod
-    def _get_mock_allocator():
-        return MagicMock(spec=PrefixCachingBlockAllocator)
-    @staticmethod
-    def test_get_num_cached_tokens():
-        """
-        Test it correctly computes the number of cached tokens for a given
-        sequence:
-        - The cache token count is derived from the number of cached blocks.
-        - The cache token count is updated when the allocator is updated.
-        - When a sequence is removed, the cache token count should be updated
-        accordingly.
-        # TODO(rickyx): This behaviour for prefill sequence is a hack until
-        we fix the computed blocks tracking.
-        - The cache token count for prefill sequence doesn't change while
-        the sequence is in continuous prefill (chunked prefill).
-        """
-        block_size = 4
-        mock_allocator = TestComputedBlocksTracker._get_mock_allocator()
-        tracker = ComputedBlocksTracker(
-            allocator=mock_allocator,
-            block_size=block_size,
-            enable_caching=True,
-        )
-        # Not yet allocated.
-        tokens = [0, 1, 2, 3, 4, 5]
-        seq1 = create_dummy_sequence(request_id=0,
-                                     token_ids=tokens,
-                                     block_size=block_size)
-        mock_allocator.find_cached_blocks_prefix.return_value = []
-        assert tracker.get_num_cached_tokens(seq1) == 0
-        mock_allocator.find_cached_blocks_prefix.return_value = [
-            None
-        ]  # 1 block cached.
-        # Result is cached for prefill sequence.
-        assert tracker.get_num_cached_tokens(seq1) == 0
-        # Mark the sequence as non-prefill.
-        seq1.data.update_num_computed_tokens(len(tokens))  # 6 tokens computed.
-        assert not seq1.is_prefill()
-        # Recomputes for decoding sequence.
-        assert tracker.get_num_cached_tokens(seq1) == 4
-        # Append new tokens to the sequence.
-        num_new_tokens = 3
-        for i in range(num_new_tokens):
-            seq1.append_token_id(i, {i: Logprob(logprob=0.0)})
-        assert tracker.get_num_cached_tokens(seq1) == 4
-        # Update the allocator.
-        mock_allocator.find_cached_blocks_prefix.return_value = [
-            None
-        ] * 2  # 2 blocks cached.
-        assert tracker.get_num_cached_tokens(seq1) == 8
-        # Remove the sequence.
-        tracker.remove_seq(seq1.seq_id)
-        # Re-create the sequence with the same request id to simulate recompute.
-        seq1 = create_dummy_sequence(request_id=0,
-                                     token_ids=tokens,
-                                     block_size=block_size)
-        mock_allocator.find_cached_blocks_prefix.return_value = [
-        ]  # no cached block
-        assert tracker.get_num_cached_tokens(seq1) == 0
-    @staticmethod
-    def test_correct_block_hash():
-        """
-        Test that the block hash is correctly computed for a sequence (should
-        match the underlying block allocator's block hash). So the number of
-        cached tokens is correctly retrieved.
-        """
-        block_size = 4
-        allocator = CpuGpuBlockAllocator.create(
-            allocator_type="prefix_caching",
-            num_gpu_blocks=16,
-            num_cpu_blocks=16,
-            block_size=block_size,
-        )
-        gpu_allocator = allocator._allocators[Device.GPU]
-        tracker = ComputedBlocksTracker(
-            allocator=allocator,
-            block_size=block_size,
-            enable_caching=True,
-        )
-        tokens = list(range(block_size * 4))  # 4 blocks.
-        seq = create_dummy_sequence(request_id=0,
-                                    token_ids=tokens,
-                                    block_size=block_size)
-        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=tokens,
-            allocator=gpu_allocator,
-        )
-        allocator.mark_blocks_as_computed([])
-        assert tracker.get_num_cached_tokens(seq) == len(tokens)
-    @staticmethod
-    def test_correct_extra_hash():
-        """
-        Test that the block hash is correctly computed based on the extra hash,
-        ensuring it matches the allocator's block hash, specifically for the
-        LoRA case, and that the correct number of cached tokens is retrieved.
-        """
-        block_size = 4
-        allocator = CpuGpuBlockAllocator.create(
-            allocator_type="prefix_caching",
-            num_gpu_blocks=16,
-            num_cpu_blocks=16,
-            block_size=block_size,
-        )
-        gpu_allocator = allocator._allocators[Device.GPU]
-        tracker = ComputedBlocksTracker(
-            allocator=allocator,
-            block_size=block_size,
-            enable_caching=True,
-        )
-        tokens = list(range(block_size * 4))
-        # Create a dummy LoRA sequence with a specific LoRA ID.
-        lora_seq = create_dummy_lora_sequence(request_id=0,
-                                              token_ids=tokens,
-                                              block_size=block_size,
-                                              lora_int_id=1)
-        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=tokens,
-            allocator=gpu_allocator,
-            extra_hash=lora_seq.extra_hash(),
-        )
-        allocator.mark_blocks_as_computed([])
-        # Create different dummy sequences that have the same token IDs
-        # but different LoRA IDs.
-        seq = create_dummy_sequence(request_id=1,
-                                    token_ids=tokens,
-                                    block_size=block_size)
-        different_lora_seq = create_dummy_lora_sequence(request_id=2,
-                                                        token_ids=tokens,
-                                                        block_size=block_size,
-                                                        lora_int_id=2)
-        # Due to the different LoRA IDs, corresponding blocks are not cached.
-        assert tracker.get_num_cached_tokens(seq) == 0
-        assert tracker.get_num_cached_tokens(different_lora_seq) == 0
-        # The number of cached tokens matches the length of the tokens
-        # for the cached LoRA sequence.
-        assert tracker.get_num_cached_tokens(lora_seq) == len(tokens)
--- a/tests/core/conftest.py
+++ b/tests/core/conftest.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from unittest.mock import MagicMock
-import pytest  # noqa
-from vllm.config import CacheConfig, SchedulerConfig
-from vllm.core.scheduler import Scheduler
-from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import Logprob, SequenceGroup
-from .utils import create_dummy_prompt
-def get_sequence_groups(scheduler_output):
-    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
-def append_new_token(seq_group: SequenceGroup, token_id: int):
-    for seq in seq_group.get_seqs():
-        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
-def schedule_and_update_computed_tokens(scheduler):
-    metas, out, _ = scheduler.schedule()
-    for s, meta in zip(out.scheduled_seq_groups, metas):
-        s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
-    return metas, out
-def test_simple():
-    """Verify basic scheduling works."""
-    block_size = 4
-    num_seq_group = 4
-    max_model_len = 16
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig("generate",
-                                       max_num_batched_tokens,
-                                       num_seq_group,
-                                       max_model_len,
-                                       enable_chunked_prefill=True)
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 8
-    cache_config.num_gpu_blocks = 8
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-    # Add seq groups to scheduler.
-    for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=block_size,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-    # Schedule seq groups prompts.
-    num_tokens = block_size * num_seq_group
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert out.num_batched_tokens == num_tokens
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == num_seq_group
-    for s in running:
-        append_new_token(s, 1)
-    # Schedule seq groups generation.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert out.num_batched_tokens == num_seq_group
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == num_seq_group
-def test_chunk():
-    """Verify prefills are chunked properly."""
-    block_size = 4
-    max_seqs = 60
-    max_model_len = 80
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 32
-    cache_config.num_gpu_blocks = 32
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-    # Verify the second request is chunked.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    print()
-    assert set(get_sequence_groups(out)) == set(running)
-    assert seq_group_meta[0].token_chunk_size == 60
-    # Verify it is chunked.
-    assert seq_group_meta[1].token_chunk_size == 4
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 64
-    # Only the first seq group has a new token appended.
-    append_new_token(running[0], 1)
-    # One chunked prefill, and one decoding.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    # The first one is prefill. Scheduler guarantees ordering.
-    assert seq_group_meta[0].token_chunk_size == 56
-    # The second one is a chunked prefill.
-    assert seq_group_meta[1].token_chunk_size == 1
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 57
-def test_concurrent_chunking():
-    """Verify prefills are chunked properly when 
-    --max-num-partial-prefills is > 1"""
-    block_size = 4
-    max_seqs = 60
-    max_model_len = 2000
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 32
-    cache_config.num_gpu_blocks = 32
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-    # Verify both requests are chunked with half of max_num_batched_tokens each
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert seq_group_meta[0].token_chunk_size == 32
-    assert seq_group_meta[1].token_chunk_size == 32
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 64
-    # After one iteration, both should have 60 - 32 = 28 tokens left to prefill
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert seq_group_meta[0].token_chunk_size == 28
-    assert seq_group_meta[1].token_chunk_size == 28
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 56
-def test_concurrent_chunking_large_requests():
-    """Verify large prefill requests are run one at a time"""
-    block_size = 4
-    max_seqs = 60
-    max_model_len = 2000
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
-    cache_config.num_gpu_blocks = 3200
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(
-            str(i),
-            prompt_length=1200,  # Very large prompt
-            block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-    # Verify only a single request is chunked, and it gets all 64 tokens
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 1
-    assert seq_group_meta[0].token_chunk_size == 64
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 64
-def test_short_prompts_jump_long_prompts_in_queue():
-    """Verify large prefill requests are punted behind smaller ones if 
-    another large prefill request is already running"""
-    block_size = 4
-    max_seqs = 60
-    max_model_len = 2000
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
-    cache_config.num_gpu_blocks = 3200
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    long_seqs: list[SequenceGroup] = []
-    short_seqs: list[SequenceGroup] = []
-    # Add 2 large seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(
-            str(i),
-            prompt_length=1200,  # Very large prompt
-            block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        long_seqs.append(seq_group)
-        assert seq_group.is_prefill()
-    # Add 2 small seq groups behind them
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(
-            str(i + 2),
-            prompt_length=40,  # Very small prompt
-            block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        short_seqs.append(seq_group)
-        assert seq_group.is_prefill()
-    # Verify one large req and 1 small req chunked
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert seq_group_meta[0].token_chunk_size == 32  # large req gets 32 tokens
-    assert seq_group_meta[1].token_chunk_size == 32  # small req gets 32 tokens
-    # all 4 are prefilling
-    assert long_seqs[0].is_prefill()
-    assert long_seqs[1].is_prefill()
-    assert short_seqs[0].is_prefill()
-    assert short_seqs[1].is_prefill()
-    # First short and first long sequences have been scheduled
-    assert long_seqs[0].first_seq.get_num_computed_tokens() == 32
-    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
-    assert short_seqs[0].first_seq.get_num_computed_tokens() == 32
-    assert short_seqs[1].first_seq.get_num_computed_tokens() == 0
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 64
-    # in the second iteration,
-    # the first small request had only 8 tokens left
-    # so it went to decode
-    # The other small req is scheduled
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    # the new small req got 64 - (32+8) tokens
-    assert seq_group_meta[0].token_chunk_size == 24
-    assert seq_group_meta[1].token_chunk_size == 32  # large req still got 32
-    # the other small request had only 8 tokens left
-    assert seq_group_meta[2].token_chunk_size == 8  # 40-32
-    # The first small request got to decode now
-    assert long_seqs[0].is_prefill()
-    assert long_seqs[1].is_prefill()
-    assert not short_seqs[0].is_prefill()
-    assert short_seqs[1].is_prefill()
-    # Both small requests have started in front of the second long request
-    assert long_seqs[0].first_seq.get_num_computed_tokens() == 64
-    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
-    assert short_seqs[0].first_seq.get_num_computed_tokens() == 40
-    assert short_seqs[1].first_seq.get_num_computed_tokens() == 24
-    assert out.num_prefill_groups == 3
-    assert out.num_batched_tokens == 64
-    # the first small seq group has a new token appended.
-    append_new_token(short_seqs[0], 1)
-    # in the third iteration,
-    # the first small request is already decoding
-    # the second small request only has 16 tokens left and will enter decoding
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert seq_group_meta[0].token_chunk_size == 32  # large still got 32
-    # small req finished prefilling 40-24=16 tokens
-    assert seq_group_meta[1].token_chunk_size == 16
-    assert seq_group_meta[2].token_chunk_size == 1  # decode
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 49  # (32+16+1 decode)
-    # both small requests have now reached decode
-    assert long_seqs[0].is_prefill()
-    assert long_seqs[1].is_prefill()
-    assert not short_seqs[0].is_prefill()
-    assert not short_seqs[1].is_prefill()
-    assert long_seqs[0].first_seq.get_num_computed_tokens() == 96
-    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
-    assert short_seqs[0].first_seq.get_num_computed_tokens() == 41
-    assert short_seqs[1].first_seq.get_num_computed_tokens() == 40
-    # both the small seq groups have a new token appended
-    append_new_token(short_seqs[0], 1)
-    append_new_token(short_seqs[1], 1)
-    # in the fourth iteration, both small requests are decoding
-    # so large request gets all the budget
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    # large req gets 62 tokens (minus 2 for decode)
-    assert seq_group_meta[0].token_chunk_size == 62
-    assert seq_group_meta[1].token_chunk_size == 1  # decode
-    assert seq_group_meta[2].token_chunk_size == 1  # decode
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 64
-    assert long_seqs[0].first_seq.get_num_computed_tokens() == 158
-    # assert long_seqs[0].is_prefill()
-    # assert long_seqs[1].is_prefill()
-    # assert not short_seqs[0].is_prefill()
-    # assert not short_seqs[1].is_prefill()
-    # # both the small seq groups have a new token appended
-    # append_new_token(short_seqs[0], 1)
-    # append_new_token(short_seqs[1], 1)
-    # # in the fifth iteration, large request gets all the budget
-    # # while both small requests are decoding
-    # seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    # assert seq_group_meta[0].token_chunk_size == 62
-    # assert seq_group_meta[1].token_chunk_size == 1  # decode
-    # assert seq_group_meta[2].token_chunk_size == 1  # decode
-    # assert out.num_prefill_groups == 1
-    # assert out.num_batched_tokens == 64
-def test_complex():
-    block_size = 4
-    max_seqs = 60
-    max_model_len = 80
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 64
-    cache_config.num_gpu_blocks = 64
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-        assert seq_group.is_prefill()
-    # Verify the second request is chunked.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert seq_group_meta[0].token_chunk_size == 60
-    # Verify it is chunked.
-    assert seq_group_meta[1].token_chunk_size == 4
-    assert not running[0].is_prefill()
-    assert running[1].is_prefill()
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 64
-    # Only the first seq group has a new token appended.
-    append_new_token(running[0], 1)
-    # Add 2 more requests.
-    for i in range(2, 4):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-    # Decoding & chunked prefill & first chunk of 3rd request is scheduled.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 3
-    # The first one is the first chunked prefill.
-    assert seq_group_meta[0].token_chunk_size == 7
-    # The second one is the second new chunked prefill.
-    assert seq_group_meta[1].token_chunk_size == 56
-    # The last one is decode.
-    assert seq_group_meta[2].token_chunk_size == 1
-    # Two of them are in chunked prefill.
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 64
-    # The first 2 requests are now in decodine phase.
-    append_new_token(running[0], 1)
-    assert not running[0].is_prefill()
-    append_new_token(running[1], 1)
-    assert not running[1].is_prefill()
-    # The third request is still in prefill stage.
-    assert running[2].is_prefill()
-def test_maximal_decoding():
-    """Verify decoding requests are prioritized."""
-    block_size = 4
-    max_seqs = 2
-    max_model_len = 8
-    max_num_batched_tokens = 2
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 8
-    cache_config.num_gpu_blocks = 8
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=2,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-        assert seq_group.is_prefill()
-    # The first prefill is scheduled.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 1
-    assert seq_group_meta[0].token_chunk_size == 2
-    assert not running[0].is_prefill()
-    assert running[1].is_prefill()
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 2
-    # Only the first seq group has a new token appended.
-    append_new_token(running[0], 1)
-    # Create one more seq_group.
-    _, seq_group = create_dummy_prompt("3",
-                                       prompt_length=2,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    running.append(seq_group)
-    assert seq_group.is_prefill()
-    # The first decoding + second chunk is scheduled.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 2
-    assert seq_group_meta[0].token_chunk_size == 1
-    assert seq_group_meta[1].token_chunk_size == 1
-    assert not running[0].is_prefill()
-    assert running[1].is_prefill()
-    assert running[2].is_prefill()
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 2
-    append_new_token(running[0], 1)
-    # Decoding + running prefill is prioritized.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 2
-    assert seq_group_meta[0].token_chunk_size == 1
-    assert seq_group_meta[1].token_chunk_size == 1
-    assert not running[0].is_prefill()
-    assert not running[1].is_prefill()
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 2
-    append_new_token(running[0], 1)
-    append_new_token(running[1], 1)
-    # Only decoding is prioritized.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 2
-    assert seq_group_meta[0].token_chunk_size == 1
-    assert seq_group_meta[1].token_chunk_size == 1
-    assert not running[0].is_prefill()
-    assert not running[1].is_prefill()
-    assert out.num_prefill_groups == 0
-    assert out.num_batched_tokens == 2
-    append_new_token(running[0], 1)
-    append_new_token(running[1], 1)
-    # After aborting the decoding request, the fcfs new prefill is prioritized.
-    scheduler.abort_seq_group(running[0].request_id)
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 2
-    assert seq_group_meta[0].token_chunk_size == 1
-    assert seq_group_meta[1].token_chunk_size == 1
-    assert not running[1].is_prefill()
-    assert running[2].is_prefill()
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 2
-def test_prompt_limit():
-    """Verify max_num_batched_tokens < max_model_len is possible."""
-    block_size = 4
-    max_seqs = 32
-    max_model_len = 64
-    max_num_batched_tokens = 32
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 16
-    cache_config.num_gpu_blocks = 16
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=48,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    running.append(seq_group)
-    assert seq_group.is_prefill()
-    # The prompt length > max_num_batched_tokens should be still scheduled.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 1
-    assert seq_group_meta[0].token_chunk_size == 32
-    assert running[0].is_prefill()
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 32
-def test_prompt_limit_exceed():
-    block_size = 4
-    max_seqs = 64
-    max_model_len = 32
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig("generate",
-                                       max_num_batched_tokens,
-                                       max_seqs,
-                                       max_model_len,
-                                       enable_chunked_prefill=True)
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 16
-    cache_config.num_gpu_blocks = 16
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-    _, seq_group = create_dummy_prompt("2",
-                                       prompt_length=48,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    running.append(seq_group)
-    assert seq_group.is_prefill()
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.ignored_seq_groups) == 1
-    assert out.ignored_seq_groups[0] == seq_group
-def test_chunked_prefill_preempt():
-    """Verify preempt works with chunked prefill requests"""
-    block_size = 4
-    max_seqs = 30
-    max_model_len = 200
-    max_num_batched_tokens = 30
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 16
-    cache_config.num_gpu_blocks = 16
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=60,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    # The request is chunked.
-    # prefill scheduled now.
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.num_prefill_groups == 1
-    assert seq_group.is_prefill()
-    assert out.num_batched_tokens == max_num_batched_tokens
-    # The request should be preempted.
-    scheduler.block_manager.can_append_slots = MagicMock()
-    def cannot_append_second_group1(seq_group, num_lookahead_slots):
-        return seq_group.request_id != "1"
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group1)
-    # The running prefill is now preempted.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 0
-    assert out.num_batched_tokens == 0
-    assert out.blocks_to_swap_out == []
-    assert out.blocks_to_swap_in == []
-    # Make sure we can reschedule preempted request.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.num_prefill_groups == 1
-    assert seq_group.is_prefill()
-    assert out.num_batched_tokens == max_num_batched_tokens
-    assert seq_group.get_num_uncomputed_tokens() == 30
-    # We should be able to run prefill twice as it is chunked.
-    def cannot_append_second_group2(seq_group, num_lookahead_slots):
-        return True
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group2)
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.num_prefill_groups == 1
-    assert not seq_group.is_prefill()
-    assert out.num_batched_tokens == max_num_batched_tokens
-def test_chunked_prefill_spec_prefill():
-    """Verify that the num_lookahead_slots is set appropriately for an all"""
-    """prefill batch."""
-    block_size = 4
-    max_seqs = 30
-    max_model_len = 200
-    max_num_batched_tokens = 30
-    num_lookahead_slots = 4
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-        num_lookahead_slots=num_lookahead_slots,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 16
-    cache_config.num_gpu_blocks = 16
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=30,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    # The request is chunked.
-    # prefill scheduled now.
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == max_num_batched_tokens
-    print(out.num_lookahead_slots)
-    assert out.num_lookahead_slots == 0
-def test_chunked_prefill_max_seqs():
-    block_size = 4
-    max_seqs = 2
-    max_model_len = 80
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 128
-    cache_config.num_gpu_blocks = 128
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=65,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    running.append(seq_group)
-    # The first prefill is chunked.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert seq_group_meta[0].token_chunk_size == max_num_batched_tokens
-    assert len(get_sequence_groups(out)) == 1
-    # Add new requests.
-    for i in range(4):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=65,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-    # Make sure only 2 requests are scheduled.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert out.num_batched_tokens == max_num_batched_tokens
-    assert len(get_sequence_groups(out)) == 2
-    assert not running[0].is_prefill()
-    assert running[1].is_prefill()
-    append_new_token(running[0], 1)
-    # Although we have enough token budget, we can only schedule max_seqs.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert seq_group_meta[0].token_chunk_size == 2
-    assert seq_group_meta[1].token_chunk_size == 1
-    assert out.num_batched_tokens == 3
-    assert len(get_sequence_groups(out)) == max_seqs
-    assert not running[0].is_prefill()
-    assert not running[1].is_prefill()
-def test_prefix_caching():
-    """Verify allocating full blocks when prefix caching is enabled."""
-    block_size = 4
-    max_seqs = 10
-    max_model_len = 80
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size,
-                               1.0,
-                               1,
-                               "auto",
-                               enable_prefix_caching=True)
-    cache_config.num_cpu_blocks = 0
-    cache_config.num_gpu_blocks = 32
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           block_size=block_size,
-                                           prompt_length=50)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert seq_group_meta[0].token_chunk_size == 50
-    # Verify it is chunked. Note that although the budget is 64-50=14,
-    # we only allocate full blocks for prefix caching, so only 4*(14//4)=12
-    # tokens are allocated.
-    assert seq_group_meta[1].token_chunk_size == 12
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 62
-def test_prefix_caching_with_concurrent_partial_prefills():
-    """Verify allocating full blocks when prefix caching is enabled with 
-    --max-num-partial-prefills > 1."""
-    block_size = 4
-    max_seqs = 10
-    max_model_len = 8000
-    max_num_batched_tokens = 60  # With two slots, each slot will get 30 tokens
-    scheduler_config = SchedulerConfig("generate",
-                                       max_num_batched_tokens,
-                                       max_seqs,
-                                       max_model_len,
-                                       enable_chunked_prefill=True,
-                                       max_num_partial_prefills=2)
-    cache_config = CacheConfig(block_size,
-                               1.0,
-                               1,
-                               "auto",
-                               enable_prefix_caching=True)
-    cache_config.num_cpu_blocks = 0
-    cache_config.num_gpu_blocks = 32
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           block_size=block_size,
-                                           prompt_length=50)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    # To partially prefill both sequences, both can chunk up to 30 tokens
-    # But the next lowest multiple of the block size (4) is 28
-    assert seq_group_meta[0].token_chunk_size == 28
-    assert seq_group_meta[1].token_chunk_size == 28
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 56
-    # On the next iteration, both sequences should finish prefill
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    # Both sequences have 50 - 28 = 22 tokens left to prefill.
-    # This is not a multiple of the block size, but we don't care since we don't
-    # cache the final partial block of prefix sequences
-    assert seq_group_meta[0].token_chunk_size == 22
-    assert seq_group_meta[1].token_chunk_size == 22
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 44
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8])
-def test_chunked_prefill_with_actual_engine(model: str,
-                                            max_num_partial_prefills: int):
-    """Make sure the model can actually sample with concurrent 
-    partial prefills
-    """
-    prompt = "hello" * 40
-    engine_args = EngineArgs(
-        model=model,
-        max_num_partial_prefills=max_num_partial_prefills,
-        max_num_batched_tokens=40,
-        max_num_seqs=8,
-        enable_chunked_prefill=True,
-        gpu_memory_utilization=0.8,
-    )
-    engine = LLMEngine.from_engine_args(engine_args)
-    sampling_params = SamplingParams(temperature=0)
-    for req_num in range(max_num_partial_prefills):
-        engine.add_request(f"{req_num}", prompt, sampling_params)
-    # first step
-    request_outputs = engine.step()
-    # means all are prefilling
-    assert len(request_outputs) == 0
-    assert len(engine.scheduler[0].running) == max_num_partial_prefills
--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-from tests.conftest import VllmRunner
-from tests.core.utils import create_dummy_prompt
-from vllm.engine.llm_engine import LLMEngine
-from vllm.sequence import SequenceGroup
-MODEL = "JackFram/llama-160m"
-def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
-    scheduler = engine.scheduler[0]
-    scheduler.add_seq_group(seq_group)
-@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
-@pytest.mark.parametrize("enforce_eager", [False, True])
-def test_num_computed_tokens_update(enable_chunked_prefill: bool,
-                                    enforce_eager: bool):
-    # Make a vllm engine
-    runner = VllmRunner(model_name=MODEL,
-                        gpu_memory_utilization=0.7,
-                        enable_chunked_prefill=enable_chunked_prefill,
-                        enforce_eager=enforce_eager)
-    engine: LLMEngine = runner.llm.llm_engine
-    num_prompt_steps = 1
-    num_output_tokens_list = [4, 8, 12, 15, 16, 17]
-    # Create sequence and add to engine
-    prompt_len = 10
-    for req_idx, num_output_tokens in enumerate(num_output_tokens_list):
-        seq, seq_group = create_dummy_prompt(request_id=str(req_idx),
-                                             prompt_length=prompt_len,
-                                             min_tokens=num_output_tokens,
-                                             max_tokens=num_output_tokens)
-        add_seq_group_to_engine(engine, seq_group)
-        assert seq.data.get_num_computed_tokens() == 0
-        for _ in range(num_prompt_steps):
-            # prompt steps
-            engine.step()
-        if not seq.is_finished():
-            prompt_num_computed_tokens = seq.data.get_num_computed_tokens()
-            # Test correctness of num_computed_tokens after the prompt steps
-            assert prompt_num_computed_tokens == \
-                        prompt_len + num_prompt_steps - 1
-            decode_step_counter = 0
-            while not seq.is_finished():
-                # Test correctness of num_computed_tokens after the decode steps
-                assert seq.data.get_num_computed_tokens(
-                ) == prompt_num_computed_tokens + decode_step_counter
-                engine.step()
-                decode_step_counter += 1
-        # Test correctness of num_computed_tokens after the sequence finish.
-        assert seq.data.get_num_computed_tokens(
-        ) == prompt_len + num_output_tokens - 1
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import time
-from collections import deque
-from typing import Optional
-from unittest.mock import MagicMock
-import pytest  # noqa
-import torch
-from torch import Use  # noqa
-from vllm.config import CacheConfig, SchedulerConfig
-from vllm.config.lora import LoRAConfig
-from vllm.core.interfaces import AllocStatus
-from vllm.core.scheduler import Scheduler, SchedulingBudget
-from vllm.lora.request import LoRARequest
-from vllm.sequence import SequenceGroup, SequenceStatus
-from .utils import (append_new_token, append_new_token_seq,
-                    append_new_token_seq_group, create_dummy_prompt,
-                    get_sequence_groups, schedule_and_update_computed_tokens)
-def test_scheduler_add_seq_group():
-    block_size = 4
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=100,
-        max_num_seqs=64,
-        max_model_len=1,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
-    cache_config.num_cpu_blocks = 4
-    cache_config.num_gpu_blocks = 4
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    # Add seq group to scheduler.
-    num_seq_group = 4
-    for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           block_size,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        assert scheduler.get_num_unfinished_seq_groups() == i + 1
-def test_scheduler_abort_seq_group():
-    block_size = 4
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=100,
-        max_num_seqs=64,
-        max_model_len=1,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 4
-    cache_config.num_gpu_blocks = 4
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    # Add multiple seq groups to scheduler.
-    num_seq_group = 4
-    request_ids: set[str] = set()
-    for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i), block_size)
-        scheduler.add_seq_group(seq_group)
-        request_ids.add(str(i))
-    # Abort all added seq groups.
-    assert scheduler.get_num_unfinished_seq_groups() == num_seq_group
-    scheduler.abort_seq_group(request_ids)
-    assert scheduler.get_num_unfinished_seq_groups() == 0
-def test_scheduler_schedule_simple():
-    block_size = 4
-    num_seq_group = 4
-    max_model_len = 16
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=64,
-        max_num_seqs=num_seq_group,
-        max_model_len=max_model_len,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 8
-    cache_config.num_gpu_blocks = 8
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-    # Add seq groups to scheduler.
-    for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=block_size,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-    # Schedule seq groups prompts.
-    num_tokens = block_size * num_seq_group
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert out.num_batched_tokens == num_tokens
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == num_seq_group
-    append_new_token(out, 1)
-    # Schedule seq groups generation.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert out.num_batched_tokens == num_seq_group
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == num_seq_group
-    append_new_token(out, 1)
-def test_scheduler_prefill_prioritized():
-    """Verify running batched tokens are not applied to prefill requests."""
-    block_size = 4
-    max_model_len = 30
-    max_batched_num_tokens = 30
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=max_batched_num_tokens,
-        max_num_seqs=2,
-        max_model_len=max_model_len,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 16
-    cache_config.num_gpu_blocks = 16
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    # Add seq groups to scheduler.
-    _, seq_group_a = create_dummy_prompt("1", 1, block_size=block_size)
-    scheduler.add_seq_group(seq_group_a)
-    # Schedule seq groups prompts.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert get_sequence_groups(out) == [seq_group_a]
-    # Add a new prefill request B.
-    _, seq_group_b = create_dummy_prompt("2", 30, block_size=block_size)
-    scheduler.add_seq_group(seq_group_b)
-    # Verify prefill requests are prioritized. Since max_batched_num_tokens
-    # is 1, new prefill request has to be scheduled first.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert get_sequence_groups(out) == [seq_group_b]
-def test_scheduler_schedule_preempt_abort():
-    block_size = 4
-    max_model_len = 16
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=64,
-        max_num_seqs=2,
-        max_model_len=max_model_len,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 2
-    cache_config.num_gpu_blocks = 2
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    # Add seq groups to scheduler.
-    seq_a, seq_group_a = create_dummy_prompt("1",
-                                             block_size,
-                                             block_size=block_size)
-    seq_b, seq_group_b = create_dummy_prompt("2",
-                                             block_size,
-                                             block_size=block_size)
-    scheduler.add_seq_group(seq_group_a)
-    scheduler.add_seq_group(seq_group_b)
-    # Schedule seq groups prompts.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert get_sequence_groups(out) == [seq_group_a, seq_group_b]
-    assert out.num_batched_tokens == block_size * 2  # seq_a and seq_b
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == 2
-    assert scheduler.get_num_unfinished_seq_groups() == 2
-    # Append "generated" tokens, allowing the sequence to mark prompt tokens as
-    # processed.
-    append_new_token(out, 1)
-    # Schedule seq groups generation and preempt seq group b.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert get_sequence_groups(out) == [seq_group_a]
-    assert out.num_batched_tokens == 1
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == 1
-    assert scheduler.get_num_unfinished_seq_groups() == 2
-    assert out.preempted == 1
-    # Abort seq group a. Re-schedule seq group b prompt with recomputation.
-    scheduler.abort_seq_group("1")
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert get_sequence_groups(out) == [seq_group_b]
-    assert out.num_batched_tokens == 5  # 4 prompt + 1 generation.
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == 1
-    assert scheduler.get_num_unfinished_seq_groups() == 1
-def test_scheduler_max_seqs():
-    block_size = 4
-    num_seq_group = 4
-    max_seq_group = 2
-    max_model_len = 16
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=64,
-        max_num_seqs=max_seq_group,
-        max_model_len=max_model_len,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 8
-    cache_config.num_gpu_blocks = 8
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    all_seq_groups: list[SequenceGroup] = []
-    # Add seq groups to scheduler.
-    for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=block_size,
-                                           block_size=block_size)
-        all_seq_groups.append(seq_group)
-    # Append 1 seq group
-    scheduler.add_seq_group(all_seq_groups[0])
-    # Schedule seq groups prompts.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
-    append_new_token(out, 1)
-    # Schedule seq groups generation.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
-    append_new_token(out, 1)
-    # Append 2 more seq group
-    scheduler.add_seq_group(all_seq_groups[1])
-    scheduler.add_seq_group(all_seq_groups[2])
-    # Schedule seq groups prompts.
-    # Only 1 seq group should be scheduled since max_seq_group is 2
-    # and one is prompting.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
-def test_scheduler_delay_factor():
-    block_size = 4
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=100,
-        max_num_seqs=64,
-        max_model_len=16,
-        delay_factor=0.5,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 8
-    cache_config.num_gpu_blocks = 8
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    # schedule first prompt
-    seq_group_meta, seq_group = create_dummy_prompt("0",
-                                                    prompt_length=block_size,
-                                                    block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert out.num_prefill_groups > 0
-    assert seq_group_meta[0].request_id == '0'
-    append_new_token(out, 1)
-    # wait for a second before scheduling next prompt
-    time.sleep(1)
-    seq_group_meta, seq_group = create_dummy_prompt("1",
-                                                    prompt_length=block_size,
-                                                    block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    # second prompt should *not* be scheduled
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert out.num_prefill_groups == 0
-    assert seq_group_meta[0].request_id == '0'
-    append_new_token(out, 1)
-    # wait for more than 0.5 second and try again
-    time.sleep(0.6)
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert out.num_prefill_groups > 0
-    assert seq_group_meta[0].request_id == '1'
-    append_new_token(out, 1)
-def initialize_scheduler(
-    *,
-    max_num_seqs=1000,
-    max_token_budget=1000,
-    max_model_len=1000,
-    lora_config=None,
-    block_size=4,
-    num_cpu_blocks=8,
-    num_gpu_blocks=8,
-    enable_prefix_caching=False,
-    enable_chunked_prefill=False,
-):
-    block_size = block_size
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=max_token_budget,
-        max_num_seqs=max_num_seqs,
-        max_model_len=max_model_len,
-        enable_chunked_prefill=enable_chunked_prefill,
-    )
-    cache_config = CacheConfig(
-        block_size,
-        1.0,
-        1,
-        "auto",
-        enable_prefix_caching=enable_prefix_caching,
-    )
-    cache_config.num_cpu_blocks = num_cpu_blocks
-    cache_config.num_gpu_blocks = num_gpu_blocks
-    scheduler = Scheduler(scheduler_config, cache_config, lora_config)
-    return scheduler
-def create_token_budget(token_budget: int = 10000,
-                        max_num_seqs: int = 10000) -> SchedulingBudget:
-    return SchedulingBudget(
-        token_budget=token_budget,
-        max_num_seqs=max_num_seqs,
-    )
-def add_token_budget(budget: SchedulingBudget,
-                     num_batched_tokens: int = 0,
-                     num_curr_seqs: int = 0):
-    mock_seq_group = create_dummy_prompt('10', prompt_length=60)[1]
-    budget.add_num_batched_tokens(mock_seq_group.request_id,
-                                  num_batched_tokens)
-    budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
-def test_prefill_schedule_max_prompt_len():
-    """
-    Test prompt longer than max_prompt_len is aborted.
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
-    _, seq_group = create_dummy_prompt("0",
-                                       prompt_length=60,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    budget = create_token_budget()
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 1
-    assert len(output.seq_groups) == 0
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 0
-    assert len(remaining_waiting) == 0
-def test_prefill_schedule_token_budget():
-    """
-    Test token budget respected.
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
-    budget = create_token_budget(token_budget=0)
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-    # 0 token budget == nothing is scheduled.
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 0
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 0
-    assert len(remaining_waiting) == 2
-    # 60 token budget == 1 request scheduled.
-    budget = create_token_budget(token_budget=60)
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 1
-    assert budget.num_batched_tokens == 60
-    assert budget.num_curr_seqs == 1
-    assert len(remaining_waiting) == 1
-    # Test when current_batched_tokens respected.
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=16,
-                                     num_gpu_blocks=16)
-    budget = create_token_budget(token_budget=60)
-    add_token_budget(budget, 30, 0)
-    _, seq_group = create_dummy_prompt(str(i),
-                                       prompt_length=60,
-                                       block_size=block_size)
-    # Cannot schedule a prompt that doesn't fit the budget.
-    scheduler.add_seq_group(seq_group)
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 0
-    assert budget.num_batched_tokens == 30
-    assert budget.num_curr_seqs == 0
-    assert len(remaining_waiting) == 1
-    budget = create_token_budget(token_budget=90)
-    add_token_budget(budget, 30, 0)
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.seq_groups) == 1
-    assert budget.num_batched_tokens == 90
-    assert budget.num_curr_seqs == 1
-    assert len(remaining_waiting) == 0
-def test_prefill_schedule_max_seqs():
-    """
-    Test max seq respected.
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
-    budget = create_token_budget(max_num_seqs=2)
-    for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 2
-    assert budget.num_batched_tokens == 120
-    assert budget.num_curr_seqs == 2
-    assert len(remaining_waiting) == 1
-    # Verify curr_num_seqs respected.
-    scheduler.waiting = deque()
-    budget = create_token_budget(max_num_seqs=2)
-    add_token_budget(budget, 0, 2)
-    _, seq_group = create_dummy_prompt(str(i),
-                                       prompt_length=60,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 0
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 2
-    assert len(remaining_waiting) == 1
-def test_prefill_schedule_max_lora():
-    """
-    Test max lora is respected and prioritized.
-    """
-    block_size = 4
-    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
-    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
-    budget = create_token_budget(token_budget=120)
-    curr_loras: set[int] = set()
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size,
-                                           lora_request=LoRARequest(
-                                               lora_name=str(i),
-                                               lora_int_id=i + 1,
-                                               lora_path="abc"))
-        scheduler.add_seq_group(seq_group)
-    # Add two more requests to verify lora is prioritized.
-    # 0: LoRA, 1: LoRA, 2: regular, 3: regular
-    # In the first iteration, index 0, 2 is scheduled.
-    # If a request is not scheduled because it hits max lora, it is
-    # prioritized. Verify that.
-    for i in range(2, 4):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-    # Schedule 2 requests (0 and 2)
-    output = scheduler._schedule_prefills(budget, curr_loras)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 2
-    assert budget.num_batched_tokens == 120
-    assert budget.num_curr_seqs == 2
-    assert len(remaining_waiting) == 2
-    assert len(curr_loras) == 1
-    # The second lora request is scheduled next as FCFS policy.
-    # Reset curr_loras so that it can be scheduled.
-    curr_loras = set()
-    budget = create_token_budget(token_budget=60)
-    output = scheduler._schedule_prefills(budget, curr_loras)
-    remaining_waiting = scheduler.waiting
-    assert len(output.seq_groups) == 1
-    assert output.seq_groups[0].seq_group.request_id == "1"
-    assert len(remaining_waiting) == 1
-    assert len(curr_loras) == 1
-    assert budget.num_batched_tokens == 60
-def test_prefill_schedule_no_block_manager_capacity():
-    """
-    Test sequence cannot be scheduled due to block manager has no capacity.
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_gpu_blocks=128,
-                                     num_cpu_blocks=128)
-    budget = create_token_budget()
-    for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-    scheduler.block_manager.can_allocate = MagicMock()
-    scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 0
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 0
-    assert len(remaining_waiting) == 3
-    scheduler = initialize_scheduler()
-    budget = create_token_budget()
-    for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-    scheduler.block_manager.can_allocate = MagicMock()
-    scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 3
-    assert len(output.seq_groups) == 0
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 0
-    assert len(remaining_waiting) == 0
-def test_decode_schedule_preempted():
-    """
-    Test decodes cannot be scheduled and preempted.
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
-    curr_loras = None
-    for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler._allocate_and_set_running(seq_group)
-        append_new_token_seq_group(60, seq_group, 1)
-        scheduler._add_seq_group_to_running(seq_group)
-    scheduler.block_manager.can_append_slots = MagicMock()
-    def cannot_append_second_group(seq_group, num_lookahead_slots):
-        return seq_group.request_id != "1"
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
-    # 1 cannot be scheduled, and the lowest priority (request 2)
-    # should be preempted. 1 will also be preempted.
-    budget = create_token_budget()
-    output = scheduler._schedule_running(budget, curr_loras)
-    remaining_running = scheduler.running
-    assert len(remaining_running) == 0
-    assert len(output.decode_seq_groups) == 1
-    assert len(output.prefill_seq_groups) == 0
-    assert output.decode_seq_groups[0].seq_group.request_id == "0"
-    assert len(output.preempted) == 2
-    # Verify budgets are updated.
-    assert budget.num_batched_tokens == 1
-    # NOTE: When enable_chunk is False, num_seqs budget is not updated.
-    # assert budget.num_curr_seqs == 1
-    # Both should be preempted, not swapped.
-    assert output.blocks_to_swap_out == []
-    # Nothing is copied.
-    assert output.blocks_to_copy == []
-def test_schedule_decode_blocks_to_copy_update():
-    """
-    Verify blocks_to_copy is updated.
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=4,
-                                     num_cpu_blocks=16,
-                                     num_gpu_blocks=16)
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=60,
-                                       block_size=block_size)
-    curr_loras = None
-    scheduler._allocate_and_set_running(seq_group)
-    append_new_token_seq_group(60, seq_group, 1)
-    scheduler._add_seq_group_to_running(seq_group)
-    # The last request should be swapped out.
-    scheduler.block_manager.append_slots = MagicMock()
-    scheduler.block_manager.append_slots.return_value = [(2, 3)]
-    budget = create_token_budget()
-    output = scheduler._schedule_running(budget, curr_loras)
-    remaining_running = scheduler.running
-    assert len(remaining_running) == 0
-    assert len(output.decode_seq_groups) == 1
-    assert len(output.prefill_seq_groups) == 0
-    assert len(output.preempted) == 0
-    assert len(output.swapped_out) == 0
-    # Nothing is preempted.
-    assert output.blocks_to_swap_out == []
-    # Since append_slot returns the source -> dist mapping, it should
-    # be applied.
-    assert output.blocks_to_copy == [(2, 3)]
-def test_schedule_swapped_max_loras():
-    block_size = 4
-    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
-    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
-    curr_loras: set[int] = set()
-    blocks_to_swap_out: list[tuple[int, int]] = []
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size,
-                                           lora_request=LoRARequest(
-                                               lora_name=str(i),
-                                               lora_int_id=i + 1,
-                                               lora_path="abc"))
-        scheduler._allocate_and_set_running(seq_group)
-        append_new_token_seq_group(60, seq_group, 1)
-        scheduler._swap_out(seq_group, blocks_to_swap_out)
-        scheduler._add_seq_group_to_swapped(seq_group)
-    budget = create_token_budget()
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 1
-    assert budget.num_batched_tokens == 1
-    assert budget.num_curr_seqs == 1
-    assert len(output.decode_seq_groups) == 1
-    assert len(output.prefill_seq_groups) == 0
-    assert len(curr_loras) == 1
-def test_schedule_swapped_cannot_swap_in():
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
-    curr_loras = None
-    blocks_to_swap_out: list[tuple[int, int]] = []
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler._allocate_and_set_running(seq_group)
-        append_new_token_seq_group(60, seq_group, 1)
-        scheduler._swap_out(seq_group, blocks_to_swap_out)
-        scheduler._add_seq_group_to_swapped(seq_group)
-    # The last request should be swapped out.
-    scheduler.block_manager.can_swap_in = MagicMock()
-    scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
-    # Since we cannot swap in, none of the requests are swapped in.
-    budget = create_token_budget()
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 2
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 0
-    assert len(output.decode_seq_groups) == 0
-    assert len(output.prefill_seq_groups) == 0
-def test_infeasible_swap():
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
-    curr_loras = None
-    blocks_to_swap_out: list[tuple[int, int]] = []
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler._allocate_and_set_running(seq_group)
-        append_new_token_seq_group(60, seq_group, 1)
-        scheduler._swap_out(seq_group, blocks_to_swap_out)
-        scheduler._add_seq_group_to_swapped(seq_group)
-    # The last request should be swapped out.
-    scheduler.block_manager.can_swap_in = MagicMock()
-    scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER
-    # Since we cannot swap in, none of the requests are swapped in.
-    budget = create_token_budget()
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 0
-    assert len(output.infeasible_seq_groups) == 2
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 0
-    assert len(output.decode_seq_groups) == 0
-    assert len(output.prefill_seq_groups) == 0
-def test_schedule_swapped_blocks_to_copy():
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
-    curr_loras = None
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=60,
-                                       block_size=block_size)
-    scheduler._allocate_and_set_running(seq_group)
-    append_new_token_seq_group(60, seq_group, 1)
-    blocks_to_swap_out: list[tuple[int, int]] = []
-    scheduler._swap_out(seq_group, blocks_to_swap_out)
-    scheduler._add_seq_group_to_swapped(seq_group)
-    # The last request should be swapped out.
-    scheduler.block_manager.append_slots = MagicMock()
-    scheduler.block_manager.append_slots.return_value = [(2, 3)]
-    budget = create_token_budget()
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 0
-    assert len(output.decode_seq_groups) == 1
-    assert len(output.prefill_seq_groups) == 0
-    assert output.blocks_to_copy == [(2, 3)]
-def test_scheduling_budget():
-    TOKEN_BUDGET = 4
-    MAX_SEQS = 4
-    budget = SchedulingBudget(token_budget=TOKEN_BUDGET, max_num_seqs=MAX_SEQS)
-    assert budget.can_schedule(num_new_tokens=1, num_new_seqs=1)
-    assert budget.can_schedule(num_new_tokens=4, num_new_seqs=4)
-    assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=5)
-    assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=1)
-    assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=5)
-    assert budget.remaining_token_budget() == TOKEN_BUDGET
-    # Verify add/subtract num batched tokens.
-    _, seq_group = create_dummy_prompt("1", 3)
-    budget.add_num_batched_tokens(seq_group.request_id, 2)
-    assert budget.remaining_token_budget() == 2
-    assert budget.num_batched_tokens == 2
-    assert budget.can_schedule(num_new_tokens=2, num_new_seqs=1)
-    assert not budget.can_schedule(num_new_tokens=3, num_new_seqs=1)
-    # Verify adding another seq group is no-op.
-    budget.add_num_batched_tokens(seq_group.request_id, 2)
-    assert budget.remaining_token_budget() == 2
-    assert budget.num_batched_tokens == 2
-    budget.subtract_num_batched_tokens(seq_group.request_id, 2)
-    assert budget.remaining_token_budget() == 4
-    assert budget.num_batched_tokens == 0
-    budget.subtract_num_batched_tokens(seq_group.request_id, 2)
-    assert budget.remaining_token_budget() == 4
-    assert budget.num_batched_tokens == 0
-    # Verify add/subtract max seqs.
-    _, seq_group = create_dummy_prompt("1", 3)
-    budget.add_num_seqs(seq_group.request_id, 2)
-    assert budget.can_schedule(num_new_tokens=1, num_new_seqs=2)
-    assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=3)
-    assert budget.num_curr_seqs == 2
-    # Verify adding another seq group is no-op.
-    budget.add_num_seqs(seq_group.request_id, 2)
-    assert budget.num_curr_seqs == 2
-    budget.subtract_num_seqs(seq_group.request_id, 2)
-    assert budget.num_curr_seqs == 0
-    budget.subtract_num_seqs(seq_group.request_id, 2)
-    assert budget.num_curr_seqs == 0
-@pytest.mark.parametrize("enable_prefix_caching", [True, False])
-def test_prefix_caching_aware_prefills(enable_prefix_caching):
-    """
-    Test the below scenario:
-    For 3 sequences, seqA, seqB, seqC, share the first block as prefix.
-    The test verifies the below scenarios:
-    1.  SeqA is first scheduled.
-    2.  SeqB and SeqC can be prefilled together in a single schedule round
-    even though there are not enough token budgets to prefill both without
-    considering prefix caching.
-    """
-    block_size = 4
-    max_num_batched_tokens = 12
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_token_budget=max_num_batched_tokens,
-        max_num_seqs=max_seq_group,
-        max_model_len=max_num_batched_tokens,
-        enable_prefix_caching=enable_prefix_caching,
-    )
-    seqA_tokens = list(range(8))
-    num_shared_tokens = 4
-    seqB_tokens = seqA_tokens[:num_shared_tokens] + list(range(
-        12, 16))  # Shared prefix first 4.
-    seqC_tokens = seqA_tokens[:num_shared_tokens] + list(range(
-        16, 20))  # Shared prefix first 4.
-    seqA, seqA_group = create_dummy_prompt("0",
-                                           prompt_tokens=seqA_tokens,
-                                           block_size=block_size)
-    seqB, seqB_group = create_dummy_prompt("1",
-                                           prompt_tokens=seqB_tokens,
-                                           block_size=block_size)
-    seqC, seqC_group = create_dummy_prompt("2",
-                                           prompt_tokens=seqC_tokens,
-                                           block_size=block_size)
-    # Schedule seqA prefill.
-    scheduler.add_seq_group(seqA_group)
-    metas, out, _ = scheduler.schedule()
-    assert (len(out.scheduled_seq_groups) == 1
-            and out.scheduled_seq_groups[0].seq_group == seqA_group)
-    assert out.scheduled_seq_groups[0].token_chunk_size == len(seqA_tokens)
-    # Schedule seqA decode.
-    append_new_token_seq_group(len(seqA_tokens), seqA_group, 999)
-    metas, out, _ = scheduler.schedule()
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.scheduled_seq_groups[0].seq_group == seqA_group
-    assert out.scheduled_seq_groups[0].token_chunk_size == 1
-    # Schedule seqB and seqC prefills should work with prefix caching.
-    scheduler.add_seq_group(seqB_group)
-    scheduler.add_seq_group(seqC_group)
-    metas, out, _ = scheduler.schedule()
-    if enable_prefix_caching:
-        assert len(out.scheduled_seq_groups) == 2
-        assert set([
-            out.scheduled_seq_groups[0].seq_group,
-            out.scheduled_seq_groups[1].seq_group,
-        ]) == set([seqB_group, seqC_group])
-        assert len(metas) == 2
-        for meta in metas:
-            assert meta.token_chunk_size == 8
-            assert (len(meta.computed_block_nums) == num_shared_tokens //
-                    block_size)  # 1 Block for the 8 tokens.
-    else:
-        assert len(out.scheduled_seq_groups) == 1
-        assert len(metas) == 1
-        assert metas[0].token_chunk_size == 8
-        assert len(metas[0].computed_block_nums) == 0  # No blocks computed.
-def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
-):
-    """
-    This test verifies that we don't schedule new prefills if there's already
-    a continuous prefill in progress even though the new prefills with shared
-    prefix can fit in the token budget:
-    - SeqA is being chunked prefill.
-    - SeqB with the same prompt shouldn't be scheduled for prefill even though
-    there's enough token budget to prefill the cached tokens.
-    - Neither should seqC be scheduled.
-    - When seqA is in decoding phase, seqB and seqC can be scheduled.
-        - Entire seqB should be prefilled since it's a full prefix cache hit.
-        - SeqC would be partially prefilled with the prefix shared, and the
-        remaining unique tokens would be prefilled (rounded down to be
-        block-size aligned).
-    """
-    block_size = 2
-    max_num_batched_tokens = 4
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_token_budget=max_num_batched_tokens,
-        max_num_seqs=max_seq_group,
-        max_model_len=100,
-        enable_prefix_caching=True,
-        enable_chunked_prefill=True,
-    )
-    seqA_tokens = list(range(8))
-    seqB_tokens = seqA_tokens
-    seqC_shared_prefix_len = 4
-    seqC_tokens = seqA_tokens[:seqC_shared_prefix_len] + list(range(12, 20))
-    seqA, seqA_group = create_dummy_prompt("0",
-                                           prompt_tokens=seqA_tokens,
-                                           block_size=block_size)
-    seqB, seqB_group = create_dummy_prompt("1",
-                                           prompt_tokens=seqB_tokens,
-                                           block_size=block_size)
-    # Chunked prefill seqA.
-    scheduler.add_seq_group(seqA_group)
-    metas, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.scheduled_seq_groups[0].seq_group == seqA_group
-    assert out.scheduled_seq_groups[0].token_chunk_size == 4
-    # seqB should not be scheduled with ongoing prefills.
-    scheduler.add_seq_group(seqB_group)
-    metas, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.scheduled_seq_groups[0].seq_group == seqA_group
-    assert out.scheduled_seq_groups[0].token_chunk_size == 4
-    # both seqB and seqC can now be scheduled with seqA is over.
-    # seqA is in decoding phase.
-    append_new_token_seq(seqA, 999)
-    seqC, seqC_group = create_dummy_prompt("2",
-                                           prompt_tokens=seqC_tokens,
-                                           block_size=block_size)
-    scheduler.add_seq_group(seqC_group)
-    metas, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 3
-    metas = {meta.request_id: meta for meta in metas}
-    assert metas[seqA_group.request_id].token_chunk_size == 1  # Decode
-    assert (metas[seqB_group.request_id].token_chunk_size == 8
-            )  # Fully cached prefill
-    assert (
-        metas[seqC_group.request_id].token_chunk_size == 6
-    ), "A partial prefix of C (4 tokens) should be prefilled, with the "
-    "remaining tokens fit into 3 token budget (4-1 from the seqA). It will "
-    "then be rounded down to 2 tokens on block size, thus 6 tokens in total."
-def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
-    """
-    Test that the scheduler does not schedule batches with prompt tokens and 
-    prompt embeddings co-mingled.
-    """
-    block_size = 2
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_num_seqs=max_seq_group,
-        max_model_len=100,
-        enable_prefix_caching=True,
-    )
-    # the odd indexed inputs should be passed in via embeddings,
-    # evens via token_ids
-    seq_length = 7
-    embedding_size = 5
-    num_seqs = 11
-    seq_tokens: list[list[int]] = []
-    seq_embeds: list[Optional[torch.Tensor]] = []
-    for i in range(num_seqs):
-        if i % 2:
-            seq_tokens.append(list(range(seq_length)))
-            seq_embeds.append(None)
-        else:
-            seq_tokens.append([0] * seq_length)
-            seq_embeds.append(torch.rand(embedding_size))
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens[i],
-                            prompt_embeds=seq_embeds[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens))
-    ]
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-    while not all(seq.is_finished() for seq, _ in seq_and_seq_groups):
-        unfinished_seq_groups = [
-            seq_group for _, seq_group in seq_and_seq_groups
-            if not seq_group.is_finished()
-        ]
-        _, out = schedule_and_update_computed_tokens(scheduler)
-        assert len(out.scheduled_seq_groups) > 0
-        batch_is_prompt_embeds = out.scheduled_seq_groups[
-            0].seq_group.uses_prompt_embeds()
-        expected_scheduled_seq_groups = [
-            seq_group for seq_group in unfinished_seq_groups
-            if seq_group.uses_prompt_embeds() == batch_is_prompt_embeds
-        ]
-        # We should have as many scheduled groups as possible, without mixing
-        assert len(out.scheduled_seq_groups) == min(
-            max_seq_group, len(expected_scheduled_seq_groups))
-        assert all(scheduled_seq_group.seq_group.uses_prompt_embeds() ==
-                   batch_is_prompt_embeds
-                   for scheduled_seq_group in out.scheduled_seq_groups)
-        # Finish the scheduled groups
-        for scheduled_seq_group in out.scheduled_seq_groups:
-            for seq in scheduled_seq_group.seq_group.seqs:
-                seq.status = SequenceStatus.FINISHED_STOPPED
-        scheduler.free_finished_seq_groups()
-def test_remove_seq_from_computed_blocks_tracker():
-    """
-    Test that computed_blocks_tracker correctly removes stale sequences
-    during scheduling.
-    The test covers 9 scheduling branches where stale seqs are removed:
-    - 1 in _schedule_swapped
-    - 1 in _schedule_priority_preemption
-    - 7 in _schedule_prefill
-    Each branch is tested to ensure proper cleanup of
-    _seq_id_to_num_tokens_computed.
-    """
-    # Budget can not schedule in swapped
-    block_size = 2
-    max_seq_group = 3
-    seq_tokens_with_swapped: list[list[int]] = []
-    blocks_to_swap_out: list[tuple[int, int]] = []
-    curr_loras: set[int] = set()
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=64,
-        num_gpu_blocks=16,
-        max_num_seqs=max_seq_group,
-        enable_prefix_caching=True,
-    )
-    budget = create_token_budget(token_budget=15)
-    seq_length = 16
-    num_seqs = 3
-    for i in range(num_seqs):
-        seq_tokens_with_swapped.append([i] * seq_length)
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_with_swapped[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens_with_swapped))
-    ]
-    for _, seq_group in seq_and_seq_groups:
-        scheduler._allocate_and_set_running(seq_group)
-        scheduler._swap_out(seq_group, blocks_to_swap_out)
-        scheduler._add_seq_group_to_swapped(seq_group)
-    scheduler._schedule_swapped(budget, curr_loras)
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
-    assert seq_id_to_num_tokens_computed is None
-    # Prefill schedule don't have a space for another LoRA, so
-    # we ignore this request for now.
-    block_size = 4
-    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
-    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64,
-                                     enable_prefix_caching=True)
-    budget = create_token_budget(token_budget=120)
-    num_seqs = 2
-    for i in range(num_seqs):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=seq_length,
-                                           block_size=block_size,
-                                           lora_request=LoRARequest(
-                                               lora_name=str(i),
-                                               lora_int_id=i + 1,
-                                               lora_path="abc"))
-        scheduler.add_seq_group(seq_group)
-    scheduler._schedule_prefills(budget, curr_loras)
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
-    assert seq_id_to_num_tokens_computed is None
-    # Priority preemption schedule
-    scheduler._schedule_priority_preemption(budget)
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
-    assert seq_id_to_num_tokens_computed is None
-    # Prefill scheduler does not schedule batches with prompt tokens and
-    # prompt embeddings co-mingled.
-    block_size = 2
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_num_seqs=max_seq_group,
-        max_model_len=100,
-        enable_prefix_caching=True,
-    )
-    seq_length = 7
-    embedding_size = 5
-    seq_tokens_with_embedding: list[list[int]] = []
-    seq_embeds: list[Optional[torch.Tensor]] = []
-    seq_tokens_with_embedding.append(list(range(seq_length)))
-    seq_embeds.append(None)
-    seq_tokens_with_embedding.append([0] * seq_length)
-    seq_embeds.append(torch.rand(embedding_size))
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_with_embedding[i],
-                            prompt_embeds=seq_embeds[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens_with_embedding))
-    ]
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-    scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
-    assert seq_id_to_num_tokens_computed is None
-    #  Prefill scheduler budget num_batched_tokens
-    #  >= scheduler_config max_num_batched_tokens
-    block_size = 2
-    max_seq_group = 3
-    seq_tokens_prefill_budget: list[list[int]] = []
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        max_token_budget=8,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_num_seqs=max_seq_group,
-        max_model_len=5,
-        enable_prefix_caching=True,
-    )
-    seq_length = 4
-    num_seqs = 3
-    for i in range(num_seqs):
-        seq_tokens_prefill_budget.append([i] * seq_length)
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_prefill_budget[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens_prefill_budget))
-    ]
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-    scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(2))
-    assert seq_id_to_num_tokens_computed is None
-    # Budget can not schedule in waiting
-    block_size = 2
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        max_token_budget=30,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_num_seqs=max_seq_group,
-        max_model_len=30,
-        enable_prefix_caching=True,
-    )
-    seq_length = 16
-    num_seqs = 3
-    seq_tokens_prefill_budget_waiting: list[list[int]] = []
-    for i in range(num_seqs):
-        seq_tokens_prefill_budget_waiting.append(list(range(seq_length)))
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_prefill_budget_waiting[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens_prefill_budget_waiting))
-    ]
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-    scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
-    assert seq_id_to_num_tokens_computed is None
-    # Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
-    block_size = 2
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_num_seqs=max_seq_group,
-        max_model_len=30,
-        enable_prefix_caching=True,
-    )
-    seq_length = 31
-    seq_tokens_prompt_limit: list[list[int]] = []
-    seq_tokens_prompt_limit.append(list(range(seq_length)))
-    seq_and_seq_groups = [
-        create_dummy_prompt("0",
-                            prompt_tokens=seq_tokens_prompt_limit[0],
-                            block_size=block_size)
-    ]
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-    scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(0))
-    assert seq_id_to_num_tokens_computed is None
-    # Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
-    block_size = 2
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=160,
-        num_gpu_blocks=160,
-        max_num_seqs=max_seq_group,
-        max_model_len=320,
-        enable_prefix_caching=True,
-    )
-    seq_length = 320
-    num_seqs = 1
-    seq_tokens_never: list[list[int]] = []
-    for i in range(num_seqs):
-        seq_tokens_never.append(list(range(seq_length)))
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_never[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens_never))
-    ]
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-    scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(0))
-    assert seq_id_to_num_tokens_computed is None
-    # Budget can not allocate, AllocStatus is LATER
-    block_size = 2
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=160,
-        num_gpu_blocks=160,
-        max_num_seqs=max_seq_group,
-        max_model_len=320,
-        enable_prefix_caching=True,
-    )
-    seq_length = 160
-    num_seqs = 2
-    seq_tokens_later: list[list[int]] = []
-    for i in range(num_seqs):
-        seq_tokens_later.append(list(range(seq_length)))
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_later[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens_later))
-    ]
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-    scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
-    assert seq_id_to_num_tokens_computed is None
--- a/tests/core/test_serialization.py
+++ b/tests/core/test_serialization.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import msgspec
-from vllm.executor.msgspec_utils import decode_hook, encode_hook
-from vllm.sequence import ExecuteModelRequest
-from .utils import create_batch
-def test_msgspec_serialization():
-    num_lookahead_slots = 4
-    seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots)
-    execute_model_req = ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list,
-        num_lookahead_slots=num_lookahead_slots,
-        running_queue_size=4)
-    encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
-    decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
-                                      dec_hook=decode_hook)
-    req = decoder.decode(encoder.encode(execute_model_req))
-    expected = execute_model_req.seq_group_metadata_list
-    actual = req.seq_group_metadata_list
-    assert (len(expected) == len(actual))
-    expected = expected[0]
-    actual = actual[0]
-    assert expected.block_tables == actual.block_tables
-    assert expected.is_prompt == actual.is_prompt
-    assert expected.request_id == actual.request_id
-    assert (expected.seq_data[0].prompt_token_ids ==
-            actual.seq_data[0].prompt_token_ids)
-    assert (expected.seq_data[0].output_token_ids ==
-            actual.seq_data[0].output_token_ids)
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import time
-from collections import defaultdict
-from collections.abc import Sequence as GenericSequence
-from itertools import count
-from typing import Any, Optional, Union
-import torch
-from vllm.core.scheduler import Scheduler, SchedulerOutputs
-from vllm.inputs import EncoderDecoderInputs, embeds_inputs, token_inputs
-from vllm.lora.request import LoRARequest
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import (Logprob, Sequence, SequenceData, SequenceGroup,
-                           SequenceGroupMetadata)
-def create_dummy_prompt(
-    request_id: str,
-    prompt_length: int = -1,
-    block_size: Optional[int] = None,
-    lora_request: Optional[LoRARequest] = None,
-    prompt_tokens: Optional[list[int]] = None,
-    prompt_embeds: Optional[torch.Tensor] = None,
-    min_tokens: int = 0,
-    max_tokens: int = 16,
-) -> tuple[Sequence, SequenceGroup]:
-    if not block_size:
-        block_size = prompt_length
-    if prompt_tokens is None:
-        # Create dummy prompt sequence with tokens 0...block_size-1
-        # and prompt "0 ... block_size".
-        prompt_tokens = list(range(prompt_length))
-    prompt_str = " ".join([str(t) for t in prompt_tokens])
-    inputs = token_inputs(
-        prompt_token_ids=prompt_tokens,
-        prompt=prompt_str) if prompt_embeds is None else embeds_inputs(
-            prompt_embeds=prompt_embeds)
-    prompt = Sequence(
-        int(request_id),
-        inputs=inputs,
-        block_size=block_size,
-    )
-    seq_group = SequenceGroup(
-        request_id=request_id,
-        seqs=[prompt],
-        arrival_time=time.time(),
-        sampling_params=SamplingParams(max_tokens=max_tokens,
-                                       min_tokens=min_tokens),
-        lora_request=lora_request,
-    )
-    return prompt, seq_group
-def create_dummy_lora_sequence(request_id: int, token_ids: list[int],
-                               block_size: int, lora_int_id: int) -> Sequence:
-    return Sequence(seq_id=request_id,
-                    inputs=token_inputs(token_ids),
-                    block_size=block_size,
-                    lora_request=LoRARequest(lora_name="dummy",
-                                             lora_path="/dummy",
-                                             lora_int_id=lora_int_id))
-def create_dummy_sequence(request_id: int, token_ids: list[int],
-                          block_size: int) -> Sequence:
-    return Sequence(
-        seq_id=request_id,
-        inputs=token_inputs(token_ids),
-        block_size=block_size,
-    )
-def create_dummy_prompt_encoder_decoder(
-    request_id: str,
-    decoder_prompt_length: int,
-    encoder_prompt_length: int,
-    block_size: Optional[int] = None,
-    lora_request: Optional[LoRARequest] = None,
-) -> tuple[Sequence, Sequence, SequenceGroup]:
-    if not block_size:
-        block_size = decoder_prompt_length
-    # Create dummy prompt sequence with tokens 0...block_size-1
-    # and prompt "0 ... block_size". Note that the prompt string
-    # doesn't actually match the tokens
-    decoder_prompt_tokens = list(range(decoder_prompt_length))
-    decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
-    encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
-    encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
-    inputs: EncoderDecoderInputs = {
-        "decoder": token_inputs(decoder_prompt_tokens,
-                                prompt=decoder_prompt_str),
-        "encoder": token_inputs(encoder_prompt_tokens,
-                                prompt=encoder_prompt_str),
-    }
-    decoder_prompt = Sequence(int(request_id),
-                              inputs=inputs["decoder"],
-                              block_size=block_size)
-    encoder_prompt = Sequence(int(request_id),
-                              inputs=inputs["encoder"],
-                              block_size=block_size)
-    seq_group = SequenceGroup(request_id=request_id,
-                              seqs=[decoder_prompt],
-                              arrival_time=time.time(),
-                              lora_request=lora_request,
-                              encoder_seq=encoder_prompt)
-    return decoder_prompt, encoder_prompt, seq_group
-def create_seq_group(
-        seq_prompt_len: int = 1024,
-        seq_output_lens: GenericSequence[int] = (128, ),
-        request_id: str = '0',
-        seq_id_start: int = 0,
-        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
-    assert len(seq_output_lens) > 0
-    if sampling_params is None:
-        sampling_params = SamplingParams()
-    prompt_token_ids = [0] * seq_prompt_len
-    seqs: list[Sequence] = []
-    for seq_id_offset, output_len in enumerate(seq_output_lens):
-        seq = Sequence(
-            seq_id=seq_id_start + seq_id_offset,
-            inputs=token_inputs(prompt_token_ids),
-            block_size=16,
-        )
-        for i in range(output_len):
-            seq.append_token_id(
-                token_id=i,
-                logprobs={i: Logprob(0.0)},
-            )
-        seqs.append(seq)
-    seq_group = SequenceGroup(
-        request_id=request_id,
-        seqs=seqs,
-        sampling_params=sampling_params,
-        arrival_time=time.time(),
-    )
-    return seq_group
-def create_seq_group_encoder_decoder(
-        seq_prompt_len: int = 1024,
-        seq_output_lens: GenericSequence[int] = (128, ),
-        request_id: str = '0',
-        seq_id_start: int = 0,
-        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
-    assert len(seq_output_lens) > 0
-    if sampling_params is None:
-        sampling_params = SamplingParams()
-    prompt_token_ids = [0] * seq_prompt_len
-    inputs: EncoderDecoderInputs = {
-        "decoder": token_inputs(prompt_token_ids),
-        "encoder": token_inputs(prompt_token_ids),
-    }
-    seqs = []
-    for seq_id_offset, output_len in enumerate(seq_output_lens):
-        # Construct decoder input sequences
-        seq = Sequence(
-            seq_id=seq_id_start + seq_id_offset,
-            inputs=inputs["decoder"],
-            block_size=16,
-        )
-        for i in range(output_len):
-            seq.append_token_id(
-                token_id=i,
-                logprobs={i: Logprob(0.0)},
-            )
-        seqs.append(seq)
-    # Encoder input sequence
-    encoder_seq = Sequence(
-        seq_id=seq_id_start + len(seq_output_lens),
-        inputs=inputs["encoder"],
-        block_size=16,
-    )
-    return SequenceGroup(request_id=request_id,
-                         seqs=seqs,
-                         sampling_params=sampling_params,
-                         arrival_time=time.time(),
-                         encoder_seq=encoder_seq)
-def round_up_to_next_block(seq_len: int, block_size: int) -> int:
-    return (seq_len + block_size - 1) // block_size
-# Helper functions for scheduler tests
-def get_sequence_groups(scheduler_output):
-    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
-def append_new_token(out, token_id: int):
-    seq_groups = get_sequence_groups(out)
-    for seq_group in seq_groups:
-        for seq in seq_group.get_seqs():
-            seq.append_token_id(token_id, {token_id: Logprob(token_id)})
-def schedule_and_update_computed_tokens(scheduler):
-    metas, out, _ = scheduler.schedule()
-    for s in out.scheduled_seq_groups:
-        s.seq_group.update_num_computed_tokens(s.token_chunk_size)
-    return metas, out
-def append_new_token_seq(seq: Sequence, token_id: int):
-    seq.append_token_id(token_id, {token_id: Logprob(token_id)})
-def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
-    seq_group.update_num_computed_tokens(token_chunk_size)
-    for seq in seq_group.get_seqs():
-        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
-class SchedulerProxy:
-    """
-    A proxy class to forward calls to the scheduler.
-    """
-    def __init__(self, scheduler: Scheduler):
-        self.scheduler_ = scheduler
-        self.call_history: dict[str, list[Any]] = defaultdict(list)
-    def __getattr__(self, name: str) -> Any:
-        def wrapper(*args, **kwargs):
-            result = getattr(self.scheduler_, name)(*args, **kwargs)
-            self.call_history[name].append((args, kwargs, result))
-            return result
-        return wrapper
-    def last_schedule_ret(
-        self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]:
-        _, _, ret = self.call_history["schedule"][-1]
-        return ret
-def create_seq_group_metadata_from_prompts(
-    prompts: list[list[int]],
-    num_gpu_blocks: int,
-    block_size: int,
-    final_prompt_lens: list[int],
-    continuations: Optional[list[list[int]]] = None,
-    seq_ids: Optional[list[int]] = None,
-) -> list[SequenceGroupMetadata]:
-    if continuations is None:
-        continuations = [[] for _ in prompts]
-    if seq_ids is None:
-        seq_ids = list(i for i, _ in enumerate(prompts))
-    free_gpu_blocks = list(range(num_gpu_blocks))
-    block_allocations = {
-        i: [
-            free_gpu_blocks.pop()
-            for _ in range(round_up_to_next_block(final_len, block_size))
-        ]
-        for i, final_len in enumerate(final_prompt_lens)
-    }
-    seq_grou_metadata_list = []
-    for i, (prompt_token_ids,
-            cont_token_ids) in enumerate(zip(prompts, continuations)):
-        data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids)
-        data.update_num_computed_tokens(
-            len(prompt_token_ids) + len(cont_token_ids) - 1)
-        seq_data = {i: data}
-        seq_grou_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=str(i),
-                is_prompt=len(cont_token_ids) == 0,
-                seq_data=seq_data,
-                sampling_params=SamplingParams(temperature=0.0),
-                block_tables={i: block_allocations[i][:]},
-            ))
-    return seq_grou_metadata_list
-def create_chunked_seq_group_metadata_from_prompt(
-        prompt: list[int],
-        num_gpu_blocks: int,
-        chunk_size: int,
-        block_size: int,
-        seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]:
-    if seq_id is None:
-        seq_id = 0
-    free_gpu_blocks = list(range(num_gpu_blocks))
-    block_allocations = [
-        free_gpu_blocks.pop()
-        for _ in range(round_up_to_next_block(len(prompt), block_size))
-    ]
-    seq_group_metadata_list = []
-    for i, idx in enumerate(range(0, len(prompt), chunk_size)):
-        chunk_ids = prompt[idx:idx + chunk_size]
-        data = SequenceData.from_seqs(prompt)
-        data.update_num_computed_tokens(idx)
-        seq_data = {i: data}
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=str(seq_id),
-                is_prompt=True,
-                do_sample=idx + chunk_size >= len(prompt),  # terminal chunk
-                seq_data=seq_data,
-                sampling_params=SamplingParams(temperature=0.0),
-                block_tables={i: block_allocations},
-                token_chunk_size=len(chunk_ids)))
-    return seq_group_metadata_list
-def create_batch(batch_size,
-                 k,
-                 prompt_len: Union[int, list[int]] = 10,
-                 prev_output_token_len: int = 10,
-                 seq_ids: Optional[list[int]] = None,
-                 num_gpu_blocks: Optional[int] = None,
-                 block_size: Optional[int] = None,
-                 prefill_chunk_size: Optional[int] = None):
-    if block_size is None:
-        block_size = 8
-    if num_gpu_blocks is None:
-        num_gpu_blocks = 2048 // block_size
-    iterator = count()
-    if isinstance(prompt_len, int):
-        prompt_lens = [prompt_len for _ in range(batch_size)]
-    else:
-        prompt_lens = prompt_len
-    prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens]
-    if prefill_chunk_size:
-        # Create a batch of chunked prompts.
-        if not seq_ids:
-            seq_ids = list(range(len(prompts)))
-        seq_group_metadata_list = []
-        for p, sid in zip(prompts, seq_ids):
-            seq_group_metadata_list += \
-                create_chunked_seq_group_metadata_from_prompt(
-                p, num_gpu_blocks, prefill_chunk_size, block_size, sid)
-        seq_group_metadata_list = seq_group_metadata_list[:batch_size]
-        prev_output_tokens = []
-    else:
-        prev_output_tokens = [[
-            next(iterator) for _ in range(prev_output_token_len)
-        ] for _ in range(batch_size)]
-        final_prompt_lens = [
-            len(prompt) + len(prev_output_token) + k + 1
-            for prompt, prev_output_token in zip(prompts, prev_output_tokens)
-        ]
-        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-            prompts, num_gpu_blocks, block_size, final_prompt_lens,
-            prev_output_tokens, seq_ids)
-    return seq_group_metadata_list, prompts, prev_output_tokens