[Core] Deprecating block manager v1 and make block manager v2 default (#8704)

Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).

[Core] Deprecating block manager v1 and make block manager v2 default (#8704)
Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
81ede99c · Kuntai Du · GitHub · 5eda21e7 · 81ede99c · 81ede99c
Unverified Commit 81ede99c authored Oct 17, 2024 by Kuntai Du Committed by GitHub Oct 17, 2024
20 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -77,8 +77,8 @@ steps:
  - vllm/
  - tests/basic_correctness/test_chunked_prefill
  commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py

 - label: Core Test # 10min
  mirror_hardwares: [amd]
@@ -88,11 +88,7 @@ steps:
  - vllm/distributed
  - tests/core
  commands:
-  - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1  pytest -v -s core/test_scheduler.py
-  - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1  pytest -v -s core core/test_chunked_prefill_scheduler.py
-  - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1  pytest -v -s core core/block/e2e/test_correctness.py
-  - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1  pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
-  - pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py
+  - pytest -v -s core

 - label: Entrypoints Test # 40min
  working_dir: "/vllm-workspace/tests"
@@ -192,8 +188,7 @@ steps:
  - vllm/
  - tests/prefix_caching
  commands:
-    - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
-    - pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py
+    - pytest -v -s prefix_caching

 - label: Samplers Test # 36min
  source_file_dependencies:
@@ -217,8 +212,7 @@ steps:
  - tests/spec_decode
  commands:
    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-    - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
-    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py
+    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py

 - label: LoRA Test %N # 15min each
  mirror_hardwares: [amd]
@@ -405,7 +399,7 @@ steps:
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-  - TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1  pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus

--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -38,7 +38,6 @@ def main(args: argparse.Namespace):
        quantization_param_path=args.quantization_param_path,
        device=args.device,
        ray_workers_use_nsight=args.ray_workers_use_nsight,
-        use_v2_block_manager=args.use_v2_block_manager,
        enable_chunked_prefill=args.enable_chunked_prefill,
        download_dir=args.download_dir,
        block_size=args.block_size,
@@ -221,9 +220,6 @@ if __name__ == '__main__':
    parser.add_argument("--enable-prefix-caching",
                        action='store_true',
                        help="Enable automatic prefix caching")
-    parser.add_argument('--use-v2-block-manager',
-                        action='store_true',
-                        default=EngineArgs.use_v2_block_manager)
    parser.add_argument(
        "--ray-workers-use-nsight",
        action='store_true',

--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -33,7 +33,6 @@ from typing import List, Optional, Tuple
 from transformers import PreTrainedTokenizerBase

 from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
 from vllm.utils import FlexibleArgumentParser

 try:
@@ -134,7 +133,6 @@ def main(args):
              tokenizer_mode='auto',
              trust_remote_code=True,
              enforce_eager=True,
-              use_v2_block_manager=args.use_v2_block_manager,
              tensor_parallel_size=args.tensor_parallel_size,
              enable_prefix_caching=args.enable_prefix_caching)

@@ -176,10 +174,6 @@ if __name__ == "__main__":
    parser.add_argument('--enable-prefix-caching',
                        action='store_true',
                        help='enable prefix caching')
-    parser.add_argument('--use-v2-block-manager',
-                        action='store_true',
-                        default=EngineArgs.use_v2_block_manager,
-                        help='Use BlockSpaceMangerV2')
    parser.add_argument('--num-prompts',
                        type=int,
                        default=1,

--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -86,7 +86,6 @@ def run_vllm(
    distributed_executor_backend: Optional[str],
    gpu_memory_utilization: float = 0.9,
    num_scheduler_steps: int = 1,
-    use_v2_block_manager: bool = False,
    download_dir: Optional[str] = None,
    load_format: str = EngineArgs.load_format,
    disable_async_output_proc: bool = False,
@@ -113,7 +112,6 @@ def run_vllm(
        distributed_executor_backend=distributed_executor_backend,
        load_format=load_format,
        num_scheduler_steps=num_scheduler_steps,
-        use_v2_block_manager=use_v2_block_manager,
        disable_async_output_proc=disable_async_output_proc,
    )

@@ -176,7 +174,6 @@ async def run_vllm_async(
    distributed_executor_backend: Optional[str],
    gpu_memory_utilization: float = 0.9,
    num_scheduler_steps: int = 1,
-    use_v2_block_manager: bool = False,
    download_dir: Optional[str] = None,
    load_format: str = EngineArgs.load_format,
    disable_async_output_proc: bool = False,
@@ -204,7 +201,6 @@ async def run_vllm_async(
        distributed_executor_backend=distributed_executor_backend,
        load_format=load_format,
        num_scheduler_steps=num_scheduler_steps,
-        use_v2_block_manager=use_v2_block_manager,
        disable_async_output_proc=disable_async_output_proc,
        worker_use_ray=False,
        disable_log_requests=True,
@@ -341,8 +337,7 @@ def main(args: argparse.Namespace):
            args.enable_prefix_caching, args.enable_chunked_prefill,
            args.max_num_batched_tokens, args.distributed_executor_backend,
            args.gpu_memory_utilization, args.num_scheduler_steps,
-            args.use_v2_block_manager, args.download_dir, args.load_format,
-            args.disable_async_output_proc
+            args.download_dir, args.load_format, args.disable_async_output_proc
        ]

        if args.async_engine:
@@ -471,10 +466,6 @@ if __name__ == "__main__":
        type=int,
        default=1,
        help="Maximum number of forward steps per scheduler call.")
-    parser.add_argument("--use-v2-block-manager",
-                        action='store_true',
-                        default=EngineArgs.use_v2_block_manager,
-                        help="Enable block manager v2.")
    parser.add_argument(
        "--enable-prefix-caching",
        action='store_true',

--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -16,7 +16,6 @@ def main(args):
        enforce_eager=True,
        enable_prefix_caching=True,
        tensor_parallel_size=args.tensor_parallel_size,
-        use_v2_block_manager=args.use_v2_block_manager,
    )

    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
@@ -56,8 +55,5 @@ if __name__ == "__main__":
    parser.add_argument('--enable-prefix-caching',
                        action='store_true',
                        help='enable prefix caching')
-    parser.add_argument('--use-v2-block-manager',
-                        action='store_true',
-                        help='Use BlockSpaceMangerV2')
    args = parser.parse_args()
    main(args)
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/models/spec_decode.rst
@@ -30,7 +30,6 @@ The following code configures vLLM in an offline mode to use speculative decodin
        tensor_parallel_size=1,
        speculative_model="facebook/opt-125m",
        num_speculative_tokens=5,
-        use_v2_block_manager=True,
    )
    outputs = llm.generate(prompts, sampling_params)

@@ -104,7 +103,6 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
        speculative_model="[ngram]",
        num_speculative_tokens=5,
        ngram_prompt_lookup_max=4,
-        use_v2_block_manager=True,
    )
    outputs = llm.generate(prompts, sampling_params)

@@ -135,7 +133,6 @@ For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-
        tensor_parallel_size=4,
        speculative_model="ibm-fms/llama3-70b-accelerator",
        speculative_draft_tensor_parallel_size=1,
-        use_v2_block_manager=True,
    )
    outputs = llm.generate(prompts, sampling_params)


--- a/examples/offline_inference_mlpspeculator.py
+++ b/examples/offline_inference_mlpspeculator.py
@@ -50,8 +50,6 @@ if __name__ == "__main__":
    llm = LLM(
        model="meta-llama/Llama-2-13b-chat-hf",
        speculative_model="ibm-fms/llama-13b-accelerator",
-        # These are currently required for MLPSpeculator decoding
-        use_v2_block_manager=True,
    )

    print("With speculation")

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -12,7 +12,7 @@ from contextlib import nullcontext
 import pytest

 from ..models.utils import check_logprobs_close, check_outputs_equal
-from ..utils import check_deprecated_block_manager_usage, multi_gpu_test
+from ..utils import multi_gpu_test

 MODELS = [
    "facebook/opt-125m",
@@ -20,12 +20,6 @@ MODELS = [
 ]


-@pytest.fixture(scope="module", autouse=True)
-def check_deprecated_block_manager():
-    check_deprecated_block_manager_usage(
-        'tests/basic_correctness/test_chunked_prefill.py')
-
-
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
@@ -197,7 +191,6 @@ def test_models_with_fp8_kv_cache(
 @pytest.mark.parametrize("max_tokens", [16])
 @pytest.mark.parametrize("enforce_eager", [False])
 @pytest.mark.parametrize("chunk_size", [30, 32])
-@pytest.mark.parametrize("use_v2_block_manager", [False, True])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
@@ -206,7 +199,6 @@ def test_with_prefix_caching(
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
-    use_v2_block_manager: bool,
    tensor_parallel_size: int,
 ) -> None:
    """
@@ -234,7 +226,6 @@ def test_with_prefix_caching(
                enable_chunked_prefill=True,
                enable_prefix_caching=enable,
                tensor_parallel_size=tensor_parallel_size,
-                use_v2_block_manager=use_v2_block_manager,
                enforce_eager=enforce_eager,
                max_num_seqs=max_num_seqs,
        ) as vllm_model:

--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -2,18 +2,11 @@ from itertools import cycle

 import pytest

-from tests.utils import check_deprecated_block_manager_usage
 from vllm import SamplingParams

 from .conftest import get_token_ids_from_llm_generator


-@pytest.fixture(scope="module", autouse=True)
-def check_deprecated_block_manager():
-    check_deprecated_block_manager_usage(
-        'tests/core/block/e2e/test_correctness.py')
-
-
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
@@ -28,32 +21,32 @@ def check_deprecated_block_manager():
        "num_gpu_blocks_override": 5 * (64 + 1),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "use_v2_block_manager": False
-}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
    "preemption_mode": "swap"
 }, {
-    "use_v2_block_manager": True,
    "preemption_mode": "recompute"
 }])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
-def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
-                                               test_llm_generator, batch_size):
-    """Verify block manager v2 produces same outputs as block manager v1, even
-    when there is preemption.
+def test_block_manager_with_preemption(baseline_llm_generator,
+                                       test_llm_generator, batch_size):
+    """Verify block manager produces same outputs even when there is preemption.

    This constructs two LLM, each with limited number of GPU blocks. The limit
    is decided such that as the sequences in the batch grow, sequences must be
    preempted and removed from cache.

    If the output token ids are equivalent, then we have confidence that the KV
-    cache is not corrupted in the v2 block manager.
+    cache is not corrupted.

    NOTE: We want a significant number of generated tokens so that any incorrect
    KV mapping has time to build up error.
+
+    NOTE(Kuntai): Though we have removed block manager v1, this test is still
+    useful as it asserts the behavior of block manager v2 (now it is called 
+    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
+    keep this test.
    """
    output_len = 1024
    temperature = 0.0
@@ -77,11 +70,9 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
        temperature=temperature,
    )

-    print('Getting token ids from block manager v1')
    baseline_token_ids = get_token_ids_from_llm_generator(
        baseline_llm_generator, prompts, sampling_params)

-    print('Getting token ids from block manager v2')
    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
                                                      prompts, sampling_params)

@@ -104,9 +95,6 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,

        # skip cuda graph creation for fast test.
        "enforce_eager": True,
-
-        # Lookahead scheduling only supported in v2 block manager.
-        "use_v2_block_manager": True,
    }])
 @pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
@@ -218,26 +206,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
                             "max_num_seqs": 10,
                         }])
 @pytest.mark.parametrize("baseline_llm_kwargs", [
-    {
-        "use_v2_block_manager": False,
-    },
+    {},
 ])
 @pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "use_v2_block_manager": True,
        "num_lookahead_slots": 0,
    },
    {
-        "use_v2_block_manager": True,
        "num_lookahead_slots": 5,
    },
 ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
-def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
-                                          test_llm_generator, batch_size):
-    """Verify that chunked prefill works with BlockManagerV2, with and without
-    lookahead scheduling.
+def test_chunked_prefill_block_manager(baseline_llm_generator,
+                                       test_llm_generator, batch_size):
+    """Verify that chunked prefill works with SelfAttnBlockSpaceManager, 
+    with and without lookahead scheduling.
    """
    output_len = 32
    temperature = 0.0
@@ -258,11 +242,11 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
        temperature=temperature,
    )

-    print('Getting token ids with BlockManagerV1')
+    print('Getting token ids with BlockManager')
    baseline_token_ids = get_token_ids_from_llm_generator(
        baseline_llm_generator, prompts, sampling_params)

-    print('Getting token ids with BlockManagerV2')
+    print('Getting token ids with BlockManager, with lookahead slots.')
    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
                                                      prompts, sampling_params)

@@ -290,32 +274,32 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
        "enable_prefix_caching": True,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "use_v2_block_manager": False
-}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
    "preemption_mode": "swap"
 }, {
-    "use_v2_block_manager": True,
    "preemption_mode": "recompute"
 }])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
-def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
+def test_block_manager_prefix_caching_enabled_with_preemption(
        baseline_llm_generator, test_llm_generator, batch_size):
-    """Verify block manager v2 produces same outputs as block manager v1, even
-    when there is preemption.
+    """Verify block manager produces same outputs even when there is preemption.

    This constructs two LLM, each with limited number of GPU blocks. The limit
    is decided such that as the sequences in the batch grow, sequences must be
    preempted and removed from cache.

    If the output token ids are equivalent, then we have confidence that the KV
-    cache is not corrupted in the v2 block manager.
+    cache is not corrupted.

    NOTE: We want a significant number of generated tokens so that any incorrect
    KV mapping has time to build up error.
+
+    NOTE(Kuntai): Though we have removed block manager v1, this test is still
+    useful as it asserts the behavior of block manager v2 (now it is called 
+    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
+    keep this test.
    """
    output_len = 1024
    temperature = 0.0
@@ -339,11 +323,11 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
        temperature=temperature,
    )

-    print('Getting token ids from block manager v1')
+    print('Getting token ids from block manager')
    baseline_token_ids = get_token_ids_from_llm_generator(
        baseline_llm_generator, prompts, sampling_params)

-    print('Getting token ids from block manager v2')
+    print('Getting token ids from block manager, with preemption')
    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
                                                      prompts, sampling_params)

@@ -366,9 +350,6 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
        # Allow only 5 sequences of ~1024 tokens in worst case.
        "block_size": 16,
        "num_gpu_blocks_override": 5 * (64 + 1),
-
-        # Test APC in v2 block
-        "use_v2_block_manager": True,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
@@ -444,9 +425,6 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
        "max_model_len": 48,
        "block_size": 16,
        "num_gpu_blocks_override": 3,
-
-        # Test APC in v2 block
-        "use_v2_block_manager": True,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{

--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -3,7 +3,6 @@ from typing import List

 import pytest

-from tests.utils import check_deprecated_block_manager_usage
 from vllm import LLM, SamplingParams

 from .conftest import get_text_from_llm_generator
@@ -13,12 +12,6 @@ MODEL = "bigcode/starcoder2-3b"
 BLOCK_SIZE = 16


-@pytest.fixture(scope="module", autouse=True)
-def check_deprecated_block_manager():
-    check_deprecated_block_manager_usage(
-        'tests/core/block/e2e/test_correctness_sliding_window.py')
-
-
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
@@ -31,10 +24,8 @@ def check_deprecated_block_manager():
        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "use_v2_block_manager": False
-}])
-@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
 def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
@@ -55,7 +46,6 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,

    prompts, answer, indices = prep_prompts(batch_size)

-    print('Getting token ids from block manager v1')
    baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
                                                 prompts,
                                                 sampling_params,
@@ -91,10 +81,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
-    "enable_chunked_prefill": True
-}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
 def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):

--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -2,7 +2,7 @@ import pytest

 from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
                                   STR_NOT_IMPL_ENC_DEC_SWA)
-from vllm.core.block_manager_v2 import BlockSpaceManagerV2
+from vllm.core.block_manager import SelfAttnBlockSpaceManager
 from vllm.core.interfaces import AllocStatus
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list
@@ -17,7 +17,7 @@ from ..utils import (create_dummy_prompt, create_seq_group,
 @pytest.mark.parametrize("watermark", [0.0, 0.5])
 def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
                                num_gpu_blocks: int, watermark: float):
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=1024,
@@ -63,7 +63,7 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int,
                                                num_seqs_per_group: int,
                                                num_gpu_blocks: int,
                                                watermark: float):
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=1024,
@@ -117,16 +117,16 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
    '''
    SWA short for Sliding Window Attention.

-    At time of writing block manager v2 does not support SWA.
+    At time of writing block manager does not support SWA.

-    However even when SWA is implemented for block manager v2,
+    However even when SWA is implemented for block manager,
    there will still most likely be a separate workstream required
    to enable SWA for encoder/decoder models.

    Therefore this test enforces that one of the following cases
    hold true:
-    1. Block manager v2 does not support SWA at all (true at time of writing)
-    2. Block manager v2 fails with NotImplementError when SWA is enabled
+    1. Block manager does not support SWA at all (true at time of writing)
+    2. Block manager fails with NotImplementError when SWA is enabled
       AND a SequenceGroup with an encoder sequence (i.e. in support of an
       encoder/decoder model) is passed into can_allocate() as an argument

@@ -135,7 +135,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
    '''

    with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
-        block_manager = BlockSpaceManagerV2(
+        block_manager = SelfAttnBlockSpaceManager(
            block_size=block_size,
            num_gpu_blocks=num_gpu_blocks,
            num_cpu_blocks=1024,
@@ -158,7 +158,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
        block_manager.can_allocate(seq_group)

    # Assert that either
-    # 1. Block manager v2 constructor fails with assertion that sliding window
+    # 1. Block manager constructor fails with assertion that sliding window
    #    is not yet supported (most likely near-term outcome at time of
    #    writing), or
    # 2. can_allocate() fails with NotImplementedError due to combination of
@@ -177,7 +177,7 @@ def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
        block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
        watermark: float):

-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=1024,
@@ -217,7 +217,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,

    num_gpu_blocks = 1024
    watermark = 0.1
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=0,
@@ -269,14 +269,15 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
    """Verify blocks number on src/desc device is correct after swapping in/out
        sequence group (not missing or extra blocks).
    """
-    block_manager = BlockSpaceManagerV2(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0,
-                                        enable_caching=enable_caching)
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
    prompt.status = SequenceStatus.WAITING
    block_manager.allocate(seq_group)
+
    # Emulate a forward pass by appending a single token.
    # The block manager then knows how many unprocessed
    # tokens will be written in the next forward pass.
@@ -321,11 +322,11 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
        can be swapped in/out.
    """
    num_cpu_blocks = num_gpu_blocks
-    block_manager = BlockSpaceManagerV2(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0,
-                                        enable_caching=enable_caching)
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
    prompt, seq_group = create_dummy_prompt(
        "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
    prompt.status = SequenceStatus.WAITING
@@ -382,11 +383,11 @@ def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
    block_size = 8
    num_cpu_blocks = 1
    num_gpu_blocks = 1
-    block_manager = BlockSpaceManagerV2(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0,
-                                        enable_caching=enable_caching)
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
    prompt_length = block_size - 3
    assert prompt_length > 0
    prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
@@ -434,7 +435,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,

    num_gpu_blocks = 1024
    watermark = 0.1
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=0,
@@ -474,7 +475,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
    seq.data.update_num_computed_tokens(prompt_len)
    check_used(num_blocks(prompt_len))

-    # this is how we compute it in BlockSpaceManagerV2.__init__
+    # this is how we compute it in SelfAttnBlockSpaceManager.__init__
    sliding_blocks = (sliding_window // block_size) + 2
    # plus one block for null block
    sliding_blocks += 1

--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -8,7 +8,6 @@ from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler
 from vllm.sequence import Logprob, SequenceGroup

-from ..utils import check_deprecated_block_manager_usage
 from .utils import create_dummy_prompt


@@ -28,25 +27,16 @@ def schedule_and_update_computed_tokens(scheduler):
    return metas, out


-@pytest.fixture(scope="module", autouse=True)
-def check_deprecated_block_manager():
-    check_deprecated_block_manager_usage(
-        'tests/core/test_chunked_prefill_scheduler.py')
-
-
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_simple(use_v2_block_manager: bool):
+def test_simple():
    """Verify basic scheduling works."""
    block_size = 4
    num_seq_group = 4
    max_model_len = 16
    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        max_num_batched_tokens,
-        num_seq_group,
-        max_model_len,
-        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+                                       num_seq_group,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
@@ -81,8 +71,7 @@ def test_simple(use_v2_block_manager: bool):
    assert len(seq_group_meta) == num_seq_group


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_chunk(use_v2_block_manager: bool):
+def test_chunk():
    """Verify prefills are chunked properly."""
    block_size = 4
    max_seqs = 60
@@ -93,7 +82,7 @@ def test_chunk(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 32
    cache_config.num_gpu_blocks = 32
@@ -131,8 +120,7 @@ def test_chunk(use_v2_block_manager: bool):
    assert out.num_batched_tokens == 57


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_complex(use_v2_block_manager: bool):
+def test_complex():
    block_size = 4
    max_seqs = 60
    max_model_len = 80
@@ -142,7 +130,7 @@ def test_complex(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 64
    cache_config.num_gpu_blocks = 64
@@ -201,8 +189,7 @@ def test_complex(use_v2_block_manager: bool):
    assert running[2].is_prefill()


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_maximal_decoding(use_v2_block_manager: bool):
+def test_maximal_decoding():
    """Verify decoding requests are prioritized."""
    block_size = 4
    max_seqs = 2
@@ -213,7 +200,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
@@ -295,8 +282,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
    assert out.num_batched_tokens == 2


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_prompt_limit(use_v2_block_manager: bool):
+def test_prompt_limit():
    """Verify max_num_batched_tokens < max_model_len is possible."""
    block_size = 4
    max_seqs = 32
@@ -307,7 +293,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
@@ -330,8 +316,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
    assert out.num_batched_tokens == 32


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_prompt_limit_exceed(use_v2_block_manager: bool):
+def test_prompt_limit_exceed():
    block_size = 4
    max_seqs = 64
    max_model_len = 32
@@ -356,8 +341,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool):
    assert out.ignored_seq_groups[0] == seq_group


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_swap(use_v2_block_manager: bool):
+def test_swap():
    """Verify swapping works with chunked prefill requests"""
    block_size = 4
    max_seqs = 30
@@ -368,7 +352,7 @@ def test_swap(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
@@ -414,8 +398,7 @@ def test_swap(use_v2_block_manager: bool):
    assert out.blocks_to_swap_out == []


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
+def test_running_prefill_prioritized_over_swap():
    block_size = 4
    max_seqs = 30
    max_model_len = 200
@@ -425,7 +408,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 32
    cache_config.num_gpu_blocks = 32
@@ -508,8 +491,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
    assert out.blocks_to_swap_out == []


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_chunked_prefill_preempt(use_v2_block_manager: bool):
+def test_chunked_prefill_preempt():
    """Verify preempt works with chunked prefill requests"""
    block_size = 4
    max_seqs = 30
@@ -520,7 +502,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
@@ -575,8 +557,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
    assert out.num_batched_tokens == max_num_batched_tokens


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
+def test_chunked_prefill_max_seqs():
    block_size = 4
    max_seqs = 2
    max_model_len = 80
@@ -586,7 +567,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 128
    cache_config.num_gpu_blocks = 128
@@ -629,8 +610,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
    assert not running[1].is_prefill()


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_perfix_caching(use_v2_block_manager: bool):
+def test_perfix_caching():
    """Verify allocating full blocks when prefix caching is enabled."""
    block_size = 4
    max_seqs = 10
@@ -641,7 +621,7 @@ def test_perfix_caching(use_v2_block_manager: bool):
        max_seqs,
        max_model_len,
        enable_chunked_prefill=True,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size,
                               1.0,
                               1,

--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@@ -31,7 +31,6 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
    # Make a vllm engine
    runner = VllmRunner(model_name=MODEL,
                        gpu_memory_utilization=0.7,
-                        use_v2_block_manager=True,
                        num_scheduler_steps=num_scheduler_steps,
                        enable_chunked_prefill=enable_chunked_prefill,
                        enforce_eager=enforce_eager)

--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -3,7 +3,7 @@ from collections import deque
 from typing import List, Set, Tuple
 from unittest.mock import MagicMock

-import pytest
+import pytest  # noqa
 from torch import Use  # noqa

 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
@@ -12,23 +12,18 @@ from vllm.core.scheduler import Scheduler, SchedulingBudget
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SequenceGroup, SequenceStatus

-from ..utils import check_deprecated_block_manager_usage
 from .utils import (append_new_token, append_new_token_seq_group,
                    create_dummy_prompt, get_sequence_groups,
                    schedule_and_update_computed_tokens)


-@pytest.fixture(scope="module", autouse=True)
-def check_deprecated_block_manager():
-    check_deprecated_block_manager_usage(
-        "tests/core/test_chunked_prefill_scheduler.py")
-
-
-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_scheduler_add_seq_group(use_v2_block_manager: bool):
+def test_scheduler_add_seq_group():
    block_size = 4
    scheduler_config = SchedulerConfig(
-        100, 64, 1, use_v2_block_manager=use_v2_block_manager)
+        100,
+        64,
+        1,
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
    cache_config.num_cpu_blocks = 4
    cache_config.num_gpu_blocks = 4
@@ -44,11 +39,13 @@ def test_scheduler_add_seq_group(use_v2_block_manager: bool):
        assert scheduler.get_num_unfinished_seq_groups() == i + 1


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
+def test_scheduler_abort_seq_group():
    block_size = 4
    scheduler_config = SchedulerConfig(
-        100, 64, 1, use_v2_block_manager=use_v2_block_manager)
+        100,
+        64,
+        1,
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 4
    cache_config.num_gpu_blocks = 4
@@ -68,8 +65,7 @@ def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
    assert scheduler.get_num_unfinished_seq_groups() == 0


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_scheduler_schedule_simple(use_v2_block_manager: bool):
+def test_scheduler_schedule_simple():
    block_size = 4
    num_seq_group = 4
    max_model_len = 16
@@ -77,7 +73,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
        64,
        num_seq_group,
        max_model_len,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
@@ -112,8 +108,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
    append_new_token(out, 1)


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
+def test_scheduler_prefill_prioritized():
    """Verify running batched tokens are not applied to prefill requests."""
    block_size = 4
    max_model_len = 30
@@ -122,7 +117,7 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
        max_batched_num_tokens,
        2,
        max_model_len,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
    cache_config.num_gpu_blocks = 16
@@ -146,12 +141,14 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
    assert get_sequence_groups(out) == [seq_group_b]


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
+def test_scheduler_schedule_preempt_abort():
    block_size = 4
    max_model_len = 16
    scheduler_config = SchedulerConfig(
-        64, 2, max_model_len, use_v2_block_manager=use_v2_block_manager)
+        64,
+        2,
+        max_model_len,
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 2
    cache_config.num_gpu_blocks = 2
@@ -201,8 +198,7 @@ def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
    assert scheduler.get_num_unfinished_seq_groups() == 1


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_scheduler_max_seqs(use_v2_block_manager: bool):
+def test_scheduler_max_seqs():
    block_size = 4
    num_seq_group = 4
    max_seq_group = 2
@@ -211,7 +207,7 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
        64,
        max_seq_group,
        max_model_len,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
@@ -249,15 +245,14 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
    assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_scheduler_delay_factor(use_v2_block_manager: bool):
+def test_scheduler_delay_factor():
    block_size = 4
    scheduler_config = SchedulerConfig(
        100,
        64,
        16,
        delay_factor=0.5,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
@@ -294,12 +289,10 @@ def test_scheduler_delay_factor(use_v2_block_manager: bool):
    append_new_token(out, 1)


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_swapped_out_prioritized(use_v2_block_manager: bool):
+def test_swapped_out_prioritized():
    block_size = 4
    scheduler = initialize_scheduler(max_num_seqs=6,
                                     block_size=block_size,
-                                     use_v2_block_manager=use_v2_block_manager,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
    # best_of=2 * 3 == 6 sequences.
@@ -351,7 +344,6 @@ def initialize_scheduler(
    max_token_budget=1000,
    max_model_len=1000,
    lora_config=None,
-    use_v2_block_manager=False,
    block_size=4,
    num_cpu_blocks=8,
    num_gpu_blocks=8,
@@ -361,7 +353,7 @@ def initialize_scheduler(
        max_token_budget,
        max_num_seqs,
        max_model_len,
-        use_v2_block_manager=use_v2_block_manager)
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = num_cpu_blocks
    cache_config.num_gpu_blocks = num_gpu_blocks
@@ -386,15 +378,12 @@ def add_token_budget(budget: SchedulingBudget,
    budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
+def test_prefill_schedule_max_prompt_len():
    """
    Test prompt longer than max_prompt_len is aborted.
    """
    block_size = 4
-    scheduler = initialize_scheduler(max_model_len=30,
-                                     use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size)
+    scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
    _, seq_group = create_dummy_prompt("0",
                                       prompt_length=60,
                                       block_size=block_size)
@@ -409,14 +398,12 @@ def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
    assert len(remaining_waiting) == 0


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
+def test_prefill_schedule_token_budget():
    """
    Test token budget respected.
    """
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size,
+    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
    budget = create_token_budget(token_budget=0)
@@ -446,8 +433,7 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
    assert len(remaining_waiting) == 1

    # Test when current_batched_tokens respected.
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size,
+    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=16,
                                     num_gpu_blocks=16)
    budget = create_token_budget(token_budget=60)
@@ -474,14 +460,12 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
    assert len(remaining_waiting) == 0


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
+def test_prefill_schedule_max_seqs():
    """
    Test max seq respected.
    """
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size,
+    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
    budget = create_token_budget(max_num_seqs=2)
@@ -515,15 +499,13 @@ def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
    assert len(remaining_waiting) == 1


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
+def test_prefill_schedule_max_lora():
    """
    Test max lora is respected and prioritized.
    """
    block_size = 4
    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     use_v2_block_manager=use_v2_block_manager,
                                     block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
@@ -570,14 +552,12 @@ def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
    assert budget.num_batched_tokens == 60


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
+def test_prefill_schedule_no_block_manager_capacity():
    """
    Test sequence cannot be scheduled due to block manager has no capacity.
    """
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size,
+    scheduler = initialize_scheduler(block_size=block_size,
                                     num_gpu_blocks=128,
                                     num_cpu_blocks=128)
    budget = create_token_budget()
@@ -614,14 +594,12 @@ def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
    assert len(remaining_waiting) == 0


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_decode_schedule_preempted(use_v2_block_manager: bool):
+def test_decode_schedule_preempted():
    """
    Test decodes cannot be scheduled and preempted.
    """
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size,
+    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
    curr_loras = None
@@ -660,14 +638,12 @@ def test_decode_schedule_preempted(use_v2_block_manager: bool):
    assert output.blocks_to_copy == []


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_decode_swap_beam_search(use_v2_block_manager: bool):
+def test_decode_swap_beam_search():
    """
    Test best_of > 1 swap out blocks
    """
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size,
+    scheduler = initialize_scheduler(block_size=block_size,
                                     num_gpu_blocks=64,
                                     num_cpu_blocks=64)
    curr_loras = None
@@ -716,14 +692,12 @@ def test_decode_swap_beam_search(use_v2_block_manager: bool):
    assert output.blocks_to_copy == []


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
+def test_schedule_decode_blocks_to_copy_update():
    """
    Verify blocks_to_copy is updated.
    """
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=4,
+    scheduler = initialize_scheduler(block_size=4,
                                     num_cpu_blocks=16,
                                     num_gpu_blocks=16)
    _, seq_group = create_dummy_prompt("1",
@@ -754,11 +728,9 @@ def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
    assert output.blocks_to_copy == [(2, 3)]


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_schedule_swapped_simple(use_v2_block_manager: bool):
+def test_schedule_swapped_simple():
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size)
+    scheduler = initialize_scheduler(block_size=block_size)
    curr_loras = None
    blocks_to_swap_out: List[Tuple[int, int]] = []
    _, seq_group = create_dummy_prompt("1",
@@ -785,11 +757,9 @@ def test_schedule_swapped_simple(use_v2_block_manager: bool):
    assert blocks_to_swap_out == blocks_to_swap_in_reverse


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
+def test_schedule_swapped_max_token_budget():
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size,
+    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
    curr_loras = None
@@ -822,11 +792,9 @@ def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
    assert len(output.prefill_seq_groups) == 0


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
+def test_schedule_swapped_max_seqs():
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size,
+    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
    curr_loras = None
@@ -859,12 +827,10 @@ def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
    assert len(output.prefill_seq_groups) == 0


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
+def test_schedule_swapped_max_loras():
    block_size = 4
    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     use_v2_block_manager=use_v2_block_manager,
                                     block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
@@ -894,11 +860,9 @@ def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
    assert len(curr_loras) == 1


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
+def test_schedule_swapped_cannot_swap_in():
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size,
+    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
    curr_loras = None
@@ -927,11 +891,9 @@ def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
    assert len(output.prefill_seq_groups) == 0


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_infeasible_swap(use_v2_block_manager: bool):
+def test_infeasible_swap():
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size,
+    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
    curr_loras = None
@@ -961,11 +923,9 @@ def test_infeasible_swap(use_v2_block_manager: bool):
    assert len(output.prefill_seq_groups) == 0


-@pytest.mark.parametrize('use_v2_block_manager', [True, False])
-def test_schedule_swapped_blocks_to_copy(use_v2_block_manager: bool):
+def test_schedule_swapped_blocks_to_copy():
    block_size = 4
-    scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
-                                     block_size=block_size,
+    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
    curr_loras = None

--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -185,13 +185,14 @@ def test_metric_spec_decode(
 ) -> None:
    k = 5

-    with vllm_runner(model,
-                     dtype=dtype,
-                     disable_log_stats=False,
-                     gpu_memory_utilization=0.4,
-                     speculative_model=model,
-                     num_speculative_tokens=k,
-                     use_v2_block_manager=True) as vllm_model:
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            disable_log_stats=False,
+            gpu_memory_utilization=0.4,
+            speculative_model=model,
+            num_speculative_tokens=k,
+    ) as vllm_model:

        # Force log interval to be 0 to catch all metrics.
        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
@@ -242,7 +243,6 @@ def test_metric_spec_decode_interval(
                             gpu_memory_utilization=0.4,
                             speculative_model=model,
                             num_speculative_tokens=k,
-                             use_v2_block_manager=True,
                             enforce_eager=True)

    engine = LLMEngine.from_engine_args(engine_args)

--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -17,7 +17,6 @@ NUM_PROMPTS = [10]

 DEFAULT_SERVER_ARGS: List[str] = [
    "--disable-log-requests",
-    "--use-v2-block-manager",
    "--worker-use-ray",
    "--gpu-memory-utilization",
    "0.85",

--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -76,7 +76,6 @@ def test_multi_step_llm(
            enforce_eager=enforce_eager,
            gpu_memory_utilization=0.7,
            tensor_parallel_size=tp_size,
-            use_v2_block_manager=True,
            enable_chunked_prefill=enable_chunked_prefill,
            num_scheduler_steps=num_scheduler_steps,
    ) as vllm_model:
@@ -169,7 +168,6 @@ def test_multi_step_llm_w_prompt_logprobs(
            enforce_eager=enforce_eager,
            gpu_memory_utilization=0.7,
            tensor_parallel_size=tp_size,
-            use_v2_block_manager=True,
            num_scheduler_steps=num_scheduler_steps,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
@@ -305,7 +303,6 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
            enforce_eager=enforce_eager,
            gpu_memory_utilization=0.7,
            tensor_parallel_size=tp_size,
-            use_v2_block_manager=True,
            num_scheduler_steps=num_scheduler_steps,
            max_model_len=48,
            max_num_batched_tokens=48,
@@ -324,7 +321,6 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
            enforce_eager=enforce_eager,
            gpu_memory_utilization=0.7,
            tensor_parallel_size=tp_size,
-            use_v2_block_manager=True,
            enable_chunked_prefill=True,
            enable_prefix_caching=True,
            num_scheduler_steps=num_scheduler_steps,

--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -2,15 +2,9 @@

 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
-from typing import List
-
 import pytest

 from tests.kernels.utils import override_backend_env_variable
-from tests.utils import check_deprecated_block_manager_usage
-from vllm.block import PhysicalTokenBlock
-from vllm.core.block_manager_v1 import CachedBlockAllocator
-from vllm.utils import Device

 from ..models.utils import check_outputs_equal

@@ -19,92 +13,11 @@ MODELS = [
 ]


-@pytest.fixture(scope="module", autouse=True)
-def check_deprecated_block_manager():
-    check_deprecated_block_manager_usage(
-        'tests/prefix_caching/test_prefix_caching.py')
-
-
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("num_blocks", [16])
-def test_block_allocator(
-    block_size: int,
-    num_blocks: int,
-):
-    block_hash = 1
-    block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
-
-    # Allocate two PysicalTokenBlocks with the same hash and check
-    # that they are the same PhysicalTokenBlock
-    first_block = block_allocator.allocate(block_hash, 0)
-    second_block = block_allocator.allocate(block_hash, 0)
-    assert (first_block == second_block)
-    assert (second_block.ref_count == 2)
-
-    # Check metric: 1 hit of 2 queries
-    assert block_allocator.get_prefix_cache_hit_rate() == 0.5
-
-    # Free the first_block and confirm that the ref_count is correctly
-    # decremented on the second block
-    block_allocator.free(first_block)
-    assert (second_block.ref_count == 1)
-
-    # Free the second block
-    block_allocator.free(second_block)
-
-    # Reallocate the first block and confirm that, even after the block
-    # had its ref_count go to 0, we still get the same block back
-    first_block = block_allocator.allocate(block_hash, 0)
-    assert (first_block == second_block)
-    assert (first_block.block_hash == block_hash)
-
-    # Allocate one more time to get 3/4 hit rate for easy checking
-    block_allocator.allocate(block_hash, 0)
-    assert block_allocator.get_prefix_cache_hit_rate() == 0.75
-
-
-@pytest.mark.parametrize("num_blocks", [16])
-def test_eviction(num_blocks: int, ):
-    block_size = 16
-    block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
-    blocks: List[PhysicalTokenBlock] = []
-
-    for i in range(num_blocks):
-        # use i as the block_hash
-        blocks.append(block_allocator.allocate(i, 0))
-
-    #Free all blocks
-    for block in blocks:
-        block_allocator.free(block)
-
-    # Allocate a new block and confirm that it's the first block freed.
-    # I.E The Least Recently Used block
-    new_block_hash = block_size
-    new_block = block_allocator.allocate(new_block_hash, 0)
-    assert (new_block == blocks[0])
-    assert (new_block.block_hash == new_block_hash)
-
-    # Reallocate the second in blocks to remove it from the free list
-    realloc_block_hash = 1
-    realloc_block = block_allocator.allocate(realloc_block_hash, 0)
-    assert (realloc_block == blocks[realloc_block_hash])
-    assert (realloc_block.block_hash == realloc_block_hash)
-
-    # Allocate a new block and confirm that it's not the realloc_block,
-    # since the realloc_block shouldn't be in the free list
-    new_block_hash = block_size + 1
-    new_block = block_allocator.allocate(new_block_hash, 0)
-    assert (realloc_block != new_block)
-    assert (new_block.block_hash == new_block_hash)
-    assert (new_block.block_number == 2)
-
-
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("cached_position", [0, 1])
-@pytest.mark.parametrize("use_v2_block_manager", [False, True])
 def test_mixed_requests(
    hf_runner,
    vllm_runner,
@@ -114,7 +27,6 @@ def test_mixed_requests(
    dtype: str,
    max_tokens: int,
    cached_position: int,
-    use_v2_block_manager: bool,
    monkeypatch,
 ) -> None:
    """
@@ -132,7 +44,6 @@ def test_mixed_requests(
            model,
            dtype=dtype,
            enable_prefix_caching=True,
-            use_v2_block_manager=use_v2_block_manager,
    ) as vllm_model:
        # Run the first prompt so the cache is populated
        vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)

--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
 import pytest

-from tests.utils import check_deprecated_block_manager_usage
 from vllm import SamplingParams

 from .conftest import get_output_from_llm_generator


-@pytest.fixture(scope="module", autouse=True)
-def check_deprecated_block_manager():
-    check_deprecated_block_manager_usage(
-        'tests/spec_decode/e2e/test_compatibility.py')
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model": "JackFram/llama-68m",
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-
-        # Required for spec decode.
-        "use_v2_block_manager": True
-    }])
+@pytest.mark.parametrize("common_llm_kwargs", [{
+    "model": "JackFram/llama-68m",
+    "speculative_model": "JackFram/llama-68m",
+    "num_speculative_tokens": 5,
+}])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [
    {
        "enable_chunked_prefill": True,
@@ -51,16 +39,11 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator):
                                      sampling_params)


-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model": "meta-llama/Llama-2-7b-chat-hf",
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-
-        # Required for spec decode.
-        "use_v2_block_manager": True
-    }])
+@pytest.mark.parametrize("common_llm_kwargs", [{
+    "model": "meta-llama/Llama-2-7b-chat-hf",
+    "speculative_model": "JackFram/llama-68m",
+    "num_speculative_tokens": 5,
+}])
 @pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
    [
@@ -101,34 +84,3 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
    with pytest.raises(ValueError, match="cannot be larger than"):
        get_output_from_llm_generator(test_llm_generator, prompts,
                                      sampling_params)
-
-
-@pytest.mark.parametrize("common_llm_kwargs", [{
-    "model": "JackFram/llama-68m",
-    "speculative_model": "JackFram/llama-68m",
-    "num_speculative_tokens": 5,
-    "use_v2_block_manager": False,
-}])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_xfail_block_manager_v1(test_llm_generator):
-    """Verify that speculative decoding with block manager v1 fails.
-    """
-    output_len = 128
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-    ]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    with pytest.raises(ValueError,
-                       match="Speculative decoding requires usage of the V2"):
-        get_output_from_llm_generator(test_llm_generator, prompts,
-                                      sampling_params)