Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev

6d2051cc · zhuwenwen · 2c7f740a · a2c71c54 · 6d2051cc · 6d2051cc
Commit 6d2051cc authored Oct 21, 2024 by zhuwenwen
20 changed files
--- a/requirements-common.txt
+++ b/requirements-common.txt
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
-requests
+requests >= 2.26.0
 tqdm
 py-cpuinfo
-transformers >= 4.45.0  # Required for Llama 3.2.
+transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
-fastapi < 0.113.0; python_version < '3.9'
-fastapi >= 0.114.1; python_version >= '3.9'
+fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
+fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
 aiohttp
 openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
 uvicorn[standard]
@@ -26,8 +26,9 @@ pyzmq
 msgspec
 gguf == 0.10.0
 importlib_metadata
-mistral_common >= 1.4.3
+mistral_common[opencv] >= 1.4.4
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
+compressed-tensors == 0.6.0 # required for compressed-tensors
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
 # Common dependencies
 -r requirements-common.txt

-# OpenVINO dependencies
-torch >= 2.1.2
-openvino ~= 2024.3.0
-optimum-intel[openvino] >= 1.18.2
+torch == 2.4.0 #  should be aligned with "common" vLLM torch version
+openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
+
+optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
+optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -10,18 +10,18 @@ pytest-shard
 awscli
 einops # required for MPT, qwen-vl and Mamba
 httpx
-librosa # required for audio test
-opencv-python # required for video test
+librosa # required for audio tests
+opencv-python # required for video tests
 peft
 requests
 ray[adag]==2.35
 sentence-transformers # required for embedding
 soundfile # required for audio test
-compressed-tensors==0.4.0 # required for compressed-tensors
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 datamodel_code_generator # required for minicpm3 test
+lm-eval[api]==0.4.4 # required for model evaluation test

 # TODO: Add this after fully implementing llava(mantis)
 # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test

--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
 # Common dependencies
 -r requirements-common.txt

-setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
-
 ray >= 2.9
+cmake>=3.26
+ninja
+packaging
+setuptools-scm>=8
+wheel
+jinja2
 # Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 torch == 2.3.1+cxx11.abi
 intel-extension-for-pytorch == 2.3.110+xpu

--- a/setup.py
+++ b/setup.py
@@ -339,7 +339,7 @@ def get_neuronxcc_version():
        # Return the version string
        return match.group(1)
    else:
-        raise RuntimeError("Could not find HIP version in the output")
+        raise RuntimeError("Could not find Neuron version in the output")


 def get_nvcc_cuda_version() -> Version:
@@ -397,9 +397,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
    
    new_version_content = f"""
 try:
-    __version__ = "0.6.2"
-    __version_tuple__ = (0, 6, 2)
-    __dcu_version__ = f'0.6.2+{version}' 
+    __version__ = "0.6.3.post1"
+    __version_tuple__ = (0, 6, 3)
+    __dcu_version__ = f'0.6.3.post1+{version}' 
    
    from vllm.version import __version__, __version_tuple__, __dcu_version__
 except Exception as e:
@@ -408,6 +408,8 @@ except Exception as e:
    warnings.warn(f"Failed to read commit hash:\\n + str(e)",
                  RuntimeWarning,
                  stacklevel=2)
+    __version__ = "dev"
+    __version_tuple__ = (0, 0, __version__)
 """
    
    with open(add_version_path, encoding="utf-8",mode="w") as file:
@@ -487,6 +489,8 @@ def get_requirements() -> List[str]:
        for line in requirements:
            if line.startswith("-r "):
                resolved_requirements += _read_requirements(line.split()[1])
+            elif line.startswith("--"):
+                continue
            else:
                resolved_requirements.append(line)
        return resolved_requirements
@@ -573,7 +577,11 @@ setup(
        "Programming Language :: Python :: 3.11",
        "Programming Language :: Python :: 3.12",
        "License :: OSI Approved :: Apache Software License",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Information Technology",
+        "Intended Audience :: Science/Research",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Scientific/Engineering :: Information Analysis",
    ],
    packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
                                    "tests*")),
@@ -582,7 +590,6 @@ setup(
    ext_modules=ext_modules,
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
-        "video": ["opencv-python"],  # Required for video processing
        "audio": ["librosa", "soundfile"]  # Required for audio processing
    },
    cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -86,17 +86,19 @@ class MockAsyncLLMEngine(AsyncLLMEngine):

 @pytest.mark.asyncio
 async def test_new_requests_event():
+    params = SamplingParams()
+
    engine = MockAsyncLLMEngine()
    engine.start_background_loop()
    await asyncio.sleep(0.01)
    assert engine.engine.step_calls == 0

-    await engine.add_request("1", "", None)
+    await engine.add_request("1", "", params)
    await asyncio.sleep(0.01)
    assert engine.engine.add_request_calls == 1
    assert engine.engine.step_calls == 1

-    await engine.add_request("2", "", None)
+    await engine.add_request("2", "", params)
    engine.engine.generate("2")
    await asyncio.sleep(0)
    await asyncio.sleep(0)
@@ -111,7 +113,7 @@ async def test_new_requests_event():
    await asyncio.sleep(0.001)
    assert engine.engine.step_calls == old_step_calls

-    await engine.add_request("3", "", None)
+    await engine.add_request("3", "", params)
    await asyncio.sleep(0.01)
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -191,7 +191,6 @@ def test_models_with_fp8_kv_cache(
 @pytest.mark.parametrize("max_tokens", [16])
 @pytest.mark.parametrize("enforce_eager", [False])
 @pytest.mark.parametrize("chunk_size", [30, 32])
-@pytest.mark.parametrize("use_v2_block_manager", [False, True])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
@@ -200,7 +199,6 @@ def test_with_prefix_caching(
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
-    use_v2_block_manager: bool,
    tensor_parallel_size: int,
 ) -> None:
    """
@@ -228,7 +226,6 @@ def test_with_prefix_caching(
                enable_chunked_prefill=True,
                enable_prefix_caching=enable,
                tensor_parallel_size=tensor_parallel_size,
-                use_v2_block_manager=use_v2_block_manager,
                enforce_eager=enforce_eager,
                max_num_seqs=max_num_seqs,
        ) as vllm_model:

--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -23,9 +23,9 @@ MODELS = [
 @pytest.fixture(scope="module", autouse=True)
 def check_settings():
    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
-        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
-        "tests/basic_correctness/test_preemption.py`")
+        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
+        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
+        "pytest tests/basic_correctness/test_preemption.py`")


 @pytest.fixture
@@ -83,8 +83,7 @@ def test_chunked_prefill_recompute(


 @pytest.mark.parametrize("model", MODELS)
-# @pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
 def test_preemption(
    caplog_vllm,
@@ -137,115 +136,7 @@ def test_preemption(


 @pytest.mark.parametrize("model", MODELS)
-# @pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [96])
-@pytest.mark.parametrize("beam_width", [4])
-def test_swap(
-    caplog_vllm,
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    beam_width: int,
-    worker_use_ray: bool,
-) -> None:
-    """Use beam search enables swapping."""
-    example_prompts = example_prompts[:1]
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            swap_space=10,
-            disable_log_stats=False,
-            worker_use_ray=worker_use_ray,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
-                                                       beam_width, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-        total_preemption = (
-            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
-
-    for i in range(len(example_prompts)):
-        hf_output_ids, _ = hf_outputs[i]
-        vllm_output_ids, _ = vllm_outputs[i]
-        assert len(hf_output_ids) == len(vllm_output_ids)
-        for j in range(len(hf_output_ids)):
-            assert hf_output_ids[j] == vllm_output_ids[j], (
-                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
-                f"vLLM: {vllm_output_ids}")
-
-    assert ("is preempted by PreemptionMode.SWAP mode because there "
-            "is not enough KV cache space." in caplog_vllm.text)
-    # Ensure the count bucket of request-level histogram metrics matches
-    # the number of requests as a simple sanity check to ensure metrics are
-    # generated
-    preemption_metrics = None
-    for m in REGISTRY.collect():
-        if m.name == "vllm:num_preemptions":
-            preemption_metrics = m
-    assert preemption_metrics is not None
-    total_recorded_preemption = 0
-    for sample in preemption_metrics.samples:
-        total_recorded_preemption += sample.value
-    assert total_preemption == total_recorded_preemption
-
-
-@pytest.mark.parametrize("model", MODELS)
-# @pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [96])
-@pytest.mark.parametrize("beam_width", [4])
-def test_swap_infeasible(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    beam_width: int,
-    worker_use_ray: bool,
-) -> None:
-    """Verify infeasible swap request will be ignored."""
-    BLOCK_SIZE = 16
-    prefill_blocks = 2
-    decode_blocks = max_tokens // BLOCK_SIZE
-    example_prompts = example_prompts[:1]
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            swap_space=10,
-            block_size=BLOCK_SIZE,
-            # Since beam search have more than 1 sequence, prefill +
-            # decode blocks are not enough to finish.
-            num_gpu_blocks_override=prefill_blocks + decode_blocks,
-            max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
-            worker_use_ray=worker_use_ray,
-    ) as vllm_model:
-        sampling_params = SamplingParams(n=beam_width,
-                                         use_beam_search=True,
-                                         temperature=0.0,
-                                         max_tokens=max_tokens,
-                                         ignore_eos=True)
-        req_outputs = vllm_model.model.generate(
-            example_prompts,
-            sampling_params=sampling_params,
-        )
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-
-    # Verify the request is ignored and not hang.
-    assert req_outputs[0].outputs[0].finish_reason == "length"
-
-
-@pytest.mark.parametrize("model", MODELS)
-# @pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
 def test_preemption_infeasible(
    vllm_runner,
@@ -284,4 +175,4 @@ def test_preemption_infeasible(
    for req_output in req_outputs:
        outputs = req_output.outputs
        assert len(outputs) == 1
-        assert outputs[0].finish_reason == "length"
+        assert outputs[0].finish_reason == "length"
\ No newline at end of file
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
+from typing import Dict, List, Optional
+
+import pytest
+
+from vllm.compilation.levels import CompilationLevel
+from vllm.utils import cuda_device_count_stateless
+
+from ..utils import compare_all_settings
+
+
+# we cannot afford testing the full Catesian product
+# of all models and all levels
+@pytest.mark.parametrize(
+    "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
+    [
+        ("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate",
+         True),
+        ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
+         ["--quantization", "compressed-tensors"
+          ], 1, 1, "FLASH_ATTN", "generate", True),
+        ("google/gemma-2-2b-it", [], 1, 2, "FLASHINFER", "generate", True),
+        # TODO: add multi-modality test for llava
+        ("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False)
+    ])
+def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend,
+                             method, fullgraph):
+    # this test is run under multiple suits, with different GPUs.
+    # make sure we only run the test with correct CUDA devices.
+    # don't use "<", as it will duplicate the tests.
+    if cuda_device_count_stateless() != pp_size * tp_size:
+        pytest.skip("Not correct CUDA devices for the test.")
+    import os
+    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
+    if not fullgraph:
+        os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"
+    all_args = [["--enforce-eager"] + model_args + ["--max_model_len", "1024"]
+                + ["-pp", str(pp_size)] + ["-tp", str(tp_size)]] * 3
+    # don't test VLLM_TORCH_COMPILE_LEVEL == 3 case
+    # inductor will change the output, so we cannot compare them.
+    all_envs: List[Optional[Dict[str, str]]] = [{
+        "VLLM_TORCH_COMPILE_LEVEL":
+        str(level)
+    } for level in [
+        CompilationLevel.NO_COMPILATION,
+        CompilationLevel.DYNAMO_AS_IS,
+        CompilationLevel.DYNAMO_ONCE,
+    ]]
+    compare_all_settings(model, all_args, all_envs, method=method)
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
 import pytest

-from vllm.compilation.backends import vllm_backend
+from vllm.compilation.levels import CompilationLevel

+from ..utils import fork_new_process_for_each_test
 from .utils import TEST_MODELS, check_full_graph_support


 @pytest.mark.parametrize("model_info", TEST_MODELS)
-@pytest.mark.parametrize("backend", ["eager", vllm_backend])
-def test_full_graph(model_info, backend):
+@pytest.mark.parametrize(
+    "optimization_level",
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.INDUCTOR])
+@fork_new_process_for_each_test
+def test_full_graph(model_info, optimization_level):
    model = model_info[0]
    model_kwargs = model_info[1]
-    check_full_graph_support(model, model_kwargs, backend, tp_size=1)
+    check_full_graph_support(model,
+                             model_kwargs,
+                             optimization_level,
+                             tp_size=1)
--- a/tests/compile/test_full_graph_multi_gpu.py
+++ b/tests/compile/test_full_graph_multi_gpu.py
-import pytest
-
-from vllm.compilation.backends import vllm_backend
-from vllm.utils import cuda_device_count_stateless
-
-from ..utils import fork_new_process_for_each_test
-from .utils import TEST_MODELS_SMOKE, check_full_graph_support
-
-
-@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
-@pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("backend", ["eager", vllm_backend])
-@fork_new_process_for_each_test
-def test_full_graph_multi_gpu(model_info, tp_size, backend):
-    model = model_info[0]
-    model_kwargs = model_info[1]
-
-    # Skip the test if there are not enough CUDA devices.
-    if cuda_device_count_stateless() < tp_size:
-        pytest.skip("Not enough CUDA devices for the test.")
-
-    check_full_graph_support(model, model_kwargs, backend, tp_size=tp_size)
--- a/tests/compile/test_full_graph_smoke.py
+++ b/tests/compile/test_full_graph_smoke.py
-import pytest
-
-from vllm.compilation.backends import vllm_backend
-
-from .utils import TEST_MODELS_SMOKE, check_full_graph_support
-
-
-@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
-@pytest.mark.parametrize("backend", ["eager", vllm_backend])
-def test_full_graph(model_info, backend):
-    model = model_info[0]
-    model_kwargs = model_info[1]
-    check_full_graph_support(model, model_kwargs, backend, tp_size=1)
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -4,16 +4,9 @@ import torch

 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.plugins import set_torch_compile_backend
+from vllm.compilation.levels import CompilationLevel
 from vllm.utils import is_hip

-TEST_MODELS_SMOKE = [
-    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
-        "quantization": "compressed-tensors"
-    }),
-    ("meta-llama/Meta-Llama-3-8B", {}),
-]
-
 TEST_MODELS = [
    ("facebook/opt-125m", {}),
    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
@@ -68,20 +61,21 @@ if not is_hip() and is_quant_method_supported("awq"):
    }))


-def check_full_graph_support(model, model_kwargs, backend, tp_size=1):
+def check_full_graph_support(model,
+                             model_kwargs,
+                             optimization_level,
+                             tp_size=1):
    # make sure these models can be captured in full graph mode
-    if "VLLM_TEST_DYNAMO_GRAPH_CAPTURE" not in os.environ:
-        os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
-        os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
+    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
+    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"

    # Inductor doesn't support fp8/gptq_marlin_24 yet.
    quantization = model_kwargs.get("quantization")
    if (quantization == "fp8" or quantization == "gptq_marlin"
-            or quantization == "gptq_marlin_24") and backend != "eager":
+            or quantization == "gptq_marlin_24"
+        ) and optimization_level >= CompilationLevel.INDUCTOR:
        return

-    set_torch_compile_backend(backend)
-
    prompts = [
        "Hello, my name is",
        "The president of the United States is",

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -35,6 +35,7 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
+from vllm.sampling_params import BeamSearchParams
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
                        identity, is_cpu)

@@ -246,17 +247,14 @@ _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)

 class HfRunner:

-    def wrap_device(self, input: _T) -> _T:
-        if not is_cpu():
-            # Check if the input is already on the GPU
-            if hasattr(input, 'device') and input.device.type == "cuda":
-                return input  # Already on GPU, no need to move
-            return input.to("cuda")
-        else:
-            # Check if the input is already on the CPU
-            if hasattr(input, 'device') and input.device.type == "cpu":
-                return input  # Already on CPU, no need to move
-            return input.to("cpu")
+    def wrap_device(self, input: _T, device: Optional[str] = None) -> _T:
+        if device is None:
+            return self.wrap_device(input, "cpu" if is_cpu() else "cuda")
+
+        if hasattr(input, "device") and input.device.type == device:
+            return input
+
+        return input.to(device)

    def __init__(
        self,
@@ -264,7 +262,7 @@ class HfRunner:
        dtype: str = "half",
        *,
        model_kwargs: Optional[Dict[str, Any]] = None,
-        is_embedding_model: bool = False,
+        is_sentence_transformer: bool = False,
        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
        postprocess_inputs: Callable[[BatchEncoding],
                                     BatchEncoding] = identity,
@@ -273,13 +271,14 @@ class HfRunner:

        self.model_name = model_name

-        if is_embedding_model:
+        if is_sentence_transformer:
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
            self.model = self.wrap_device(
                SentenceTransformer(
                    model_name,
                    device="cpu",
+                    trust_remote_code=True,
                ).to(dtype=torch_dtype))
        else:
            model_kwargs = model_kwargs if model_kwargs is not None else {}
@@ -308,17 +307,23 @@ class HfRunner:

        self.postprocess_inputs = postprocess_inputs

-    def generate(
+    def get_inputs(
        self,
        prompts: List[str],
        images: Optional[PromptImageInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
-        **kwargs: Any,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
-        if images:
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[BatchEncoding]:
+        if images is not None:
            assert len(prompts) == len(images)

-        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        if videos is not None:
+            assert len(prompts) == len(videos)
+
+        if audios is not None:
+            assert len(prompts) == len(audios)
+
+        all_inputs: List[BatchEncoding] = []
        for i, prompt in enumerate(prompts):
            processor_kwargs: Dict[str, Any] = {
                "text": prompt,
@@ -328,12 +333,35 @@ class HfRunner:
                processor_kwargs["images"] = images[i]
            if videos is not None and videos[i] is not None:
                processor_kwargs["videos"] = videos[i]
+            if audios is not None and audios[i] is not None:
+                audio, sr = audios[i]
+                processor_kwargs["audio"] = audio
+                processor_kwargs["sampling_rate"] = sr

            inputs = self.processor(**processor_kwargs)
            inputs = self.postprocess_inputs(inputs)

+            all_inputs.append(inputs)
+
+        return all_inputs
+
+    def generate(
+        self,
+        prompts: List[str],
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[List[np.ndarray]] = None,
+        audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
+    ) -> List[Tuple[List[List[int]], List[str]]]:
+        all_inputs = self.get_inputs(prompts,
+                                     images=images,
+                                     videos=videos,
+                                     audios=audios)
+
+        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        for inputs in all_inputs:
            output_ids = self.model.generate(
-                **self.wrap_device(inputs),
+                **self.wrap_device(inputs, device=self.model.device.type),
                use_cache=True,
                **kwargs,
            )
@@ -351,12 +379,16 @@ class HfRunner:
        prompts: List[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
+        videos: Optional[List[np.ndarray]] = None,
+        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
    ) -> List[Tuple[List[int], str]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                images=images,
+                                videos=videos,
+                                audios=audios,
                                **kwargs)

        return [(output_ids[0], output_str[0])
@@ -389,24 +421,18 @@ class HfRunner:
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
        videos: Optional[List[np.ndarray]] = None,
+        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
    ) -> List[List[torch.Tensor]]:
-        all_logprobs: List[List[torch.Tensor]] = []
-        for i, prompt in enumerate(prompts):
-            processor_kwargs: Dict[str, Any] = {
-                "text": prompt,
-                "return_tensors": "pt",
-            }
-            if images is not None and images[i] is not None:
-                processor_kwargs["images"] = images[i]
-            if videos is not None and videos[i] is not None:
-                processor_kwargs["videos"] = videos[i]
-
-            inputs = self.processor(**processor_kwargs)
-            inputs = self.postprocess_inputs(inputs)
+        all_inputs = self.get_inputs(prompts,
+                                     images=images,
+                                     videos=videos,
+                                     audios=audios)

+        all_logprobs: List[List[torch.Tensor]] = []
+        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs),
+                **self.wrap_device(inputs, device=self.model.device.type),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@@ -414,40 +440,39 @@ class HfRunner:
                return_dict_in_generate=True,
                **kwargs,
            )
-            seq_logprobs: List[torch.Tensor] = []
-            for hidden_states in output.hidden_states:
-                last_hidden_states = hidden_states[-1][0]
-                logits = torch.matmul(
-                    last_hidden_states,
-                    self.model.get_output_embeddings().weight.t(),
-                )
-                if self.model.get_output_embeddings().bias is not None:
-                    logits += self.model.get_output_embeddings(
-                    ).bias.unsqueeze(0)
-                logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
-                seq_logprobs.append(logprobs)
+            seq_logprobs = self._hidden_states_to_seq_logprobs(
+                output.hidden_states)
            all_logprobs.append(seq_logprobs)
        return all_logprobs

-    def _hidden_states_to_logprobs(
+    def _hidden_states_to_seq_logprobs(
        self,
-        hidden_states,
-        num_logprobs,
-    ) -> Tuple[List[Dict[int, float]], int]:
+        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
+    ) -> List[torch.Tensor]:
+        output_embeddings = self.model.get_output_embeddings()
+
        seq_logprobs: List[torch.Tensor] = []
-        output_len = len(hidden_states)
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
-                last_hidden_states,
-                self.model.get_output_embeddings().weight.t(),
+                last_hidden_states.to(output_embeddings.weight.device),
+                output_embeddings.weight.t(),
            )
-            if getattr(self.model.get_output_embeddings(), "bias",
-                       None) is not None:
-                logits += self.model.get_output_embeddings().bias.unsqueeze(0)
+            if getattr(output_embeddings, "bias", None) is not None:
+                logits += output_embeddings.bias.unsqueeze(0)
            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
            seq_logprobs.append(logprobs)

+        return seq_logprobs
+
+    def _hidden_states_to_logprobs(
+        self,
+        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
+        num_logprobs: int,
+    ) -> Tuple[List[Dict[int, float]], int]:
+        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
+        output_len = len(hidden_states)
+
        # convert to dict
        seq_logprobs_lst: List[Dict[int, float]] = []
        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
@@ -477,30 +502,18 @@ class HfRunner:
        videos: Optional[List[np.ndarray]] = None,
        **kwargs: Any,
    ) -> List[TokensTextLogprobs]:
+        all_inputs = self.get_inputs(prompts,
+                                     images=images,
+                                     videos=videos,
+                                     audios=audios)
+
        all_logprobs: List[List[Dict[int, float]]] = []
        all_output_ids: List[List[int]] = []
        all_output_strs: List[str] = []

-        for i, prompt in enumerate(prompts):
-            processor_kwargs: Dict[str, Any] = {
-                "text": prompt,
-                "return_tensors": "pt",
-            }
-            if images is not None and images[i] is not None:
-                processor_kwargs["images"] = images[i]
-
-            if audios is not None:
-                audio, sr = audios[i]
-                processor_kwargs["audio"] = audio
-                processor_kwargs["sampling_rate"] = sr
-
-            if videos is not None:
-                processor_kwargs["videos"] = videos[i]
-            inputs = self.processor(**processor_kwargs)
-            inputs = self.postprocess_inputs(inputs)
-
+        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs),
+                **self.wrap_device(inputs, device=self.model.device.type),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@@ -543,12 +556,20 @@ class HfRunner:

        for (encoder_prompt,
             decoder_prompt) in to_enc_dec_tuple_list(encoder_decoder_prompts):
+
            encoder_input_ids = self.wrap_device(
-                self.tokenizer(encoder_prompt, return_tensors="pt").input_ids)
-            decoder_input_ids = (
-                None if decoder_prompt is None else self.wrap_device(
+                self.tokenizer(encoder_prompt, return_tensors="pt").input_ids,
+                device=self.model.device.type,
+            )
+
+            if decoder_prompt is None:
+                decoder_input_ids = None
+            else:
+                decoder_input_ids = self.wrap_device(
                    self.tokenizer(decoder_prompt,
-                                   return_tensors="pt").input_ids))
+                                   return_tensors="pt").input_ids,
+                    device=self.model.device.type,
+                )

            output = self.model.generate(
                encoder_input_ids,
@@ -626,20 +647,50 @@ class VllmRunner:
            **kwargs,
        )

-    def generate(
+    def get_inputs(
        self,
        prompts: List[str],
-        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[TextPrompt]:
        if images is not None:
            assert len(prompts) == len(images)

+        if videos is not None:
+            assert len(prompts) == len(videos)
+
+        if audios is not None:
+            assert len(prompts) == len(audios)
+
        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
        if images is not None:
            for i, image in enumerate(images):
                inputs[i]["multi_modal_data"] = {"image": image}

+        if videos is not None:
+            for i, video in enumerate(videos):
+                inputs[i]["multi_modal_data"] = {"video": video}
+
+        if audios is not None:
+            for i, audio in enumerate(audios):
+                inputs[i]["multi_modal_data"] = {"audio": audio}
+
+        return inputs
+
+    def generate(
+        self,
+        prompts: List[str],
+        sampling_params: SamplingParams,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[Tuple[List[List[int]], List[str]]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
        req_outputs = self.model.generate(inputs,
                                          sampling_params=sampling_params)

@@ -681,25 +732,10 @@ class VllmRunner:
        videos: Optional[PromptVideoInput] = None,
    ) -> Union[List[TokensTextLogprobs],
               List[TokensTextLogprobsPromptLogprobs]]:
-        if images is not None:
-            assert len(prompts) == len(images)
-
-        if videos is not None:
-            assert len(prompts) == len(videos)
-
-        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
-        if images is not None:
-            for i, image in enumerate(images):
-                inputs[i]["multi_modal_data"] = {"image": image}
-
-        if audios is not None:
-            for i, audio in enumerate(audios):
-                inputs[i]["multi_modal_data"] = {"audio": audio}
-
-        if videos is not None:
-            for i, video in enumerate(videos):
-                inputs[i]["multi_modal_data"] = {"video": video}
-        print(f"[INPUTS!!!!]: {inputs}, {sampling_params}")
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)

        req_outputs = self.model.generate(inputs,
                                          sampling_params=sampling_params)
@@ -736,9 +772,15 @@ class VllmRunner:
        prompts: List[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-        outputs = self.generate(prompts, greedy_params, images=images)
+        outputs = self.generate(prompts,
+                                greedy_params,
+                                images=images,
+                                videos=videos,
+                                audios=audios)
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]

@@ -777,7 +819,6 @@ class VllmRunner:
               List[TokensTextLogprobsPromptLogprobs]]:
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
-            use_beam_search=False,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=(num_prompt_logprobs),
@@ -790,25 +831,14 @@ class VllmRunner:
            encoder_decoder_prompts, greedy_logprobs_params)

    def generate_beam_search(
-        self,
-        prompts: List[str],
-        beam_width: int,
-        max_tokens: int,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
-        beam_search_params = SamplingParams(n=beam_width,
-                                            use_beam_search=True,
-                                            temperature=0.0,
-                                            max_tokens=max_tokens)
-        outputs = self.generate(prompts, beam_search_params)
-        return outputs
-
-    def generate_beam_search_new(
        self,
        prompts: Union[List[str], List[List[int]]],
        beam_width: int,
        max_tokens: int,
    ) -> List[Tuple[List[List[int]], List[str]]]:
-        outputs = self.model.beam_search(prompts, beam_width, max_tokens)
+        outputs = self.model.beam_search(
+            prompts,
+            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
        returned_outputs = []
        for output in outputs:
            token_ids = [x.tokens for x in output.sequences]
@@ -876,15 +906,17 @@ def num_gpus_available():


 temp_dir = tempfile.gettempdir()
-_dummy_path = os.path.join(temp_dir, "dummy_opt")
+_dummy_opt_path = os.path.join(temp_dir, "dummy_opt")
+_dummy_llava_path = os.path.join(temp_dir, "dummy_llava")
+_dummy_gemma2_embedding_path = os.path.join(temp_dir, "dummy_gemma2_embedding")


 @pytest.fixture
 def dummy_opt_path():
-    json_path = os.path.join(_dummy_path, "config.json")
-    if not os.path.exists(_dummy_path):
+    json_path = os.path.join(_dummy_opt_path, "config.json")
+    if not os.path.exists(_dummy_opt_path):
        snapshot_download(repo_id="facebook/opt-125m",
-                          local_dir=_dummy_path,
+                          local_dir=_dummy_opt_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
@@ -895,4 +927,42 @@ def dummy_opt_path():
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
            json.dump(config, f)
-    return _dummy_path
+    return _dummy_opt_path
+
+
+@pytest.fixture
+def dummy_llava_path():
+    json_path = os.path.join(_dummy_llava_path, "config.json")
+    if not os.path.exists(_dummy_llava_path):
+        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
+                          local_dir=_dummy_llava_path,
+                          ignore_patterns=[
+                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
+                              "*.msgpack"
+                          ])
+        assert os.path.exists(json_path)
+        with open(json_path, "r") as f:
+            config = json.load(f)
+        config["architectures"] = ["MyLlava"]
+        with open(json_path, "w") as f:
+            json.dump(config, f)
+    return _dummy_llava_path
+
+
+@pytest.fixture
+def dummy_gemma2_embedding_path():
+    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
+    if not os.path.exists(_dummy_gemma2_embedding_path):
+        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
+                          local_dir=_dummy_gemma2_embedding_path,
+                          ignore_patterns=[
+                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
+                              "*.msgpack"
+                          ])
+        assert os.path.exists(json_path)
+        with open(json_path, "r") as f:
+            config = json.load(f)
+        config["architectures"] = ["MyGemma2Embedding"]
+        with open(json_path, "w") as f:
+            json.dump(config, f)
+    return _dummy_gemma2_embedding_path
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -21,32 +21,32 @@ from .conftest import get_token_ids_from_llm_generator
        "num_gpu_blocks_override": 5 * (64 + 1),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "use_v2_block_manager": False
-}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
    "preemption_mode": "swap"
 }, {
-    "use_v2_block_manager": True,
    "preemption_mode": "recompute"
 }])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
-def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
-                                               test_llm_generator, batch_size):
-    """Verify block manager v2 produces same outputs as block manager v1, even
-    when there is preemption.
+def test_block_manager_with_preemption(baseline_llm_generator,
+                                       test_llm_generator, batch_size):
+    """Verify block manager produces same outputs even when there is preemption.

    This constructs two LLM, each with limited number of GPU blocks. The limit
    is decided such that as the sequences in the batch grow, sequences must be
    preempted and removed from cache.

    If the output token ids are equivalent, then we have confidence that the KV
-    cache is not corrupted in the v2 block manager.
+    cache is not corrupted.

    NOTE: We want a significant number of generated tokens so that any incorrect
    KV mapping has time to build up error.
+
+    NOTE(Kuntai): Though we have removed block manager v1, this test is still
+    useful as it asserts the behavior of block manager v2 (now it is called 
+    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
+    keep this test.
    """
    output_len = 1024
    temperature = 0.0
@@ -70,78 +70,9 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
        temperature=temperature,
    )

-    print('Getting token ids from block manager v1')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-
-    print('Getting token ids from block manager v2')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-
-    assert baseline_token_ids == test_token_ids
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-
-        # Use a large block size to trigger more copy-on-writes.
-        "block_size": 32,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "use_v2_block_manager": False
-}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
-    "preemption_mode": "swap"
-}, {
-    "use_v2_block_manager": True,
-    "preemption_mode": "recompute"
-}])
-@pytest.mark.parametrize("batch_size", [10])
-@pytest.mark.parametrize("seed", [1])
-def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
-                                        test_llm_generator, batch_size):
-    """Verify beam search equality with block manager v1 and v2.
-
-    This requires copy-on-writes; if the v1 and v2 output is the same, then
-    we have some confidence cow is working.
-    """
-    output_len = 128
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-        use_beam_search=True,
-        best_of=2,
-    )
-
-    print('Getting token ids from block manager v1')
    baseline_token_ids = get_token_ids_from_llm_generator(
        baseline_llm_generator, prompts, sampling_params)

-    print('Getting token ids from block manager v2')
    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
                                                      prompts, sampling_params)

@@ -164,9 +95,6 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,

        # skip cuda graph creation for fast test.
        "enforce_eager": True,
-
-        # Lookahead scheduling only supported in v2 block manager.
-        "use_v2_block_manager": True,
    }])
 @pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
@@ -278,26 +206,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
                             "max_num_seqs": 10,
                         }])
 @pytest.mark.parametrize("baseline_llm_kwargs", [
-    {
-        "use_v2_block_manager": False,
-    },
+    {},
 ])
 @pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "use_v2_block_manager": True,
        "num_lookahead_slots": 0,
    },
    {
-        "use_v2_block_manager": True,
        "num_lookahead_slots": 5,
    },
 ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
-def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
-                                          test_llm_generator, batch_size):
-    """Verify that chunked prefill works with BlockManagerV2, with and without
-    lookahead scheduling.
+def test_chunked_prefill_block_manager(baseline_llm_generator,
+                                       test_llm_generator, batch_size):
+    """Verify that chunked prefill works with SelfAttnBlockSpaceManager, 
+    with and without lookahead scheduling.
    """
    output_len = 32
    temperature = 0.0
@@ -318,11 +242,11 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
        temperature=temperature,
    )

-    print('Getting token ids with BlockManagerV1')
+    print('Getting token ids with BlockManager')
    baseline_token_ids = get_token_ids_from_llm_generator(
        baseline_llm_generator, prompts, sampling_params)

-    print('Getting token ids with BlockManagerV2')
+    print('Getting token ids with BlockManager, with lookahead slots.')
    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
                                                      prompts, sampling_params)

@@ -350,32 +274,32 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
        "enable_prefix_caching": True,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "use_v2_block_manager": False
-}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
    "preemption_mode": "swap"
 }, {
-    "use_v2_block_manager": True,
    "preemption_mode": "recompute"
 }])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
-def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
+def test_block_manager_prefix_caching_enabled_with_preemption(
        baseline_llm_generator, test_llm_generator, batch_size):
-    """Verify block manager v2 produces same outputs as block manager v1, even
-    when there is preemption.
+    """Verify block manager produces same outputs even when there is preemption.

    This constructs two LLM, each with limited number of GPU blocks. The limit
    is decided such that as the sequences in the batch grow, sequences must be
    preempted and removed from cache.

    If the output token ids are equivalent, then we have confidence that the KV
-    cache is not corrupted in the v2 block manager.
+    cache is not corrupted.

    NOTE: We want a significant number of generated tokens so that any incorrect
    KV mapping has time to build up error.
+
+    NOTE(Kuntai): Though we have removed block manager v1, this test is still
+    useful as it asserts the behavior of block manager v2 (now it is called 
+    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
+    keep this test.
    """
    output_len = 1024
    temperature = 0.0
@@ -399,11 +323,11 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
        temperature=temperature,
    )

-    print('Getting token ids from block manager v1')
+    print('Getting token ids from block manager')
    baseline_token_ids = get_token_ids_from_llm_generator(
        baseline_llm_generator, prompts, sampling_params)

-    print('Getting token ids from block manager v2')
+    print('Getting token ids from block manager, with preemption')
    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
                                                      prompts, sampling_params)

@@ -426,9 +350,6 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
        # Allow only 5 sequences of ~1024 tokens in worst case.
        "block_size": 16,
        "num_gpu_blocks_override": 5 * (64 + 1),
-
-        # Test APC in v2 block
-        "use_v2_block_manager": True,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
@@ -504,9 +425,6 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
        "max_model_len": 48,
        "block_size": 16,
        "num_gpu_blocks_override": 3,
-
-        # Test APC in v2 block
-        "use_v2_block_manager": True,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{

--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -24,10 +24,8 @@ BLOCK_SIZE = 16
        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "use_v2_block_manager": False
-}])
-@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
 def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
@@ -48,7 +46,6 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,

    prompts, answer, indices = prep_prompts(batch_size)

-    print('Getting token ids from block manager v1')
    baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
                                                 prompts,
                                                 sampling_params,
@@ -84,10 +81,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
-    "enable_chunked_prefill": True
-}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
 def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):

--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -2,7 +2,7 @@ import pytest

 from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
                                   STR_NOT_IMPL_ENC_DEC_SWA)
-from vllm.core.block_manager_v2 import BlockSpaceManagerV2
+from vllm.core.block_manager import SelfAttnBlockSpaceManager
 from vllm.core.interfaces import AllocStatus
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list
@@ -17,7 +17,7 @@ from ..utils import (create_dummy_prompt, create_seq_group,
 @pytest.mark.parametrize("watermark", [0.0, 0.5])
 def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
                                num_gpu_blocks: int, watermark: float):
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=1024,
@@ -63,7 +63,7 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int,
                                                num_seqs_per_group: int,
                                                num_gpu_blocks: int,
                                                watermark: float):
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=1024,
@@ -117,16 +117,16 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
    '''
    SWA short for Sliding Window Attention.

-    At time of writing block manager v2 does not support SWA.
+    At time of writing block manager does not support SWA.

-    However even when SWA is implemented for block manager v2,
+    However even when SWA is implemented for block manager,
    there will still most likely be a separate workstream required
    to enable SWA for encoder/decoder models.

    Therefore this test enforces that one of the following cases
    hold true:
-    1. Block manager v2 does not support SWA at all (true at time of writing)
-    2. Block manager v2 fails with NotImplementError when SWA is enabled
+    1. Block manager does not support SWA at all (true at time of writing)
+    2. Block manager fails with NotImplementError when SWA is enabled
       AND a SequenceGroup with an encoder sequence (i.e. in support of an
       encoder/decoder model) is passed into can_allocate() as an argument

@@ -135,7 +135,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
    '''

    with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
-        block_manager = BlockSpaceManagerV2(
+        block_manager = SelfAttnBlockSpaceManager(
            block_size=block_size,
            num_gpu_blocks=num_gpu_blocks,
            num_cpu_blocks=1024,
@@ -158,7 +158,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
        block_manager.can_allocate(seq_group)

    # Assert that either
-    # 1. Block manager v2 constructor fails with assertion that sliding window
+    # 1. Block manager constructor fails with assertion that sliding window
    #    is not yet supported (most likely near-term outcome at time of
    #    writing), or
    # 2. can_allocate() fails with NotImplementedError due to combination of
@@ -177,7 +177,7 @@ def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
        block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
        watermark: float):

-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=1024,
@@ -217,7 +217,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,

    num_gpu_blocks = 1024
    watermark = 0.1
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=0,
@@ -269,14 +269,15 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
    """Verify blocks number on src/desc device is correct after swapping in/out
        sequence group (not missing or extra blocks).
    """
-    block_manager = BlockSpaceManagerV2(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0,
-                                        enable_caching=enable_caching)
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
    prompt.status = SequenceStatus.WAITING
    block_manager.allocate(seq_group)
+
    # Emulate a forward pass by appending a single token.
    # The block manager then knows how many unprocessed
    # tokens will be written in the next forward pass.
@@ -321,11 +322,11 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
        can be swapped in/out.
    """
    num_cpu_blocks = num_gpu_blocks
-    block_manager = BlockSpaceManagerV2(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0,
-                                        enable_caching=enable_caching)
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
    prompt, seq_group = create_dummy_prompt(
        "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
    prompt.status = SequenceStatus.WAITING
@@ -373,6 +374,52 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
            seq_group, num_lookahead_slots) == AllocStatus.NEVER


+@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
+@pytest.mark.parametrize("enable_caching", [False, True])
+def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
+    """Verifies that swapping fails if there is not enough free blocks
+    to account for unseen tokens and lookahead_slots.
+    """
+    block_size = 8
+    num_cpu_blocks = 1
+    num_gpu_blocks = 1
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
+    prompt_length = block_size - 3
+    assert prompt_length > 0
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Swap seq group from GPU -> CPU.
+    assert block_manager.can_swap_out(seq_group)
+    block_manager.swap_out(seq_group)
+    prompt.status = SequenceStatus.SWAPPED
+
+    # Swap seq group from CPU -> GPU.
+    # The number of unseen tokens is 1. If the number of existing
+    # tokens plus the unseen ones and number of lookahead slots exceeds
+    # the total number of available GPU blocks then the swap
+    # should fail.
+    num_unseen_tokens = 1
+    if (num_lookahead_slots + num_unseen_tokens +
+            prompt_length) <= (block_size * num_gpu_blocks):
+        assert block_manager.can_swap_in(seq_group,
+                                         num_lookahead_slots) == AllocStatus.OK
+    else:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+
+
 # TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.


@@ -388,7 +435,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,

    num_gpu_blocks = 1024
    watermark = 0.1
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=0,
@@ -400,7 +447,6 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
        if max_n is None:
            max_n = min_n
        used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
-        #print("check", min_n, used, max_n)
        assert min_n <= used
        assert used <= max_n

@@ -429,7 +475,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
    seq.data.update_num_computed_tokens(prompt_len)
    check_used(num_blocks(prompt_len))

-    # this is how we compute it in BlockSpaceManagerV2.__init__
+    # this is how we compute it in SelfAttnBlockSpaceManager.__init__
    sliding_blocks = (sliding_window // block_size) + 2
    # plus one block for null block
    sliding_blocks += 1

--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -104,9 +104,9 @@ class TestNaiveBlockAllocator:
    @staticmethod
    @pytest.mark.parametrize("num_blocks", [4])
    @pytest.mark.parametrize("block_size", [8])
-    def test_naive_block_get_num_blocks_touched(num_blocks, block_size):
+    def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
        """ Verify the allocator can correctly return the number of
-        blocks touched, with different lookahead slots.
+        full blocks touched.
        """
        allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
                                            num_blocks=num_blocks,
@@ -124,7 +124,7 @@ class TestNaiveBlockAllocator:
        src_blocks = [allocate_block() for _ in range(num_blocks - 1)]

        # All blocks are cached
-        assert allocator_dst.get_num_blocks_touched(
+        assert allocator_dst.get_num_full_blocks_touched(
            src_blocks) == num_blocks - 1

        # Insert one non-full block in the src
@@ -136,9 +136,10 @@ class TestNaiveBlockAllocator:
        src_blocks.append(allocate_non_full_block())
        src_blocks[-1].append_token_ids([0])

-        assert allocator_dst.get_num_blocks_touched(
-            src_blocks, num_lookahead_slots=1) == num_blocks
-        assert allocator_dst.get_num_blocks_touched(
-            src_blocks, num_lookahead_slots=block_size - 1) == num_blocks
-        assert allocator_dst.get_num_blocks_touched(
-            src_blocks, num_lookahead_slots=block_size) == (num_blocks + 1)
+        assert allocator_dst.get_num_full_blocks_touched(
+            src_blocks) == num_blocks - 1
+        # Fill up the last source block and then invoke
+        # get_num_blocks_touched
+        src_blocks[-1].append_token_ids([0] * (block_size - 1))
+        assert allocator_dst.get_num_full_blocks_touched(
+            src_blocks) == num_blocks
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -318,11 +318,10 @@ class TestPrefixCachingBlockAllocator:
    @staticmethod
    @pytest.mark.parametrize("num_blocks", [4])
    @pytest.mark.parametrize("block_size", [8])
-    def test_prefix_caching_block_get_num_blocks_touched(
+    def test_prefix_caching_block_get_num_full_blocks_touched(
            num_blocks, block_size):
        """ Verify the allocator can correctly return the number of
-        blocks touched, when there are cached prefixes and different
-        lookahead slots.
+        blocks touched, when there are cached prefixes.
        """
        allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
                                                    block_size=block_size)
@@ -346,28 +345,30 @@ class TestPrefixCachingBlockAllocator:
                token_ids=token_ids,
                allocator=allocator_src,
            )
-
        # All blocks are cached
-        assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in) == 0
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 0

        # Free the first block in the dst
        allocator_dst.free(cached_blocks[0])

        # Now the first block becomes dangling, the swapped blocks need
        # to reclaim the first block in the dst
-        assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in) == 1
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 1

        # Insert one non-full block in the src
        non_full_block = allocator_src.allocate_mutable_block(
            blocks_to_swap_in[-1])
        non_full_block.append_token_ids([0])
        blocks_to_swap_in.append(non_full_block)
-        assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in,
-                                                    num_lookahead_slots=1) == 2
-        assert allocator_dst.get_num_blocks_touched(
-            blocks_to_swap_in, num_lookahead_slots=block_size - 1) == 2
-        assert allocator_dst.get_num_blocks_touched(
-            blocks_to_swap_in, num_lookahead_slots=block_size) == 3
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 1
+        # Fill up the last mutable block and invoke get_num_blocks_touched.
+        # Note: The last block is not cached so it will be touched.
+        non_full_block.append_token_ids([0] * (block_size - 1))
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 2

    @staticmethod
    @pytest.mark.parametrize("num_blocks", [1024])

--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
-import time
-from collections import defaultdict
-from typing import List
-
-import pytest
-
-from vllm import SamplingParams
-from vllm.block import PhysicalTokenBlock
-from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
-                                   STR_NOT_IMPL_ENC_DEC_SWA)
-from vllm.core.block_manager_v1 import (BlockSpaceManagerV1,
-                                        UncachedBlockAllocator)
-from vllm.core.interfaces import AllocStatus
-from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus
-from vllm.utils import Device
-
-from .utils import create_dummy_prompt, create_dummy_prompt_encoder_decoder
-
-
-def test_block_allocator_allocate():
-    block_size = 4
-    num_cpu_blocks = 4
-    cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
-                                           num_cpu_blocks)
-
-    # Allocate all available cpu blocks.
-    num_free = num_cpu_blocks
-    assert cpu_allocator.get_num_free_blocks() == num_free
-    for _ in range(num_cpu_blocks):
-        block = cpu_allocator.allocate()
-        num_free -= 1
-
-        assert block not in cpu_allocator.free_blocks
-        assert cpu_allocator.get_num_free_blocks() == num_free
-
-    with pytest.raises(ValueError):
-        cpu_allocator.allocate()
-
-
-def test_block_allocator_free():
-    block_size = 4
-    num_cpu_blocks = 4
-    cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
-                                           num_cpu_blocks)
-
-    # Allocate all available cpu blocks.
-    blocks: List[PhysicalTokenBlock] = []
-    for _ in range(num_cpu_blocks):
-        block = cpu_allocator.allocate()
-        blocks.append(block)
-        assert block not in cpu_allocator.free_blocks
-
-    # Free all allocated cpu blocks.
-    num_free = 0
-    assert cpu_allocator.get_num_free_blocks() == num_free
-    for block in blocks:
-        cpu_allocator.free(block)
-        num_free += 1
-        assert block in cpu_allocator.free_blocks
-        assert cpu_allocator.get_num_free_blocks() == num_free
-
-        with pytest.raises(ValueError):
-            cpu_allocator.free(block)
-
-
-def test_allocate():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    # Allocate same sequence group to all available gpu blocks.
-    for i in range(num_gpu_blocks):
-        _, seq_group = create_dummy_prompt(str(i), block_size)
-        assert block_manager.can_allocate(seq_group) == AllocStatus.OK
-        block_manager.allocate(seq_group)
-    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
-
-    # Allocate same sequence group to all available gpu blocks.
-    # Use watermark to reserve one gpu block.
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=1 / num_gpu_blocks)
-    for i in range(num_gpu_blocks - 1):
-        _, seq_group = create_dummy_prompt(str(i), block_size)
-        assert block_manager.can_allocate(seq_group) == AllocStatus.OK
-        block_manager.allocate(seq_group)
-    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
-
-
-def test_allocate_encoder_decoder():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_req_per_seq_group = 2
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    # Allocate same sequence group to all available gpu blocks.
-    for i in range(num_gpu_blocks // block_req_per_seq_group):
-        _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            str(i),
-            decoder_prompt_length=block_size,
-            encoder_prompt_length=block_size)
-        assert block_manager.can_allocate(seq_group) == AllocStatus.OK
-        block_manager.allocate(seq_group)
-    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
-
-    # Allocate same sequence group to all available gpu blocks.
-    # Use watermark to reserve one gpu block.
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=1 / num_gpu_blocks)
-    for i in range((num_gpu_blocks - 1) // block_req_per_seq_group):
-        _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            str(i),
-            decoder_prompt_length=block_size,
-            encoder_prompt_length=block_size)
-        assert block_manager.can_allocate(seq_group) == AllocStatus.OK
-        block_manager.allocate(seq_group)
-    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
-
-
-def test_allocate_encoder_decoder_fails_with_swa():
-    # SWA short for sliding window attention
-
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0,
-                                        sliding_window=5)  # swa
-
-    # Allocate same sequence group to all available gpu blocks.
-    _, _, seq_group = create_dummy_prompt_encoder_decoder(
-        "0",
-        decoder_prompt_length=block_size,
-        encoder_prompt_length=block_size)
-
-    # Assert that can_allocate() fails due to SWA
-    with pytest.raises(NotImplementedError) as exc_info:
-        block_manager.can_allocate(seq_group)
-
-    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
-
-    # Assert that allocate() fails due to SWA
-    with pytest.raises(NotImplementedError) as exc_info:
-        block_manager.allocate(seq_group)
-
-    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
-
-
-def test_allocate_encoder_decoder_fails_with_prefix_caching():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0,
-                                        enable_caching=True)  # Prefix cache
-
-    # Allocate same sequence group to all available gpu blocks.
-    _, _, seq_group = create_dummy_prompt_encoder_decoder(
-        "0",
-        decoder_prompt_length=block_size,
-        encoder_prompt_length=block_size)
-
-    # Assert that can_allocate() fails due to prefix caching
-    with pytest.raises(NotImplementedError) as exc_info:
-        block_manager.can_allocate(seq_group)
-
-    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
-
-    # Assert that allocate() fails due to prefix caching
-    with pytest.raises(NotImplementedError) as exc_info:
-        block_manager.allocate(seq_group)
-
-    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
-
-
-def test_append_slot_single_seq():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    # Allocate single seq to gpu block.
-    prompt, seq_group = create_dummy_prompt("1", block_size)
-    block_manager.allocate(seq_group)
-
-    # Nothing to append. Sequence has no new logical blocks.
-    assert block_manager.can_append_slots(seq_group)
-    before_blocks = block_manager.get_num_free_gpu_blocks()
-    assert not block_manager.append_slots(prompt)
-    after_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_blocks == after_blocks
-
-    # Add block_size number of new tokens and append slot.
-    for i in range(block_size):
-        token_id = i + 5
-        prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
-
-    assert block_manager.can_append_slots(seq_group)
-    before_blocks = block_manager.get_num_free_gpu_blocks()
-    assert not block_manager.append_slots(prompt)
-    after_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_blocks - after_blocks == 1
-
-
-def test_append_slot_cow():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size=block_size,
-                                        num_cpu_blocks=num_cpu_blocks,
-                                        num_gpu_blocks=num_gpu_blocks,
-                                        watermark=0)
-
-    # Allocate prompt to gpu block. There is one slot left in the block.
-    prompt = Sequence(seq_id=1,
-                      inputs={
-                          "prompt": "one two three",
-                          "prompt_token_ids": [1, 2, 3],
-                      },
-                      block_size=block_size)
-
-    # Fork the sequence, such that a COW will be required when we append a new
-    # token id.
-    child = prompt.fork(new_seq_id=2)
-
-    # Allocate space for the sequence group.
-    seq_group = SequenceGroup(request_id="1",
-                              seqs=[prompt, child],
-                              arrival_time=time.time(),
-                              sampling_params=SamplingParams())
-    block_manager.allocate(seq_group)
-
-    # Fork and append a new token id. We expect a COW to be scheduled.
-    token_id = 4
-    child.append_token_id(token_id, {token_id: Logprob(0.0)})
-    block_manager.fork(prompt, child)
-
-    assert block_manager.can_append_slots(seq_group)
-    before_blocks = block_manager.get_num_free_gpu_blocks()
-
-    cows = block_manager.append_slots(child)
-    assert cows
-    dict_cows = defaultdict(list)
-    for src_block, dst_block in cows:
-        dict_cows[src_block].append(dst_block)
-    for src_block, dst_blocks in dict_cows.items():
-        assert src_block not in dst_blocks
-
-    after_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_blocks - after_blocks == 1
-
-
-def test_fork():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    prompt, seq_group = create_dummy_prompt("1",
-                                            block_size - 1,
-                                            block_size=block_size)
-    block_manager.allocate(seq_group)
-
-    # Fork prompt and copy block tables.
-    child = prompt.fork(2)
-    block_manager.fork(prompt, child)
-    assert block_manager.get_block_table(
-        prompt) == block_manager.get_block_table(child)
-    token_id = 4
-    # Append token to child. Block is shared so copy on write occurs.
-    child.append_token_id(token_id, {token_id: Logprob(0.0)})
-    block_manager.append_slots(child)
-    assert block_manager.get_block_table(
-        prompt) != block_manager.get_block_table(child)
-
-
-def test_swap():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
-    prompt.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group)
-
-    # Emulate a forward pass by appending a single token.
-    # The block manager then knows how many unprocessed
-    # tokens will be written in the next forward pass.
-    token_id = 0
-    prompt.status = SequenceStatus.RUNNING
-    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
-
-    # Swap seq group from GPU -> CPU.
-    gpu_blocks = block_manager.get_block_table(prompt)
-    assert block_manager.can_swap_out(seq_group)
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_out(seq_group)
-    assert [x[0] for x in mapping] == gpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
-    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
-    prompt.status = SequenceStatus.SWAPPED
-
-    # Swap seq group from CPU -> GPU.
-    cpu_blocks = block_manager.get_block_table(prompt)
-    assert block_manager.can_swap_in(seq_group) == AllocStatus.OK
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_in(seq_group)
-    assert [x[0] for x in mapping] == cpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
-    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
-
-
-def test_swap_encoder_decoder():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    decoder_prompt, encoder_prompt, seq_group = \
-        create_dummy_prompt_encoder_decoder(
-        "1",
-        decoder_prompt_length=block_size,
-        encoder_prompt_length=block_size)
-    decoder_prompt.status = SequenceStatus.WAITING
-    encoder_prompt.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group)
-
-    # Emulate a forward pass by appending a single token.
-    # The block manager then knows how many unprocessed
-    # tokens will be written in the next forward pass.
-    token_id = 0
-    decoder_prompt.status = SequenceStatus.RUNNING
-    decoder_prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
-
-    # Swap encoder/decoder seq group from GPU -> CPU.
-    decoder_gpu_blocks = block_manager.get_block_table(decoder_prompt)
-    cross_gpu_blocks = block_manager.get_cross_block_table(seq_group)
-    gpu_blocks = decoder_gpu_blocks + cross_gpu_blocks
-    assert block_manager.can_swap_out(seq_group)
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_out(seq_group)
-    assert [x[0] for x in mapping] == gpu_blocks
-    #assert list(mapping.keys()) == gpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
-    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
-    decoder_prompt.status = SequenceStatus.SWAPPED
-
-    # Swap encoder/decoder seq group from CPU -> GPU.
-    decoder_cpu_blocks = block_manager.get_block_table(decoder_prompt)
-    cross_cpu_blocks = block_manager.get_cross_block_table(seq_group)
-    cpu_blocks = decoder_cpu_blocks + cross_cpu_blocks
-    assert block_manager.can_swap_in(seq_group) == AllocStatus.OK
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_in(seq_group)
-    assert [x[0] for x in mapping] == cpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
-    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
-
-
-def test_free():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    prompt, seq_group = create_dummy_prompt("1", block_size)
-    block_manager.allocate(seq_group)
-
-    # Free allocated seq.
-    prompt_blocks = len(block_manager.get_block_table(prompt))
-    before_blocks = block_manager.get_num_free_gpu_blocks()
-    block_manager.free(prompt)
-    after_blocks = block_manager.get_num_free_gpu_blocks()
-    assert after_blocks == before_blocks + prompt_blocks
-
-    # Block table for freed seq is deleted.
-    with pytest.raises(KeyError):
-        block_manager.get_block_table(prompt)
-
-
-def test_free_encoder_decoder():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    decoder_prompt, encoder_prompt, seq_group = \
-        create_dummy_prompt_encoder_decoder(
-        "1",
-        decoder_prompt_length=block_size,
-        encoder_prompt_length=block_size)
-    block_manager.allocate(seq_group)
-
-    # Free allocated seq.
-    decoder_prompt_blocks = len(block_manager.get_block_table(decoder_prompt))
-    encoder_prompt_blocks = len(block_manager.get_cross_block_table(seq_group))
-    prompt_blocks = decoder_prompt_blocks + encoder_prompt_blocks
-    before_blocks = block_manager.get_num_free_gpu_blocks()
-    block_manager.free(decoder_prompt)
-    block_manager.free_cross(seq_group)
-    after_blocks = block_manager.get_num_free_gpu_blocks()
-    assert after_blocks == before_blocks + prompt_blocks
-
-    # Block table for freed encoder & decoder seq's are deleted.
-    with pytest.raises(KeyError):
-        block_manager.get_block_table(decoder_prompt)
-
-    # Block table for freed encoder & decoder seq's are deleted.
-    with pytest.raises(KeyError):
-        block_manager.get_block_table(encoder_prompt)
-
-
-def test_reset():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    # Allocate same seq group on all available gpu blocks.
-    original_blocks = block_manager.get_num_free_gpu_blocks()
-    for i in range(num_gpu_blocks):
-        _, seq_group = create_dummy_prompt(str(i), block_size)
-        block_manager.allocate(seq_group)
-    assert block_manager.get_num_free_gpu_blocks() == 0
-
-    # Resetting block manager frees all allocated blocks.
-    block_manager.reset()
-    assert block_manager.get_num_free_gpu_blocks() == original_blocks
-
-
-def test_reset_encoder_decoder():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_req_per_seq_group = 2
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    # Allocate same seq group on all available gpu blocks.
-    original_blocks = block_manager.get_num_free_gpu_blocks()
-    for i in range(num_gpu_blocks // block_req_per_seq_group):
-        _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            f"{i}",
-            decoder_prompt_length=block_size,
-            encoder_prompt_length=block_size)
-        block_manager.allocate(seq_group)
-    assert block_manager.get_num_free_gpu_blocks() == 0
-
-    # Resetting block manager frees all allocated blocks.
-    block_manager.reset()
-    assert block_manager.get_num_free_gpu_blocks() == original_blocks
-
-
-def test_sliding_window_multi_seq():
-    """
-    Tests that memory allocation and deallocation is handled
-    correctly with multiple sequences that exceed the sliding
-    window's capacity.
-    """
-    block_size = 1
-    num_cpu_blocks = 8
-    num_gpu_blocks = 8
-    sliding_window = 2
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        sliding_window=sliding_window,
-                                        watermark=0)
-
-    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
-
-    parent = Sequence(seq_id=1,
-                      inputs={
-                          "prompt": "one two three",
-                          "prompt_token_ids": [0, 1, 2],
-                      },
-                      block_size=block_size)
-    seq_group = SequenceGroup(request_id="1",
-                              seqs=[parent],
-                              arrival_time=time.time(),
-                              sampling_params=SamplingParams(),
-                              lora_request=None)
-    block_manager.allocate(seq_group)
-
-    # assert the number of blocks allocated is correct
-    # the parent seq has len 3, but since sliding_window is 2,
-    # we will use at most 2 blocks
-    assert block_manager.get_num_free_gpu_blocks(
-    ) == num_gpu_blocks - sliding_window
-
-    # Fork prompt and copy block tables.
-    child = parent.fork(2)
-    block_manager.fork(parent, child)
-
-    # assert the number of blocks allocated is correct
-    # forking does not increase memory consumption
-    assert block_manager.get_num_free_gpu_blocks(
-    ) == num_gpu_blocks - sliding_window
-
-    # assert both parent and child share all blocks
-    assert block_manager.get_block_table(
-        parent) == block_manager.get_block_table(child)
-
-    token_id = 4
-    # Append token to child. Block is shared so copy on write occurs.
-    child.append_token_id(token_id, {token_id: Logprob(0.0)})
-    block_manager.append_slots(child)
-
-    # assert the number of blocks allocated is correct
-    # we will use now one block more. Each seq will use 2 blocks,
-    # but only one can be shared
-    assert block_manager.get_num_free_gpu_blocks(
-    ) == num_gpu_blocks - sliding_window - 1
-
-    token_id = 5
-    parent.append_token_id(token_id, {token_id: Logprob(0.0)})
-    block_manager.append_slots(parent)
-
-    # assert the number of blocks allocated is correct
-    # no change, because both sequences are still just sharing one block
-    assert block_manager.get_num_free_gpu_blocks(
-    ) == num_gpu_blocks - sliding_window - 1
-
-    block_table_parent = block_manager.get_block_table(parent)
-    block_table_child = block_manager.get_block_table(child)
-
-    assert block_table_parent != block_table_child
-
-    # assert both blocks are sharing the second-last block
-    assert block_table_parent[-2] == block_table_child[-2]
-
-    # now let's clean up...
-    block_manager.free(parent)
-
-    # assert the number of blocks allocated is correct
-    # We have freed one seq, reducing the ref count of two blocks by one.
-    # One of the two was only used by the parent seq, so this is now free.
-    # The child seq still consumes sliding_window blocks
-    assert block_manager.get_num_free_gpu_blocks(
-    ) == num_gpu_blocks - sliding_window
-
-    # free all blocks
-    block_manager.free(child)
-
-    # assert all blocks are free now
-    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
-
-
-def test_mark_blocks_as_computed_with_prefix_cache_and_chunked_prefill():
-    """When prefix cache and chunked prefill are enabled, the block manager
-    should only mark a chunk of blocks as computed instead of all blocks.
-    """
-
-    block_size = 4
-    num_cpu_blocks = 0
-    num_gpu_blocks = 16
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_gpu_blocks,
-                                        num_cpu_blocks,
-                                        watermark=0,
-                                        enable_caching=True)
-
-    # Set prompt size to have num_gpu_blocks - 1 full blocks.
-    prompt_length = block_size * num_gpu_blocks - 1
-
-    # Allocate (reserve) all blocks.
-    _, seq_group = create_dummy_prompt("0",
-                                       prompt_length,
-                                       block_size=block_size)
-    block_manager.allocate(seq_group)
-    assert seq_group.seqs[0].n_blocks == num_gpu_blocks
-
-    # 1st chunk: Compute 2 and half blocks. Should mark 2 blocks as computed.
-    token_chunk_size = int(block_size * 2.5)
-    block_manager.mark_blocks_as_computed(seq_group, token_chunk_size)
-    computed_blocks = block_manager.get_all_computed_blocks(seq_group.seqs[0])
-    assert len(computed_blocks) == 2
-
-    # Actual computed tokens.
-    seq_group.seqs[0].data.update_num_computed_tokens(token_chunk_size)
-
-    # 2nd chunk: Complete 3rd block and additional 4 blocks.
-    token_chunk_size = int(block_size * 4.5)
-    block_manager.mark_blocks_as_computed(seq_group, token_chunk_size)
-    computed_blocks = block_manager.get_all_computed_blocks(seq_group.seqs[0])
-    assert len(computed_blocks) == 7