Merge tag 'v0.6.5' into v0.6.5-dev

4d3a2c28 · zhuwenwen · 92ec5d8e · 2d1b9baa · 4d3a2c28 · 4d3a2c28
Commit 4d3a2c28 authored Dec 30, 2024 by zhuwenwen
20 changed files
--- a/tests/compile/piecewise/__init__.py
+++ b/tests/compile/piecewise/__init__.py
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
+"""
+Test the piecewise compilation with a simple model so that we
+can exactly calculate the expected output and side effects.
+"""
+
+import torch
+from torch import nn
+from torch.library import Library
+
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+from vllm.utils import direct_register_custom_op
+
+global_counter = 0
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
+
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    global global_counter
+    global_counter += 1
+    print(f"{global_counter=}")
+    out.copy_(q)
+    out[0] += 1
+
+
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
+@support_torch_compile
+class SillyModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Overall effect:
+        x += 1
+        x[0] += 2
+        global_counter += 2
+        """
+        x = x + 1
+        x = x + 2
+        out = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, out)
+        x = out
+        x = x - 2
+        x = x - 1
+        out = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, out)
+        x = out
+        x = x + 1
+        return x
+
+
+def test_simple_piecewise_compile():
+
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=True,
+        splitting_ops=["silly.attention"],
+        cudagraph_copy_inputs=True,
+        cudagraph_capture_sizes=[1, 2],
+    ))
+    with set_current_vllm_config(vllm_config):
+        model = SillyModel(vllm_config=vllm_config, prefix='')
+
+    inputs = torch.randn(100).cuda()
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
+            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
+            num_inductor_compilations=3,  # num_piecewise_capturable_graphs_seen
+            num_cudagraph_caputured=
+            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+
+        model(inputs)
+
+        model(torch.randn(2).cuda())
+        model(torch.randn(1).cuda())
+
+        input = torch.zeros(2).cuda()
+        global global_counter
+        global_counter = 0
+        output = model(input)
+        assert global_counter == 2
+        assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
+"""
+Test the piecewise compilation with a simple model, comparing the output
+with and without the piecewise compilation.
+
+This is a tractable model, the weights and computation are specially designed
+if the config `tractable_init` is set to True. Otherwise, the weights are
+initialized randomly with a fixed seed.
+"""
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from torch.library import Library
+
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+from vllm.utils import direct_register_custom_op
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
+
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    out.copy_(q)
+    out += k
+    out += v
+
+
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
+@dataclass
+class LlamaConfig:
+    hidden_size: int = 128
+    mlp_size: int = 256
+    vocab_size: int = 128
+    num_layers: int = 2
+    init_value: float = 1.0
+    tractable_init: bool = False
+    random_seed: int = 0
+
+    def __post_init__(self):
+        assert self.mlp_size >= self.hidden_size
+
+
+class LlamaMLP(nn.Module):
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.gate_up_projection = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=config.mlp_size * 2,
+            bias=False,
+        )
+        self.down_projection = nn.Linear(
+            in_features=config.mlp_size,
+            out_features=config.hidden_size,
+            bias=False,
+        )
+
+        if config.tractable_init:
+            nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size])
+            nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:])
+            nn.init.eye_(self.down_projection.weight.data)
+        else:
+            nn.init.xavier_normal_(self.gate_up_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+            nn.init.xavier_normal_(self.down_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+
+    def forward(self, x):
+        # for tractable_init and positive input, this is
+        # essentially an elementwise-square
+        x = self.gate_up_projection(x)
+        x = x[:, :x.size(1) // 2] * torch.nn.functional.relu(
+            x[:, x.size(1) // 2:])
+        x = self.down_projection(x)
+        return x
+
+
+class LlamaAttention(nn.Module):
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.qkv_projection = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=config.hidden_size * 3,
+            bias=False,
+        )
+
+        self.output_projection = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=config.hidden_size,
+            bias=False,
+        )
+
+        if config.tractable_init:
+            nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size])
+            nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 *
+                                                         config.hidden_size])
+            nn.init.eye_(self.qkv_projection.weight.data[2 *
+                                                         config.hidden_size:])
+            nn.init.eye_(self.output_projection.weight.data)
+        else:
+            nn.init.xavier_normal_(self.qkv_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+            nn.init.xavier_normal_(self.output_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # for tractable_init, this is:
+        # output = (hidden_states * 3 + positions * 2)
+        qkv = self.qkv_projection(hidden_states)
+        hidden_size = qkv.size(-1) // 3
+        q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1)
+
+        q = q + positions.unsqueeze(1)
+        k = k + positions.unsqueeze(1)
+
+        attn_output = torch.empty_like(q)
+        torch.ops.silly.attention(q, k, v, attn_output)
+
+        output = self.output_projection(attn_output)
+        return output
+
+
+class LlamaDecoderLayer(nn.Module):
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.self_attention = LlamaAttention(config)
+        self.mlp = LlamaMLP(config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        For tractable computation:
+        - if residual is None, the outputs are:
+            - residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3
+            - hidden_states = (residual + 1) ** 2
+        - if residual is not None, the outputs are:
+            - residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3
+            - hidden_states = (residual + 1) ** 2
+        """ # noqa
+        if residual is None:
+            residual = hidden_states
+            hidden_states = hidden_states + 1
+        else:
+            hidden_states = hidden_states + residual
+            residual = hidden_states
+            hidden_states = hidden_states + 1
+
+        hidden_states = self.self_attention(positions=positions,
+                                            hidden_states=hidden_states)
+
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = hidden_states + 1
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class LlamaModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 config: LlamaConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+        self.embedding_tokens = nn.Embedding(
+            num_embeddings=config.vocab_size,
+            embedding_dim=config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config) for _ in range(config.num_layers)])
+
+        # this is the initial value of the hidden states
+        self.embedding_tokens.weight.data.fill_(config.init_value)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.embedding_tokens(input_ids)
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        return hidden_states
+
+
+def tractable_computation(input_ids: torch.Tensor,
+                          positions: torch.Tensor,
+                          config: LlamaConfig,
+                          init_value: float = 1.0) -> torch.Tensor:
+    hidden_states = torch.ones(input_ids.size(0),
+                               config.hidden_size,
+                               device=input_ids.device,
+                               dtype=input_ids.dtype) * init_value
+
+    # first layer
+    residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
+    hidden_states = (residual + 1)**2
+
+    # following layers
+    for _ in range(config.num_layers - 1):
+        hidden_states = hidden_states + residual
+        residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
+        hidden_states = (residual + 1)**2
+
+    return hidden_states
+
+
+@torch.inference_mode
+def run_model(llama_config,
+              use_compile: bool,
+              split_attn: bool = False) -> torch.Tensor:
+
+    if use_compile:
+        compilation_config = CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            use_cudagraph=True,
+            cudagraph_capture_sizes=[1, 2],
+        )
+        if split_attn:
+            compilation_config.splitting_ops = ["silly.attention"]
+    else:
+        compilation_config = CompilationConfig(
+            level=CompilationLevel.NO_COMPILATION, )
+
+    vllm_config = VllmConfig(compilation_config=compilation_config)
+    with set_current_vllm_config(vllm_config):
+        model = LlamaModel(config=llama_config,
+                           vllm_config=vllm_config,
+                           prefix="").eval().cuda()
+
+    B = 16  # max batch size
+    input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
+    positions = torch.arange(B).cuda()
+
+    model(input_ids, positions)
+    model(input_ids[:2], positions[:2])
+    model(input_ids[:1], positions[:1])
+
+    input_ids[:2].zero_()
+    output = model(input_ids[:2], positions[:2])
+
+    output = output.cpu()
+
+    if llama_config.tractable_init:
+        expected_output = tractable_computation(input_ids[:2], positions[:2],
+                                                llama_config).cpu()
+
+        assert torch.allclose(output, expected_output)
+    else:
+        return output.cpu()
+
+
+def test_toy_llama():
+    # compare output with and without piecewise compilation
+
+    llama_config = LlamaConfig(hidden_size=128,
+                               mlp_size=256,
+                               vocab_size=128,
+                               num_layers=12)
+
+    tractable_config = LlamaConfig(hidden_size=128,
+                                   mlp_size=256,
+                                   vocab_size=128,
+                                   num_layers=2,
+                                   tractable_init=True)
+
+    outputs = []
+    with compilation_counter.expect(
+            num_graphs_seen=0,
+            num_piecewise_graphs_seen=0,
+            num_piecewise_capturable_graphs_seen=0,
+            num_inductor_compilations=0,
+            num_cudagraph_caputured=0,
+    ):
+        outputs.append(run_model(llama_config, use_compile=False))
+    run_model(tractable_config, use_compile=False)
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_piecewise_graphs_seen=1,
+            num_piecewise_capturable_graphs_seen=1,
+            num_inductor_compilations=1,  # num_piecewise_capturable_graphs_seen
+            num_cudagraph_caputured=
+            2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        outputs.append(run_model(llama_config, use_compile=True))
+    run_model(tractable_config, use_compile=True)
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_piecewise_graphs_seen=2 * llama_config.num_layers +
+            1,  # 2 * num_layers + 1
+            num_piecewise_capturable_graphs_seen=1 +
+            llama_config.num_layers,  # 1 + num_layers
+            num_inductor_compilations=1 +
+            llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
+            num_cudagraph_caputured=2 *
+        (1 + llama_config.num_layers
+         ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        outputs.append(
+            run_model(llama_config, use_compile=True, split_attn=True))
+    run_model(tractable_config, use_compile=True, split_attn=True)
+
+    for i in range(1, len(outputs)):
+        assert torch.allclose(outputs[0], outputs[i])
+
+
+@torch.inference_mode
+def benchmark():
+    from triton.testing import do_bench
+
+    # similar to llama 3.1-8B
+    llama_config = LlamaConfig(hidden_size=4096,
+                               mlp_size=14336,
+                               vocab_size=128 * 1024,
+                               num_layers=32)
+
+    # a tiny model to measure the overhead
+    # of piecewise cudagraph
+    llama_config = LlamaConfig(hidden_size=40,
+                               mlp_size=80,
+                               vocab_size=128,
+                               num_layers=2)
+
+    cudagraph_sizes = [1, 2, 4] + [i * 8 for i in range(1, 33)]
+
+    eager_time = {}
+    full_cudagraph_time = {}
+    piecewise_cudagraph_time = {}
+
+    pool = torch.cuda.graph_pool_handle()
+
+    for piecewise in [False, True]:
+        if piecewise:
+            compilation_config = CompilationConfig(
+                level=CompilationLevel.PIECEWISE,
+                use_cudagraph=True,
+                splitting_ops=["silly.attention"],
+                cudagraph_capture_sizes=cudagraph_sizes,
+            )
+        else:
+            compilation_config = CompilationConfig(
+                level=CompilationLevel.PIECEWISE,
+                cudagraph_capture_sizes=cudagraph_sizes,
+            )
+
+        vllm_config = VllmConfig(compilation_config=compilation_config)
+        with set_current_vllm_config(vllm_config):
+            model = LlamaModel(config=llama_config,
+                               vllm_config=vllm_config,
+                               prefix="").eval().cuda().to(torch.bfloat16)
+
+        B = 256  # max batch size
+        input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
+        positions = torch.arange(B).cuda().to(torch.bfloat16)
+
+        graphs = {}
+
+        model(input_ids, positions)
+        for b in cudagraph_sizes[::-1]:
+            if not piecewise:
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph, pool=pool):
+                    output = model(input_ids[:b], positions[:b])
+                graphs[b] = (graph, output)
+            else:
+                output = model(input_ids[:b], positions[:b])
+                graphs[b] = (model, output)
+        for b in cudagraph_sizes:
+            if piecewise:
+                # noqa is for `Function definition does not bind loop variable`
+                # it will be problematic if we save the created lambda function
+                # and use it later, because it will look up the name `b` in the
+                # enclosing scope, and the value of `b` will always be 256.
+                # it is fine here, because we only use the lambda function once.
+                runtime = do_bench(lambda: graphs[b][0]  # noqa
+                                   (input_ids[:b], positions[:b]))  # noqa
+                piecewise_cudagraph_time[b] = runtime
+            else:
+                runtime = do_bench(lambda: graphs[b][0].replay())  # noqa
+                eager_runtime = do_bench(
+                    lambda: model(input_ids[:b], positions[:b]))  # noqa
+                full_cudagraph_time[b] = runtime
+                eager_time[b] = eager_runtime
+
+    # print in tabular format
+    print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
+    for b in cudagraph_sizes:
+        print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
+              f"\t{piecewise_cudagraph_time[b]:.3f}")
+
+
+if __name__ == "__main__":
+    benchmark()
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
+import dataclasses
+from typing import Dict, List, Optional
+
+import pytest
+import os
+
+from vllm.config import CompilationLevel
+from vllm.utils import cuda_device_count_stateless
+
+from ..utils import compare_all_settings, models_path_prefix
+
+
+@dataclasses.dataclass
+class TestSetting:
+    model: str
+    model_args: List[str]
+    pp_size: int
+    tp_size: int
+    attn_backend: str
+    method: str
+    fullgraph: bool
+
+
+# representative settings for testing
+test_settings = [
+    # basic llama model
+    TestSetting(
+        model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"),
+        model_args=[],
+        pp_size=2,
+        tp_size=2,
+        attn_backend="FLASHINFER",
+        method="generate",
+        fullgraph=True,
+    ),
+    # llama model with quantization
+    TestSetting(
+        model=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"),
+        model_args=["--quantization", "gptq"],
+        pp_size=1,
+        tp_size=1,
+        attn_backend="FLASH_ATTN",
+        method="generate",
+        fullgraph=True,
+    ),
+    # MoE model
+    TestSetting(
+        model=os.path.join(models_path_prefix, "ibm/PowerMoE-3b"),
+        model_args=[],
+        pp_size=1,
+        tp_size=2,
+        attn_backend="FLASH_ATTN",
+        method="generate",
+        fullgraph=True,
+    ),
+    # embedding model
+    TestSetting(
+        model=os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
+        model_args=["--task", "embed"],
+        pp_size=1,
+        tp_size=1,
+        attn_backend="FLASHINFER",
+        method="encode",
+        fullgraph=True,
+    ),
+    # encoder-based embedding model (BERT)
+    TestSetting(
+        model=os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
+        model_args=["--task", "embed"],
+        pp_size=1,
+        tp_size=1,
+        attn_backend="XFORMERS",
+        method="encode",
+        fullgraph=True,
+    ),
+    # vision language model
+    TestSetting(
+        model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
+        model_args=["--trust-remote-code", "--max-model-len", "2048"],
+        pp_size=2,
+        tp_size=1,
+        attn_backend="FLASH_ATTN",
+        method="generate_with_image",
+        fullgraph=False,
+    ),
+]
+
+
+# we cannot afford testing the full Catesian product
+# of all models and all levels
+@pytest.mark.parametrize("test_setting", test_settings)
+def test_compile_correctness(test_setting: TestSetting):
+    # this test is run under multiple suits, with different GPUs.
+    # make sure we only run the test with correct CUDA devices.
+    # don't use "<", as it will duplicate the tests.
+    model = test_setting.model
+    model_args = test_setting.model_args
+    pp_size = test_setting.pp_size
+    tp_size = test_setting.tp_size
+    attn_backend = test_setting.attn_backend
+    method = test_setting.method
+    fullgraph = test_setting.fullgraph
+    if cuda_device_count_stateless() != pp_size * tp_size:
+        pytest.skip("Not correct CUDA devices for the test.")
+    import os
+    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
+    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
+                ["-tp", str(tp_size)]
+
+    all_args: List[List[str]] = []
+    all_envs: List[Optional[Dict[str, str]]] = []
+
+    for level in [
+            CompilationLevel.NO_COMPILATION,
+            CompilationLevel.PIECEWISE,
+    ]:
+        all_args.append(final_args + [f"-O{level}"])
+        all_envs.append({})
+
+    # inductor will change the output, so we only compare if the output
+    # is close, not exactly the same.
+    compare_all_settings(
+        model,
+        all_args,
+        all_envs,
+        method=method if method != "generate" else "generate_close")
+    all_envs.clear()
+    all_args.clear()
+
+    for level in [
+            CompilationLevel.NO_COMPILATION,
+            CompilationLevel.DYNAMO_AS_IS,
+            CompilationLevel.DYNAMO_ONCE,
+    ]:
+        all_args.append(final_args + [f"-O{level}"])
+        all_envs.append({})
+        if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
+            # "DYNAMO_ONCE" will always use fullgraph
+            all_envs[-1][
+                "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
+
+    compare_all_settings(model, all_args * 3, all_envs, method=method)
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
 import pytest

-from vllm.compilation.backends import vllm_backend
+from vllm.config import CompilationLevel

+from ..utils import fork_new_process_for_each_test
 from .utils import TEST_MODELS, check_full_graph_support


 @pytest.mark.parametrize("model_info", TEST_MODELS)
-@pytest.mark.parametrize("backend", ["eager", vllm_backend])
-def test_full_graph(model_info, backend):
+@pytest.mark.parametrize(
+    "optimization_level",
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
+@fork_new_process_for_each_test
+def test_full_graph(model_info, optimization_level):
    model = model_info[0]
    model_kwargs = model_info[1]
-    check_full_graph_support(model, model_kwargs, backend, tp_size=1)
+    check_full_graph_support(model,
+                             model_kwargs,
+                             optimization_level,
+                             tp_size=1)
--- a/tests/compile/test_full_graph_multi_gpu.py
+++ b/tests/compile/test_full_graph_multi_gpu.py
-import pytest
-
-from vllm.compilation.backends import vllm_backend
-from vllm.utils import cuda_device_count_stateless
-
-from ..utils import fork_new_process_for_each_test
-from .utils import TEST_MODELS_SMOKE, check_full_graph_support
-
-
-@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
-@pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("backend", ["eager", vllm_backend])
-@fork_new_process_for_each_test
-def test_full_graph_multi_gpu(model_info, tp_size, backend):
-    model = model_info[0]
-    model_kwargs = model_info[1]
-
-    # Skip the test if there are not enough CUDA devices.
-    if cuda_device_count_stateless() < tp_size:
-        pytest.skip("Not enough CUDA devices for the test.")
-
-    check_full_graph_support(model, model_kwargs, backend, tp_size=tp_size)
--- a/tests/compile/test_full_graph_smoke.py
+++ b/tests/compile/test_full_graph_smoke.py
-import pytest
-
-from vllm.compilation.backends import vllm_backend
-
-from .utils import TEST_MODELS_SMOKE, check_full_graph_support
-
-
-@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
-@pytest.mark.parametrize("backend", ["eager", vllm_backend])
-def test_full_graph(model_info, backend):
-    model = model_info[0]
-    model_kwargs = model_info[1]
-    check_full_graph_support(model, model_kwargs, backend, tp_size=1)
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
+import os
+import pytest
+import torch
+
+import vllm.envs as envs
+from vllm import LLM, SamplingParams
+from vllm.compilation.fix_functionalization import FixFunctionalizationPass
+from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
+                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
+from vllm.compilation.reshapes import RedundantReshapesPass
+from vllm.config import CompilationConfig
+
+from .backend import TestBackend
+from ..utils import models_path_prefix
+
+OPS_IN_MODEL = [
+    torch.ops._C.rotary_embedding.default,
+    torch.ops._C.fused_add_rms_norm.default,
+    torch.ops._C.silu_and_mul.default,
+]
+
+RMS_OP = torch.ops._C.rms_norm.default
+
+RMS_QUANT_OPS = {
+    "static_fp8": [
+        torch.ops._C.rms_norm_static_fp8_quant.default,
+        torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
+    ],
+}
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+@pytest.mark.parametrize(
+    "model, quant_key",
+    [(os.path.join(models_path_prefix, "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"), kFp8StaticTensorSym),
+     (os.path.join(models_path_prefix, "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8_DYNAMIC-e2e"),
+      kFp8DynamicTokenSym)])
+@pytest.mark.parametrize("do_fusion", [True, False])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
+                    reason="Only test on CUDA")
+def test_fix_functionalization(model: str, quant_key: QuantKey,
+                               do_fusion: bool):
+    torch.set_default_device("cuda")
+
+    config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
+                                          enable_reshape=True)
+    reshape_pass = RedundantReshapesPass(config)
+    fusion_pass = FusionPass.instance(config)
+
+    passes = [reshape_pass, fusion_pass] if do_fusion else [reshape_pass]
+    func_pass = FixFunctionalizationPass(config)
+    backend_func = TestBackend(*passes, func_pass)
+    backend_no_func = TestBackend(*passes)
+
+    # instantiate a full engine and manually compile the model 2x
+    # (with and without FixFunctionalizationPass)
+    llm = LLM(model=model, enforce_eager=True)
+    model_runner = llm.llm_engine.model_executor.driver_worker.model_runner
+    orig_model = model_runner.model
+    # TODO mark inputs dynamic? (currently torch.compile is triggered 4x)
+    # Can only do that by using the decorator but then we'd have to instantiate
+    # 2 LLM instances.
+
+    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
+    model_runner.model = torch.compile(orig_model,
+                                       fullgraph=True,
+                                       backend=backend_func)
+    gen_func = llm.generate(prompts, sampling_params)
+
+    model_runner.model = torch.compile(orig_model,
+                                       fullgraph=True,
+                                       backend=backend_no_func)
+    gen_no_func = llm.generate(prompts, sampling_params)
+
+    for output_func, output_no_func in zip(gen_func, gen_no_func):
+        assert output_func.outputs[0].text == output_no_func.outputs[0].text
+
+    # OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion,
+    # and replaced by fused quantized ops in RMS_QUANT_OPS.
+    rms_ops = [FUSED_OPS[(quant_key, True)], FUSED_OPS[(quant_key, False)]
+               ] if do_fusion else [RMS_OP]
+    ops = OPS_IN_MODEL + rms_ops
+
+    for op in ops:
+        find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
+        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
+                                  op) is None  # noqa: E501
+
+    # make sure the ops were all de-functionalized
+    found = dict()
+    for node in backend_func.graph_post_pass.nodes:
+        for op in ops:
+            if is_func(node, op):
+                found[op] = True
+    assert all(found[op] for op in ops)
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
+import pytest
+import torch
+from compressed_tensors.quantization import FP8_DTYPE
+
+import vllm.envs as envs
+from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
+                                     FusionPass, QuantKey)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
+from vllm.compilation.reshapes import RedundantReshapesPass
+from vllm.config import CompilationConfig
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear)
+
+from .backend import TestBackend
+
+
+class TestModel(torch.nn.Module):
+
+    def __init__(self, hidden_size: int, eps: float, static: bool, *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        if static:
+            self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        else:
+            self.scale = [None for _ in range(2)]
+        self.w = [
+            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
+            for _ in range(2)
+        ]
+
+    def forward(self, x):
+        resid = torch.sqrt(x)
+        y = self.norm[0](x)
+
+        x2 = apply_fp8_linear(y,
+                              self.w[0],
+                              self.wscale[0],
+                              self.scale[0],
+                              use_per_token_if_dynamic=True)
+        # make sure resid is used for replacement to work
+        y2, resid = self.norm[1](x2, resid)
+
+        x3 = apply_fp8_linear(y2,
+                              self.w[1],
+                              self.wscale[1],
+                              self.scale[1],
+                              use_per_token_if_dynamic=True)
+        y3, resid = self.norm[2](x3, resid)  # use resid here
+        return y3
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
+@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
+@pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.parametrize("static", [True, False])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
+                    reason="Only test on CUDA")
+def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static):
+    torch.set_default_device("cuda")
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(1)
+
+    # Reshape pass is needed for the fusion pass to work
+    config = CompilationConfig.PassConfig(enable_fusion=True,
+                                          enable_reshape=True)
+    reshape_pass = RedundantReshapesPass(config)
+    fusion_pass = FusionPass.instance(config)
+
+    backend = TestBackend(reshape_pass, fusion_pass)
+    model = TestModel(hidden_size, eps, static)
+
+    # First dimension dynamic
+    x = torch.rand(num_tokens, hidden_size)
+    torch._dynamo.mark_dynamic(x, 0)
+
+    result = model(x)
+
+    model2 = torch.compile(model, backend=backend)
+    result2 = model2(x)
+
+    # Higher tol for dynamic, even higher for bfloat16
+    if static:
+        ATOL, RTOL = (1e-3, 1e-3)
+    elif dtype == torch.float16:
+        ATOL, RTOL = (2e-3, 2e-3)
+    else:
+        ATOL, RTOL = (1e-2, 1e-2)
+
+    torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
+
+    # Check substitution worked
+    pre_nodes = backend.graph_pre_pass.nodes
+    post_nodes = backend.graph_post_pass.nodes
+
+    # static is per-tensor, dynamic is per-token
+    key = QuantKey(dtype=FP8_DTYPE,
+                   static=static,
+                   per_tensor=static,
+                   symmetric=True)
+    rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
+    add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
+    fp8_quant = QUANT_OPS[key]
+
+    # In pre-nodes, fp8 quant should be present and fused kernels should not
+    assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
+    assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
+    find_auto_fn(pre_nodes, fp8_quant)
+
+    # In post-nodes, fused kernels should be present and fp8 quant should not
+    find_auto_fn(post_nodes, rms_quant)
+    find_auto_fn(post_nodes, add_rms_quant)
+    assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
+import pickle
+
+import pytest
+import torch
+from torch._inductor.codecache import BypassFxGraphCache
+
+from vllm.compilation.config import CompilationConfig
+from vllm.compilation.inductor_pass import (CallableInductorPass,
+                                            as_inductor_pass)
+from vllm.compilation.pass_manager import PostGradPassManager
+
+
+def simple_callable(graph: torch.fx.Graph):
+    pass
+
+
+@as_inductor_pass(files=(__file__, ))
+def callable_decorated(graph: torch.fx.Graph):
+    pass
+
+
+@pytest.mark.parametrize(
+    "works, callable",
+    [(False, simple_callable), (True, callable_decorated),
+     (True, CallableInductorPass(simple_callable, "simple_callable"))])
+def test_pass_manager(works: bool, callable):
+    config = CompilationConfig().pass_config
+    pass_manager = PostGradPassManager([callable])
+    pass_manager.configure(config)  # Adds default passes
+
+    if works:
+        pickle.dumps(pass_manager)
+    else:
+        with pytest.raises(BypassFxGraphCache):
+            pickle.dumps(pass_manager)
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -3,6 +3,7 @@ from typing import Optional
 import torch

 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.config import CompilationLevel


 class MyMod(torch.nn.Module):
@@ -18,7 +19,8 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
    def __init__(self, model):
        self.model = model
        compiled_callable = torch.compile(self.forward, backend="eager")
-        super().__init__(compiled_callable)
+        super().__init__(compiled_callable,
+                         compilation_level=CompilationLevel.DYNAMO_ONCE)

    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
        # this is the function to be compiled

--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -4,18 +4,11 @@ import torch

 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.plugins import set_torch_compile_backend
-from vllm.utils import is_hip
+from vllm.config import CompilationLevel
+from vllm.platforms import current_platform
 import os
 from ..utils import models_path_prefix

-TEST_MODELS_SMOKE = [
-    (os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"), {
-        "quantization": "compressed-tensors"
-    }),
-    (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), {}),
-]
-
 TEST_MODELS = [
    (os.path.join(models_path_prefix, "facebook/opt-125m"), {}),
    (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), {
@@ -32,13 +25,12 @@ TEST_MODELS = [
    (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), {}),
 ]

-# TODO: enable in pytorch 2.5
-if False and is_quant_method_supported("aqlm"):  # noqa: SIM223
+if is_quant_method_supported("aqlm"):
    TEST_MODELS.append((os.path.join(models_path_prefix, "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"), {
        "quantization": "aqlm"
    }))

-# TODO: enable in pytorch 2.5
+# TODO: figure out why this fails.
 if False and is_quant_method_supported("gguf"):  # noqa: SIM223
    TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"), {
        "quantization": "gguf"
@@ -64,25 +56,26 @@ if is_quant_method_supported("gptq"):
 #         "quantization": "marlin"
 #     }))

-if not is_hip() and is_quant_method_supported("awq"):
+
+if not current_platform.is_rocm() and is_quant_method_supported("awq"):
    TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"), {
        "quantization": "AWQ"
    }))


-def check_full_graph_support(model, model_kwargs, backend, tp_size=1):
+def check_full_graph_support(model,
+                             model_kwargs,
+                             optimization_level,
+                             tp_size=1):
    # make sure these models can be captured in full graph mode
-    if "VLLM_TEST_DYNAMO_GRAPH_CAPTURE" not in os.environ:
-        os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"

-    # Inductor doesn't support fp8/gptq_marlin_24 yet.
-    quantization = model_kwargs.get("quantization")
-    if (quantization == "fp8" or quantization == "gptq_marlin"
-            or quantization == "gptq_marlin_24") and backend != "eager":
+    # The base meta llama uses too much memory.
+    if (model == "meta-llama/Meta-Llama-3-8B"
+            and optimization_level >= CompilationLevel.PIECEWISE):
        return

-    set_torch_compile_backend(backend)
+    print(f"MODEL={model}")

    prompts = [
        "Hello, my name is",
@@ -95,6 +88,7 @@ def check_full_graph_support(model, model_kwargs, backend, tp_size=1):
              enforce_eager=True,
              tensor_parallel_size=tp_size,
              disable_custom_all_reduce=True,
+              compilation_config=optimization_level,
              **model_kwargs)

    outputs = llm.generate(prompts, sampling_params)

--- a/tests/conftest.py
+++ b/tests/conftest.py
-import contextlib
-import gc
 import json
 import os
-import sys
 import tempfile
 from collections import UserList
 from enum import Enum
@@ -27,18 +24,19 @@ from tests.models.utils import (TokensTextLogprobs,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TokenizerPoolConfig
+from vllm.config import TaskOption, TokenizerPoolConfig
 from vllm.connections import global_http_connection
-from vllm.distributed import (destroy_distributed_environment,
-                              destroy_model_parallel,
+from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
                              initialize_model_parallel)
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
+from vllm.platforms import current_platform
+from vllm.sampling_params import BeamSearchParams
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity, is_cpu)
+                        identity)
 from .utils import models_path_prefix

 logger = init_logger(__name__)
@@ -47,14 +45,16 @@ _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]

-PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
-PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
-                         List[List[Tuple[np.ndarray, int]]]]
-PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
+_M = TypeVar("_M")
+_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
+
+PromptImageInput = _PromptMultiModalInput[Image.Image]
+PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
+PromptVideoInput = _PromptMultiModalInput[np.ndarray]


 def _read_prompts(filename: str) -> List[str]:
-    with open(filename, "r") as f:
+    with open(filename) as f:
        prompts = f.readlines()
        return prompts

@@ -64,13 +64,7 @@ class _ImageAssetPrompts(TypedDict):
    cherry_blossom: str


-if sys.version_info < (3, 9):
-    # UserList cannot be subscripted
-    class _ImageAssetsBase(UserList):
-        pass
-else:
-
-    class _ImageAssetsBase(UserList[ImageAsset]):
+class _ImageAssetsBase(UserList[ImageAsset]):
    pass


@@ -96,13 +90,7 @@ class _VideoAssetPrompts(TypedDict):
    sample_demo_1: str


-if sys.version_info < (3, 9):
-    # UserList cannot be subscripted
-    class _VideoAssetsBase(UserList):
-        pass
-else:
-
-    class _VideoAssetsBase(UserList[VideoAsset]):
+class _VideoAssetsBase(UserList[VideoAsset]):
    pass


@@ -123,6 +111,23 @@ VIDEO_ASSETS = _VideoAssets()
 """Singleton instance of :class:`_VideoAssets`."""


+@pytest.fixture(params=[True, False])
+def run_with_both_engines(request, monkeypatch):
+    # Automatically runs tests twice, once with V1 and once without
+    use_v1 = request.param
+    # Tests decorated with `@skip_v1` are only run without v1
+    skip_v1 = request.node.get_closest_marker("skip_v1")
+
+    if use_v1:
+        if skip_v1:
+            pytest.skip("Skipping test on vllm V1")
+        monkeypatch.setenv('VLLM_USE_V1', '1')
+    else:
+        monkeypatch.setenv('VLLM_USE_V1', '0')
+
+    yield
+
+
 @pytest.fixture(autouse=True)
 def init_test_http_connection():
    # pytest_asyncio may use a different event loop per test
@@ -142,17 +147,7 @@ def dist_init():
    )
    initialize_model_parallel(1, 1)
    yield
-    cleanup()
-
-
-def cleanup():
-    destroy_model_parallel()
-    destroy_distributed_environment()
-    with contextlib.suppress(AssertionError):
-        torch.distributed.destroy_process_group()
-    gc.collect()
-    if not is_cpu():
-        torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()


 @pytest.fixture()
@@ -169,7 +164,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
 def cleanup_fixture(should_do_global_cleanup_after_test: bool):
    yield
    if should_do_global_cleanup_after_test:
-        cleanup()
+        cleanup_dist_env_and_memory()


 @pytest.fixture(autouse=True)
@@ -244,22 +239,25 @@ def video_assets() -> _VideoAssets:
    return VIDEO_ASSETS


-_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
+_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)


 class HfRunner:

-    def wrap_device(self, input: _T) -> _T:
-        if not is_cpu():
-            # Check if the input is already on the GPU
-            if hasattr(input, 'device') and input.device.type == "cuda":
-                return input  # Already on GPU, no need to move
-            return input.to("cuda")
-        else:
-            # Check if the input is already on the CPU
-            if hasattr(input, 'device') and input.device.type == "cpu":
-                return input  # Already on CPU, no need to move
-            return input.to("cpu")
+    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+        if x is None or isinstance(x, (bool, )):
+            return x
+
+        if device is None:
+            device = "cpu" if current_platform.is_cpu() else "cuda"
+
+        if isinstance(x, dict):
+            return {k: self.wrap_device(v, device) for k, v in x.items()}
+
+        if hasattr(x, "device") and x.device.type == device:
+            return x
+
+        return x.to(device)

    def __init__(
        self,
@@ -267,23 +265,33 @@ class HfRunner:
        dtype: str = "half",
        *,
        model_kwargs: Optional[Dict[str, Any]] = None,
-        is_embedding_model: bool = False,
+        is_sentence_transformer: bool = False,
+        is_cross_encoder: bool = False,
+        skip_tokenizer_init: bool = False,
        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
-        postprocess_inputs: Callable[[BatchEncoding],
-                                     BatchEncoding] = identity,
+        postprocess_inputs: Callable[..., BatchEncoding] = identity,
    ) -> None:
        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]

        self.model_name = model_name

-        if is_embedding_model:
+        if is_sentence_transformer:
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
            self.model = self.wrap_device(
                SentenceTransformer(
                    model_name,
                    device="cpu",
+                    trust_remote_code=True,
                ).to(dtype=torch_dtype))
+        elif is_cross_encoder:
+            # Lazy init required for AMD CI
+            from sentence_transformers import CrossEncoder
+            self.model = CrossEncoder(model_name,
+                                      device="cpu",
+                                      trust_remote_code=True)
+            self.model.model = self.wrap_device(self.model.model)\
+                .to(dtype=torch_dtype)
        else:
            model_kwargs = model_kwargs if model_kwargs is not None else {}
            self.model = self.wrap_device(
@@ -294,6 +302,7 @@ class HfRunner:
                    **model_kwargs,
                ))

+        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
@@ -308,35 +317,78 @@ class HfRunner:
            torch_dtype=torch_dtype,
            trust_remote_code=True,
        )
+        if skip_tokenizer_init:
+            self.tokenizer = self.processor.tokenizer

+        self.dtype = dtype
        self.postprocess_inputs = postprocess_inputs

-    def generate(
+    def get_inputs(
        self,
        prompts: List[str],
        images: Optional[PromptImageInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
-        **kwargs: Any,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
-        if images:
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[BatchEncoding]:
+        if images is not None:
            assert len(prompts) == len(images)

-        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        if videos is not None:
+            assert len(prompts) == len(videos)
+
+        if audios is not None:
+            assert len(prompts) == len(audios)
+
+        all_inputs: List[BatchEncoding] = []
        for i, prompt in enumerate(prompts):
            processor_kwargs: Dict[str, Any] = {
                "text": prompt,
                "return_tensors": "pt",
            }
-            if images is not None and images[i] is not None:
-                processor_kwargs["images"] = images[i]
-            if videos is not None and videos[i] is not None:
-                processor_kwargs["videos"] = videos[i]
+            if images is not None and (image := images[i]) is not None:
+                processor_kwargs["images"] = image
+            if videos is not None and (video := videos[i]) is not None:
+                processor_kwargs["videos"] = video
+            if audios is not None and (audio_tuple := audios[i]) is not None:
+                audio, sr = audio_tuple
+                processor_kwargs["audio"] = audio
+                processor_kwargs["sampling_rate"] = sr

            inputs = self.processor(**processor_kwargs)
-            inputs = self.postprocess_inputs(inputs)
+            inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
+
+            all_inputs.append(inputs)
+
+        return all_inputs
+
+    def classify(self, prompts: List[str]) -> List[str]:
+        # output is final logits
+        all_inputs = self.get_inputs(prompts)
+        outputs = []
+        for inputs in all_inputs:
+            output = self.model(**self.wrap_device(inputs))
+            logits = output.logits.softmax(dim=-1)[0].tolist()
+            outputs.append(logits)

+        return outputs
+
+    def generate(
+        self,
+        prompts: List[str],
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
+    ) -> List[Tuple[List[List[int]], List[str]]]:
+        all_inputs = self.get_inputs(prompts,
+                                     images=images,
+                                     videos=videos,
+                                     audios=audios)
+
+        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        for inputs in all_inputs:
            output_ids = self.model.generate(
-                **self.wrap_device(inputs),
+                **self.wrap_device(inputs, device=self.model.device.type),
                use_cache=True,
                **kwargs,
            )
@@ -354,12 +406,16 @@ class HfRunner:
        prompts: List[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
    ) -> List[Tuple[List[int], str]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                images=images,
+                                videos=videos,
+                                audios=audios,
                                **kwargs)

        return [(output_ids[0], output_str[0])
@@ -391,25 +447,19 @@ class HfRunner:
        prompts: List[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
    ) -> List[List[torch.Tensor]]:
-        all_logprobs: List[List[torch.Tensor]] = []
-        for i, prompt in enumerate(prompts):
-            processor_kwargs: Dict[str, Any] = {
-                "text": prompt,
-                "return_tensors": "pt",
-            }
-            if images is not None and images[i] is not None:
-                processor_kwargs["images"] = images[i]
-            if videos is not None and videos[i] is not None:
-                processor_kwargs["videos"] = videos[i]
-
-            inputs = self.processor(**processor_kwargs)
-            inputs = self.postprocess_inputs(inputs)
+        all_inputs = self.get_inputs(prompts,
+                                     images=images,
+                                     videos=videos,
+                                     audios=audios)

+        all_logprobs: List[List[torch.Tensor]] = []
+        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs),
+                **self.wrap_device(inputs, device=self.model.device.type),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@@ -417,40 +467,39 @@ class HfRunner:
                return_dict_in_generate=True,
                **kwargs,
            )
-            seq_logprobs: List[torch.Tensor] = []
-            for hidden_states in output.hidden_states:
-                last_hidden_states = hidden_states[-1][0]
-                logits = torch.matmul(
-                    last_hidden_states,
-                    self.model.get_output_embeddings().weight.t(),
-                )
-                if self.model.get_output_embeddings().bias is not None:
-                    logits += self.model.get_output_embeddings(
-                    ).bias.unsqueeze(0)
-                logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
-                seq_logprobs.append(logprobs)
+            seq_logprobs = self._hidden_states_to_seq_logprobs(
+                output.hidden_states)
            all_logprobs.append(seq_logprobs)
        return all_logprobs

-    def _hidden_states_to_logprobs(
+    def _hidden_states_to_seq_logprobs(
        self,
-        hidden_states,
-        num_logprobs,
-    ) -> Tuple[List[Dict[int, float]], int]:
+        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
+    ) -> List[torch.Tensor]:
+        output_embeddings = self.model.get_output_embeddings()
+
        seq_logprobs: List[torch.Tensor] = []
-        output_len = len(hidden_states)
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
-                last_hidden_states,
-                self.model.get_output_embeddings().weight.t(),
+                last_hidden_states.to(output_embeddings.weight.device),
+                output_embeddings.weight.t(),
            )
-            if getattr(self.model.get_output_embeddings(), "bias",
-                       None) is not None:
-                logits += self.model.get_output_embeddings().bias.unsqueeze(0)
+            if getattr(output_embeddings, "bias", None) is not None:
+                logits += output_embeddings.bias.unsqueeze(0)
            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
            seq_logprobs.append(logprobs)

+        return seq_logprobs
+
+    def _hidden_states_to_logprobs(
+        self,
+        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
+        num_logprobs: int,
+    ) -> Tuple[List[Dict[int, float]], int]:
+        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
+        output_len = len(hidden_states)
+
        # convert to dict
        seq_logprobs_lst: List[Dict[int, float]] = []
        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
@@ -477,33 +526,21 @@ class HfRunner:
        num_logprobs: int,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
        **kwargs: Any,
    ) -> List[TokensTextLogprobs]:
+        all_inputs = self.get_inputs(prompts,
+                                     images=images,
+                                     videos=videos,
+                                     audios=audios)
+
        all_logprobs: List[List[Dict[int, float]]] = []
        all_output_ids: List[List[int]] = []
        all_output_strs: List[str] = []

-        for i, prompt in enumerate(prompts):
-            processor_kwargs: Dict[str, Any] = {
-                "text": prompt,
-                "return_tensors": "pt",
-            }
-            if images is not None and images[i] is not None:
-                processor_kwargs["images"] = images[i]
-
-            if audios is not None:
-                audio, sr = audios[i]
-                processor_kwargs["audio"] = audio
-                processor_kwargs["sampling_rate"] = sr
-
-            if videos is not None:
-                processor_kwargs["videos"] = videos[i]
-            inputs = self.processor(**processor_kwargs)
-            inputs = self.postprocess_inputs(inputs)
-
+        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs),
+                **self.wrap_device(inputs, device=self.model.device.type),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@@ -534,6 +571,7 @@ class HfRunner:
        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
        max_tokens: int,
        num_logprobs: int,
+        images: Optional[PromptImageInput] = None,
        **kwargs: Any,
    ) -> List[TokensTextLogprobs]:
        '''
@@ -544,14 +582,28 @@ class HfRunner:
        all_output_ids: List[List[int]] = []
        all_output_strs: List[str] = []

-        for (encoder_prompt,
-             decoder_prompt) in to_enc_dec_tuple_list(encoder_decoder_prompts):
+        for i, (encoder_prompt, decoder_prompt) in enumerate(
+                to_enc_dec_tuple_list(encoder_decoder_prompts)):
+            processor_kwargs: Dict[str, Any] = {
+                "text": encoder_prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+
            encoder_input_ids = self.wrap_device(
-                self.tokenizer(encoder_prompt, return_tensors="pt").input_ids)
-            decoder_input_ids = (
-                None if decoder_prompt is None else self.wrap_device(
+                self.processor(**processor_kwargs).input_ids,
+                device=self.model.device.type,
+            )
+
+            if decoder_prompt is None:
+                decoder_input_ids = None
+            else:
+                decoder_input_ids = self.wrap_device(
                    self.tokenizer(decoder_prompt,
-                                   return_tensors="pt").input_ids))
+                                   return_tensors="pt").input_ids,
+                    device=self.model.device.type,
+                )

            output = self.model.generate(
                encoder_input_ids,
@@ -583,12 +635,15 @@ class HfRunner:
    def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
        return self.model.encode(prompts)

+    def predict(self, prompts: List[List[str]]) -> torch.Tensor:
+        return self.model.predict(prompts, convert_to_tensor=True)
+
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        del self.model
-        cleanup()
+        cleanup_dist_env_and_memory()


 @pytest.fixture(scope="session")
@@ -601,7 +656,9 @@ class VllmRunner:
    def __init__(
        self,
        model_name: str,
+        task: TaskOption = "auto",
        tokenizer_name: Optional[str] = None,
+        tokenizer_mode: str = "auto",
        # Use smaller max model length, otherwise bigger model cannot run due
        # to kv cache size limit.
        max_model_len: int = 1024,
@@ -616,7 +673,9 @@ class VllmRunner:
    ) -> None:
        self.model = LLM(
            model=model_name,
+            task=task,
            tokenizer=tokenizer_name,
+            tokenizer_mode=tokenizer_mode,
            trust_remote_code=True,
            dtype=dtype,
            swap_space=swap_space,
@@ -629,20 +688,53 @@ class VllmRunner:
            **kwargs,
        )

-    def generate(
+    def get_inputs(
        self,
        prompts: List[str],
-        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[TextPrompt]:
        if images is not None:
            assert len(prompts) == len(images)

+        if videos is not None:
+            assert len(prompts) == len(videos)
+
+        if audios is not None:
+            assert len(prompts) == len(audios)
+
        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
        if images is not None:
            for i, image in enumerate(images):
+                if image is not None:
                    inputs[i]["multi_modal_data"] = {"image": image}

+        if videos is not None:
+            for i, video in enumerate(videos):
+                if video is not None:
+                    inputs[i]["multi_modal_data"] = {"video": video}
+
+        if audios is not None:
+            for i, audio in enumerate(audios):
+                if audio is not None:
+                    inputs[i]["multi_modal_data"] = {"audio": audio}
+
+        return inputs
+
+    def generate(
+        self,
+        prompts: List[str],
+        sampling_params: SamplingParams,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[Tuple[List[List[int]], List[str]]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
        req_outputs = self.model.generate(inputs,
                                          sampling_params=sampling_params)

@@ -684,25 +776,10 @@ class VllmRunner:
        videos: Optional[PromptVideoInput] = None,
    ) -> Union[List[TokensTextLogprobs],
               List[TokensTextLogprobsPromptLogprobs]]:
-        if images is not None:
-            assert len(prompts) == len(images)
-
-        if videos is not None:
-            assert len(prompts) == len(videos)
-
-        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
-        if images is not None:
-            for i, image in enumerate(images):
-                inputs[i]["multi_modal_data"] = {"image": image}
-
-        if audios is not None:
-            for i, audio in enumerate(audios):
-                inputs[i]["multi_modal_data"] = {"audio": audio}
-
-        if videos is not None:
-            for i, video in enumerate(videos):
-                inputs[i]["multi_modal_data"] = {"video": video}
-        print(f"[INPUTS!!!!]: {inputs}, {sampling_params}")
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)

        req_outputs = self.model.generate(inputs,
                                          sampling_params=sampling_params)
@@ -739,9 +816,15 @@ class VllmRunner:
        prompts: List[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-        outputs = self.generate(prompts, greedy_params, images=images)
+        outputs = self.generate(prompts,
+                                greedy_params,
+                                images=images,
+                                videos=videos,
+                                audios=audios)
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]

@@ -755,6 +838,7 @@ class VllmRunner:
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
        stop_token_ids: Optional[List[int]] = None,
+        stop: Optional[List[str]] = None,
    ) -> Union[List[TokensTextLogprobs],
               List[TokensTextLogprobsPromptLogprobs]]:
        greedy_logprobs_params = SamplingParams(
@@ -762,7 +846,8 @@ class VllmRunner:
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=num_prompt_logprobs,
-            stop_token_ids=stop_token_ids)
+            stop_token_ids=stop_token_ids,
+            stop=stop)

        return self.generate_w_logprobs(prompts,
                                        greedy_logprobs_params,
@@ -780,7 +865,6 @@ class VllmRunner:
               List[TokensTextLogprobsPromptLogprobs]]:
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
-            use_beam_search=False,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=(num_prompt_logprobs),
@@ -793,25 +877,14 @@ class VllmRunner:
            encoder_decoder_prompts, greedy_logprobs_params)

    def generate_beam_search(
-        self,
-        prompts: List[str],
-        beam_width: int,
-        max_tokens: int,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
-        beam_search_params = SamplingParams(n=beam_width,
-                                            use_beam_search=True,
-                                            temperature=0.0,
-                                            max_tokens=max_tokens)
-        outputs = self.generate(prompts, beam_search_params)
-        return outputs
-
-    def generate_beam_search_new(
        self,
        prompts: Union[List[str], List[List[int]]],
        beam_width: int,
        max_tokens: int,
    ) -> List[Tuple[List[List[int]], List[str]]]:
-        outputs = self.model.beam_search(prompts, beam_width, max_tokens)
+        outputs = self.model.beam_search(
+            prompts,
+            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
        returned_outputs = []
        for output in outputs:
            token_ids = [x.tokens for x in output.sequences]
@@ -819,20 +892,39 @@ class VllmRunner:
            returned_outputs.append((token_ids, texts))
        return returned_outputs

-    def encode(self, prompts: List[str]) -> List[List[float]]:
-        req_outputs = self.model.encode(prompts)
-        outputs = []
-        for req_output in req_outputs:
-            embedding = req_output.outputs.embedding
-            outputs.append(embedding)
-        return outputs
+    def classify(self, prompts: List[str]) -> List[List[float]]:
+        req_outputs = self.model.classify(prompts)
+        return [req_output.outputs.probs for req_output in req_outputs]
+
+    def encode(
+        self,
+        prompts: List[str],
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[List[float]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+        req_outputs = self.model.embed(inputs)
+        return [req_output.outputs.embedding for req_output in req_outputs]
+
+    def score(
+        self,
+        text_1: Union[str, List[str]],
+        text_2: Union[str, List[str]],
+    ) -> List[float]:
+        req_outputs = self.model.score(text_1, text_2)
+        return [req_output.outputs.score for req_output in req_outputs]

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        del self.model
-        cleanup()
+        cleanup_dist_env_and_memory()


 @pytest.fixture(scope="session")
@@ -879,27 +971,30 @@ def num_gpus_available():


 # temp_dir = tempfile.gettempdir()
-# _dummy_path = os.path.join(temp_dir, "dummy_opt")
-_dummy_path = os.path.join(models_path_prefix, "facebook/opt-125m") 
+_dummy_opt_path = os.path.join(models_path_prefix, "dummy_opt")
+_dummy_llava_path = os.path.join(models_path_prefix, "dummy_llava")
+_dummy_gemma2_embedding_path = os.path.join(models_path_prefix, "dummy_gemma2_embedding")
+


 @pytest.fixture
 def dummy_opt_path():
-    json_path = os.path.join(_dummy_path, "config.json")
-    if not os.path.exists(_dummy_path):
+    json_path = os.path.join(_dummy_opt_path, "config.json")
+    if not os.path.exists(_dummy_opt_path):
        snapshot_download(repo_id="facebook/opt-125m",
-                          local_dir=_dummy_path,
+                          local_dir=_dummy_opt_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
            config = json.load(f)
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
            json.dump(config, f)
-    return _dummy_path
+    return _dummy_opt_path
+

 # 定义一个 pytest 钩子，在测试后生成报告
 @pytest.hookimpl(tryfirst=True, hookwrapper=True)
@@ -918,3 +1013,60 @@ def pytest_runtest_makereport(item, call):
            # 如果测试结果有 extra 属性，则添加截图
            if hasattr(result, "extra"):
                result.extra.append(pytest_html.extras.image(screenshot_path))
+
+
+@pytest.fixture
+def dummy_llava_path():
+    json_path = os.path.join(_dummy_llava_path, "config.json")
+    if not os.path.exists(_dummy_llava_path):
+        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
+                          local_dir=_dummy_llava_path,
+                          ignore_patterns=[
+                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
+                              "*.msgpack"
+                          ])
+        assert os.path.exists(json_path)
+        with open(json_path) as f:
+            config = json.load(f)
+        config["architectures"] = ["MyLlava"]
+        with open(json_path, "w") as f:
+            json.dump(config, f)
+    return _dummy_llava_path
+
+
+@pytest.fixture
+def dummy_gemma2_embedding_path():
+    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
+    if not os.path.exists(_dummy_gemma2_embedding_path):
+        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
+                          local_dir=_dummy_gemma2_embedding_path,
+                          ignore_patterns=[
+                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
+                              "*.msgpack"
+                          ])
+        assert os.path.exists(json_path)
+        with open(json_path) as f:
+            config = json.load(f)
+        config["architectures"] = ["MyGemma2Embedding"]
+        with open(json_path, "w") as f:
+            json.dump(config, f)
+    return _dummy_gemma2_embedding_path
+
+
+# Add the flag `--optional` to allow run tests
+# that are marked with @pytest.mark.optional
+def pytest_addoption(parser):
+    parser.addoption("--optional",
+                     action="store_true",
+                     default=False,
+                     help="run optional test")
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--optional"):
+        # --optional given in cli: do not skip optional tests
+        return
+    skip_optional = pytest.mark.skip(reason="need --optional option to run")
+    for item in items:
+        if "optional" in item.keywords:
+            item.add_marker(skip_optional)
--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
@@ -3,10 +3,9 @@ from typing import Callable, Iterable, Optional
 import pytest

 from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.utils import set_random_seed

-from ....conftest import cleanup
-

 @pytest.fixture
 def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
@@ -37,7 +36,7 @@ def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,

        yield llm
        del llm
-        cleanup()
+        cleanup_dist_env_and_memory()

    for llm in generator_inner():
        yield llm

--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -23,32 +23,32 @@ from ....utils import models_path_prefix
        "num_gpu_blocks_override": 5 * (64 + 1),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "use_v2_block_manager": False
-}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
    "preemption_mode": "swap"
 }, {
-    "use_v2_block_manager": True,
    "preemption_mode": "recompute"
 }])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
-def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
+def test_block_manager_with_preemption(baseline_llm_generator,
                                       test_llm_generator, batch_size):
-    """Verify block manager v2 produces same outputs as block manager v1, even
-    when there is preemption.
+    """Verify block manager produces same outputs even when there is preemption.

    This constructs two LLM, each with limited number of GPU blocks. The limit
    is decided such that as the sequences in the batch grow, sequences must be
    preempted and removed from cache.

    If the output token ids are equivalent, then we have confidence that the KV
-    cache is not corrupted in the v2 block manager.
+    cache is not corrupted.

    NOTE: We want a significant number of generated tokens so that any incorrect
    KV mapping has time to build up error.
+
+    NOTE(Kuntai): Though we have removed block manager v1, this test is still
+    useful as it asserts the behavior of block manager v2 (now it is called 
+    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
+    keep this test.
    """
    output_len = 1024
    temperature = 0.0
@@ -72,78 +72,9 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
        temperature=temperature,
    )

-    print('Getting token ids from block manager v1')
    baseline_token_ids = get_token_ids_from_llm_generator(
        baseline_llm_generator, prompts, sampling_params)

-    print('Getting token ids from block manager v2')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-
-    assert baseline_token_ids == test_token_ids
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": os.path.join(models_path_prefix, "facebook/opt-125m"),
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-
-        # Use a large block size to trigger more copy-on-writes.
-        "block_size": 32,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "use_v2_block_manager": False
-}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
-    "preemption_mode": "swap"
-}, {
-    "use_v2_block_manager": True,
-    "preemption_mode": "recompute"
-}])
-@pytest.mark.parametrize("batch_size", [10])
-@pytest.mark.parametrize("seed", [1])
-def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
-                                        test_llm_generator, batch_size):
-    """Verify beam search equality with block manager v1 and v2.
-
-    This requires copy-on-writes; if the v1 and v2 output is the same, then
-    we have some confidence cow is working.
-    """
-    output_len = 128
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-        use_beam_search=True,
-        best_of=2,
-    )
-
-    print('Getting token ids from block manager v1')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-
-    print('Getting token ids from block manager v2')
    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
                                                      prompts, sampling_params)

@@ -166,9 +97,6 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,

        # skip cuda graph creation for fast test.
        "enforce_eager": True,
-
-        # Lookahead scheduling only supported in v2 block manager.
-        "use_v2_block_manager": True,
    }])
 @pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
@@ -280,26 +208,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
                             "max_num_seqs": 10,
                         }])
 @pytest.mark.parametrize("baseline_llm_kwargs", [
-    {
-        "use_v2_block_manager": False,
-    },
+    {},
 ])
 @pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "use_v2_block_manager": True,
        "num_lookahead_slots": 0,
    },
    {
-        "use_v2_block_manager": True,
        "num_lookahead_slots": 5,
    },
 ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
-def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
+def test_chunked_prefill_block_manager(baseline_llm_generator,
                                       test_llm_generator, batch_size):
-    """Verify that chunked prefill works with BlockManagerV2, with and without
-    lookahead scheduling.
+    """Verify that chunked prefill works with SelfAttnBlockSpaceManager, 
+    with and without lookahead scheduling.
    """
    output_len = 32
    temperature = 0.0
@@ -320,11 +244,11 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
        temperature=temperature,
    )

-    print('Getting token ids with BlockManagerV1')
+    print('Getting token ids with BlockManager')
    baseline_token_ids = get_token_ids_from_llm_generator(
        baseline_llm_generator, prompts, sampling_params)

-    print('Getting token ids with BlockManagerV2')
+    print('Getting token ids with BlockManager, with lookahead slots.')
    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
                                                      prompts, sampling_params)

@@ -352,32 +276,32 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
        "enable_prefix_caching": True,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "use_v2_block_manager": False
-}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
    "preemption_mode": "swap"
 }, {
-    "use_v2_block_manager": True,
    "preemption_mode": "recompute"
 }])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
-def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
+def test_block_manager_prefix_caching_enabled_with_preemption(
        baseline_llm_generator, test_llm_generator, batch_size):
-    """Verify block manager v2 produces same outputs as block manager v1, even
-    when there is preemption.
+    """Verify block manager produces same outputs even when there is preemption.

    This constructs two LLM, each with limited number of GPU blocks. The limit
    is decided such that as the sequences in the batch grow, sequences must be
    preempted and removed from cache.

    If the output token ids are equivalent, then we have confidence that the KV
-    cache is not corrupted in the v2 block manager.
+    cache is not corrupted.

    NOTE: We want a significant number of generated tokens so that any incorrect
    KV mapping has time to build up error.
+
+    NOTE(Kuntai): Though we have removed block manager v1, this test is still
+    useful as it asserts the behavior of block manager v2 (now it is called 
+    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
+    keep this test.
    """
    output_len = 1024
    temperature = 0.0
@@ -401,11 +325,11 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
        temperature=temperature,
    )

-    print('Getting token ids from block manager v1')
+    print('Getting token ids from block manager')
    baseline_token_ids = get_token_ids_from_llm_generator(
        baseline_llm_generator, prompts, sampling_params)

-    print('Getting token ids from block manager v2')
+    print('Getting token ids from block manager, with preemption')
    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
                                                      prompts, sampling_params)

@@ -428,9 +352,6 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
        # Allow only 5 sequences of ~1024 tokens in worst case.
        "block_size": 16,
        "num_gpu_blocks_override": 5 * (64 + 1),
-
-        # Test APC in v2 block
-        "use_v2_block_manager": True,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
@@ -506,9 +427,6 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
        "max_model_len": 48,
        "block_size": 16,
        "num_gpu_blocks_override": 3,
-
-        # Test APC in v2 block
-        "use_v2_block_manager": True,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{

--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -4,6 +4,7 @@ from typing import List
 import pytest
 import os

+from tests.kernels.utils import override_backend_env_variable
 from vllm import LLM, SamplingParams

 from .conftest import get_text_from_llm_generator
@@ -26,14 +27,13 @@ BLOCK_SIZE = 16
        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "use_v2_block_manager": False
-}])
-@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
 def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
-                                 batch_size, seed):
+                                 batch_size, seed, backend, monkeypatch):
    """
    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
    asks for value of one of them (which is outside the sliding window).
@@ -42,6 +42,8 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,

    Additionally, we compare the results of the v1 and v2 managers.
    """
+    override_backend_env_variable(monkeypatch, backend)
+
    sampling_params = SamplingParams(
        max_tokens=1024,
        ignore_eos=True,
@@ -50,7 +52,6 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,

    prompts, answer, indices = prep_prompts(batch_size)

-    print('Getting token ids from block manager v1')
    baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
                                                 prompts,
                                                 sampling_params,
@@ -86,13 +87,12 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
-    "enable_chunked_prefill": True
-}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
-def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
+                                        backend, monkeypatch):
    """
    This is similar to test_sliding_window_retrival, however, it doesn't
    compare against the v1 block manager since v1 doesn't support
@@ -101,6 +101,8 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
    The results with and without chunked prefill are not the same due to
    numerical instabilities.
    """
+    override_backend_env_variable(monkeypatch, backend)
+
    sampling_params = SamplingParams(
        max_tokens=10,
        ignore_eos=True,

--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -2,7 +2,7 @@ import pytest

 from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
                                   STR_NOT_IMPL_ENC_DEC_SWA)
-from vllm.core.block_manager_v2 import BlockSpaceManagerV2
+from vllm.core.block_manager import SelfAttnBlockSpaceManager
 from vllm.core.interfaces import AllocStatus
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list
@@ -17,7 +17,7 @@ from ..utils import (create_dummy_prompt, create_seq_group,
 @pytest.mark.parametrize("watermark", [0.0, 0.5])
 def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
                                num_gpu_blocks: int, watermark: float):
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=1024,
@@ -63,7 +63,7 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int,
                                                num_seqs_per_group: int,
                                                num_gpu_blocks: int,
                                                watermark: float):
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=1024,
@@ -117,16 +117,16 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
    '''
    SWA short for Sliding Window Attention.

-    At time of writing block manager v2 does not support SWA.
+    At time of writing block manager does not support SWA.

-    However even when SWA is implemented for block manager v2,
+    However even when SWA is implemented for block manager,
    there will still most likely be a separate workstream required
    to enable SWA for encoder/decoder models.

    Therefore this test enforces that one of the following cases
    hold true:
-    1. Block manager v2 does not support SWA at all (true at time of writing)
-    2. Block manager v2 fails with NotImplementError when SWA is enabled
+    1. Block manager does not support SWA at all (true at time of writing)
+    2. Block manager fails with NotImplementError when SWA is enabled
       AND a SequenceGroup with an encoder sequence (i.e. in support of an
       encoder/decoder model) is passed into can_allocate() as an argument

@@ -135,7 +135,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
    '''

    with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
-        block_manager = BlockSpaceManagerV2(
+        block_manager = SelfAttnBlockSpaceManager(
            block_size=block_size,
            num_gpu_blocks=num_gpu_blocks,
            num_cpu_blocks=1024,
@@ -158,7 +158,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
        block_manager.can_allocate(seq_group)

    # Assert that either
-    # 1. Block manager v2 constructor fails with assertion that sliding window
+    # 1. Block manager constructor fails with assertion that sliding window
    #    is not yet supported (most likely near-term outcome at time of
    #    writing), or
    # 2. can_allocate() fails with NotImplementedError due to combination of
@@ -177,7 +177,7 @@ def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
        block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
        watermark: float):

-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=1024,
@@ -217,7 +217,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,

    num_gpu_blocks = 1024
    watermark = 0.1
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=0,
@@ -269,7 +269,7 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
    """Verify blocks number on src/desc device is correct after swapping in/out
        sequence group (not missing or extra blocks).
    """
-    block_manager = BlockSpaceManagerV2(block_size,
+    block_manager = SelfAttnBlockSpaceManager(block_size,
                                              num_cpu_blocks,
                                              num_gpu_blocks,
                                              watermark=0,
@@ -277,6 +277,7 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
    prompt.status = SequenceStatus.WAITING
    block_manager.allocate(seq_group)
+
    # Emulate a forward pass by appending a single token.
    # The block manager then knows how many unprocessed
    # tokens will be written in the next forward pass.
@@ -321,7 +322,7 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
        can be swapped in/out.
    """
    num_cpu_blocks = num_gpu_blocks
-    block_manager = BlockSpaceManagerV2(block_size,
+    block_manager = SelfAttnBlockSpaceManager(block_size,
                                              num_cpu_blocks,
                                              num_gpu_blocks,
                                              watermark=0,
@@ -373,6 +374,52 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
            seq_group, num_lookahead_slots) == AllocStatus.NEVER


+@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
+@pytest.mark.parametrize("enable_caching", [False, True])
+def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
+    """Verifies that swapping fails if there is not enough free blocks
+    to account for unseen tokens and lookahead_slots.
+    """
+    block_size = 8
+    num_cpu_blocks = 1
+    num_gpu_blocks = 1
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
+    prompt_length = block_size - 3
+    assert prompt_length > 0
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Swap seq group from GPU -> CPU.
+    assert block_manager.can_swap_out(seq_group)
+    block_manager.swap_out(seq_group)
+    prompt.status = SequenceStatus.SWAPPED
+
+    # Swap seq group from CPU -> GPU.
+    # The number of unseen tokens is 1. If the number of existing
+    # tokens plus the unseen ones and number of lookahead slots exceeds
+    # the total number of available GPU blocks then the swap
+    # should fail.
+    num_unseen_tokens = 1
+    if (num_lookahead_slots + num_unseen_tokens +
+            prompt_length) <= (block_size * num_gpu_blocks):
+        assert block_manager.can_swap_in(seq_group,
+                                         num_lookahead_slots) == AllocStatus.OK
+    else:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+
+
 # TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.


@@ -388,7 +435,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,

    num_gpu_blocks = 1024
    watermark = 0.1
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
        block_size=block_size,
        num_gpu_blocks=num_gpu_blocks,
        num_cpu_blocks=0,
@@ -400,7 +447,6 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
        if max_n is None:
            max_n = min_n
        used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
-        #print("check", min_n, used, max_n)
        assert min_n <= used
        assert used <= max_n

@@ -429,7 +475,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
    seq.data.update_num_computed_tokens(prompt_len)
    check_used(num_blocks(prompt_len))

-    # this is how we compute it in BlockSpaceManagerV2.__init__
+    # this is how we compute it in SelfAttnBlockSpaceManager.__init__
    sliding_blocks = (sliding_window // block_size) + 2
    # plus one block for null block
    sliding_blocks += 1

--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -104,9 +104,9 @@ class TestNaiveBlockAllocator:
    @staticmethod
    @pytest.mark.parametrize("num_blocks", [4])
    @pytest.mark.parametrize("block_size", [8])
-    def test_naive_block_get_num_blocks_touched(num_blocks, block_size):
+    def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
        """ Verify the allocator can correctly return the number of
-        blocks touched, with different lookahead slots.
+        full blocks touched.
        """
        allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
                                            num_blocks=num_blocks,
@@ -124,7 +124,7 @@ class TestNaiveBlockAllocator:
        src_blocks = [allocate_block() for _ in range(num_blocks - 1)]

        # All blocks are cached
-        assert allocator_dst.get_num_blocks_touched(
+        assert allocator_dst.get_num_full_blocks_touched(
            src_blocks) == num_blocks - 1

        # Insert one non-full block in the src
@@ -136,9 +136,10 @@ class TestNaiveBlockAllocator:
        src_blocks.append(allocate_non_full_block())
        src_blocks[-1].append_token_ids([0])

-        assert allocator_dst.get_num_blocks_touched(
-            src_blocks, num_lookahead_slots=1) == num_blocks
-        assert allocator_dst.get_num_blocks_touched(
-            src_blocks, num_lookahead_slots=block_size - 1) == num_blocks
-        assert allocator_dst.get_num_blocks_touched(
-            src_blocks, num_lookahead_slots=block_size) == (num_blocks + 1)
+        assert allocator_dst.get_num_full_blocks_touched(
+            src_blocks) == num_blocks - 1
+        # Fill up the last source block and then invoke
+        # get_num_blocks_touched
+        src_blocks[-1].append_token_ids([0] * (block_size - 1))
+        assert allocator_dst.get_num_full_blocks_touched(
+            src_blocks) == num_blocks
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -5,9 +5,14 @@ from unittest.mock import MagicMock

 import pytest

+from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.core.block.interfaces import Block, BlockAllocator
-from vllm.core.block.prefix_caching_block import (PrefixCachingBlock,
+from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
+                                                  PrefixCachingBlock,
                                                  PrefixCachingBlockAllocator)
+from vllm.sequence import Logprob
+from vllm.utils import Device


 class TestPrefixCachingBlock:
@@ -99,13 +104,11 @@ class TestPrefixCachingBlock:

        token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]

-        first_chain, second_chain = [
-            TestPrefixCachingBlock.create_chain(
+        first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
            block_size=block_size,
            token_ids=token_ids,
            num_empty_trailing_blocks=num_empty_trailing_blocks)
-            for _ in range(2)
-        ]
+                                     for _ in range(2))

        for first_chain_block, second_chain_block in zip(
                first_chain, second_chain):
@@ -318,11 +321,10 @@ class TestPrefixCachingBlockAllocator:
    @staticmethod
    @pytest.mark.parametrize("num_blocks", [4])
    @pytest.mark.parametrize("block_size", [8])
-    def test_prefix_caching_block_get_num_blocks_touched(
+    def test_prefix_caching_block_get_num_full_blocks_touched(
            num_blocks, block_size):
        """ Verify the allocator can correctly return the number of
-        blocks touched, when there are cached prefixes and different
-        lookahead slots.
+        blocks touched, when there are cached prefixes.
        """
        allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
                                                    block_size=block_size)
@@ -346,28 +348,30 @@ class TestPrefixCachingBlockAllocator:
                token_ids=token_ids,
                allocator=allocator_src,
            )
-
        # All blocks are cached
-        assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in) == 0
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 0

        # Free the first block in the dst
        allocator_dst.free(cached_blocks[0])

        # Now the first block becomes dangling, the swapped blocks need
        # to reclaim the first block in the dst
-        assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in) == 1
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 1

        # Insert one non-full block in the src
        non_full_block = allocator_src.allocate_mutable_block(
            blocks_to_swap_in[-1])
        non_full_block.append_token_ids([0])
        blocks_to_swap_in.append(non_full_block)
-        assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in,
-                                                    num_lookahead_slots=1) == 2
-        assert allocator_dst.get_num_blocks_touched(
-            blocks_to_swap_in, num_lookahead_slots=block_size - 1) == 2
-        assert allocator_dst.get_num_blocks_touched(
-            blocks_to_swap_in, num_lookahead_slots=block_size) == 3
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 1
+        # Fill up the last mutable block and invoke get_num_blocks_touched.
+        # Note: The last block is not cached so it will be touched.
+        non_full_block.append_token_ids([0] * (block_size - 1))
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 2

    @staticmethod
    @pytest.mark.parametrize("num_blocks", [1024])
@@ -727,23 +731,77 @@ class TestPrefixCachingBlockAllocator:
                token_ids=common_token_ids,
                allocator=allocator,
            )
-            block_ids = [block.block_id for block in blocks]
+            block_hashes = [block.content_hash for block in blocks]
            # The allocated blocks should  be marked as touched
            # but not computed.
-            computed_block_ids = allocator.get_computed_block_ids(
-                [], block_ids, skip_last_block_id=False)
+            computed_block_ids = allocator.find_cached_blocks_prefix(
+                block_hashes)
            assert len(computed_block_ids) == 0

        allocator.mark_blocks_as_computed([])
-        computed_block_ids = allocator.get_computed_block_ids(
-            [], block_ids, skip_last_block_id=False)
+        computed_block_ids = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes)
        assert len(computed_block_ids) == common_blocks

+    @staticmethod
+    def test_find_cached_blocks_prefix():
+        """
+        This test verifies the behavior of find_cached_blocks_prefix.
+        """
+        block_size = 4
+        num_blocks = 8
+        total_test_blocks = 12
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+
+        token_ids = list(range(total_test_blocks * block_size))
+        block_tokens_seq1 = token_ids[:num_blocks * block_size]
+        blocks_seq1 = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=block_tokens_seq1,
+            allocator=allocator,
+        )
+        block_hashes_seq1 = [block.content_hash for block in blocks_seq1]
+        allocator.mark_blocks_as_computed([])
+
+        # All blocks should be cached.
+        cached_blocks_seq1 = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes_seq1)
+        assert len(cached_blocks_seq1) == num_blocks
+
+        # Free the first sequence.
+        for block in blocks_seq1:
+            allocator.free(block)
+
+        # All blocks should be still be cached if not required to be allocated.
+        cached_blocks = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes_seq1)
+        assert len(cached_blocks) == num_blocks
+
+        block_tokens_seq2 = token_ids[num_blocks * block_size:]
+        blocks_seq2 = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=block_tokens_seq2,
+            allocator=allocator,
+        )
+        block_hashes_seq2 = [block.content_hash for block in blocks_seq2]
+        allocator.mark_blocks_as_computed([])
+        cached_blocks = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes_seq2)
+        assert len(cached_blocks) == len(blocks_seq2)
+
+        # Half of the blocks from seq1 should still be cached.
+        num_evicted_blocks = len(blocks_seq2)
+        cached_blocks = allocator.find_cached_blocks_prefix(
+            block_hashes=block_hashes_seq1)
+        assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks
+
    @staticmethod
    def create_immutable_chain(
        block_size: int,
        token_ids: List[int],
        allocator: PrefixCachingBlockAllocator,
+        extra_hash: Optional[int] = None,
    ) -> List[PrefixCachingBlock]:
        """Helper method which creates a chain of blocks.
        """
@@ -759,7 +817,178 @@ class TestPrefixCachingBlockAllocator:
                                        block_size:(block_number + 1) *
                                        block_size]
            prev_block = allocator.allocate_immutable_block(
-                prev_block=prev_block, token_ids=block_token_ids)
+                prev_block=prev_block,
+                token_ids=block_token_ids,
+                extra_hash=extra_hash)
            blocks.append(prev_block)

        return blocks
+
+
+class TestComputedBlocksTracker:
+
+    @staticmethod
+    def _get_mock_allocator():
+        return MagicMock(spec=PrefixCachingBlockAllocator)
+
+    @staticmethod
+    def test_get_num_cached_tokens():
+        """
+        Test it correctly computes the number of cached tokens for a given
+        sequence:
+
+        - The cache token count is derived from the number of cached blocks.
+        - The cache token count is updated when the allocator is updated.
+        - When a sequence is removed, the cache token count should be updated
+        accordingly.
+
+        # TODO(rickyx): This behaviour for prefill sequence is a hack until
+        we fix the computed blocks tracking.
+        - The cache token count for prefill sequence doesn't change while
+        the sequence is in continuous prefill (chunked prefill).
+        """
+        block_size = 4
+        mock_allocator = TestComputedBlocksTracker._get_mock_allocator()
+        tracker = ComputedBlocksTracker(
+            allocator=mock_allocator,
+            block_size=block_size,
+            enable_caching=True,
+        )
+
+        # Not yet allocated.
+        tokens = [0, 1, 2, 3, 4, 5]
+        seq1 = create_dummy_sequence(request_id=0,
+                                     token_ids=tokens,
+                                     block_size=block_size)
+        mock_allocator.find_cached_blocks_prefix.return_value = []
+        assert tracker.get_num_cached_tokens(seq1) == 0
+
+        mock_allocator.find_cached_blocks_prefix.return_value = [
+            None
+        ]  # 1 block cached.
+        # Result is cached for prefill sequence.
+        assert tracker.get_num_cached_tokens(seq1) == 0
+
+        # Mark the sequence as non-prefill.
+        seq1.data.update_num_computed_tokens(len(tokens))  # 6 tokens computed.
+        assert not seq1.is_prefill()
+
+        # Recomputes for decoding sequence.
+        assert tracker.get_num_cached_tokens(seq1) == 4
+
+        # Append new tokens to the sequence.
+        num_new_tokens = 3
+        for i in range(num_new_tokens):
+            seq1.append_token_id(i, {i: Logprob(logprob=0.0)})
+
+        assert tracker.get_num_cached_tokens(seq1) == 4
+
+        # Update the allocator.
+        mock_allocator.find_cached_blocks_prefix.return_value = [
+            None
+        ] * 2  # 2 blocks cached.
+        assert tracker.get_num_cached_tokens(seq1) == 8
+
+        # Remove the sequence.
+        tracker.remove_seq(seq1.seq_id)
+
+        # Re-create the sequence with the same request id to simulate recompute.
+        seq1 = create_dummy_sequence(request_id=0,
+                                     token_ids=tokens,
+                                     block_size=block_size)
+        mock_allocator.find_cached_blocks_prefix.return_value = [
+        ]  # no cached block
+        assert tracker.get_num_cached_tokens(seq1) == 0
+
+    @staticmethod
+    def test_correct_block_hash():
+        """
+        Test that the block hash is correctly computed for a sequence (should
+        match the underlying block allocator's block hash). So the number of
+        cached tokens is correctly retrieved.
+        """
+        block_size = 4
+        allocator = CpuGpuBlockAllocator.create(
+            allocator_type="prefix_caching",
+            num_gpu_blocks=16,
+            num_cpu_blocks=16,
+            block_size=block_size,
+        )
+        gpu_allocator = allocator._allocators[Device.GPU]
+
+        tracker = ComputedBlocksTracker(
+            allocator=allocator,
+            block_size=block_size,
+            enable_caching=True,
+        )
+
+        tokens = list(range(block_size * 4))  # 4 blocks.
+        seq = create_dummy_sequence(request_id=0,
+                                    token_ids=tokens,
+                                    block_size=block_size)
+        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=tokens,
+            allocator=gpu_allocator,
+        )
+        allocator.mark_blocks_as_computed([])
+
+        assert tracker.get_num_cached_tokens(seq) == len(tokens)
+
+    @staticmethod
+    def test_correct_extra_hash():
+        """
+        Test that the block hash is correctly computed based on the extra hash,
+        ensuring it matches the allocator's block hash, specifically for the
+        LoRA case, and that the correct number of cached tokens is retrieved.
+        """
+        block_size = 4
+        allocator = CpuGpuBlockAllocator.create(
+            allocator_type="prefix_caching",
+            num_gpu_blocks=16,
+            num_cpu_blocks=16,
+            block_size=block_size,
+        )
+        gpu_allocator = allocator._allocators[Device.GPU]
+
+        tracker = ComputedBlocksTracker(
+            allocator=allocator,
+            block_size=block_size,
+            enable_caching=True,
+        )
+
+        tokens = list(range(block_size * 4))
+
+        # Create a dummy LoRA sequence with a specific LoRA ID.
+        lora_seq = create_dummy_lora_sequence(request_id=0,
+                                              token_ids=tokens,
+                                              block_size=block_size,
+                                              lora_int_id=1)
+
+        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=tokens,
+            allocator=gpu_allocator,
+            extra_hash=lora_seq.extra_hash(),
+        )
+
+        allocator.mark_blocks_as_computed([])
+
+        # Create different dummy sequences that have the same token IDs
+        # but different LoRA IDs.
+        seq = create_dummy_sequence(request_id=1,
+                                    token_ids=tokens,
+                                    block_size=block_size)
+
+        different_lora_seq = create_dummy_lora_sequence(request_id=2,
+                                                        token_ids=tokens,
+                                                        block_size=block_size,
+                                                        lora_int_id=2)
+
+        # Due to the different LoRA IDs, corresponding blocks are not cached.
+        assert tracker.get_num_cached_tokens(seq) == 0
+        assert tracker.get_num_cached_tokens(different_lora_seq) == 0
+
+        # The number of cached tokens matches the length of the tokens
+        # for the cached LoRA sequence.
+        assert tracker.get_num_cached_tokens(lora_seq) == len(tokens)
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
-import time
-from collections import defaultdict
-from typing import List
-
-import pytest
-
-from vllm import SamplingParams
-from vllm.block import PhysicalTokenBlock
-from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
-                                   STR_NOT_IMPL_ENC_DEC_SWA)
-from vllm.core.block_manager_v1 import (BlockSpaceManagerV1,
-                                        UncachedBlockAllocator)
-from vllm.core.interfaces import AllocStatus
-from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus
-from vllm.utils import Device
-
-from .utils import create_dummy_prompt, create_dummy_prompt_encoder_decoder
-
-
-def test_block_allocator_allocate():
-    block_size = 4
-    num_cpu_blocks = 4
-    cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
-                                           num_cpu_blocks)
-
-    # Allocate all available cpu blocks.
-    num_free = num_cpu_blocks
-    assert cpu_allocator.get_num_free_blocks() == num_free
-    for _ in range(num_cpu_blocks):
-        block = cpu_allocator.allocate()
-        num_free -= 1
-
-        assert block not in cpu_allocator.free_blocks
-        assert cpu_allocator.get_num_free_blocks() == num_free
-
-    with pytest.raises(ValueError):
-        cpu_allocator.allocate()
-
-
-def test_block_allocator_free():
-    block_size = 4
-    num_cpu_blocks = 4
-    cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
-                                           num_cpu_blocks)
-
-    # Allocate all available cpu blocks.
-    blocks: List[PhysicalTokenBlock] = []
-    for _ in range(num_cpu_blocks):
-        block = cpu_allocator.allocate()
-        blocks.append(block)
-        assert block not in cpu_allocator.free_blocks
-
-    # Free all allocated cpu blocks.
-    num_free = 0
-    assert cpu_allocator.get_num_free_blocks() == num_free
-    for block in blocks:
-        cpu_allocator.free(block)
-        num_free += 1
-        assert block in cpu_allocator.free_blocks
-        assert cpu_allocator.get_num_free_blocks() == num_free
-
-        with pytest.raises(ValueError):
-            cpu_allocator.free(block)
-
-
-def test_allocate():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    # Allocate same sequence group to all available gpu blocks.
-    for i in range(num_gpu_blocks):
-        _, seq_group = create_dummy_prompt(str(i), block_size)
-        assert block_manager.can_allocate(seq_group) == AllocStatus.OK
-        block_manager.allocate(seq_group)
-    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
-
-    # Allocate same sequence group to all available gpu blocks.
-    # Use watermark to reserve one gpu block.
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=1 / num_gpu_blocks)
-    for i in range(num_gpu_blocks - 1):
-        _, seq_group = create_dummy_prompt(str(i), block_size)
-        assert block_manager.can_allocate(seq_group) == AllocStatus.OK
-        block_manager.allocate(seq_group)
-    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
-
-
-def test_allocate_encoder_decoder():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_req_per_seq_group = 2
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    # Allocate same sequence group to all available gpu blocks.
-    for i in range(num_gpu_blocks // block_req_per_seq_group):
-        _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            str(i),
-            decoder_prompt_length=block_size,
-            encoder_prompt_length=block_size)
-        assert block_manager.can_allocate(seq_group) == AllocStatus.OK
-        block_manager.allocate(seq_group)
-    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
-
-    # Allocate same sequence group to all available gpu blocks.
-    # Use watermark to reserve one gpu block.
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=1 / num_gpu_blocks)
-    for i in range((num_gpu_blocks - 1) // block_req_per_seq_group):
-        _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            str(i),
-            decoder_prompt_length=block_size,
-            encoder_prompt_length=block_size)
-        assert block_manager.can_allocate(seq_group) == AllocStatus.OK
-        block_manager.allocate(seq_group)
-    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
-
-
-def test_allocate_encoder_decoder_fails_with_swa():
-    # SWA short for sliding window attention
-
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0,
-                                        sliding_window=5)  # swa
-
-    # Allocate same sequence group to all available gpu blocks.
-    _, _, seq_group = create_dummy_prompt_encoder_decoder(
-        "0",
-        decoder_prompt_length=block_size,
-        encoder_prompt_length=block_size)
-
-    # Assert that can_allocate() fails due to SWA
-    with pytest.raises(NotImplementedError) as exc_info:
-        block_manager.can_allocate(seq_group)
-
-    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
-
-    # Assert that allocate() fails due to SWA
-    with pytest.raises(NotImplementedError) as exc_info:
-        block_manager.allocate(seq_group)
-
-    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
-
-
-def test_allocate_encoder_decoder_fails_with_prefix_caching():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0,
-                                        enable_caching=True)  # Prefix cache
-
-    # Allocate same sequence group to all available gpu blocks.
-    _, _, seq_group = create_dummy_prompt_encoder_decoder(
-        "0",
-        decoder_prompt_length=block_size,
-        encoder_prompt_length=block_size)
-
-    # Assert that can_allocate() fails due to prefix caching
-    with pytest.raises(NotImplementedError) as exc_info:
-        block_manager.can_allocate(seq_group)
-
-    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
-
-    # Assert that allocate() fails due to prefix caching
-    with pytest.raises(NotImplementedError) as exc_info:
-        block_manager.allocate(seq_group)
-
-    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
-
-
-def test_append_slot_single_seq():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    # Allocate single seq to gpu block.
-    prompt, seq_group = create_dummy_prompt("1", block_size)
-    block_manager.allocate(seq_group)
-
-    # Nothing to append. Sequence has no new logical blocks.
-    assert block_manager.can_append_slots(seq_group)
-    before_blocks = block_manager.get_num_free_gpu_blocks()
-    assert not block_manager.append_slots(prompt)
-    after_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_blocks == after_blocks
-
-    # Add block_size number of new tokens and append slot.
-    for i in range(block_size):
-        token_id = i + 5
-        prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
-
-    assert block_manager.can_append_slots(seq_group)
-    before_blocks = block_manager.get_num_free_gpu_blocks()
-    assert not block_manager.append_slots(prompt)
-    after_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_blocks - after_blocks == 1
-
-
-def test_append_slot_cow():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size=block_size,
-                                        num_cpu_blocks=num_cpu_blocks,
-                                        num_gpu_blocks=num_gpu_blocks,
-                                        watermark=0)
-
-    # Allocate prompt to gpu block. There is one slot left in the block.
-    prompt = Sequence(seq_id=1,
-                      inputs={
-                          "prompt": "one two three",
-                          "prompt_token_ids": [1, 2, 3],
-                      },
-                      block_size=block_size)
-
-    # Fork the sequence, such that a COW will be required when we append a new
-    # token id.
-    child = prompt.fork(new_seq_id=2)
-
-    # Allocate space for the sequence group.
-    seq_group = SequenceGroup(request_id="1",
-                              seqs=[prompt, child],
-                              arrival_time=time.time(),
-                              sampling_params=SamplingParams())
-    block_manager.allocate(seq_group)
-
-    # Fork and append a new token id. We expect a COW to be scheduled.
-    token_id = 4
-    child.append_token_id(token_id, {token_id: Logprob(0.0)})
-    block_manager.fork(prompt, child)
-
-    assert block_manager.can_append_slots(seq_group)
-    before_blocks = block_manager.get_num_free_gpu_blocks()
-
-    cows = block_manager.append_slots(child)
-    assert cows
-    dict_cows = defaultdict(list)
-    for src_block, dst_block in cows:
-        dict_cows[src_block].append(dst_block)
-    for src_block, dst_blocks in dict_cows.items():
-        assert src_block not in dst_blocks
-
-    after_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_blocks - after_blocks == 1
-
-
-def test_fork():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    prompt, seq_group = create_dummy_prompt("1",
-                                            block_size - 1,
-                                            block_size=block_size)
-    block_manager.allocate(seq_group)
-
-    # Fork prompt and copy block tables.
-    child = prompt.fork(2)
-    block_manager.fork(prompt, child)
-    assert block_manager.get_block_table(
-        prompt) == block_manager.get_block_table(child)
-    token_id = 4
-    # Append token to child. Block is shared so copy on write occurs.
-    child.append_token_id(token_id, {token_id: Logprob(0.0)})
-    block_manager.append_slots(child)
-    assert block_manager.get_block_table(
-        prompt) != block_manager.get_block_table(child)
-
-
-def test_swap():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
-    prompt.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group)
-
-    # Emulate a forward pass by appending a single token.
-    # The block manager then knows how many unprocessed
-    # tokens will be written in the next forward pass.
-    token_id = 0
-    prompt.status = SequenceStatus.RUNNING
-    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
-
-    # Swap seq group from GPU -> CPU.
-    gpu_blocks = block_manager.get_block_table(prompt)
-    assert block_manager.can_swap_out(seq_group)
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_out(seq_group)
-    assert [x[0] for x in mapping] == gpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
-    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
-    prompt.status = SequenceStatus.SWAPPED
-
-    # Swap seq group from CPU -> GPU.
-    cpu_blocks = block_manager.get_block_table(prompt)
-    assert block_manager.can_swap_in(seq_group) == AllocStatus.OK
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_in(seq_group)
-    assert [x[0] for x in mapping] == cpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
-    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
-
-
-def test_swap_encoder_decoder():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    decoder_prompt, encoder_prompt, seq_group = \
-        create_dummy_prompt_encoder_decoder(
-        "1",
-        decoder_prompt_length=block_size,
-        encoder_prompt_length=block_size)
-    decoder_prompt.status = SequenceStatus.WAITING
-    encoder_prompt.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group)
-
-    # Emulate a forward pass by appending a single token.
-    # The block manager then knows how many unprocessed
-    # tokens will be written in the next forward pass.
-    token_id = 0
-    decoder_prompt.status = SequenceStatus.RUNNING
-    decoder_prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
-
-    # Swap encoder/decoder seq group from GPU -> CPU.
-    decoder_gpu_blocks = block_manager.get_block_table(decoder_prompt)
-    cross_gpu_blocks = block_manager.get_cross_block_table(seq_group)
-    gpu_blocks = decoder_gpu_blocks + cross_gpu_blocks
-    assert block_manager.can_swap_out(seq_group)
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_out(seq_group)
-    assert [x[0] for x in mapping] == gpu_blocks
-    #assert list(mapping.keys()) == gpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
-    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
-    decoder_prompt.status = SequenceStatus.SWAPPED
-
-    # Swap encoder/decoder seq group from CPU -> GPU.
-    decoder_cpu_blocks = block_manager.get_block_table(decoder_prompt)
-    cross_cpu_blocks = block_manager.get_cross_block_table(seq_group)
-    cpu_blocks = decoder_cpu_blocks + cross_cpu_blocks
-    assert block_manager.can_swap_in(seq_group) == AllocStatus.OK
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_in(seq_group)
-    assert [x[0] for x in mapping] == cpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
-    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
-
-
-def test_free():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    prompt, seq_group = create_dummy_prompt("1", block_size)
-    block_manager.allocate(seq_group)
-
-    # Free allocated seq.
-    prompt_blocks = len(block_manager.get_block_table(prompt))
-    before_blocks = block_manager.get_num_free_gpu_blocks()
-    block_manager.free(prompt)
-    after_blocks = block_manager.get_num_free_gpu_blocks()
-    assert after_blocks == before_blocks + prompt_blocks
-
-    # Block table for freed seq is deleted.
-    with pytest.raises(KeyError):
-        block_manager.get_block_table(prompt)
-
-
-def test_free_encoder_decoder():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    decoder_prompt, encoder_prompt, seq_group = \
-        create_dummy_prompt_encoder_decoder(
-        "1",
-        decoder_prompt_length=block_size,
-        encoder_prompt_length=block_size)
-    block_manager.allocate(seq_group)
-
-    # Free allocated seq.
-    decoder_prompt_blocks = len(block_manager.get_block_table(decoder_prompt))
-    encoder_prompt_blocks = len(block_manager.get_cross_block_table(seq_group))
-    prompt_blocks = decoder_prompt_blocks + encoder_prompt_blocks
-    before_blocks = block_manager.get_num_free_gpu_blocks()
-    block_manager.free(decoder_prompt)
-    block_manager.free_cross(seq_group)
-    after_blocks = block_manager.get_num_free_gpu_blocks()
-    assert after_blocks == before_blocks + prompt_blocks
-
-    # Block table for freed encoder & decoder seq's are deleted.
-    with pytest.raises(KeyError):
-        block_manager.get_block_table(decoder_prompt)
-
-    # Block table for freed encoder & decoder seq's are deleted.
-    with pytest.raises(KeyError):
-        block_manager.get_block_table(encoder_prompt)
-
-
-def test_reset():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    # Allocate same seq group on all available gpu blocks.
-    original_blocks = block_manager.get_num_free_gpu_blocks()
-    for i in range(num_gpu_blocks):
-        _, seq_group = create_dummy_prompt(str(i), block_size)
-        block_manager.allocate(seq_group)
-    assert block_manager.get_num_free_gpu_blocks() == 0
-
-    # Resetting block manager frees all allocated blocks.
-    block_manager.reset()
-    assert block_manager.get_num_free_gpu_blocks() == original_blocks
-
-
-def test_reset_encoder_decoder():
-    block_size = 4
-    num_cpu_blocks = 4
-    num_gpu_blocks = 4
-    block_req_per_seq_group = 2
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0)
-
-    # Allocate same seq group on all available gpu blocks.
-    original_blocks = block_manager.get_num_free_gpu_blocks()
-    for i in range(num_gpu_blocks // block_req_per_seq_group):
-        _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            f"{i}",
-            decoder_prompt_length=block_size,
-            encoder_prompt_length=block_size)
-        block_manager.allocate(seq_group)
-    assert block_manager.get_num_free_gpu_blocks() == 0
-
-    # Resetting block manager frees all allocated blocks.
-    block_manager.reset()
-    assert block_manager.get_num_free_gpu_blocks() == original_blocks
-
-
-def test_sliding_window_multi_seq():
-    """
-    Tests that memory allocation and deallocation is handled
-    correctly with multiple sequences that exceed the sliding
-    window's capacity.
-    """
-    block_size = 1
-    num_cpu_blocks = 8
-    num_gpu_blocks = 8
-    sliding_window = 2
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        sliding_window=sliding_window,
-                                        watermark=0)
-
-    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
-
-    parent = Sequence(seq_id=1,
-                      inputs={
-                          "prompt": "one two three",
-                          "prompt_token_ids": [0, 1, 2],
-                      },
-                      block_size=block_size)
-    seq_group = SequenceGroup(request_id="1",
-                              seqs=[parent],
-                              arrival_time=time.time(),
-                              sampling_params=SamplingParams(),
-                              lora_request=None)
-    block_manager.allocate(seq_group)
-
-    # assert the number of blocks allocated is correct
-    # the parent seq has len 3, but since sliding_window is 2,
-    # we will use at most 2 blocks
-    assert block_manager.get_num_free_gpu_blocks(
-    ) == num_gpu_blocks - sliding_window
-
-    # Fork prompt and copy block tables.
-    child = parent.fork(2)
-    block_manager.fork(parent, child)
-
-    # assert the number of blocks allocated is correct
-    # forking does not increase memory consumption
-    assert block_manager.get_num_free_gpu_blocks(
-    ) == num_gpu_blocks - sliding_window
-
-    # assert both parent and child share all blocks
-    assert block_manager.get_block_table(
-        parent) == block_manager.get_block_table(child)
-
-    token_id = 4
-    # Append token to child. Block is shared so copy on write occurs.
-    child.append_token_id(token_id, {token_id: Logprob(0.0)})
-    block_manager.append_slots(child)
-
-    # assert the number of blocks allocated is correct
-    # we will use now one block more. Each seq will use 2 blocks,
-    # but only one can be shared
-    assert block_manager.get_num_free_gpu_blocks(
-    ) == num_gpu_blocks - sliding_window - 1
-
-    token_id = 5
-    parent.append_token_id(token_id, {token_id: Logprob(0.0)})
-    block_manager.append_slots(parent)
-
-    # assert the number of blocks allocated is correct
-    # no change, because both sequences are still just sharing one block
-    assert block_manager.get_num_free_gpu_blocks(
-    ) == num_gpu_blocks - sliding_window - 1
-
-    block_table_parent = block_manager.get_block_table(parent)
-    block_table_child = block_manager.get_block_table(child)
-
-    assert block_table_parent != block_table_child
-
-    # assert both blocks are sharing the second-last block
-    assert block_table_parent[-2] == block_table_child[-2]
-
-    # now let's clean up...
-    block_manager.free(parent)
-
-    # assert the number of blocks allocated is correct
-    # We have freed one seq, reducing the ref count of two blocks by one.
-    # One of the two was only used by the parent seq, so this is now free.
-    # The child seq still consumes sliding_window blocks
-    assert block_manager.get_num_free_gpu_blocks(
-    ) == num_gpu_blocks - sliding_window
-
-    # free all blocks
-    block_manager.free(child)
-
-    # assert all blocks are free now
-    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
-
-
-def test_mark_blocks_as_computed_with_prefix_cache_and_chunked_prefill():
-    """When prefix cache and chunked prefill are enabled, the block manager
-    should only mark a chunk of blocks as computed instead of all blocks.
-    """
-
-    block_size = 4
-    num_cpu_blocks = 0
-    num_gpu_blocks = 16
-    block_manager = BlockSpaceManagerV1(block_size,
-                                        num_gpu_blocks,
-                                        num_cpu_blocks,
-                                        watermark=0,
-                                        enable_caching=True)
-
-    # Set prompt size to have num_gpu_blocks - 1 full blocks.
-    prompt_length = block_size * num_gpu_blocks - 1
-
-    # Allocate (reserve) all blocks.
-    _, seq_group = create_dummy_prompt("0",
-                                       prompt_length,
-                                       block_size=block_size)
-    block_manager.allocate(seq_group)
-    assert seq_group.seqs[0].n_blocks == num_gpu_blocks
-
-    # 1st chunk: Compute 2 and half blocks. Should mark 2 blocks as computed.
-    token_chunk_size = int(block_size * 2.5)
-    block_manager.mark_blocks_as_computed(seq_group, token_chunk_size)
-    computed_blocks = block_manager.get_all_computed_blocks(seq_group.seqs[0])
-    assert len(computed_blocks) == 2
-
-    # Actual computed tokens.
-    seq_group.seqs[0].data.update_num_computed_tokens(token_chunk_size)
-
-    # 2nd chunk: Complete 3rd block and additional 4 blocks.
-    token_chunk_size = int(block_size * 4.5)
-    block_manager.mark_blocks_as_computed(seq_group, token_chunk_size)
-    computed_blocks = block_manager.get_all_computed_blocks(seq_group.seqs[0])
-    assert len(computed_blocks) == 7