Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

705f6a35 · zhuwenwen · af837396 · 4cf256ae · 705f6a35 · 705f6a35
Commit 705f6a35 authored Jul 16, 2024 by zhuwenwen
20 changed files
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,6 +4,8 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py # for pynvml package
-torch == 2.3.0
+torch == 2.3.1
-xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
+# These must be updated alongside torch
-vllm-flash-attn == 2.5.9  # Requires PyTorch 2.3.0
+torchvision == 0.18.1   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers == 0.0.27  # Requires PyTorch 2.3.1
+vllm-flash-attn == 2.5.9.post1  # Requires PyTorch 2.3.1
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
-# formatting
+-r requirements-lint.txt
-yapf==0.32.0
+-r requirements-test.txt
-toml==0.10.2
-tomli==2.0.1
-ruff==0.1.5
-codespell==2.2.6
-isort==5.13.2
-clang-format==18.1.5
-# type checking
+# Avoid adding requirements directly to this file.
-mypy==1.9.0
+# Instead, modify the two files referenced above.
-types-PyYAML
-types-requests
-types-setuptools
-# testing
-pytest
-tensorizer>=2.9.0
-pytest-forked
-pytest-asyncio
-pytest-rerunfailures
-pytest-shard
-# testing utils
-awscli
-einops # required for MPT
-httpx
-peft
-requests
-ray
-sentence-transformers # required for embedding
-# Benchmarking
-aiohttp
-# quantization
-bitsandbytes==0.42.0
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
+# formatting
+yapf==0.32.0
+toml==0.10.2
+tomli==2.0.1
+ruff==0.1.5
+codespell==2.3.0
+isort==5.13.2
+clang-format==18.1.5
+# type checking
+mypy==1.9.0
+types-PyYAML
+types-requests
+types-setuptools
--- a/requirements-mamba.txt
+++ b/requirements-mamba.txt
+# Mamba dependencies
+mamba-ssm>=1.2.2
+causal-conv1d>=1.2.0
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
+# Common dependencies
+-r requirements-common.txt
+# OpenVINO dependencies
+torch >= 2.1.2
+openvino ~= 2024.3.0.dev
+optimum-intel[openvino] >= 1.18.1
+triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -2,6 +2,5 @@
 -r requirements-common.txt
 # Dependencies for AMD GPUs
-ray == 2.9.1
+ray >= 2.10.0
-# ray >= 2.10.0
 pytest-asyncio
--- a/requirements-test.txt
+++ b/requirements-test.txt
+# testing
+pytest
+tensorizer>=2.9.0
+pytest-forked
+pytest-asyncio
+pytest-rerunfailures
+pytest-shard
+# testing utils
+awscli
+einops # required for MPT
+httpx
+peft
+requests
+ray
+sentence-transformers # required for embedding
+sparseml==1.8.0 # required for compressed-tensors
+compressed-tensors==0.4.0 # required for compressed-tensors
+# Benchmarking
+aiohttp
+# quantization
+bitsandbytes==0.42.0
\ No newline at end of file
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
+# Common dependencies
+-r requirements-common.txt
+# Dependencies for TPU
+# Currently, the TPU backend uses a nightly version of PyTorch XLA.
+# You can install the dependencies in Dockerfile.tpu.
+triton  # To avoid import errors
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
+# Common dependencies
+-r requirements-common.txt
+setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
+torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
+intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
+triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
--- a/setup.py
+++ b/setup.py
@@ -5,6 +5,7 @@ import os
 import re
 import subprocess
 import sys
+import warnings
 from shutil import which
 from typing import Dict, List
@@ -30,6 +31,34 @@ def load_module_from_path(module_name, path):
 ROOT_DIR = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)
+def embed_commit_hash():
+    try:
+        if "BUILDKITE_COMMIT" in os.environ:
+            # ci build
+            commit_id = os.environ["BUILDKITE_COMMIT"]
+        else:
+            commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
+                                                encoding="utf-8").strip()
+        commit_contents = f'__commit__ = "{commit_id}"\n'
+        version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
+        with open(version_file, "w", encoding="utf-8") as f:
+            f.write(commit_contents)
+    except subprocess.CalledProcessError as e:
+        warnings.warn(f"Failed to get commit hash:\n{e}",
+                      RuntimeWarning,
+                      stacklevel=2)
+    except Exception as e:
+        warnings.warn(f"Failed to embed commit hash:\n{e}",
+                      RuntimeWarning,
+                      stacklevel=2)
+embed_commit_hash()
 # cannot import envs directly because it depends on vllm,
 #  which is not installed yet
 envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
@@ -144,6 +173,7 @@ class cmake_build_ext(build_ext):
            cmake_args += [
                '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
                '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
+                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
            ]
        elif is_ccache_available():
            cmake_args += [
@@ -175,7 +205,6 @@ class cmake_build_ext(build_ext):
        else:
            # Default build tool to whatever cmake picks.
            build_tool = []
        subprocess.check_call(
            ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
            cwd=self.build_temp)
@@ -210,9 +239,9 @@ class cmake_build_ext(build_ext):
 def _is_cuda() -> bool:
-    return VLLM_TARGET_DEVICE == "cuda" \
+    has_cuda = torch.version.cuda is not None
-            and torch.version.cuda is not None \
+    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
-            and not _is_neuron()
+            and not (_is_neuron() or _is_tpu()))
 def _is_hip() -> bool:
@@ -229,10 +258,26 @@ def _is_neuron() -> bool:
    return torch_neuronx_installed or VLLM_TARGET_DEVICE == "neuron"
+def _is_tpu() -> bool:
+    return VLLM_TARGET_DEVICE == "tpu"
 def _is_cpu() -> bool:
    return VLLM_TARGET_DEVICE == "cpu"
+def _is_openvino() -> bool:
+    return VLLM_TARGET_DEVICE == "openvino"
+def _is_xpu() -> bool:
+    return VLLM_TARGET_DEVICE == "xpu"
+def _build_custom_ops() -> bool:
+    return _is_cuda() or _is_hip() or _is_cpu()
 def _install_punica() -> bool:
    return envs.VLLM_INSTALL_PUNICA_KERNELS
@@ -350,8 +395,8 @@ def get_version_add(sha: Optional[str] = None) -> str:
    version += ".torch" + torch.__version__[:5]
    with open(add_version_path, encoding="utf-8",mode="w") as file:
-        file.write("__version__='0.5.0'\n")
+        file.write("__version__='0.5.2'\n")
-        file.write("__dcu_version__='0.5.0+{}'\n".format(version))
+        file.write("__dcu_version__='0.5.2+{}'\n".format(version))
    file.close()
@@ -364,7 +409,7 @@ def get_version():
 def get_vllm_version() -> str:
-    version = find_version(get_path("vllm", "__init__.py"))
+    # version = find_version(get_path("vllm", "version.py"))
    if _is_cuda():
        cuda_version = str(get_nvcc_cuda_version())
@@ -384,8 +429,14 @@ def get_vllm_version() -> str:
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
            version += f"+neuron{neuron_version_str}"
+    elif _is_openvino():
+        version += "+openvino"
+    elif _is_tpu():
+        version += "+tpu"
    elif _is_cpu():
        version += "+cpu"
+    elif _is_xpu():
+        version += "+xpu"
    else:
        raise RuntimeError("Unknown runtime environment")
@@ -431,11 +482,18 @@ def get_requirements() -> List[str]:
        requirements = _read_requirements("requirements-rocm.txt")
    elif _is_neuron():
        requirements = _read_requirements("requirements-neuron.txt")
+    elif _is_openvino():
+        requirements = _read_requirements("requirements-openvino.txt")
+    elif _is_tpu():
+        requirements = _read_requirements("requirements-tpu.txt")
    elif _is_cpu():
        requirements = _read_requirements("requirements-cpu.txt")
+    elif _is_xpu():
+        requirements = _read_requirements("requirements-xpu.txt")
    else:
        raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
+            "Unsupported platform, please use CUDA, ROCm, Neuron, "
+            "OpenVINO, or CPU.")
    return requirements
@@ -444,7 +502,7 @@ ext_modules = []
 if _is_cuda() or _is_hip():
    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
-if not _is_neuron():
+if _build_custom_ops():
    ext_modules.append(CMakeExtension(name="vllm._C"))
    if _install_punica():
@@ -487,6 +545,11 @@ setup(
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
    },
-    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
+    cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {},
    package_data=package_data,
+    entry_points={
+        "console_scripts": [
+            "vllm=vllm.scripts:main",
+        ],
+    },
 )
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
 """vllm.entrypoints.api_server with some extra logging for testing."""
-import argparse
 from typing import Any, Dict
 import uvicorn
@@ -8,6 +7,7 @@ from fastapi.responses import JSONResponse, Response
 import vllm.entrypoints.api_server
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.utils import FlexibleArgumentParser
 app = vllm.entrypoints.api_server.app
@@ -33,7 +33,7 @@ def stats() -> Response:
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = FlexibleArgumentParser()
    parser.add_argument("--host", type=str, default="localhost")
    parser.add_argument("--port", type=int, default=8000)
    parser = AsyncEngineArgs.add_cli_args(parser)

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -2,8 +2,13 @@ import asyncio
 from dataclasses import dataclass
 import pytest
+import torch
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm import SamplingParams
+from vllm.config import ParallelConfig
+from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
+from ..utils import wait_for_gpu_memory_to_clear
 @dataclass
@@ -19,8 +24,11 @@ class MockEngine:
        self.add_request_calls = 0
        self.abort_request_calls = 0
        self.request_id = None
+        # Ugly, remove dependency when possible
+        self.parallel_config = ParallelConfig(1, 1, False)
-    async def step_async(self):
+    async def step_async(self, virtual_engine):
+        # PP size is 1, ignore virtual engine
        self.step_calls += 1
        return [RequestOutput(
            request_id=self.request_id)] if self.request_id else []
@@ -28,6 +36,9 @@ class MockEngine:
    async def process_model_inputs_async(self, *args, **kwargs):
        pass
+    async def stop_remote_worker_execution_loop_async(self):
+        pass
    def generate(self, request_id):
        self.request_id = request_id
@@ -37,6 +48,7 @@ class MockEngine:
    def add_request(self, **kwargs):
        del kwargs  # Unused
        self.add_request_calls += 1
+        print(f'Request calls: {self.add_request_calls}')
    async def add_request_async(self, **kwargs):
        self.add_request_calls += 1
@@ -49,6 +61,9 @@ class MockEngine:
    def has_unfinished_requests(self):
        return self.request_id is not None
+    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
+        return self.request_id is not None
 class MockAsyncLLMEngine(AsyncLLMEngine):
@@ -72,6 +87,7 @@ async def test_new_requests_event():
    engine.engine.generate("2")
    await asyncio.sleep(0)
    await asyncio.sleep(0)
+    await asyncio.sleep(0)
    assert engine.engine.add_request_calls == 2
    assert engine.engine.step_calls >= 2
    await asyncio.sleep(0.001)
@@ -94,3 +110,35 @@ async def test_new_requests_event():
    assert engine.get_model_config() is not None
    assert engine.get_tokenizer() is not None
    assert engine.get_decoding_config() is not None
+def test_asyncio_run():
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(torch.cuda.device_count())),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+    engine = AsyncLLMEngine.from_engine_args(
+        AsyncEngineArgs(model="facebook/opt-125m"))
+    async def run(prompt: str):
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=32,
+        )
+        async for output in engine.generate(prompt,
+                                            sampling_params,
+                                            request_id=prompt):
+            final_output = output
+        return final_output
+    async def generate():
+        return await asyncio.gather(
+            run("test0"),
+            run("test1"),
+        )
+    results = asyncio.run(generate())
+    assert len(results) == 2
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
-from ..utils import ServerRunner
+from ..utils import RemoteOpenAIServer
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
@@ -12,34 +9,27 @@ MODEL_NAME = "facebook/opt-125m"
 @pytest.fixture(scope="module")
 def server():
-    ray.init()
+    with RemoteOpenAIServer([
-    server_runner = ServerRunner.remote([
+            "--model",
-        "--model",
+            MODEL_NAME,
-        MODEL_NAME,
+            # use half precision for speed and memory savings in CI environment
-        # use half precision for speed and memory savings in CI environment
+            "--dtype",
-        "--dtype",
+            "float16",
-        "float16",
+            "--max-model-len",
-        "--max-model-len",
+            "2048",
-        "2048",
+            "--enforce-eager",
-        "--enforce-eager",
+            "--engine-use-ray"
-        "--engine-use-ray"
+    ]) as remote_server:
-    ])
+        yield remote_server
-    ray.get(server_runner.ready.remote())
-    yield server_runner
-    ray.shutdown()
 @pytest.fixture(scope="module")
-def client():
+def client(server):
-    client = openai.AsyncOpenAI(
+    return server.get_async_client()
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
-    yield client
 @pytest.mark.asyncio
-async def test_check_models(server, client: openai.AsyncOpenAI):
+async def test_check_models(client: openai.AsyncOpenAI):
    models = await client.models.list()
    models = models.data
    served_model = models[0]
@@ -48,7 +38,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
-async def test_single_completion(server, client: openai.AsyncOpenAI):
+async def test_single_completion(client: openai.AsyncOpenAI):
    completion = await client.completions.create(model=MODEL_NAME,
                                                 prompt="Hello, my name is",
                                                 max_tokens=5,
@@ -72,7 +62,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
-async def test_single_chat_session(server, client: openai.AsyncOpenAI):
+async def test_single_chat_session(client: openai.AsyncOpenAI):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -8,12 +8,14 @@ import weakref
 import pytest
 from vllm import LLM
+from vllm.utils import is_hip
+from ..models.utils import check_outputs_equal
 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-2-7b-hf",
 ]
-VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
 def test_vllm_gc_ed():
@@ -27,6 +29,7 @@ def test_vllm_gc_ed():
 @pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False, True])
@@ -35,13 +38,16 @@ def test_models(
    vllm_runner,
    example_prompts,
    model: str,
+    backend: str,
    dtype: str,
    max_tokens: int,
    enforce_eager: bool,
 ) -> None:
-    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
-    if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
+    if backend == "FLASHINFER" and is_hip():
-        pytest.skip("Skipping non-eager test for FlashInferBackend.")
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@@ -52,10 +58,9 @@ def test_models(
                     gpu_memory_utilization=0.7) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    for i in range(len(example_prompts)):
+    check_outputs_equal(
-        hf_output_ids, hf_output_str = hf_outputs[i]
+        outputs_0_lst=hf_outputs,
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        outputs_1_lst=vllm_outputs,
-        assert hf_output_str == vllm_output_str, (
+        name_0="hf",
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        name_1="vllm",
-        assert hf_output_ids == vllm_output_ids, (
+    )
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -8,6 +8,8 @@ Run `pytest tests/models/test_chunked_prefill.py`.
 """
 import pytest
+from ..models.utils import check_outputs_equal
 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-2-7b-hf",
@@ -54,10 +56,9 @@ def test_models(
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    for i in range(len(example_prompts)):
+    check_outputs_equal(
-        hf_output_ids, hf_output_str = hf_outputs[i]
+        outputs_0_lst=hf_outputs,
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        outputs_1_lst=vllm_outputs,
-        assert hf_output_str == vllm_output_str, (
+        name_0="hf",
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        name_1="vllm",
-        assert hf_output_ids == vllm_output_ids, (
+    )
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -12,6 +12,8 @@ from vllm import SamplingParams
 from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                 ENABLE_ARTIFICIAL_PREEMPT)
+from ..models.utils import check_outputs_equal
 MODELS = [
    "facebook/opt-125m",
 ]
@@ -54,8 +56,8 @@ def test_chunked_prefill_recompute(
            max_num_seqs=max_num_seqs,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
@@ -89,18 +91,18 @@ def test_preemption(
            disable_log_stats=False,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
        total_preemption = (
-            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
    assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
            "is not enough KV cache space." in caplog_vllm.text)
    # Ensure the count bucket of request-level histogram metrics matches
@@ -145,10 +147,10 @@ def test_swap(
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
                                                       beam_width, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
        total_preemption = (
-            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
    for i in range(len(example_prompts)):
        hf_output_ids, _ = hf_outputs[i]
@@ -212,8 +214,8 @@ def test_swap_infeasible(
            example_prompts,
            sampling_params=sampling_params,
        )
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
    # Verify the request is ignored and not hang.
    assert req_outputs[0].outputs[0].finish_reason == "length"
@@ -250,8 +252,8 @@ def test_preemption_infeasible(
            sampling_params=sampling_params,
        )
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
    # Verify the request is ignored and not hang.
    for req_output in req_outputs:

--- a/tests/conftest.py
+++ b/tests/conftest.py
 import contextlib
 import gc
 import os
-import subprocess
 import sys
-from typing import Any, Dict, List, Optional, Tuple, TypeVar
+from collections import UserList
+from dataclasses import dataclass
+from functools import cached_property
+from pathlib import Path
+from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
+                    TypeVar)
 import pytest
 import torch
@@ -11,17 +15,17 @@ import torch.nn as nn
 import torch.nn.functional as F
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
-                          AutoProcessor, AutoTokenizer, BatchEncoding)
+                          AutoTokenizer, BatchEncoding)
 from vllm import LLM, SamplingParams
-from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
+from vllm.config import TokenizerPoolConfig
-from vllm.distributed import destroy_model_parallel
+from vllm.distributed import (destroy_distributed_environment,
+                              destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalData
+from vllm.multimodal.utils import fetch_image
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 from vllm.sequence import SampleLogprobs
-from vllm.utils import is_cpu
+from vllm.utils import cuda_device_count_stateless, is_cpu
 logger = init_logger(__name__)
@@ -29,21 +33,8 @@ _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
-# Multi modal related
+_IMAGE_DIR = Path(_TEST_DIR) / "images"
-# You can use `.buildkite/download-images.sh` to download the assets
+"""You can use `.buildkite/download-images.sh` to download the assets."""
-PIXEL_VALUES_FILES = [
-    os.path.join(_TEST_DIR, "images", filename) for filename in
-    ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
-]
-IMAGE_FEATURES_FILES = [
-    os.path.join(_TEST_DIR, "images", filename) for filename in
-    ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
-]
-IMAGE_FILES = [
-    os.path.join(_TEST_DIR, "images", filename)
-    for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
-]
-assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES)
 def _read_prompts(filename: str) -> List[str]:
@@ -52,8 +43,65 @@ def _read_prompts(filename: str) -> List[str]:
        return prompts
+@dataclass(frozen=True)
+class ImageAsset:
+    name: Literal["stop_sign", "cherry_blossom", "boardwalk"]
+    @cached_property
+    def pil_image(self) -> Image.Image:
+        if self.name == "boardwalk":
+            return fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+            )
+        return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
+class _ImageAssetPrompts(TypedDict):
+    stop_sign: str
+    cherry_blossom: str
+    boardwalk: str
+if sys.version_info < (3, 9):
+    # UserList cannot be subscripted
+    class _ImageAssetsBase(UserList):
+        pass
+else:
+    class _ImageAssetsBase(UserList[ImageAsset]):
+        pass
+class _ImageAssets(_ImageAssetsBase):
+    def __init__(self) -> None:
+        super().__init__([
+            ImageAsset("stop_sign"),
+            ImageAsset("cherry_blossom"),
+            ImageAsset("boardwalk")
+        ])
+    def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
+        """
+        Convenience method to define the prompt for each test image.
+        The order of the returned prompts matches the order of the
+        assets when iterating through this object.
+        """
+        return [
+            prompts["stop_sign"], prompts["cherry_blossom"],
+            prompts["boardwalk"]
+        ]
+IMAGE_ASSETS = _ImageAssets()
+"""Singleton instance of :class:`_ImageAssets`."""
 def cleanup():
    destroy_model_parallel()
+    destroy_distributed_environment()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    gc.collect()
@@ -81,31 +129,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
        cleanup()
-@pytest.fixture(scope="session")
-def hf_images() -> List[Image.Image]:
-    return [Image.open(filename) for filename in IMAGE_FILES]
-@pytest.fixture()
-def vllm_images(request) -> List[MultiModalData]:
-    vision_language_config = request.getfixturevalue("model_and_config")[1]
-    if vision_language_config.image_input_type == (
-            VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
-        return [
-            ImageFeatureData(torch.load(filename))
-            for filename in IMAGE_FEATURES_FILES
-        ]
-    else:
-        return [
-            ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES
-        ]
-@pytest.fixture()
-def vllm_image_tensors(request) -> List[torch.Tensor]:
-    return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
 @pytest.fixture
 def example_prompts() -> List[str]:
    prompts = []
@@ -122,6 +145,11 @@ def example_long_prompts() -> List[str]:
    return prompts
+@pytest.fixture(scope="session")
+def image_assets() -> _ImageAssets:
+    return IMAGE_ASSETS
 _STR_DTYPE_TO_TORCH_DTYPE = {
    "half": torch.half,
    "bfloat16": torch.bfloat16,
@@ -144,8 +172,10 @@ class HfRunner:
        model_name: str,
        dtype: str = "half",
        *,
+        model_kwargs: Optional[Dict[str, Any]] = None,
        is_embedding_model: bool = False,
        is_vision_model: bool = False,
+        is_sparseml_model: bool = False,
    ) -> None:
        assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
@@ -163,14 +193,19 @@ class HfRunner:
        else:
            if is_vision_model:
                auto_cls = AutoModelForVision2Seq
+            elif is_sparseml_model:
+                from sparseml.transformers import SparseAutoModelForCausalLM
+                auto_cls = SparseAutoModelForCausalLM
            else:
                auto_cls = AutoModelForCausalLM
+            model_kwargs = model_kwargs if model_kwargs is not None else {}
            self.model = self.wrap_device(
                auto_cls.from_pretrained(
                    model_name,
                    torch_dtype=torch_dtype,
                    trust_remote_code=True,
+                    **model_kwargs,
                ))
        self.tokenizer = AutoTokenizer.from_pretrained(
@@ -180,6 +215,9 @@ class HfRunner:
        )
        try:
+            # don't put this import at the top level
+            # it will call torch.cuda.device_count()
+            from transformers import AutoProcessor  # noqa: F401
            self.processor = AutoProcessor.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
@@ -195,7 +233,7 @@ class HfRunner:
        self,
        prompts: List[str],
        images: Optional[List[Image.Image]] = None,
-        **kwargs,
+        **kwargs: Any,
    ) -> List[Tuple[List[List[int]], List[str]]]:
        if images:
            assert len(prompts) == len(images)
@@ -230,11 +268,13 @@ class HfRunner:
        prompts: List[str],
        max_tokens: int,
        images: Optional[List[Image.Image]] = None,
+        **kwargs: Any,
    ) -> List[Tuple[List[int], str]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
-                                images=images)
+                                images=images,
+                                **kwargs)
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
@@ -264,19 +304,30 @@ class HfRunner:
        self,
        prompts: List[str],
        max_tokens: int,
+        images: Optional[List[Image.Image]] = None,
+        **kwargs: Any,
    ) -> List[List[torch.Tensor]]:
-        all_logprobs = []
+        all_logprobs: List[List[torch.Tensor]] = []
-        for prompt in prompts:
+        for i, prompt in enumerate(prompts):
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+            processor_kwargs: Dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+            inputs = self.processor(**processor_kwargs)
            output = self.model.generate(
-                self.wrap_device(input_ids),
+                **self.wrap_device(inputs),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
+                **kwargs,
            )
-            seq_logprobs = []
+            seq_logprobs: List[torch.Tensor] = []
            for hidden_states in output.hidden_states:
                last_hidden_states = hidden_states[-1][0]
                logits = torch.matmul(
@@ -296,20 +347,32 @@ class HfRunner:
        prompts: List[str],
        max_tokens: int,
        num_logprobs: int,
+        images: Optional[List[Image.Image]] = None,
+        **kwargs: Any,
    ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
        all_logprobs: List[List[Dict[int, float]]] = []
        all_output_ids: List[List[int]] = []
        all_output_strs: List[str] = []
-        for prompt in prompts:
+        for i, prompt in enumerate(prompts):
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+            processor_kwargs: Dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+            inputs = self.processor(**processor_kwargs)
+            input_ids = inputs.input_ids
            output = self.model.generate(
-                self.wrap_device(input_ids),
+                **self.wrap_device(inputs),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
+                **kwargs,
            )
            seq_logprobs: List[torch.Tensor] = []
@@ -362,7 +425,7 @@ class HfRunner:
        cleanup()
-@pytest.fixture
+@pytest.fixture(scope="session")
 def hf_runner():
    return HfRunner
@@ -382,6 +445,7 @@ class VllmRunner:
        block_size: int = 16,
        enable_chunked_prefill: bool = False,
        swap_space: int = 4,
+        enforce_eager: bool = False,
        **kwargs,
    ) -> None:
        self.model = LLM(
@@ -390,6 +454,7 @@ class VllmRunner:
            trust_remote_code=True,
            dtype=dtype,
            swap_space=swap_space,
+            enforce_eager=enforce_eager,
            disable_log_stats=disable_log_stats,
            tensor_parallel_size=tensor_parallel_size,
            max_model_len=max_model_len,
@@ -402,7 +467,7 @@ class VllmRunner:
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[List[Image.Image]] = None,
    ) -> List[Tuple[List[List[int]], List[str]]]:
        if images is not None:
            assert len(prompts) == len(images)
@@ -410,7 +475,7 @@ class VllmRunner:
        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
        if images is not None:
            for i, image in enumerate(images):
-                inputs[i]["multi_modal_data"] = image
+                inputs[i]["multi_modal_data"] = {"image": image}
        req_outputs = self.model.generate(inputs,
                                          sampling_params=sampling_params)
@@ -423,7 +488,7 @@ class VllmRunner:
            req_sample_output_strs: List[str] = []
            for sample in req_output.outputs:
                output_str = sample.text
-                output_ids = sample.token_ids
+                output_ids = list(sample.token_ids)
                req_sample_output_ids.append(prompt_ids + output_ids)
                req_sample_output_strs.append(prompt_str + output_str)
            outputs.append((req_sample_output_ids, req_sample_output_strs))
@@ -433,10 +498,19 @@ class VllmRunner:
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
+        images: Optional[List[Image.Image]] = None,
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        assert sampling_params.logprobs is not None
-        req_outputs = self.model.generate(prompts,
+        if images is not None:
+            assert len(prompts) == len(images)
+        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
+        if images is not None:
+            for i, image in enumerate(images):
+                inputs[i]["multi_modal_data"] = {"image": image}
+        req_outputs = self.model.generate(inputs,
                                          sampling_params=sampling_params)
        outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
        for req_output in req_outputs:
@@ -451,7 +525,7 @@ class VllmRunner:
        self,
        prompts: List[str],
        max_tokens: int,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[List[Image.Image]] = None,
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
        outputs = self.generate(prompts, greedy_params, images=images)
@@ -463,11 +537,14 @@ class VllmRunner:
        prompts: List[str],
        max_tokens: int,
        num_logprobs: int,
+        images: Optional[List[Image.Image]] = None,
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        greedy_logprobs_params = SamplingParams(temperature=0.0,
                                                max_tokens=max_tokens,
                                                logprobs=num_logprobs)
-        outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)
+        outputs = self.generate_w_logprobs(prompts,
+                                           greedy_logprobs_params,
+                                           images=images)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]
@@ -537,15 +614,4 @@ def num_gpus_available():
    """Get number of GPUs without initializing the CUDA context
    in current process."""
-    try:
+    return cuda_device_count_stateless()
-        out = subprocess.run([
-            sys.executable, "-c",
-            "import torch; print(torch.cuda.device_count())"
-        ],
-                             capture_output=True,
-                             check=True,
-                             text=True)
-    except subprocess.CalledProcessError as e:
-        logger.warning("Failed to get number of GPUs.", exc_info=e)
-        return 0
-    return int(out.stdout.strip())
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -477,3 +477,70 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
        assert expected_token_ids == actual_token_ids
    assert baseline_token_ids == test_token_ids
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+        # we keep the blocks small, so that hit eviction quickly
+        "max_model_len": 48,
+        "block_size": 16,
+        "num_gpu_blocks_override": 3,
+        # Test APC in v2 block
+        "use_v2_block_manager": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "enable_prefix_caching": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "enable_prefix_caching": True,
+}])
+@pytest.mark.parametrize("seed", [1])
+def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
+                                                 test_llm_generator):
+    """Verify block manager v2 with auto prefix caching could works normal
+    even when eviction started.
+    With APC enabled, all blocks are held by native block at the beginning.
+    Then blocks are managed by evictor instead. If cache hit at the evitor's
+    block, then it could be reused, or we need to recompute its kv cache.
+    """
+    output_len = 10
+    temperature = 0.0
+    prompts = [
+        "You are a helpful assistant. Please answer truthfully and write "
+        "out your thinking step by step to be sure you get the right answer. "
+        "If you make a mistake, attempt to correct it. who are you?",
+        "You are a helpful assistant. Please answer truthfully and write out "
+        "your thinking step by step to be sure you get the right answer. You "
+        "are helpful and harmless and you follow ethical guidelines. "
+        "who are you?"
+    ]
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+    print('Getting token ids with APC disabled')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+    print('Getting token ids with APC enabled')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+    assert baseline_token_ids == test_token_ids
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
+from typing import List
 import pytest
 from vllm.core.block.block_table import BlockTable
@@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
    token_ids = list(range(sequence_len))
    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
-    block_tables = []
+    block_tables: List[BlockTable] = []
    for i in range(5):
        assert allocator.get_num_free_blocks(
            device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
@@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
    num_immutable_blocks_per_alloc = len(
        chunked_tokens) - num_mutable_blocks_per_alloc
-    block_tables = []
+    block_tables: List[BlockTable] = []
    for alloc_i in range(1, 6):
        block_tables.append(
@@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
    )
    block_table.allocate(token_ids=token_ids, device=Device.GPU)
-    appended_so_far = []
+    appended_so_far: List[int] = []
    for append in chunk_list(token_ids_to_append, append_size):
        block_table.append_token_ids(append)
        appended_so_far.extend(append)
@@ -371,8 +373,9 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
                                   block_size) - (sequence_len // block_size)
    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
-    original_block_ids = original_block_table.physical_block_ids
+    original_block_ids = original_block_table.physical_block_ids[:]
+    print("original_block_ids = {}".format(original_block_ids))
    forked_block_table = original_block_table.fork()
    # Expect no additional allocation (copy on _write_).
@@ -455,7 +458,7 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
    # Allocate lookahead slots.
    original_block_table.ensure_num_empty_slots(lookahead_slots)
-    original_block_ids = original_block_table.physical_block_ids
+    original_block_ids = original_block_table.physical_block_ids[:]
    forked_block_table = original_block_table.fork()

--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -8,8 +8,8 @@ from vllm.utils import Device, chunk_list
 @pytest.mark.parametrize("num_gpu_blocks", [1024])
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
+def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
-                          block_size: int, allocator_type: str):
+                                block_size: int, allocator_type: str):
    allocator = CpuGpuBlockAllocator.create(
        allocator_type=allocator_type,
        num_gpu_blocks=num_gpu_blocks,
@@ -21,14 +21,14 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
    cpu_blocks = [
-        allocator.allocate_mutable(prev_block=None, device=Device.CPU)
+        allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
        for _ in range(num_cpu_blocks)
    ]
    assert allocator.get_num_free_blocks(Device.CPU) == 0
    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
    gpu_blocks = [
-        allocator.allocate_mutable(prev_block=None, device=Device.GPU)
+        allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
        for _ in range(num_gpu_blocks)
    ]
    assert allocator.get_num_free_blocks(Device.CPU) == 0
@@ -47,8 +47,8 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
 @pytest.mark.parametrize("num_gpu_blocks", [1024])
 @pytest.mark.parametrize("block_size", [2])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
+def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
-                            block_size: int, allocator_type: str):
+                                  block_size: int, allocator_type: str):
    allocator = CpuGpuBlockAllocator.create(
        allocator_type=allocator_type,
        num_gpu_blocks=num_gpu_blocks,
@@ -67,18 +67,18 @@ def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
    cpu_blocks = [
-        allocator.allocate_immutable(prev_block=None,
+        allocator.allocate_immutable_block(prev_block=None,
-                                     token_ids=token_ids,
+                                           token_ids=token_ids,
-                                     device=Device.CPU)
+                                           device=Device.CPU)
        for token_ids in cpu_token_ids
    ]
    assert allocator.get_num_free_blocks(Device.CPU) == 0
    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
    gpu_blocks = [
-        allocator.allocate_immutable(prev_block=None,
+        allocator.allocate_immutable_block(prev_block=None,
-                                     token_ids=token_ids,
+                                           token_ids=token_ids,
-                                     device=Device.GPU)
+                                           device=Device.GPU)
        for token_ids in gpu_token_ids
    ]
    assert allocator.get_num_free_blocks(Device.CPU) == 0