Merge branch 'v0.5.4-dtk24.04.1'

e7c1b7f3 · zhuwenwen · 7462218e · 04c62b93 · e7c1b7f3 · e7c1b7f3
Commit e7c1b7f3 authored Sep 06, 2024 by zhuwenwen
20 changed files
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -2,10 +2,11 @@ cmake >= 3.21
 ninja  # For faster builds.
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
-numpy
+numpy < 2.0.0
 requests
+tqdm
 py-cpuinfo
-transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
+transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp
@@ -16,7 +17,8 @@ pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.1
-outlines >= 0.0.43 # Requires torch >= 2.1.0
+lm-format-enforcer == 0.10.3
+outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+pyzmq
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,5 +2,5 @@
 -r requirements-common.txt

 # Dependencies for x86_64 CPUs
-torch == 2.3.1+cpu
-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
+torch == 2.4.0+cpu; platform_machine != "ppc64le"
+torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,6 +4,8 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py # for pynvml package
-torch == 2.3.0
-xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
-vllm-flash-attn == 2.5.9  # Requires PyTorch 2.3.0
+torch == 2.4.0
+# These must be updated alongside torch
+torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers == 0.0.27.post2  # Requires PyTorch 2.4.0
+vllm-flash-attn == 2.6.1  # Requires PyTorch 2.4.0
--- a/requirements-mamba.txt
+++ b/requirements-mamba.txt
+# Mamba dependencies
+mamba-ssm>=1.2.2
+causal-conv1d>=1.2.0
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
+# Common dependencies
+# -r requirements-common.txt
+# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
+cmake >= 3.21
+ninja  # For faster builds.
+psutil
+sentencepiece  # Required for LLaMA tokenizer.
+numpy < 2.0.0
+requests
+tqdm
+py-cpuinfo
+transformers < 4.43
+tokenizers >= 0.19.1  # Required for Llama 3.
+fastapi
+aiohttp
+openai
+uvicorn[standard]
+pydantic >= 2.0  # Required for OpenAI server.
+pillow  # Required for image processing
+prometheus_client >= 0.18.0
+prometheus-fastapi-instrumentator >= 7.0.0
+tiktoken >= 0.6.0  # Required for DBRX tokenizer
+lm-format-enforcer == 0.10.3
+outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
+typing_extensions
+filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+pyzmq
+
+# OpenVINO dependencies
+torch >= 2.1.2
+openvino ~= 2024.3.0.dev
+openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
+optimum-intel[openvino] >= 1.18.1
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -2,5 +2,9 @@
 -r requirements-common.txt

 # Dependencies for AMD GPUs
+awscli
+boto3
+botocore
 ray >= 2.10.0
+peft
 pytest-asyncio
--- a/requirements-test.txt
+++ b/requirements-test.txt
+# Needed for Ray accelerated DAG tests
+-r requirements-adag.txt
+
 # testing
 pytest
 tensorizer>=2.9.0
@@ -14,9 +17,11 @@ peft
 requests
 ray
 sentence-transformers # required for embedding
+compressed-tensors==0.4.0 # required for compressed-tensors
+timm # required for internvl test

 # Benchmarking
 aiohttp

 # quantization
-bitsandbytes==0.42.0
+bitsandbytes==0.42.0
\ No newline at end of file
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -4,4 +4,4 @@
 # Dependencies for TPU
 # Currently, the TPU backend uses a nightly version of PyTorch XLA.
 # You can install the dependencies in Dockerfile.tpu.
-triton  # To avoid import errors
+ray
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
+# Common dependencies
+-r requirements-common.txt
+
+setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
+
+torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
+intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
+
+triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+
--- a/rocm_patch/rocm_bf16.patch
+++ b/rocm_patch/rocm_bf16.patch
--- amd_hip_bf16.h	2024-02-06 18:28:58.268699142 +0000
-+++ amd_hip_bf16.h.new	2024-02-06 18:28:31.988647133 +0000
-@@ -90,10 +90,10 @@
- #include "math_fwd.h"              // ocml device functions
- 
- #if defined(__HIPCC_RTC__)
-#define __HOST_DEVICE__ __device__
-+#define __HOST_DEVICE__ __device__ static
- #else
- #include <climits>
-#define __HOST_DEVICE__ __host__ __device__
-+#define __HOST_DEVICE__ __host__ __device__ static inline
- #endif
- 
- // Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on
--- a/setup.py
+++ b/setup.py
@@ -5,6 +5,7 @@ import os
 import re
 import subprocess
 import sys
+import warnings
 from shutil import which
 from typing import Dict, List

@@ -33,6 +34,34 @@ def load_module_from_path(module_name, path):
 ROOT_DIR = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)

+
+def embed_commit_hash():
+    try:
+        if "BUILDKITE_COMMIT" in os.environ:
+            # ci build
+            commit_id = os.environ["BUILDKITE_COMMIT"]
+        else:
+            commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
+                                                encoding="utf-8").strip()
+
+        commit_contents = f'__commit__ = "{commit_id}"\n'
+
+        version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
+        with open(version_file, "w", encoding="utf-8") as f:
+            f.write(commit_contents)
+
+    except subprocess.CalledProcessError as e:
+        warnings.warn(f"Failed to get commit hash:\n{e}",
+                      RuntimeWarning,
+                      stacklevel=2)
+    except Exception as e:
+        warnings.warn(f"Failed to embed commit hash:\n{e}",
+                      RuntimeWarning,
+                      stacklevel=2)
+
+
+embed_commit_hash()
+
 # cannot import envs directly because it depends on vllm,
 #  which is not installed yet
 envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
@@ -159,9 +188,6 @@ class cmake_build_ext(build_ext):
        # match.
        cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]

-        if _install_punica():
-            cmake_args += ['-DVLLM_INSTALL_PUNICA_KERNELS=ON']
-
        #
        # Setup parallelism and build tool
        #
@@ -240,12 +266,20 @@ def _is_cpu() -> bool:
    return VLLM_TARGET_DEVICE == "cpu"


+def _is_openvino() -> bool:
+    return VLLM_TARGET_DEVICE == "openvino"
+
+
+def _is_xpu() -> bool:
+    return VLLM_TARGET_DEVICE == "xpu"
+
+
 def _build_custom_ops() -> bool:
    return _is_cuda() or _is_hip() or _is_cpu()


-def _install_punica() -> bool:
-    return envs.VLLM_INSTALL_PUNICA_KERNELS
+def _build_core_ext() -> bool:
+    return not _is_neuron() and not _is_tpu()


 def get_hipcc_rocm_version():
@@ -346,10 +380,26 @@ def get_version_add(sha: Optional[str] = None) -> str:
            lines = file.readlines()
        rocm_version=lines[0].replace(".", "")
        version += ".dtk" + rocm_version
-
+    
+    new_version_content = f"""
+import warnings
+
+try:
+    import vllm.commit_id
+    __commit__ = vllm.commit_id.__commit__
+except Exception as e:
+    warnings.warn(f"Failed to read commit hash:\\n + str(e)",
+                  RuntimeWarning,
+                  stacklevel=2)
+    __commit__ = "COMMIT_HASH_PLACEHOLDER"
+
+__version__ = "0.5.4"
+__dcu_version__ = f'0.5.4+{version}' 
+
+"""
+    
    with open(add_version_path, encoding="utf-8",mode="w") as file:
-        file.write("__version__='0.5.0.post1'\n")
-        file.write("__dcu_version__='0.5.0.post1+{}'\n".format(version))
+        file.write(new_version_content)
    file.close()
    
    
@@ -382,10 +432,14 @@ def get_vllm_version() -> str:
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
            version += f"+neuron{neuron_version_str}"
+    elif _is_openvino():
+        version += "+openvino"
    elif _is_tpu():
        version += "+tpu"
    elif _is_cpu():
        version += "+cpu"
+    elif _is_xpu():
+        version += "+xpu"
    else:
        raise RuntimeError("Unknown runtime environment")

@@ -431,27 +485,32 @@ def get_requirements() -> List[str]:
        requirements = _read_requirements("requirements-rocm.txt")
    elif _is_neuron():
        requirements = _read_requirements("requirements-neuron.txt")
+    elif _is_openvino():
+        requirements = _read_requirements("requirements-openvino.txt")
    elif _is_tpu():
        requirements = _read_requirements("requirements-tpu.txt")
    elif _is_cpu():
        requirements = _read_requirements("requirements-cpu.txt")
+    elif _is_xpu():
+        requirements = _read_requirements("requirements-xpu.txt")
    else:
        raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
+            "Unsupported platform, please use CUDA, ROCm, Neuron, "
+            "OpenVINO, or CPU.")
    return requirements


 ext_modules = []

+if _build_core_ext():
+    ext_modules.append(CMakeExtension(name="vllm._core_C"))
+
 if _is_cuda() or _is_hip():
    ext_modules.append(CMakeExtension(name="vllm._moe_C"))

 if _build_custom_ops():
    ext_modules.append(CMakeExtension(name="vllm._C"))

-    if _install_punica():
-        ext_modules.append(CMakeExtension(name="vllm._punica_C"))
-
 package_data = {
    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
@@ -478,6 +537,7 @@ setup(
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
        "License :: OSI Approved :: Apache Software License",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
@@ -489,6 +549,11 @@ setup(
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
    },
-    cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {},
+    cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
    package_data=package_data,
-)
+    entry_points={
+        "console_scripts": [
+            "vllm=vllm.scripts:main",
+        ],
+    },
+)
\ No newline at end of file
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
 """vllm.entrypoints.api_server with some extra logging for testing."""
-import argparse
 from typing import Any, Dict

 import uvicorn
@@ -8,6 +7,7 @@ from fastapi.responses import JSONResponse, Response
 import vllm.entrypoints.api_server
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.utils import FlexibleArgumentParser

 app = vllm.entrypoints.api_server.app

@@ -33,7 +33,7 @@ def stats() -> Response:


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = FlexibleArgumentParser()
    parser.add_argument("--host", type=str, default="localhost")
    parser.add_argument("--port", type=int, default=8000)
    parser = AsyncEngineArgs.add_cli_args(parser)

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -2,8 +2,13 @@ import asyncio
 from dataclasses import dataclass

 import pytest
+import torch

-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm import SamplingParams
+from vllm.config import ParallelConfig
+from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
+
+from ..utils import wait_for_gpu_memory_to_clear


 @dataclass
@@ -19,8 +24,11 @@ class MockEngine:
        self.add_request_calls = 0
        self.abort_request_calls = 0
        self.request_id = None
+        # Ugly, remove dependency when possible
+        self.parallel_config = ParallelConfig(1, 1, False)

-    async def step_async(self):
+    async def step_async(self, virtual_engine):
+        # PP size is 1, ignore virtual engine
        self.step_calls += 1
        return [RequestOutput(
            request_id=self.request_id)] if self.request_id else []
@@ -28,6 +36,9 @@ class MockEngine:
    async def process_model_inputs_async(self, *args, **kwargs):
        pass

+    async def stop_remote_worker_execution_loop_async(self):
+        pass
+
    def generate(self, request_id):
        self.request_id = request_id

@@ -37,6 +48,7 @@ class MockEngine:
    def add_request(self, **kwargs):
        del kwargs  # Unused
        self.add_request_calls += 1
+        print(f'Request calls: {self.add_request_calls}')

    async def add_request_async(self, **kwargs):
        self.add_request_calls += 1
@@ -49,6 +61,9 @@ class MockEngine:
    def has_unfinished_requests(self):
        return self.request_id is not None

+    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
+        return self.request_id is not None
+

 class MockAsyncLLMEngine(AsyncLLMEngine):

@@ -72,6 +87,7 @@ async def test_new_requests_event():
    engine.engine.generate("2")
    await asyncio.sleep(0)
    await asyncio.sleep(0)
+    await asyncio.sleep(0)
    assert engine.engine.add_request_calls == 2
    assert engine.engine.step_calls >= 2
    await asyncio.sleep(0.001)
@@ -94,3 +110,35 @@ async def test_new_requests_event():
    assert engine.get_model_config() is not None
    assert engine.get_tokenizer() is not None
    assert engine.get_decoding_config() is not None
+
+
+def test_asyncio_run():
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(torch.cuda.device_count())),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+    engine = AsyncLLMEngine.from_engine_args(
+        AsyncEngineArgs(model="facebook/opt-125m"))
+
+    async def run(prompt: str):
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=32,
+        )
+
+        async for output in engine.generate(prompt,
+                                            sampling_params,
+                                            request_id=prompt):
+            final_output = output
+        return final_output
+
+    async def generate():
+        return await asyncio.gather(
+            run("test0"),
+            run("test1"),
+        )
+
+    results = asyncio.run(generate())
+    assert len(results) == 2
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
 import os
 import pathlib
-from dataclasses import dataclass

 import pytest

+from vllm.entrypoints.chat_utils import load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.transformers_utils.tokenizer import get_tokenizer

 chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
@@ -50,24 +49,9 @@ TEST_MESSAGES = [
 ]


-@dataclass
-class MockTokenizer:
-    chat_template = None
-
-
-@dataclass
-class MockServingChat:
-    tokenizer: MockTokenizer
-
-
 def test_load_chat_template():
    # Testing chatml template
-    tokenizer = MockTokenizer()
-    mock_serving_chat = MockServingChat(tokenizer)
-    OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                          chat_template=chatml_jinja_path)
-
-    template_content = tokenizer.chat_template
+    template_content = load_chat_template(chat_template=chatml_jinja_path)

    # Test assertions
    assert template_content is not None
@@ -79,24 +63,16 @@ def test_load_chat_template():
 def test_no_load_chat_template_filelike():
    # Testing chatml template
    template = "../../examples/does_not_exist"
-    tokenizer = MockTokenizer()
-
-    mock_serving_chat = MockServingChat(tokenizer)

    with pytest.raises(ValueError, match="looks like a file path"):
-        OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                              chat_template=template)
+        load_chat_template(chat_template=template)


 def test_no_load_chat_template_literallike():
    # Testing chatml template
    template = "{{ messages }}"
-    tokenizer = MockTokenizer()

-    mock_serving_chat = MockServingChat(tokenizer)
-    OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                          chat_template=template)
-    template_content = tokenizer.chat_template
+    template_content = load_chat_template(chat_template=template)

    assert template_content == template

@@ -108,9 +84,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
                        expected_output):
    # Initialize the tokenizer
    tokenizer = get_tokenizer(tokenizer_name=model)
-    mock_serving_chat = MockServingChat(tokenizer)
-    OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                          chat_template=template)
+    template_content = load_chat_template(chat_template=template)

    # Create a mock request object using keyword arguments
    mock_request = ChatCompletionRequest(
@@ -122,7 +96,8 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
    result = tokenizer.apply_chat_template(
        conversation=mock_request.messages,
        tokenize=False,
-        add_generation_prompt=mock_request.add_generation_prompt)
+        add_generation_prompt=mock_request.add_generation_prompt,
+        chat_template=mock_request.chat_template or template_content)

    # Test assertion
    assert result == expected_output, (

--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray

-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer

 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"


 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        MODEL_NAME,
+def server():
+    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
@@ -29,7 +17,10 @@ def server(ray_ctx):
        "2048",
        "--enforce-eager",
        "--engine-use-ray"
-    ])
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server


 @pytest.fixture(scope="module")

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -8,12 +8,14 @@ import weakref
 import pytest

 from vllm import LLM
+from vllm.utils import is_hip
+
+from ..models.utils import check_outputs_equal

 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-2-7b-hf",
 ]
-VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"


 def test_vllm_gc_ed():
@@ -27,6 +29,7 @@ def test_vllm_gc_ed():


 @pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False, True])
@@ -35,13 +38,16 @@ def test_models(
    vllm_runner,
    example_prompts,
    model: str,
+    backend: str,
    dtype: str,
    max_tokens: int,
    enforce_eager: bool,
 ) -> None:
-    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
-    if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
-        pytest.skip("Skipping non-eager test for FlashInferBackend.")
+
+    if backend == "FLASHINFER" and is_hip():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend

    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@@ -52,10 +58,9 @@ def test_models(
                     gpu_memory_utilization=0.7) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)

-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -8,6 +8,8 @@ Run `pytest tests/models/test_chunked_prefill.py`.
 """
 import pytest

+from ..models.utils import check_outputs_equal
+
 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-2-7b-hf",
@@ -54,10 +56,9 @@ def test_models(
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)

-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+
+from ..utils import compare_two_settings
+
+
+def test_cpu_offload():
+    compare_two_settings("meta-llama/Llama-2-7b-hf", [],
+                         ["--cpu-offload-gb", "4"])
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="fp8 is not supported on this GPU type.")
+def test_cpu_offload_fp8():
+    # Test quantization of an unquantized checkpoint
+    compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
+                         ["--quantization", "fp8"],
+                         ["--quantization", "fp8", "--cpu-offload-gb", "2"])
+    # Test loading a quantized checkpoint
+    compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
+                         ["--cpu-offload-gb", "2"])
+
+
+@pytest.mark.skipif(not is_quant_method_supported("awq"),
+                    reason="awq is not supported on this GPU type.")
+def test_cpu_offload_awq():
+    compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [],
+                         ["--cpu-offload-gb", "2"])
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="gptq_marlin is not supported on this GPU type.")
+def test_cpu_offload_compressed_tensors():
+    # Test wNa16
+    compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
+                         ["--cpu-offload-gb", "1"])
+    # Test w4a16_marlin24
+    compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+                         [], ["--cpu-offload-gb", "1"])
+    # Test w8a8
+    compare_two_settings(
+        "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
+        ["--cpu-offload-gb", "1"])
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -12,6 +12,8 @@ from vllm import SamplingParams
 from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                 ENABLE_ARTIFICIAL_PREEMPT)

+from ..models.utils import check_outputs_equal
+
 MODELS = [
    "facebook/opt-125m",
 ]
@@ -54,8 +56,8 @@ def test_chunked_prefill_recompute(
            max_num_seqs=max_num_seqs,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
@@ -90,18 +92,18 @@ def test_preemption(
            disable_log_stats=False,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
        total_preemption = (
-            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )

-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
    assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
            "is not enough KV cache space." in caplog_vllm.text)
    # Ensure the count bucket of request-level histogram metrics matches
@@ -147,10 +149,10 @@ def test_swap(
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
                                                       beam_width, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
        total_preemption = (
-            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)

    for i in range(len(example_prompts)):
        hf_output_ids, _ = hf_outputs[i]
@@ -215,8 +217,8 @@ def test_swap_infeasible(
            example_prompts,
            sampling_params=sampling_params,
        )
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)

    # Verify the request is ignored and not hang.
    assert req_outputs[0].outputs[0].finish_reason == "length"
@@ -254,8 +256,8 @@ def test_preemption_infeasible(
            sampling_params=sampling_params,
        )

-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)

    # Verify the request is ignored and not hang.
    for req_output in req_outputs:

--- a/tests/conftest.py
+++ b/tests/conftest.py
 import contextlib
 import gc
 import os
-from typing import Any, Dict, List, Optional, Tuple, TypeVar
+import sys
+from collections import UserList
+from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar, Union

 import pytest
 import torch
@@ -9,18 +11,19 @@ import torch.nn as nn
 import torch.nn.functional as F
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
-                          AutoProcessor, AutoTokenizer, BatchEncoding)
+                          AutoTokenizer, BatchEncoding, BatchFeature)

 from vllm import LLM, SamplingParams
-from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
+from vllm.assets.image import ImageAsset
+from vllm.config import TokenizerPoolConfig
+from vllm.connections import global_http_connection
 from vllm.distributed import (destroy_distributed_environment,
                              destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalData
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 from vllm.sequence import SampleLogprobs
-from vllm.utils import cuda_device_count_stateless, is_cpu
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
+                        is_cpu)

 logger = init_logger(__name__)

@@ -28,22 +31,6 @@ _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]

-# Multi modal related
-# You can use `.buildkite/download-images.sh` to download the assets
-PIXEL_VALUES_FILES = [
-    os.path.join(_TEST_DIR, "images", filename) for filename in
-    ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
-]
-IMAGE_FEATURES_FILES = [
-    os.path.join(_TEST_DIR, "images", filename) for filename in
-    ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
-]
-IMAGE_FILES = [
-    os.path.join(_TEST_DIR, "images", filename)
-    for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
-]
-assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES)
-

 def _read_prompts(filename: str) -> List[str]:
    with open(filename, "r") as f:
@@ -51,6 +38,50 @@ def _read_prompts(filename: str) -> List[str]:
        return prompts


+class _ImageAssetPrompts(TypedDict):
+    stop_sign: str
+    cherry_blossom: str
+
+
+if sys.version_info < (3, 9):
+    # UserList cannot be subscripted
+    class _ImageAssetsBase(UserList):
+        pass
+else:
+
+    class _ImageAssetsBase(UserList[ImageAsset]):
+        pass
+
+
+class _ImageAssets(_ImageAssetsBase):
+
+    def __init__(self) -> None:
+        super().__init__([
+            ImageAsset("stop_sign"),
+            ImageAsset("cherry_blossom"),
+        ])
+
+    def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
+        """
+        Convenience method to define the prompt for each test image.
+
+        The order of the returned prompts matches the order of the
+        assets when iterating through this object.
+        """
+        return [prompts["stop_sign"], prompts["cherry_blossom"]]
+
+
+IMAGE_ASSETS = _ImageAssets()
+"""Singleton instance of :class:`_ImageAssets`."""
+
+
+@pytest.fixture(autouse=True)
+def init_test_http_connection():
+    # pytest_asyncio may use a different event loop per test
+    # so we need to make sure the async client is created anew
+    global_http_connection.reuse_client = False
+
+
 def cleanup():
    destroy_model_parallel()
    destroy_distributed_environment()
@@ -81,31 +112,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
        cleanup()


-@pytest.fixture(scope="session")
-def hf_images() -> List[Image.Image]:
-    return [Image.open(filename) for filename in IMAGE_FILES]
-
-
-@pytest.fixture()
-def vllm_images(request) -> List[MultiModalData]:
-    vision_language_config = request.getfixturevalue("model_and_config")[1]
-    if vision_language_config.image_input_type == (
-            VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
-        return [
-            ImageFeatureData(torch.load(filename))
-            for filename in IMAGE_FEATURES_FILES
-        ]
-    else:
-        return [
-            ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES
-        ]
-
-
-@pytest.fixture()
-def vllm_image_tensors(request) -> List[torch.Tensor]:
-    return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
-
-
 @pytest.fixture
 def example_prompts() -> List[str]:
    prompts = []
@@ -122,13 +128,12 @@ def example_long_prompts() -> List[str]:
    return prompts


-_STR_DTYPE_TO_TORCH_DTYPE = {
-    "half": torch.half,
-    "bfloat16": torch.bfloat16,
-    "float": torch.float,
-}
+@pytest.fixture(scope="session")
+def image_assets() -> _ImageAssets:
+    return IMAGE_ASSETS
+

-_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding)
+_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)


 class HfRunner:
@@ -144,11 +149,11 @@ class HfRunner:
        model_name: str,
        dtype: str = "half",
        *,
+        model_kwargs: Optional[Dict[str, Any]] = None,
        is_embedding_model: bool = False,
        is_vision_model: bool = False,
    ) -> None:
-        assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
-        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
+        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]

        self.model_name = model_name

@@ -166,11 +171,13 @@ class HfRunner:
            else:
                auto_cls = AutoModelForCausalLM

+            model_kwargs = model_kwargs if model_kwargs is not None else {}
            self.model = self.wrap_device(
                auto_cls.from_pretrained(
                    model_name,
                    torch_dtype=torch_dtype,
                    trust_remote_code=True,
+                    **model_kwargs,
                ))

        self.tokenizer = AutoTokenizer.from_pretrained(
@@ -180,6 +187,9 @@ class HfRunner:
        )

        try:
+            # don't put this import at the top level
+            # it will call torch.cuda.device_count()
+            from transformers import AutoProcessor  # noqa: F401
            self.processor = AutoProcessor.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
@@ -195,7 +205,7 @@ class HfRunner:
        self,
        prompts: List[str],
        images: Optional[List[Image.Image]] = None,
-        **kwargs,
+        **kwargs: Any,
    ) -> List[Tuple[List[List[int]], List[str]]]:
        if images:
            assert len(prompts) == len(images)
@@ -230,11 +240,13 @@ class HfRunner:
        prompts: List[str],
        max_tokens: int,
        images: Optional[List[Image.Image]] = None,
+        **kwargs: Any,
    ) -> List[Tuple[List[int], str]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
-                                images=images)
+                                images=images,
+                                **kwargs)

        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
@@ -264,19 +276,30 @@ class HfRunner:
        self,
        prompts: List[str],
        max_tokens: int,
+        images: Optional[List[Image.Image]] = None,
+        **kwargs: Any,
    ) -> List[List[torch.Tensor]]:
-        all_logprobs = []
-        for prompt in prompts:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+        all_logprobs: List[List[torch.Tensor]] = []
+        for i, prompt in enumerate(prompts):
+            processor_kwargs: Dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+
+            inputs = self.processor(**processor_kwargs)
+
            output = self.model.generate(
-                self.wrap_device(input_ids),
+                **self.wrap_device(inputs),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
+                **kwargs,
            )
-            seq_logprobs = []
+            seq_logprobs: List[torch.Tensor] = []
            for hidden_states in output.hidden_states:
                last_hidden_states = hidden_states[-1][0]
                logits = torch.matmul(
@@ -296,20 +319,31 @@ class HfRunner:
        prompts: List[str],
        max_tokens: int,
        num_logprobs: int,
+        images: Optional[List[Image.Image]] = None,
+        **kwargs: Any,
    ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
        all_logprobs: List[List[Dict[int, float]]] = []
        all_output_ids: List[List[int]] = []
        all_output_strs: List[str] = []

-        for prompt in prompts:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+        for i, prompt in enumerate(prompts):
+            processor_kwargs: Dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+
+            inputs = self.processor(**processor_kwargs)
+
            output = self.model.generate(
-                self.wrap_device(input_ids),
+                **self.wrap_device(inputs),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
+                **kwargs,
            )

            seq_logprobs: List[torch.Tensor] = []
@@ -342,7 +376,7 @@ class HfRunner:

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
-            output_len = seq_ids.shape[0] - input_ids.shape[1]
+            output_len = len(seq_logprobs_lst)
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))
@@ -362,7 +396,7 @@ class HfRunner:
        cleanup()


-@pytest.fixture
+@pytest.fixture(scope="session")
 def hf_runner():
    return HfRunner

@@ -382,6 +416,7 @@ class VllmRunner:
        block_size: int = 16,
        enable_chunked_prefill: bool = False,
        swap_space: int = 4,
+        enforce_eager: bool = False,
        **kwargs,
    ) -> None:
        self.model = LLM(
@@ -390,6 +425,7 @@ class VllmRunner:
            trust_remote_code=True,
            dtype=dtype,
            swap_space=swap_space,
+            enforce_eager=enforce_eager,
            disable_log_stats=disable_log_stats,
            tensor_parallel_size=tensor_parallel_size,
            max_model_len=max_model_len,
@@ -402,7 +438,7 @@ class VllmRunner:
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[List[Image.Image]] = None,
    ) -> List[Tuple[List[List[int]], List[str]]]:
        if images is not None:
            assert len(prompts) == len(images)
@@ -410,7 +446,7 @@ class VllmRunner:
        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
        if images is not None:
            for i, image in enumerate(images):
-                inputs[i]["multi_modal_data"] = image
+                inputs[i]["multi_modal_data"] = {"image": image}

        req_outputs = self.model.generate(inputs,
                                          sampling_params=sampling_params)
@@ -423,7 +459,7 @@ class VllmRunner:
            req_sample_output_strs: List[str] = []
            for sample in req_output.outputs:
                output_str = sample.text
-                output_ids = sample.token_ids
+                output_ids = list(sample.token_ids)
                req_sample_output_ids.append(prompt_ids + output_ids)
                req_sample_output_strs.append(prompt_str + output_str)
            outputs.append((req_sample_output_ids, req_sample_output_strs))
@@ -433,10 +469,19 @@ class VllmRunner:
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
+        images: Optional[List[Image.Image]] = None,
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        assert sampling_params.logprobs is not None

-        req_outputs = self.model.generate(prompts,
+        if images is not None:
+            assert len(prompts) == len(images)
+
+        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
+        if images is not None:
+            for i, image in enumerate(images):
+                inputs[i]["multi_modal_data"] = {"image": image}
+
+        req_outputs = self.model.generate(inputs,
                                          sampling_params=sampling_params)
        outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
        for req_output in req_outputs:
@@ -451,7 +496,7 @@ class VllmRunner:
        self,
        prompts: List[str],
        max_tokens: int,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[List[Image.Image]] = None,
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
        outputs = self.generate(prompts, greedy_params, images=images)
@@ -463,11 +508,17 @@ class VllmRunner:
        prompts: List[str],
        max_tokens: int,
        num_logprobs: int,
+        images: Optional[Union[List[Image.Image],
+                               List[List[Image.Image]]]] = None,
+        stop_token_ids: Optional[List[int]] = None,
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        greedy_logprobs_params = SamplingParams(temperature=0.0,
                                                max_tokens=max_tokens,
-                                                logprobs=num_logprobs)
-        outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)
+                                                logprobs=num_logprobs,
+                                                stop_token_ids=stop_token_ids)
+        outputs = self.generate_w_logprobs(prompts,
+                                           greedy_logprobs_params,
+                                           images=images)

        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]
@@ -513,6 +564,10 @@ def get_tokenizer_pool_config(tokenizer_group_type):
        return TokenizerPoolConfig(pool_size=1,
                                   pool_type="ray",
                                   extra_config={})
+    if isinstance(tokenizer_group_type, type):
+        return TokenizerPoolConfig(pool_size=1,
+                                   pool_type=tokenizer_group_type,
+                                   extra_config={})
    raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")