Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1

af7f4372 · zhuwenwen · 5e19cdef · 09c77926 · af7f4372 · af7f4372
Commit af7f4372 authored Sep 03, 2024 by zhuwenwen
20 changed files
--- a/pyproject.toml
+++ b/pyproject.toml
 [build-system]
 # Should be mirrored in requirements-build.txt
 requires = [
-    "cmake>=3.21",
+    "cmake>=3.26",
    "ninja",
    "packaging",
    "setuptools >= 49.4.0",
    "torch == 2.4.0",
    "wheel",
+    "jinja2",
 ]
 build-backend = "setuptools.build_meta"
@@ -56,6 +57,7 @@ files = [
    "vllm/*.py",
    "vllm/adapter_commons",
    "vllm/assets",
+    "vllm/entrypoints",
    "vllm/inputs",
    "vllm/logging",
    "vllm/multimodal",

--- a/requirements-build.txt
+++ b/requirements-build.txt
 # Should be mirrored in pyproject.toml
-cmake>=3.21
+cmake>=3.26
 ninja
 packaging
 setuptools>=49.4.0
 torch==2.4.0
 wheel
+jinja2
--- a/requirements-common.txt
+++ b/requirements-common.txt
-cmake >= 3.21
-ninja  # For faster builds.
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
@@ -8,17 +6,23 @@ tqdm
 py-cpuinfo
 transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
 tokenizers >= 0.19.1  # Required for Llama 3.
+protobuf # Required by LlamaTokenizer.
 fastapi
 aiohttp
-openai
+openai >= 1.0 # Ensure modern openai package (ensure types module present)
 uvicorn[standard]
-pydantic >= 2.0  # Required for OpenAI server.
+pydantic >= 2.8  # Required for OpenAI server.
 pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.3
+lm-format-enforcer == 0.10.6
 outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
-typing_extensions
+typing_extensions >= 4.10
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
 pyzmq
+msgspec
+librosa # Required for audio processing
+soundfile # Required for audio processing
+gguf == 0.9.1
+importlib_metadata
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -7,5 +7,5 @@ nvidia-ml-py # for pynvml package
 torch == 2.4.0
 # These must be updated alongside torch
 torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.27.post2  # Requires PyTorch 2.4.0
+xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
-vllm-flash-attn == 2.6.1  # Requires PyTorch 2.4.0
+vllm-flash-attn == 2.6.1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -8,7 +8,7 @@ isort==5.13.2
 clang-format==18.1.5
 # type checking
-mypy==1.9.0
+mypy==1.11.1
 types-PyYAML
 types-requests
 types-setuptools
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
 # Common dependencies
-# -r requirements-common.txt
+-r requirements-common.txt
-# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
-cmake >= 3.21
-ninja  # For faster builds.
-psutil
-sentencepiece  # Required for LLaMA tokenizer.
-numpy < 2.0.0
-requests
-tqdm
-py-cpuinfo
-transformers < 4.43
-tokenizers >= 0.19.1  # Required for Llama 3.
-fastapi
-aiohttp
-openai
-uvicorn[standard]
-pydantic >= 2.0  # Required for OpenAI server.
-pillow  # Required for image processing
-prometheus_client >= 0.18.0
-prometheus-fastapi-instrumentator >= 7.0.0
-tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.3
-outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
-typing_extensions
-filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
-pyzmq
 # OpenVINO dependencies
 torch >= 2.1.2
-openvino ~= 2024.3.0.dev
+openvino ~= 2024.3.0
-openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
+optimum-intel[openvino] >= 1.18.2
-optimum-intel[openvino] >= 1.18.1
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -11,7 +11,7 @@ pytest-shard
 # testing utils
 awscli
-einops # required for MPT
+einops # required for MPT and qwen-vl
 httpx
 peft
 requests
@@ -19,9 +19,15 @@ ray
 sentence-transformers # required for embedding
 compressed-tensors==0.4.0 # required for compressed-tensors
 timm # required for internvl test
+transformers_stream_generator # required for qwen-vl test
+matplotlib # required for qwen-vl test
+# TODO: Add this after fully implementing llava(mantis)
+# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
 # Benchmarking
 aiohttp
 # quantization
 bitsandbytes==0.42.0
\ No newline at end of file
+buildkite-test-collector==0.1.8
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -68,9 +68,12 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
-# vLLM only supports Linux platform
+if not sys.platform.startswith("linux"):
-assert sys.platform.startswith(
+    logger.warning(
-    "linux"), "vLLM only supports Linux platform (including WSL)."
+        "vLLM only supports Linux platform (including WSL). "
+        "Building on %s, "
+        "so vLLM may not be able to run correctly", sys.platform)
+    VLLM_TARGET_DEVICE = "empty"
 MAIN_CUDA_VERSION = "12.1"
@@ -188,6 +191,10 @@ class cmake_build_ext(build_ext):
        # match.
        cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
+        # Pass the python path to cmake so it can reuse the build dependencies
+        # on subsequent calls to python.
+        cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
        #
        # Setup parallelism and build tool
        #
@@ -238,6 +245,10 @@ class cmake_build_ext(build_ext):
        subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
+def _no_device() -> bool:
+    return VLLM_TARGET_DEVICE == "empty"
 def _is_cuda() -> bool:
    has_cuda = torch.version.cuda is not None
    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
@@ -279,7 +290,7 @@ def _build_custom_ops() -> bool:
 def _build_core_ext() -> bool:
-    return not _is_neuron() and not _is_tpu()
+    return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu())
 def get_hipcc_rocm_version():
@@ -398,13 +409,13 @@ try:
    import vllm.commit_id
    __commit__ = vllm.commit_id.__commit__
 except Exception as e:
-    warnings.warn(f"Failed to read commit hash:\\n + str(e)",
+    warnings.warn(f"Failed to read commit hash:\n{e}",
                  RuntimeWarning,
                  stacklevel=2)
    __commit__ = "COMMIT_HASH_PLACEHOLDER"
-__version__ = "0.5.4"
+__version__ = "0.5.5"
-__dcu_version__ = f'0.5.4+{version}' 
+__dcu_version__ = f'0.5.5+{version}' 
 """
@@ -424,7 +435,9 @@ def get_version():
 def get_vllm_version() -> str:
    # version = find_version(get_path("vllm", "version.py"))
-    if _is_cuda():
+    if _no_device():
+        version += "+empty"
+    elif _is_cuda():
        cuda_version = str(get_nvcc_cuda_version())
        if cuda_version != MAIN_CUDA_VERSION:
            cuda_version_str = cuda_version.replace(".", "")[:3]
@@ -479,7 +492,9 @@ def get_requirements() -> List[str]:
                resolved_requirements.append(line)
        return resolved_requirements
-    if _is_cuda():
+    if _no_device():
+        requirements = _read_requirements("requirements-cuda.txt")
+    elif _is_cuda():
        requirements = _read_requirements("requirements-cuda.txt")
        cuda_major, cuda_minor = torch.version.cuda.split(".")
        modified_requirements = []
@@ -528,6 +543,9 @@ if envs.VLLM_USE_PRECOMPILED:
    ext_modules = []
    package_data["vllm"].append("*.so")
+if _no_device():
+    ext_modules = []
 setup(
    name="vllm",
    version=get_vllm_version(),

--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
 """vllm.entrypoints.api_server with some extra logging for testing."""
-from typing import Any, Dict
+from typing import Any, Dict, Iterable
 import uvicorn
 from fastapi.responses import JSONResponse, Response
@@ -18,9 +18,10 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
        super().__init__(*args, **kwargs)
        self._num_aborts = 0
-    async def abort(self, request_id: str) -> None:
+    async def _engine_abort(self, request_ids: Iterable[str]):
-        await super().abort(request_id)
+        ids = list(request_ids)
-        self._num_aborts += 1
+        self._num_aborts += len(ids)
+        await super()._engine_abort(ids)
    def testing_stats(self) -> Dict[str, Any]:
        return {"num_aborted_requests": self._num_aborts}

--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
+import os
 import subprocess
 import sys
 import time
@@ -35,11 +36,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
        "127.0.0.1", "--tokenizer-pool-size",
        str(tokenizer_pool_size)
    ]
+    # Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
+    # to prevent `--engine-use-ray` raises an exception due to it deprecation
+    env_vars = os.environ.copy()
+    env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
    if engine_use_ray:
        commands.append("--engine-use-ray")
    if worker_use_ray:
        commands.append("--worker-use-ray")
-    uvicorn_process = subprocess.Popen(commands)
+    uvicorn_process = subprocess.Popen(commands, env=env_vars)
    yield
    uvicorn_process.terminate()

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
 import asyncio
+import os
+from asyncio import CancelledError
 from dataclasses import dataclass
+from typing import Optional
 import pytest
+import pytest_asyncio
 import torch
 from vllm import SamplingParams
 from vllm.config import ParallelConfig
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
+from vllm.outputs import RequestOutput as RealRequestOutput
+from ..conftest import cleanup
 from ..utils import wait_for_gpu_memory_to_clear
@@ -106,21 +112,49 @@ async def test_new_requests_event():
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1
+    # Allow deprecated engine_use_ray to not raise exception
+    os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
    assert engine.get_model_config() is not None
    assert engine.get_tokenizer() is not None
    assert engine.get_decoding_config() is not None
+    os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
-def test_asyncio_run():
+def start_engine():
    wait_for_gpu_memory_to_clear(
        devices=list(range(torch.cuda.device_count())),
        threshold_bytes=2 * 2**30,
        timeout_s=60,
    )
-    engine = AsyncLLMEngine.from_engine_args(
+    return AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m"))
+        AsyncEngineArgs(model="facebook/opt-125m", enforce_eager=True))
+@pytest_asyncio.fixture(scope="module")
+async def async_engine():
+    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
+                                                            func=start_engine)
+    try:
+        yield engine
+    finally:
+        engine.shutdown_background_loop()
+        del engine
+        await asyncio.sleep(0.1)
+        cleanup()
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    # So we can share the async engine fixture between these tests
+    return False
+@pytest.mark.asyncio(scope="module")
+async def test_asyncio_run(async_engine):
    async def run(prompt: str):
        sampling_params = SamplingParams(
@@ -128,17 +162,64 @@ def test_asyncio_run():
            max_tokens=32,
        )
-        async for output in engine.generate(prompt,
+        async for output in async_engine.generate(prompt,
-                                            sampling_params,
+                                                  sampling_params,
-                                            request_id=prompt):
+                                                  request_id=prompt):
            final_output = output
        return final_output
-    async def generate():
+    results = await asyncio.gather(
-        return await asyncio.gather(
+        run("test0"),
-            run("test0"),
+        run("test1"),
-            run("test1"),
+    )
-        )
-    results = asyncio.run(generate())
    assert len(results) == 2
+@pytest.mark.asyncio(scope="module")
+async def test_cancellation(async_engine):
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+    )
+    i = 0
+    with pytest.raises(CancelledError):
+        async for output in async_engine.generate("test2",
+                                                  sampling_params,
+                                                  request_id="test2"):
+            assert not output.finished
+            i += 1
+            if i == 5:
+                await async_engine.abort("test2")
+    assert i == 5
+@pytest.mark.asyncio(scope="module")
+async def test_delayed_generator(async_engine):
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+    )
+    stream = async_engine.generate("test3",
+                                   sampling_params,
+                                   request_id="test3")
+    i = 0
+    final_output: Optional[RealRequestOutput] = None
+    async for output in stream:
+        final_output = output
+        if i == 0:
+            # wait for generation to complete before consuming
+            # the remaining messages
+            await asyncio.sleep(1)
+        if i < 9:
+            assert not output.finished
+        i += 1
+    assert i == 10
+    assert final_output is not None
+    assert len(final_output.outputs[0].token_ids) == 10
+    assert final_output.finished
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
-import os
-import pathlib
 import pytest
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import apply_chat_template, load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.transformers_utils.tokenizer import get_tokenizer
-chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
+from ..utils import VLLM_PATH
-    __file__))).parent.parent / "examples/template_chatml.jinja"
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
 assert chatml_jinja_path.exists()
 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATON_OUTPUT = [
-    ("facebook/opt-125m", None, True,
-     "Hello</s>Hi there!</s>What is the capital of</s>"),
-    ("facebook/opt-125m", None, False,
-     "Hello</s>Hi there!</s>What is the capital of</s>"),
    ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
@@ -93,11 +87,12 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
        add_generation_prompt=add_generation_prompt)
    # Call the function and get the result
-    result = tokenizer.apply_chat_template(
+    result = apply_chat_template(
+        tokenizer,
        conversation=mock_request.messages,
-        tokenize=False,
+        chat_template=mock_request.chat_template or template_content,
        add_generation_prompt=mock_request.add_generation_prompt,
-        chat_template=mock_request.chat_template or template_content)
+    )
    # Test assertion
    assert result == expected_output, (

--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
 import openai  # use the official client for correctness check
 import pytest
-from ..utils import RemoteOpenAIServer
+from ..utils import VLLM_PATH, RemoteOpenAIServer
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
 @pytest.fixture(scope="module")
@@ -16,10 +18,16 @@ def server():
        "--max-model-len",
        "2048",
        "--enforce-eager",
-        "--engine-use-ray"
+        "--engine-use-ray",
+        "--chat-template",
+        str(chatml_jinja_path),
    ]
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # Allow `--engine-use-ray`, otherwise the launch of the server throw
+    # an error due to try to use a deprecated feature
+    env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
+    with RemoteOpenAIServer(MODEL_NAME, args,
+                            env_dict=env_dict) as remote_server:
        yield remote_server
@@ -83,7 +91,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI):
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=13, total_tokens=23)
+        completion_tokens=10, prompt_tokens=55, total_tokens=65)
    message = choice.message
    assert message.content is not None and len(message.content) >= 10

--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -10,23 +10,23 @@ async def test_request_tracker():
    stream_1 = tracker.add_request("1")
    assert tracker.new_requests_event.is_set()
    await tracker.wait_for_new_requests()
-    new, finished = tracker.get_new_and_finished_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
    assert not tracker.new_requests_event.is_set()
    assert len(new) == 1
    assert new[0]["request_id"] == "1"
-    assert not finished
+    assert not aborted
    assert not stream_1.finished
    stream_2 = tracker.add_request("2")
    stream_3 = tracker.add_request("3")
    assert tracker.new_requests_event.is_set()
    await tracker.wait_for_new_requests()
-    new, finished = tracker.get_new_and_finished_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
    assert not tracker.new_requests_event.is_set()
    assert len(new) == 2
    assert new[0]["request_id"] == "2"
    assert new[1]["request_id"] == "3"
-    assert not finished
+    assert not aborted
    assert not stream_2.finished
    assert not stream_3.finished
@@ -36,9 +36,9 @@ async def test_request_tracker():
    assert not tracker.new_requests_event.is_set()
    tracker.abort_request("1")
-    new, finished = tracker.get_new_and_finished_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
-    assert len(finished) == 1
+    assert len(aborted) == 1
-    assert "1" in finished
+    assert "1" in aborted
    assert not new
    assert stream_1.finished
@@ -46,9 +46,11 @@ async def test_request_tracker():
    tracker.abort_request("4")
    assert tracker.new_requests_event.is_set()
    await tracker.wait_for_new_requests()
-    new, finished = tracker.get_new_and_finished_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
-    assert len(finished) == 1
+    # aborted new requests will cancel each other out -
-    assert "4" in finished
+    # there's no need for them to propagate into the
+    # engine
+    assert not aborted
    assert not new
    assert stream_4.finished
@@ -57,10 +59,9 @@ async def test_request_tracker():
    tracker.process_request_output(
        RequestOutput("2", "output", [], [], [], finished=True))
    await tracker.wait_for_new_requests()
-    new, finished = tracker.get_new_and_finished_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
    assert not tracker.new_requests_event.is_set()
-    assert len(finished) == 1
+    assert not aborted
-    assert "2" in finished
    assert len(new) == 1
    assert new[0]["request_id"] == "5"
    assert stream_2.finished

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -6,14 +6,27 @@ prefill requests are chunked.
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
 import pytest
-from ..models.utils import check_outputs_equal
+from ..models.utils import check_logprobs_close, check_outputs_equal
 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-2-7b-hf",
 ]
+E5M2_KV_MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-chat-hf",
+]
+E4M3_KV_MODELS = [
+    "meta-llama/Llama-2-7b-chat-hf", "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
+]
+KV_CACHE_QUANTIZATION_PATHS = {
+    "meta-llama/Llama-2-7b-chat-hf":
+    "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
+}
 @pytest.mark.parametrize("model", MODELS)
@@ -35,12 +48,12 @@ def test_models(
    enforce_eager: bool,
    tensor_parallel_size: int,
 ) -> None:
-    max_num_seqs = min(chunked_prefill_token_size, 256)
+    """
-    enable_chunked_prefill = False
+    Checks exact match decode between huggingface model and vllm runner with
-    max_num_batched_tokens = None
+    chunked prefill.
-    if chunked_prefill_token_size != -1:
+    """
-        enable_chunked_prefill = True
+    max_num_seqs = chunked_prefill_token_size
-        max_num_batched_tokens = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@@ -49,7 +62,7 @@ def test_models(
            model,
            dtype=dtype,
            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=enable_chunked_prefill,
+            enable_chunked_prefill=True,
            tensor_parallel_size=tensor_parallel_size,
            enforce_eager=enforce_eager,
            max_num_seqs=max_num_seqs,
@@ -62,3 +75,78 @@ def test_models(
        name_0="hf",
        name_1="vllm",
    )
+@pytest.mark.parametrize("kv_cache_dtype,model",
+                         [("fp8_e5m2", m)
+                          for m in E5M2_KV_MODELS] + [("fp8_e4m3", m)
+                                                      for m in E4M3_KV_MODELS])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_models_with_fp8_kv_cache(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    model: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    enforce_eager: bool,
+    tensor_parallel_size: int,
+) -> None:
+    """
+    Only checks log probs match between chunked-prefill and
+    non-chunked-prefill version of vLLM model runner.
+    This test is used when there is discrepancy in kernels
+    / numerics (e.g. when using lower-precision types like FP8).
+    """
+    NUM_LOG_PROBS = 8
+    if model == "facebook/opt-125m":
+        pytest.skip(
+            "#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
+        )
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+    extra_kwargs = {}
+    if model in KV_CACHE_QUANTIZATION_PATHS:
+        extra_kwargs["quantization_param_path"] = KV_CACHE_QUANTIZATION_PATHS[
+            model]
+    with vllm_runner(
+            model,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+            kv_cache_dtype=kv_cache_dtype,
+            **extra_kwargs,
+    ) as vllm_model:
+        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+    with vllm_runner(
+            model,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=True,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+            kv_cache_dtype=kv_cache_dtype,
+            **extra_kwargs,
+    ) as vllm_model:
+        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+    check_logprobs_close(
+        outputs_0_lst=no_chunked_prefill_outputs,
+        outputs_1_lst=chunked_prefill_outputs,
+        name_0="no_chunked_prefill",
+        name_1="chunked_prefill",
+    )
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
-import pytest
-from tests.quantization.utils import is_quant_method_supported
 from ..utils import compare_two_settings
 def test_cpu_offload():
    compare_two_settings("meta-llama/Llama-2-7b-hf", [],
                         ["--cpu-offload-gb", "4"])
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
-def test_cpu_offload_fp8():
-    # Test quantization of an unquantized checkpoint
-    compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
-                         ["--quantization", "fp8"],
-                         ["--quantization", "fp8", "--cpu-offload-gb", "2"])
-    # Test loading a quantized checkpoint
-    compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
-                         ["--cpu-offload-gb", "2"])
-@pytest.mark.skipif(not is_quant_method_supported("awq"),
-                    reason="awq is not supported on this GPU type.")
-def test_cpu_offload_awq():
-    compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [],
-                         ["--cpu-offload-gb", "2"])
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_compressed_tensors():
-    # Test wNa16
-    compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
-                         ["--cpu-offload-gb", "1"])
-    # Test w4a16_marlin24
-    compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
-                         [], ["--cpu-offload-gb", "1"])
-    # Test w8a8
-    compare_two_settings(
-        "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
-        ["--cpu-offload-gb", "1"])
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -8,6 +8,7 @@ pytest tests/basic_correctness/test_preemption.py`.
 import pytest
 from prometheus_client import REGISTRY
+import vllm.envs as envs
 from vllm import SamplingParams
 from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                 ENABLE_ARTIFICIAL_PREEMPT)
@@ -24,6 +25,13 @@ assert ENABLE_ARTIFICIAL_PREEMPT is True, (
    "tests/basic_correctness/test_preemption.py`")
+@pytest.fixture
+def worker_use_ray() -> bool:
+    # When SPMD worker is used, use ray_use_worker=True
+    # to test delta input optimization works with preemption.
+    return envs.VLLM_USE_RAY_SPMD_WORKER
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [96])
@@ -36,6 +44,7 @@ def test_chunked_prefill_recompute(
    dtype: str,
    max_tokens: int,
    chunked_prefill_token_size: int,
+    worker_use_ray: bool,
 ) -> None:
    """Ensure that chunked prefill works with preemption."""
    max_num_seqs = min(chunked_prefill_token_size, 256)
@@ -54,6 +63,7 @@ def test_chunked_prefill_recompute(
            max_num_batched_tokens=max_num_batched_tokens,
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_seqs=max_num_seqs,
+            worker_use_ray=worker_use_ray,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
@@ -80,6 +90,7 @@ def test_preemption(
    model: str,
    dtype: str,
    max_tokens: int,
+    worker_use_ray: bool,
 ) -> None:
    """By default, recompute preemption is enabled"""
@@ -90,6 +101,7 @@ def test_preemption(
            model,
            dtype=dtype,
            disable_log_stats=False,
+            worker_use_ray=worker_use_ray,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
@@ -134,6 +146,7 @@ def test_swap(
    dtype: str,
    max_tokens: int,
    beam_width: int,
+    worker_use_ray: bool,
 ) -> None:
    """Use beam search enables swapping."""
    example_prompts = example_prompts[:1]
@@ -146,6 +159,7 @@ def test_swap(
            dtype=dtype,
            swap_space=10,
            disable_log_stats=False,
+            worker_use_ray=worker_use_ray,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
                                                       beam_width, max_tokens)
@@ -191,6 +205,7 @@ def test_swap_infeasible(
    dtype: str,
    max_tokens: int,
    beam_width: int,
+    worker_use_ray: bool,
 ) -> None:
    """Verify infeasible swap request will be ignored."""
    BLOCK_SIZE = 16
@@ -207,6 +222,7 @@ def test_swap_infeasible(
            # decode blocks are not enough to finish.
            num_gpu_blocks_override=prefill_blocks + decode_blocks,
            max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
+            worker_use_ray=worker_use_ray,
    ) as vllm_model:
        sampling_params = SamplingParams(n=beam_width,
                                         use_beam_search=True,
@@ -234,6 +250,7 @@ def test_preemption_infeasible(
    model: str,
    dtype: str,
    max_tokens: int,
+    worker_use_ray: bool,
 ) -> None:
    """Verify infeasible preemption request will be ignored."""
    BLOCK_SIZE = 16
@@ -248,6 +265,7 @@ def test_preemption_infeasible(
            # ignored instead of hanging forever.
            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+            worker_use_ray=worker_use_ray,
    ) as vllm_model:
        sampling_params = SamplingParams(max_tokens=max_tokens,
                                         ignore_eos=True)

--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
+import os
+import pytest
+@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
+def test_full_graph(model):
+    # make sure these models can be captured in full graph mode
+    os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
+    from vllm import LLM, SamplingParams
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0)
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B")
+    llm.generate(prompts, sampling_params)
--- a/tests/conftest.py
+++ b/tests/conftest.py
 import contextlib
 import gc
+import json
 import os
 import sys
+import tempfile
 from collections import UserList
-from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar, Union
+from enum import Enum
+from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
+                    TypeVar, Union)
+import numpy as np
 import pytest
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from huggingface_hub import snapshot_download
 from PIL import Image
-from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
+from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
-                          AutoTokenizer, BatchEncoding, BatchFeature)
+                          BatchFeature)
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (destroy_distributed_environment,
-                              destroy_model_parallel)
+                              destroy_model_parallel,
-from vllm.inputs import TextPrompt
+                              init_distributed_environment,
+                              initialize_model_parallel)
+from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
+                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
 from vllm.sequence import SampleLogprobs
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        is_cpu)
+                        identity, is_cpu)
 logger = init_logger(__name__)
@@ -82,6 +92,21 @@ def init_test_http_connection():
    global_http_connection.reuse_client = False
+@pytest.fixture
+def dist_init():
+    temp_file = tempfile.mkstemp()[1]
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"file://{temp_file}",
+        local_rank=0,
+        backend="nccl",
+    )
+    initialize_model_parallel(1, 1)
+    yield
+    cleanup()
 def cleanup():
    destroy_model_parallel()
    destroy_distributed_environment()
@@ -120,6 +145,46 @@ def example_prompts() -> List[str]:
    return prompts
+class DecoderPromptType(Enum):
+    """For encoder/decoder models only."""
+    CUSTOM = 1
+    NONE = 2
+    EMPTY_STR = 3
+@pytest.fixture
+def example_encoder_decoder_prompts(
+) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]:
+    '''
+    Returns an encoder prompt list and a decoder prompt list, wherein each pair
+    of same-index entries in both lists corresponds to an (encoder prompt,
+    decoder prompt) tuple.
+    Returns:
+    * Encoder prompt list
+    * Decoder prompt list (reverse of encoder prompt list)
+    '''
+    encoder_prompts = []
+    for filename in _TEST_PROMPTS:
+        encoder_prompts += _read_prompts(filename)
+    custom_decoder_prompts = encoder_prompts[::-1]
+    empty_str_decoder_prompts = [""] * len(encoder_prompts)
+    none_decoder_prompts = [None] * len(encoder_prompts)
+    # NONE decoder prompt type
+    return {
+        DecoderPromptType.NONE:
+        zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
+        DecoderPromptType.EMPTY_STR:
+        zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
+        DecoderPromptType.CUSTOM:
+        zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
+    }
 @pytest.fixture
 def example_long_prompts() -> List[str]:
    prompts = []
@@ -151,7 +216,9 @@ class HfRunner:
        *,
        model_kwargs: Optional[Dict[str, Any]] = None,
        is_embedding_model: bool = False,
-        is_vision_model: bool = False,
+        auto_cls=AutoModelForCausalLM,
+        postprocess_inputs: Callable[[BatchEncoding],
+                                     BatchEncoding] = identity,
    ) -> None:
        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
@@ -166,11 +233,6 @@ class HfRunner:
                    device="cpu",
                ).to(dtype=torch_dtype))
        else:
-            if is_vision_model:
-                auto_cls = AutoModelForVision2Seq
-            else:
-                auto_cls = AutoModelForCausalLM
            model_kwargs = model_kwargs if model_kwargs is not None else {}
            self.model = self.wrap_device(
                auto_cls.from_pretrained(
@@ -195,12 +257,14 @@ class HfRunner:
                torch_dtype=torch_dtype,
                trust_remote_code=True,
            )
-        except Exception:
+        except Exception as exc:
            logger.warning(
-                "Unable to auto-load processor from HuggingFace for "
+                "Unable to auto-load HuggingFace processor for model (%s). "
-                "model %s. Using tokenizer instead.", model_name)
+                "Using tokenizer instead. Reason: %s", model_name, exc)
            self.processor = self.tokenizer
+        self.postprocess_inputs = postprocess_inputs
    def generate(
        self,
        prompts: List[str],
@@ -220,6 +284,7 @@ class HfRunner:
                processor_kwargs["images"] = images[i]
            inputs = self.processor(**processor_kwargs)
+            inputs = self.postprocess_inputs(inputs)
            output_ids = self.model.generate(
                **self.wrap_device(inputs),
@@ -289,6 +354,7 @@ class HfRunner:
                processor_kwargs["images"] = images[i]
            inputs = self.processor(**processor_kwargs)
+            inputs = self.postprocess_inputs(inputs)
            output = self.model.generate(
                **self.wrap_device(inputs),
@@ -314,12 +380,51 @@ class HfRunner:
            all_logprobs.append(seq_logprobs)
        return all_logprobs
+    def _hidden_states_to_logprobs(
+        self,
+        hidden_states,
+        num_logprobs,
+    ) -> Tuple[List[Dict[int, float]], int]:
+        seq_logprobs: List[torch.Tensor] = []
+        output_len = len(hidden_states)
+        for _, hidden_state in enumerate(hidden_states):
+            last_hidden_states = hidden_state[-1][0]
+            logits = torch.matmul(
+                last_hidden_states,
+                self.model.get_output_embeddings().weight.t(),
+            )
+            if getattr(self.model.get_output_embeddings(), "bias",
+                       None) is not None:
+                logits += self.model.get_output_embeddings().bias.unsqueeze(0)
+            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+            seq_logprobs.append(logprobs)
+        # convert to dict
+        seq_logprobs_lst: List[Dict[int, float]] = []
+        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
+            # drop prompt logprobs
+            if tok_idx == 0:
+                tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
+            topk = tok_logprobs.topk(num_logprobs)
+            tok_logprobs_dct = {}
+            for token_id, logprob in zip(topk.indices[0], topk.values[0]):
+                tok_logprobs_dct[token_id.item()] = logprob.item()
+            seq_logprobs_lst.append(tok_logprobs_dct)
+        return (
+            seq_logprobs_lst,
+            output_len,
+        )
    def generate_greedy_logprobs_limit(
        self,
        prompts: List[str],
        max_tokens: int,
        num_logprobs: int,
        images: Optional[List[Image.Image]] = None,
+        audios: Optional[List[Tuple[np.ndarray, int]]] = None,
        **kwargs: Any,
    ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
        all_logprobs: List[List[Dict[int, float]]] = []
@@ -334,7 +439,13 @@ class HfRunner:
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
+            if audios is not None:
+                audio, sr = audios[i]
+                processor_kwargs["audio"] = audio
+                processor_kwargs["sampling_rate"] = sr
            inputs = self.processor(**processor_kwargs)
+            inputs = self.postprocess_inputs(inputs)
            output = self.model.generate(
                **self.wrap_device(inputs),
@@ -346,37 +457,66 @@ class HfRunner:
                **kwargs,
            )
-            seq_logprobs: List[torch.Tensor] = []
+            (
-            for _, hidden_states in enumerate(output.hidden_states):
+                seq_logprobs_lst,
-                last_hidden_states = hidden_states[-1][0]
+                output_len,
-                logits = torch.matmul(
+            ) = self._hidden_states_to_logprobs(output.hidden_states,
-                    last_hidden_states,
+                                                num_logprobs)
-                    self.model.get_output_embeddings().weight.t(),
-                )
-                if getattr(self.model.get_output_embeddings(), "bias",
-                           None) is not None:
-                    logits += self.model.get_output_embeddings(
-                    ).bias.unsqueeze(0)
-                logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
-                seq_logprobs.append(logprobs)
-            # convert to dict
+            all_logprobs.append(seq_logprobs_lst)
-            seq_logprobs_lst: List[Dict[int, float]] = []
+            seq_ids = output.sequences[0]
-            for tok_idx, tok_logprobs in enumerate(seq_logprobs):
+            output_len = len(seq_logprobs_lst)
-                # drop prompt logprobs
+            output_ids = seq_ids[-output_len:]
-                if tok_idx == 0:
+            all_output_ids.append(output_ids.tolist())
-                    tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
+            all_output_strs.append(self.tokenizer.decode(output_ids))
-                topk = tok_logprobs.topk(num_logprobs)
-                tok_logprobs_dct = {}
+        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
-                for token_id, logprob in zip(topk.indices[0], topk.values[0]):
+        return [(output_ids, output_str, output_logprobs)
-                    tok_logprobs_dct[token_id.item()] = logprob.item()
+                for output_ids, output_str, output_logprobs in outputs]
-                seq_logprobs_lst.append(tok_logprobs_dct)
+    def generate_encoder_decoder_greedy_logprobs_limit(
+        self,
+        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        max_tokens: int,
+        num_logprobs: int,
+        **kwargs: Any,
+    ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
+        '''
+        Greedy logprobs generation for vLLM encoder/decoder models
+        '''
+        all_logprobs: List[List[Dict[int, float]]] = []
+        all_output_ids: List[List[int]] = []
+        all_output_strs: List[str] = []
+        for (encoder_prompt,
+             decoder_prompt) in to_enc_dec_tuple_list(encoder_decoder_prompts):
+            encoder_input_ids = self.wrap_device(
+                self.tokenizer(encoder_prompt, return_tensors="pt").input_ids)
+            decoder_input_ids = (
+                None if decoder_prompt is None else self.wrap_device(
+                    self.tokenizer(decoder_prompt,
+                                   return_tensors="pt").input_ids))
+            output = self.model.generate(
+                encoder_input_ids,
+                decoder_input_ids=decoder_input_ids,
+                use_cache=True,
+                do_sample=False,
+                max_new_tokens=max_tokens,
+                output_hidden_states=True,
+                return_dict_in_generate=True,
+                **kwargs,
+            )
+            (
+                seq_logprobs_lst,
+                output_len,
+            ) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
+                                                num_logprobs)
            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
-            output_len = len(seq_logprobs_lst)
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))
@@ -416,7 +556,7 @@ class VllmRunner:
        block_size: int = 16,
        enable_chunked_prefill: bool = False,
        swap_space: int = 4,
-        enforce_eager: bool = False,
+        enforce_eager: Optional[bool] = False,
        **kwargs,
    ) -> None:
        self.model = LLM(
@@ -438,7 +578,8 @@ class VllmRunner:
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[Union[List[Image.Image],
+                               List[List[Image.Image]]]] = None,
    ) -> List[Tuple[List[List[int]], List[str]]]:
        if images is not None:
            assert len(prompts) == len(images)
@@ -465,11 +606,27 @@ class VllmRunner:
            outputs.append((req_sample_output_ids, req_sample_output_strs))
        return outputs
+    def _final_steps_generate_w_logprobs(
+        self,
+        req_outputs: List[RequestOutput],
+    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
+        outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
+        for req_output in req_outputs:
+            for sample in req_output.outputs:
+                output_str = sample.text
+                output_ids = list(sample.token_ids)
+                output_logprobs = sample.logprobs
+            outputs.append((output_ids, output_str, output_logprobs))
+        return outputs
    def generate_w_logprobs(
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[Union[List[Image.Image],
+                               List[List[Image.Image]]]] = None,
+        audios: Optional[Union[List[Tuple[np.ndarray, int]],
+                               List[List[Tuple[np.ndarray, int]]]]] = None
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        assert sampling_params.logprobs is not None
@@ -481,16 +638,27 @@ class VllmRunner:
            for i, image in enumerate(images):
                inputs[i]["multi_modal_data"] = {"image": image}
+        if audios is not None:
+            for i, audio in enumerate(audios):
+                inputs[i]["multi_modal_data"] = {"audio": audio}
        req_outputs = self.model.generate(inputs,
                                          sampling_params=sampling_params)
-        outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
+        return self._final_steps_generate_w_logprobs(req_outputs)
-        for req_output in req_outputs:
-            for sample in req_output.outputs:
+    def generate_encoder_decoder_w_logprobs(
-                output_str = sample.text
+        self,
-                output_ids = sample.token_ids
+        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
-                output_logprobs = sample.logprobs
+        sampling_params: SamplingParams,
-            outputs.append((output_ids, output_str, output_logprobs))
+    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
-        return outputs
+        '''
+        Logprobs generation for vLLM encoder/decoder models
+        '''
+        assert sampling_params.logprobs is not None
+        req_outputs = self.model.generate(encoder_decoder_prompts,
+                                          sampling_params=sampling_params)
+        return self._final_steps_generate_w_logprobs(req_outputs)
    def generate_greedy(
        self,
@@ -510,6 +678,8 @@ class VllmRunner:
        num_logprobs: int,
        images: Optional[Union[List[Image.Image],
                               List[List[Image.Image]]]] = None,
+        audios: Optional[Union[List[Tuple[np.ndarray, int]],
+                               List[List[Tuple[np.ndarray, int]]]]] = None,
        stop_token_ids: Optional[List[int]] = None,
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        greedy_logprobs_params = SamplingParams(temperature=0.0,
@@ -518,7 +688,28 @@ class VllmRunner:
                                                stop_token_ids=stop_token_ids)
        outputs = self.generate_w_logprobs(prompts,
                                           greedy_logprobs_params,
-                                           images=images)
+                                           images=images,
+                                           audios=audios)
+        return [(output_ids, output_str, output_logprobs)
+                for output_ids, output_str, output_logprobs in outputs]
+    def generate_encoder_decoder_greedy_logprobs(
+        self,
+        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        max_tokens: int,
+        num_logprobs: int,
+    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
+        greedy_logprobs_params = SamplingParams(temperature=0.0,
+                                                use_beam_search=False,
+                                                max_tokens=max_tokens,
+                                                logprobs=num_logprobs)
+        '''
+        Greedy logprobs generation for vLLM encoder/decoder models
+        '''
+        outputs = self.generate_encoder_decoder_w_logprobs(
+            encoder_decoder_prompts, greedy_logprobs_params)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]
@@ -593,3 +784,26 @@ def num_gpus_available():
    in current process."""
    return cuda_device_count_stateless()
+temp_dir = tempfile.gettempdir()
+_dummy_path = os.path.join(temp_dir, "dummy_opt")
+@pytest.fixture
+def dummy_opt_path():
+    json_path = os.path.join(_dummy_path, "config.json")
+    if not os.path.exists(_dummy_path):
+        snapshot_download(repo_id="facebook/opt-125m",
+                          local_dir=_dummy_path,
+                          ignore_patterns=[
+                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
+                              "*.msgpack"
+                          ])
+        assert os.path.exists(json_path)
+        with open(json_path, "r") as f:
+            config = json.load(f)
+        config["architectures"] = ["MyOPTForCausalLM"]
+        with open(json_path, "w") as f:
+            json.dump(config, f)
+    return _dummy_path
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -261,11 +261,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
            # skip cuda graph creation for fast test.
            "enforce_eager": True,
            "enable_chunked_prefill": True,
-            "max_num_batched_tokens": 2,
-            "max_num_seqs": 2,
        },
    ])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("per_test_common_llm_kwargs",
+                         [{
+                             "block_size": 8,
+                             "max_num_batched_tokens": 2,
+                             "max_num_seqs": 2,
+                         }, {
+                             "block_size": 8,
+                             "max_num_batched_tokens": 3,
+                             "max_num_seqs": 2,
+                         }, {
+                             "block_size": 8,
+                             "max_num_batched_tokens": 256,
+                             "max_num_seqs": 10,
+                         }])
 @pytest.mark.parametrize("baseline_llm_kwargs", [
    {
        "use_v2_block_manager": False,
@@ -294,6 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
+        ("1 + " * 50) + " 1 = ",  # Longer prompt.
        "The capital of France is",
        "The future of AI is",
    ]