Merge tag 'v0.8.1' into v0.8.1-ori

ca796e19 · zhuwenwen · e983c804 · 61c7a1b8 · e983c804 · ca796e19
Commit ca796e19 authored Mar 21, 2025 by zhuwenwen
20 changed files
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
-#!/bin/bash
-
-set -e
-
-# Build the docker image.
-docker build -f Dockerfile.tpu -t vllm-tpu .
-
-# Set up cleanup.
-remove_docker_container() { docker rm -f tpu-test || true; }
-trap remove_docker_container EXIT
-# Remove the container that might not be cleaned up in the previous run.
-remove_docker_container
-
-# For HF_TOKEN.
-source /etc/environment
-# Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest \
-    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
-    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -15,13 +15,22 @@ remove_docker_container
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest \
    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
-    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
-    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
-    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
+    && echo TEST_1 \
+    && VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && echo TEST_2 \
+    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+    && echo TEST_3 \
+    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    && echo TEST_4 \
+    && VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && echo TEST_5 \
+    && VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
+    
+
+# TODO: This test fails because it uses RANDOM_SEED sampling
+# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -12,10 +12,11 @@ docker build -t ${image_name} -f Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() { 
-  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true;
+  docker rm -f "${container_name}" || true; 
+  docker image rm -f "${image_name}" || true;
+  docker system prune -f || true;
 }
 trap remove_docker_container EXIT
-remove_docker_container

 # Run the image and test offline inference/tensor parallel
 docker run \
@@ -25,6 +26,6 @@ docker run \
    --name "${container_name}" \
    "${image_name}" \
    sh -c '
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
-FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
+# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
+FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base

-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
-    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
-    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
-    chmod 644 /usr/share/keyrings/intel-graphics.gpg
+RUN rm /etc/apt/sources.list.d/intel-graphics.list

 RUN apt-get update -y && \
    apt-get install -y --no-install-recommends --fix-missing \
@@ -21,8 +17,6 @@ RUN apt-get update -y && \
    python3 \
    python3-dev \
    python3-pip \
-    libze-intel-gpu-dev \
-    libze-intel-gpu1 \
    wget

 WORKDIR /workspace/vllm

--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Easy, fast, and cheap LLM serving for everyone

 *Latest News* 🔥

- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
+- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).

--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -732,8 +732,11 @@ def main(args: argparse.Namespace):
        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
        base_url = f"http://{args.host}:{args.port}"

-    tokenizer = get_tokenizer(tokenizer_id,
-                              trust_remote_code=args.trust_remote_code)
+    tokenizer = get_tokenizer(
+        tokenizer_id,
+        trust_remote_code=args.trust_remote_code,
+        tokenizer_mode=args.tokenizer_mode,
+    )

    if args.dataset == 'grammar':
        args.structure_type = 'guided_grammar'
@@ -876,6 +879,13 @@ if __name__ == "__main__":
        help=
        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
+    parser.add_argument(
+        "--tokenizer-mode",
+        type=str,
+        default="auto",
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
    parser.add_argument(
        "--num-prompts",
        type=int,

--- a/benchmarks/run_structured_output_benchmark.sh
+++ b/benchmarks/run_structured_output_benchmark.sh
@@ -54,6 +54,7 @@ for qps in "${QPS_VALUES[@]}"; do
  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
    --request-rate $qps \
    --result-filename "$FILENAME" \
+    --tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
    --port ${PORT:-8000}

  echo "Completed benchmark with QPS: $qps"

--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -350,8 +350,8 @@ __global__ void concat_and_cache_mla_kernel(

 }  // namespace vllm

-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)               \
  vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>             \
@@ -393,8 +393,8 @@ void reshape_and_cache(
                             CALL_RESHAPE_AND_CACHE)
 }

-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)         \
  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>       \
@@ -446,8 +446,8 @@ void reshape_and_cache_flash(
                             CALL_RESHAPE_AND_CACHE_FLASH);
 }

-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)              \
  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>            \

--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:

 ```Dockerfile
-FROM vllm/vllm-openai:v0.7.3
+FROM vllm/vllm-openai:v0.8.0

 # e.g. install the `audio` and `video` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install --system vllm[audio,video]==0.7.3
+RUN uv pip install vllm[audio,video]==0.8.0
 ```

 :::
@@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
 ```Dockerfile
 FROM vllm/vllm-openai:latest

-RUN uv pip install --system git+https://github.com/huggingface/transformers.git
+RUN uv pip install git+https://github.com/huggingface/transformers.git
 ```

 :::

--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@@ -191,7 +191,7 @@ When the head block (least recently used block) of the free queue is cached, we

 In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.

-**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
+**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.

 :::{image} /assets/design/v1/prefix_caching/example-time-1.png
 :alt: Example Time 1
@@ -203,7 +203,7 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 :alt: Example Time 3
 :::

-**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
+**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.

 :::{image} /assets/design/v1/prefix_caching/example-time-4.png
 :alt: Example Time 4

--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -2,6 +2,8 @@

 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).

+To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
+
 ## Why vLLM V1?

 vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.

--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -768,7 +768,7 @@ See [this page](#generative-models) for more information on how to use generativ
  * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
  * ✅︎
  * ✅︎
-  *
+  * ⚠️
 - * `GLM4VForCausalLM`<sup>^</sup>
  * GLM-4V
  * T + I
@@ -884,7 +884,7 @@ See [this page](#generative-models) for more information on how to use generativ
 - * `PixtralForConditionalGeneration`
  * Pixtral
  * T + I<sup>+</sup>
-  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b`, etc.
+  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.
  *
  * ✅︎
  * ✅︎
@@ -951,13 +951,10 @@ V0 correctly implements the model's attention pattern:

 V1 currently uses a simplified attention pattern:
 - Uses causal attention for all tokens, including image tokens
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
+- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
 - Will be updated in the future to support the correct behavior
- Does not support `"do_pan_and_scan": True`

 This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
-
-For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
 :::

 :::{note}

--- a/examples/offline_inference/pixtral.py
+++ b/examples/offline_inference/pixtral.py
@@ -6,14 +6,14 @@ import argparse
 from vllm import LLM
 from vllm.sampling_params import SamplingParams

-# This script is an offline demo for running Pixtral.
+# This script is an offline demo for running Mistral-Small-3
 #
 # If you want to run a server/client setup, please follow this code:
 #
 # - Server:
 #
 # ```bash
-# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
@@ -23,7 +23,7 @@ from vllm.sampling_params import SamplingParams
 # --header 'Content-Type: application/json' \
 # --header 'Authorization: Bearer token' \
 # --data '{
-#     "model": "mistralai/Pixtral-12B-2409",
+#     "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
 #     "messages": [
 #       {
 #         "role": "user",
@@ -44,7 +44,7 @@ from vllm.sampling_params import SamplingParams


 def run_simple_demo(args: argparse.Namespace):
-    model_name = "mistralai/Pixtral-12B-2409"
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
    sampling_params = SamplingParams(max_tokens=8192)

    # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
@@ -83,7 +83,7 @@ def run_simple_demo(args: argparse.Namespace):


 def run_advanced_demo(args: argparse.Namespace):
-    model_name = "mistralai/Pixtral-12B-2409"
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
    max_img_per_msg = 5
    max_tokens_per_img = 4096


--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -60,7 +60,7 @@ class TestSetting:
        # embedding model
        TestSetting(
            model="BAAI/bge-multilingual-gemma2",
-            model_args=["--task", "embed"],
+            model_args=["--task", "embed", "--dtype", "bfloat16"],
            pp_size=1,
            tp_size=1,
            attn_backend="FLASH_ATTN",

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -14,8 +14,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 from huggingface_hub import snapshot_download
 from PIL import Image
-from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
-                          BatchFeature)
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+                          BatchEncoding, BatchFeature)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass

 from tests.models.utils import (TokensTextLogprobs,
@@ -23,7 +23,7 @@ from tests.models.utils import (TokensTextLogprobs,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig
+from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
@@ -34,8 +34,7 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity, is_list_of)
+from vllm.utils import cuda_device_count_stateless, is_list_of

 logger = init_logger(__name__)

@@ -271,14 +270,18 @@ _R = TypeVar("_R")

 class HfRunner:

-    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+    def get_default_device(self):
        from vllm.platforms import current_platform
+
+        return ("cpu" if current_platform.is_cpu()
+                or current_platform.is_openvino() else "cuda")
+
+    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
        if x is None or isinstance(x, (bool, )):
            return x

        if device is None:
-            device = "cpu" if current_platform.is_cpu(
-            ) or current_platform.is_openvino() else "cuda"
+            device = self.device

        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
@@ -291,45 +294,59 @@ class HfRunner:
    def __init__(
        self,
        model_name: str,
-        dtype: str = "half",
+        dtype: str = "auto",
        *,
        model_kwargs: Optional[dict[str, Any]] = None,
        is_sentence_transformer: bool = False,
        is_cross_encoder: bool = False,
        skip_tokenizer_init: bool = False,
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
-        postprocess_inputs: Callable[..., BatchEncoding] = identity,
    ) -> None:
-        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-
        self.model_name = model_name

+        self.config = AutoConfig.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+        )
+        self.device = self.get_default_device()
+        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
+
+        model_kwargs = model_kwargs if model_kwargs is not None else {}
+        model_kwargs.setdefault("torch_dtype", torch_dtype)
+
        if is_sentence_transformer:
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
-            self.model = self.wrap_device(
-                SentenceTransformer(
-                    model_name,
-                    device="cpu",
-                    trust_remote_code=True,
-                ).to(dtype=torch_dtype))
+
+            self.model = SentenceTransformer(
+                model_name,
+                device=self.device,
+                model_kwargs=model_kwargs,
+                trust_remote_code=True,
+            )
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
-            self.model = CrossEncoder(model_name,
-                                      device="cpu",
-                                      trust_remote_code=True)
-            self.model.model = self.wrap_device(self.model.model)\
-                .to(dtype=torch_dtype)
+
+            self.model = CrossEncoder(
+                model_name,
+                device=self.device,
+                automodel_args=model_kwargs,
+                trust_remote_code=True,
+            )
        else:
-            model_kwargs = model_kwargs if model_kwargs is not None else {}
-            self.model = self.wrap_device(
-                auto_cls.from_pretrained(
-                    model_name,
-                    torch_dtype=torch_dtype,
-                    trust_remote_code=True,
-                    **model_kwargs,
-                ))
+            model = auto_cls.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                **model_kwargs,
+            )
+
+            if (getattr(model, "quantization_method", None) != "bitsandbytes"
+                    and len({p.device
+                             for p in model.parameters()}) < 2):
+                model = model.to(self.device)
+
+            self.model = model

        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
@@ -349,16 +366,13 @@ class HfRunner:
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer

-        self.dtype = dtype
-        self.postprocess_inputs = postprocess_inputs
-
    def get_inputs(
        self,
        prompts: list[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> list[BatchEncoding]:
+    ) -> list[Union[BatchFeature, BatchEncoding]]:
        if images is not None:
            assert len(prompts) == len(images)

@@ -368,7 +382,7 @@ class HfRunner:
        if audios is not None:
            assert len(prompts) == len(audios)

-        all_inputs: list[BatchEncoding] = []
+        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
        for i, prompt in enumerate(prompts):
            processor_kwargs: dict[str, Any] = {
                "text": prompt,
@@ -384,7 +398,8 @@ class HfRunner:
                processor_kwargs["sampling_rate"] = sr

            inputs = self.processor(**processor_kwargs)
-            inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
+            if isinstance(inputs, BatchFeature):
+                inputs = inputs.to(dtype=self.dtype)

            all_inputs.append(inputs)

@@ -417,7 +432,7 @@ class HfRunner:
        outputs: list[tuple[list[list[int]], list[str]]] = []
        for inputs in all_inputs:
            output_ids = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                use_cache=True,
                **kwargs,
            )
@@ -488,7 +503,7 @@ class HfRunner:
        all_logprobs: list[list[torch.Tensor]] = []
        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@@ -569,7 +584,7 @@ class HfRunner:

        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@@ -620,19 +635,15 @@ class HfRunner:
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]

-            encoder_inputs = self.wrap_device(
-                self.processor(**processor_kwargs),
-                device=self.model.device.type,
-            )
+            encoder_inputs = self.processor(**processor_kwargs)
+            encoder_inputs = self.wrap_device(encoder_inputs)

            if decoder_prompt is None:
                decoder_input_ids = None
            else:
-                decoder_input_ids = self.wrap_device(
-                    self.tokenizer(decoder_prompt,
-                                   return_tensors="pt").input_ids,
-                    device=self.model.device.type,
-                )
+                decoder_inputs = self.tokenizer(decoder_prompt,
+                                                return_tensors="pt")
+                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)

            output = self.model.generate(
                decoder_input_ids=decoder_input_ids,
@@ -684,6 +695,7 @@ class VllmRunner:
    """
    The default value of some arguments have been modified from
    :class:`~vllm.LLM` as follows:
+
    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.
    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
@@ -701,10 +713,8 @@ class VllmRunner:
        tokenizer_mode: str = "auto",
        trust_remote_code: bool = True,
        seed: Optional[int] = 0,
-        # Use smaller max model length, otherwise bigger model cannot run due
-        # to kv cache size limit.
        max_model_len: int = 1024,
-        dtype: str = "half",
+        dtype: str = "auto",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16,
@@ -1110,4 +1120,4 @@ def pytest_collection_modifyitems(config, items):
    skip_optional = pytest.mark.skip(reason="need --optional option to run")
    for item in items:
        if "optional" in item.keywords:
-            item.add_marker(skip_optional)
\ No newline at end of file
+            item.add_marker(skip_optional)
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -64,7 +64,6 @@ def test_multi_chat():
 def test_chat_multi_image(image_urls: list[str]):
    llm = LLM(
        model="microsoft/Phi-3.5-vision-instruct",
-        dtype="bfloat16",
        max_model_len=4096,
        max_num_seqs=5,
        enforce_eager=True,

--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -18,8 +18,6 @@ TEST_AUDIO_URLS = [
 @pytest.fixture(scope="module")
 def server():
    args = [
-        "--dtype",
-        "bfloat16",
        "--max-model-len",
        "2048",
        "--max-num-seqs",

--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -24,8 +24,6 @@ def server():
    args = [
        "--task",
        "generate",
-        "--dtype",
-        "bfloat16",
        "--max-model-len",
        "32768",
        "--max-num-seqs",

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -25,8 +25,6 @@ def server():
    args = [
        "--task",
        "generate",
-        "--dtype",
-        "bfloat16",
        "--max-model-len",
        "2048",
        "--max-num-seqs",

--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -28,8 +28,6 @@ def server():
    args = [
        "--task",
        "embed",
-        "--dtype",
-        "bfloat16",
        "--max-model-len",
        "2048",
        "--max-num-seqs",