Update to transformers v5 (#30566)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: khluu <khluu000@gmail.com> Signed-off-by: Kevin H. Luu <khluu000@gmail.com> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: khluu <khluu000@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: jiang1.li <jiang1.li@intel.com>

Update to transformers v5 (#30566)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: khluu <khluu000@gmail.com> Signed-off-by: Kevin H. Luu <khluu000@gmail.com> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: khluu <khluu000@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: jiang1.li <jiang1.li@intel.com>
03f8d3a5 · Harry Mellor · GitHub · 6dc94914 · 03f8d3a5 · 03f8d3a5
Unverified Commit 03f8d3a5 authored Apr 16, 2026 by Harry Mellor Committed by GitHub Apr 15, 2026
20 changed files
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
 docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
+docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 -e VLLM_CPU_ATTN_SPLIT_KV=0 --shm-size=4g "$IMAGE_NAME" \
        timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -4,7 +4,6 @@ depends_on:
 steps:
 - label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
-  device: h200_18gb
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -73,3 +72,18 @@ steps:
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
+
+- label: Transformers Backward Compatibility Models Test
+  working_dir: "/vllm-workspace/"
+  optional: true
+  soft_fail: true
+  commands:
+    - pip install transformers==4.57.5
+    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_transformers.py
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -642,7 +642,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    else \
        BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
    fi; \
-    uv pip install --system accelerate hf_transfer modelscope \
+    uv pip install --system accelerate modelscope \
        "bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}"

 # ============================================================
@@ -756,9 +756,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -e tests/vllm_test_utils

 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER 1
+ENV HF_XET_HIGH_PERFORMANCE 1
+
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60

 # Copy in the v1 package for testing (it isn't distributed yet)
 COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1

--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -197,6 +197,12 @@ ADD ./.buildkite/ ./.buildkite/
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -e tests/vllm_test_utils

+# enable fast downloads from hf (for testing)
+ENV HF_XET_HIGH_PERFORMANCE 1
+
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60
+
 ######################### RELEASE IMAGE #########################
 FROM base AS vllm-openai


--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -272,9 +272,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -e tests/vllm_test_utils

 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER 1
+ENV HF_XET_HIGH_PERFORMANCE 1
+
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60

 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/test/nightly-torch.txt

--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -365,9 +365,10 @@ RUN cd /vllm-workspace \
    && python3 -m pip install pytest-shard

 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER=1
+ENV HF_XET_HIGH_PERFORMANCE=1
+
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60

 # install audio decode package `torchcodec` from source (required due to 
 # ROCm and torch version mismatch) for tests with datasets package

--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@@ -240,7 +240,7 @@ uv pip install vllm==${VLLM_VERSION} \
        # Install dependencies
        pip install --upgrade numba \
            scipy \
-            huggingface-hub[cli,hf_transfer] \
+            huggingface-hub[cli] \
            setuptools_scm
        pip install -r requirements/rocm.txt


--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,7 +7,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.56.0, < 5
+transformers >= 4.56.0, != 5.0.*, != 5.1.*, != 5.2.*, != 5.3.*, != 5.4.*, != 5.5.0
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.14.0.1 # required for compressed-tensors
+compressed-tensors == 0.15.0.1 # required for compressed-tensors
 depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files

--- a/requirements/test/cuda.in
+++ b/requirements/test/cuda.in
@@ -18,7 +18,7 @@ httpx
 librosa # required for audio tests
 vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
-peft>=0.15.0 # required for phi-4-mm test
+peft>=0.18.1 # required for phi-4-mm test
 pqdm
 ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
 resampy # required for audio tests
@@ -39,8 +39,8 @@ opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.5
-tokenizers==0.22.0
+transformers==5.5.3
+tokenizers==0.22.2
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes==0.49.2

--- a/requirements/test/cuda.txt
+++ b/requirements/test/cuda.txt
@@ -4,7 +4,7 @@ absl-py==2.1.0
    # via
    #   rouge-score
    #   tensorboard
-accelerate==1.0.1
+accelerate==1.13.0
    # via peft
 aenum==3.1.16
    # via lightly
@@ -248,7 +248,6 @@ filelock==3.16.1
    #   huggingface-hub
    #   ray
    #   torch
-    #   transformers
    #   virtualenv
 fiona==1.10.1
    # via torchgeo
@@ -331,7 +330,7 @@ h5py==3.13.0
    # via terratorch
 harfile==0.3.0
    # via schemathesis
-hf-xet==1.1.7
+hf-xet==1.4.3
    # via huggingface-hub
 hiredis==3.0.0
    # via tensorizer
@@ -345,9 +344,10 @@ httpx==0.27.2
    # via
    #   -r requirements/test/cuda.in
    #   diffusers
+    #   huggingface-hub
    #   perceptron
    #   schemathesis
-huggingface-hub==0.36.2
+huggingface-hub==1.10.2
    # via
    #   accelerate
    #   datasets
@@ -756,7 +756,7 @@ pathvalidate==3.2.1
    # via pytablewriter
 patsy==1.0.1
    # via statsmodels
-peft==0.16.0
+peft==0.18.1
    # via -r requirements/test/cuda.in
 perceptron==0.1.4
    # via -r requirements/test/cuda.in
@@ -982,7 +982,7 @@ referencing==0.35.1
    # via
    #   jsonschema
    #   jsonschema-specifications
-regex==2024.9.11
+regex==2026.2.28
    # via
    #   diffusers
    #   nltk
@@ -1002,7 +1002,6 @@ requests==2.32.3
    #   google-api-core
    #   google-cloud-storage
    #   gpt-oss
-    #   huggingface-hub
    #   lightly
    #   lm-eval
    #   mistral-common
@@ -1015,7 +1014,6 @@ requests==2.32.3
    #   starlette-testclient
    #   tacoreader
    #   tiktoken
-    #   transformers
    #   wandb
 resampy==0.4.3
    # via -r requirements/test/cuda.in
@@ -1216,7 +1214,7 @@ timm==1.0.17
    #   segmentation-models-pytorch
    #   terratorch
    #   torchgeo
-tokenizers==0.22.0
+tokenizers==0.22.2
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/cuda.in
@@ -1295,7 +1293,7 @@ tqdm==4.67.3
    #   tacoreader
    #   terratorch
    #   transformers
-transformers==4.57.5
+transformers==5.5.3
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/cuda.in
@@ -1317,7 +1315,9 @@ typepy==1.3.2
 typer==0.15.2
    # via
    #   fastsafetensors
+    #   huggingface-hub
    #   perceptron
+    #   transformers
 types-python-dateutil==2.9.0.20241206
    # via arrow
 typeshed-client==2.8.2

--- a/requirements/test/nightly-torch.txt
+++ b/requirements/test/nightly-torch.txt
@@ -29,8 +29,8 @@ opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.5
-tokenizers==0.22.0
+transformers==5.5.3
+tokenizers==0.22.2
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes>=0.49.2

--- a/requirements/test/rocm.in
+++ b/requirements/test/rocm.in
@@ -38,8 +38,8 @@ opencv-python-headless>=4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.5
-tokenizers==0.22.0
+transformers==5.5.3
+tokenizers==0.22.2
 schemathesis>=3.39.15 # Required for openai schema test
 # quantization
 bitsandbytes==0.49.2
@@ -82,4 +82,3 @@ plotly # required for perf comparison html report
 rapidfuzz
 torchgeo==0.7.0
 multiprocess==0.70.16
-huggingface-hub==0.36.2
--- a/requirements/test/rocm.txt
+++ b/requirements/test/rocm.txt
@@ -39,7 +39,7 @@ annotated-doc==0.0.4
    #   typer
 annotated-types==0.7.0
    # via pydantic
-anthropic==0.89.0
+anthropic==0.93.0
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/../common.txt
@@ -172,7 +172,7 @@ colorful==0.5.8
    # via ray
 colorlog==6.10.1
    # via optuna
-compressed-tensors==0.14.0.1
+compressed-tensors==0.15.0.1
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/../common.txt
@@ -269,9 +269,9 @@ fastapi==0.135.2
    #   model-hosting-container-standards
 fastapi-cli==0.0.24
    # via fastapi
-fastapi-cloud-cli==0.15.1
+fastapi-cloud-cli==0.16.1
    # via fastapi-cli
-fastar==0.9.0
+fastar==0.10.0
    # via fastapi-cloud-cli
 fastparquet==2026.3.0
    # via genai-perf
@@ -290,7 +290,6 @@ filelock==3.25.2
    #   python-discovery
    #   ray
    #   torch
-    #   transformers
    #   virtualenv
 fiona==1.10.1
    # via torchgeo
@@ -384,7 +383,7 @@ h5py==3.16.0
    # via terratorch
 harfile==0.4.0
    # via schemathesis
-hf-xet==1.4.2
+hf-xet==1.4.3
    # via huggingface-hub
 hiredis==3.3.1
    # via tensorizer
@@ -403,6 +402,7 @@ httpx==0.27.2
    #   diffusers
    #   fastapi
    #   fastapi-cloud-cli
+    #   huggingface-hub
    #   mcp
    #   model-hosting-container-standards
    #   openai
@@ -410,9 +410,8 @@ httpx==0.27.2
    #   schemathesis
 httpx-sse==0.4.3
    # via mcp
-huggingface-hub==0.36.2
+huggingface-hub==1.10.2
    # via
-    #   -r requirements/test/rocm.in
    #   accelerate
    #   datasets
    #   diffusers
@@ -484,7 +483,7 @@ jinja2==3.1.6
    #   genai-perf
    #   lm-eval
    #   torch
-jiter==0.13.0
+jiter==0.14.0
    # via
    #   anthropic
    #   openai
@@ -631,7 +630,7 @@ msgpack==1.1.2
    # via
    #   librosa
    #   ray
-msgspec==0.20.0
+msgspec==0.21.0
    # via -r requirements/test/../common.txt
 mteb==2.11.5
    # via -r requirements/test/rocm.in
@@ -742,7 +741,7 @@ omegaconf==2.3.0
    #   lightning
 open-clip-torch==2.32.0
    # via -r requirements/test/rocm.in
-openai==2.30.0
+openai==2.31.0
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/../common.txt
@@ -1093,7 +1092,7 @@ python-dotenv==1.2.2
    #   uvicorn
 python-json-logger==4.1.0
    # via -r requirements/test/../common.txt
-python-multipart==0.0.22
+python-multipart==0.0.26
    # via
    #   fastapi
    #   mcp
@@ -1180,7 +1179,6 @@ requests==2.32.5
    #   google-api-core
    #   google-cloud-storage
    #   gpt-oss
-    #   huggingface-hub
    #   lightly
    #   lm-eval
    #   mistral-common
@@ -1194,7 +1192,6 @@ requests==2.32.5
    #   starlette-testclient
    #   tacoreader
    #   tiktoken
-    #   transformers
    #   wandb
 resampy==0.4.3
    # via -r requirements/test/rocm.in
@@ -1428,7 +1425,7 @@ timm==1.0.17
    #   segmentation-models-pytorch
    #   terratorch
    #   torchgeo
-tokenizers==0.22.0
+tokenizers==0.22.2
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/../common.txt
@@ -1471,7 +1468,7 @@ tqdm==4.67.3
    #   tacoreader
    #   terratorch
    #   transformers
-transformers==4.57.5
+transformers==5.5.3
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/../common.txt
@@ -1498,7 +1495,9 @@ typer==0.24.1
    #   fastapi-cli
    #   fastapi-cloud-cli
    #   fastsafetensors
+    #   huggingface-hub
    #   perceptron
+    #   transformers
 typeshed-client==2.9.0
    # via jsonargparse
 typing-extensions==4.15.0

--- a/requirements/test/xpu.in
+++ b/requirements/test/xpu.in
@@ -13,7 +13,6 @@ pytest-shard
 absl-py
 accelerate
 arctic-inference
-hf_transfer
 lm_eval[api]
 modelscope


--- a/requirements/test/xpu.txt
+++ b/requirements/test/xpu.txt
@@ -19,7 +19,9 @@ aiosignal==1.4.0
 albumentations==1.4.6
    # via -r requirements/test/xpu.in
 annotated-doc==0.0.4
-    # via fastapi
+    # via
+    #   fastapi
+    #   typer
 annotated-types==0.7.0
    # via pydantic
 anyio==4.13.0
@@ -64,6 +66,7 @@ click==8.3.1
    #   jiwer
    #   nltk
    #   schemathesis
+    #   typer
    #   uvicorn
 colorama==0.4.6
    # via sacrebleu
@@ -112,7 +115,6 @@ filelock==3.25.2
    #   huggingface-hub
    #   modelscope
    #   torch
-    #   transformers
 frozenlist==1.8.0
    # via
    #   aiohttp
@@ -133,9 +135,7 @@ h11==0.16.0
    #   uvicorn
 harfile==0.4.0
    # via schemathesis
-hf-transfer==0.1.9
-    # via -r requirements/test/xpu.in
-hf-xet==1.4.2
+hf-xet==1.4.3
    # via huggingface-hub
 html2text==2025.4.15
    # via gpt-oss
@@ -144,8 +144,9 @@ httpcore==1.0.9
 httpx==0.28.1
    # via
    #   datasets
+    #   huggingface-hub
    #   schemathesis
-huggingface-hub==0.36.2
+huggingface-hub==1.10.2
    # via
    #   accelerate
    #   datasets
@@ -515,7 +516,6 @@ requests==2.33.1
    #   docker
    #   evaluate
    #   gpt-oss
-    #   huggingface-hub
    #   lm-eval
    #   mistral-common
    #   modelscope
@@ -524,11 +524,11 @@ requests==2.33.1
    #   schemathesis
    #   starlette-testclient
    #   tiktoken
-    #   transformers
 rich==14.3.3
    # via
    #   mteb
    #   schemathesis
+    #   typer
 rouge-score==0.1.2
    # via lm-eval
 rpds-py==0.30.0
@@ -572,6 +572,8 @@ setuptools==80.10.2
    #   modelscope
    #   pytablewriter
    #   torch
+shellingham==1.5.4
+    # via typer
 six==1.17.0
    # via
    #   -c requirements/common.txt
@@ -665,7 +667,7 @@ tqdm==4.67.3
    #   pqdm
    #   sentence-transformers
    #   transformers
-transformers==4.57.6
+transformers==5.5.3
    # via
    #   -c requirements/common.txt
    #   sentence-transformers
@@ -676,6 +678,10 @@ typepy==1.3.4
    #   dataproperty
    #   pytablewriter
    #   tabledata
+typer==0.24.1
+    # via
+    #   huggingface-hub
+    #   transformers
 typing-extensions==4.15.0
    # via
    #   -c requirements/common.txt

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -410,6 +410,15 @@ class HfRunner:
            model_name,
            trust_remote_code=trust_remote_code,
        )
+        # HF runner should use the HF config so that it's consistent with the HF model
+        if self.config.__module__.startswith("vllm.transformers_utils.configs"):
+            from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+
+            del CONFIG_MAPPING._extra_content[self.config.model_type]
+            self.config = AutoConfig.from_pretrained(
+                model_name,
+                trust_remote_code=trust_remote_code,
+            )
        self.device = self.get_default_device()
        self.dtype = dtype = _get_and_verify_dtype(
            self.model_name,

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -3,6 +3,7 @@

 import tempfile
 from collections import OrderedDict
+from importlib import reload
 from unittest.mock import MagicMock

 import pytest
@@ -47,6 +48,11 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
 def maybe_enable_lora_dual_stream(monkeypatch: pytest.MonkeyPatch):
    if current_platform.is_cuda():
        monkeypatch.setenv("VLLM_LORA_ENABLE_DUAL_STREAM", "1")
+        import vllm.lora.layers.base_linear
+
+        if not hasattr(vllm.lora.layers.base_linear, "lora_linear_async"):
+            # Reload the module to ensure the environment variable takes effect.
+            reload(vllm.lora.layers.base_linear)
    yield



--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from importlib.metadata import version
+
 import pytest
+from packaging.version import Version

 import vllm
 from vllm.assets.image import ImageAsset
@@ -10,6 +13,14 @@ from vllm.platforms import current_platform

 from ..utils import multi_gpu_test

+pytestmark = pytest.mark.skipif(
+    Version("5.0") <= Version(version("transformers")),
+    reason=(
+        "MiniCPMV custom processor uses tokenizer.im_start_id which is not "
+        "available on TokenizersBackend in transformers v5.0+"
+    ),
+)
+
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"

 PROMPT_TEMPLATE = (

--- a/tests/model_executor/test_weight_utils.py
+++ b/tests/model_executor/test_weight_utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import os
 import tempfile

 import huggingface_hub.constants
@@ -10,26 +9,10 @@ from huggingface_hub.utils import LocalEntryNotFoundError

 from vllm.model_executor.model_loader.weight_utils import (
    download_weights_from_hf,
-    enable_hf_transfer,
    maybe_remap_kv_scale_name,
 )


-def test_hf_transfer_auto_activation():
-    if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
-        # in case it is already set, we can't test the auto activation
-        pytest.skip("HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
-    enable_hf_transfer()
-    try:
-        # enable hf hub transfer if available
-        import hf_transfer  # type: ignore # noqa
-
-        HF_TRANSFER_ACTIVE = True
-    except ImportError:
-        HF_TRANSFER_ACTIVE = False
-    assert huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == HF_TRANSFER_ACTIVE
-
-
 def test_download_weights_from_hf():
    with tempfile.TemporaryDirectory() as tmpdir:
        # assert LocalEntryNotFoundError error is thrown
@@ -178,5 +161,4 @@ class TestMaybeRemapKvScaleName:


 if __name__ == "__main__":
-    test_hf_transfer_auto_activation()
    test_download_weights_from_hf()
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -143,6 +143,11 @@ def test_models(
        # in parts of the operators
        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")

+    if current_platform.is_cpu() and model == "TitanML/tiny-mixtral":
+        # This untrained model is sensitive to the rounding error
+        # Fuse ops to reduce bfloat16 rounding
+        monkeypatch.setenv("VLLM_CPU_CI_ENV", "0")
+
    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs