Unverified Commit 03f8d3a5 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update to transformers v5 (#30566)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: default avatarkhluu <khluu000@gmail.com>
Signed-off-by: default avatarKevin H. Luu <khluu000@gmail.com>
Signed-off-by: default avatarCyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: default avatarkhluu <khluu000@gmail.com>
Co-authored-by: default avatarCyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: default avatarjiang1.li <jiang1.li@intel.com>
parent 6dc94914
......@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 -e VLLM_CPU_ATTN_SPLIT_KV=0 --shm-size=4g "$IMAGE_NAME" \
timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
......@@ -4,7 +4,6 @@ depends_on:
steps:
- label: Basic Models Tests (Initialization)
timeout_in_minutes: 45
device: h200_18gb
torch_nightly: true
source_file_dependencies:
- vllm/
......@@ -73,3 +72,18 @@ steps:
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Transformers Backward Compatibility Models Test
working_dir: "/vllm-workspace/"
optional: true
soft_fail: true
commands:
- pip install transformers==4.57.5
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
......@@ -642,7 +642,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
else \
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
fi; \
uv pip install --system accelerate hf_transfer modelscope \
uv pip install --system accelerate modelscope \
"bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}"
# ============================================================
......@@ -756,9 +756,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1
ENV HF_XET_HIGH_PERFORMANCE 1
# increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
# Copy in the v1 package for testing (it isn't distributed yet)
COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
......
......@@ -197,6 +197,12 @@ ADD ./.buildkite/ ./.buildkite/
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
ENV HF_XET_HIGH_PERFORMANCE 1
# increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
######################### RELEASE IMAGE #########################
FROM base AS vllm-openai
......
......@@ -272,9 +272,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1
ENV HF_XET_HIGH_PERFORMANCE 1
# increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/test/nightly-torch.txt
......
......@@ -365,9 +365,10 @@ RUN cd /vllm-workspace \
&& python3 -m pip install pytest-shard
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV HF_XET_HIGH_PERFORMANCE=1
# increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
# install audio decode package `torchcodec` from source (required due to
# ROCm and torch version mismatch) for tests with datasets package
......
......@@ -240,7 +240,7 @@ uv pip install vllm==${VLLM_VERSION} \
# Install dependencies
pip install --upgrade numba \
scipy \
huggingface-hub[cli,hf_transfer] \
huggingface-hub[cli] \
setuptools_scm
pip install -r requirements/rocm.txt
......
......@@ -7,7 +7,7 @@ requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers >= 4.56.0, < 5
transformers >= 4.56.0, != 5.0.*, != 5.1.*, != 5.2.*, != 5.3.*, != 5.4.*, != 5.5.0
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
......@@ -37,7 +37,7 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.14.0.1 # required for compressed-tensors
compressed-tensors == 0.15.0.1 # required for compressed-tensors
depyf==0.20.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files
......
......@@ -18,7 +18,7 @@ httpx
librosa # required for audio tests
vector_quantize_pytorch # required for minicpmo_26 test
vocos # required for minicpmo_26 test
peft>=0.15.0 # required for phi-4-mm test
peft>=0.18.1 # required for phi-4-mm test
pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
resampy # required for audio tests
......@@ -39,8 +39,8 @@ opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5
tokenizers==0.22.0
transformers==5.5.3
tokenizers==0.22.2
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes==0.49.2
......
......@@ -4,7 +4,7 @@ absl-py==2.1.0
# via
# rouge-score
# tensorboard
accelerate==1.0.1
accelerate==1.13.0
# via peft
aenum==3.1.16
# via lightly
......@@ -248,7 +248,6 @@ filelock==3.16.1
# huggingface-hub
# ray
# torch
# transformers
# virtualenv
fiona==1.10.1
# via torchgeo
......@@ -331,7 +330,7 @@ h5py==3.13.0
# via terratorch
harfile==0.3.0
# via schemathesis
hf-xet==1.1.7
hf-xet==1.4.3
# via huggingface-hub
hiredis==3.0.0
# via tensorizer
......@@ -345,9 +344,10 @@ httpx==0.27.2
# via
# -r requirements/test/cuda.in
# diffusers
# huggingface-hub
# perceptron
# schemathesis
huggingface-hub==0.36.2
huggingface-hub==1.10.2
# via
# accelerate
# datasets
......@@ -756,7 +756,7 @@ pathvalidate==3.2.1
# via pytablewriter
patsy==1.0.1
# via statsmodels
peft==0.16.0
peft==0.18.1
# via -r requirements/test/cuda.in
perceptron==0.1.4
# via -r requirements/test/cuda.in
......@@ -982,7 +982,7 @@ referencing==0.35.1
# via
# jsonschema
# jsonschema-specifications
regex==2024.9.11
regex==2026.2.28
# via
# diffusers
# nltk
......@@ -1002,7 +1002,6 @@ requests==2.32.3
# google-api-core
# google-cloud-storage
# gpt-oss
# huggingface-hub
# lightly
# lm-eval
# mistral-common
......@@ -1015,7 +1014,6 @@ requests==2.32.3
# starlette-testclient
# tacoreader
# tiktoken
# transformers
# wandb
resampy==0.4.3
# via -r requirements/test/cuda.in
......@@ -1216,7 +1214,7 @@ timm==1.0.17
# segmentation-models-pytorch
# terratorch
# torchgeo
tokenizers==0.22.0
tokenizers==0.22.2
# via
# -c requirements/common.txt
# -r requirements/test/cuda.in
......@@ -1295,7 +1293,7 @@ tqdm==4.67.3
# tacoreader
# terratorch
# transformers
transformers==4.57.5
transformers==5.5.3
# via
# -c requirements/common.txt
# -r requirements/test/cuda.in
......@@ -1317,7 +1315,9 @@ typepy==1.3.2
typer==0.15.2
# via
# fastsafetensors
# huggingface-hub
# perceptron
# transformers
types-python-dateutil==2.9.0.20241206
# via arrow
typeshed-client==2.8.2
......
......@@ -29,8 +29,8 @@ opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5
tokenizers==0.22.0
transformers==5.5.3
tokenizers==0.22.2
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes>=0.49.2
......
......@@ -38,8 +38,8 @@ opencv-python-headless>=4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5
tokenizers==0.22.0
transformers==5.5.3
tokenizers==0.22.2
schemathesis>=3.39.15 # Required for openai schema test
# quantization
bitsandbytes==0.49.2
......@@ -82,4 +82,3 @@ plotly # required for perf comparison html report
rapidfuzz
torchgeo==0.7.0
multiprocess==0.70.16
huggingface-hub==0.36.2
......@@ -39,7 +39,7 @@ annotated-doc==0.0.4
# typer
annotated-types==0.7.0
# via pydantic
anthropic==0.89.0
anthropic==0.93.0
# via
# -c requirements/common.txt
# -r requirements/test/../common.txt
......@@ -172,7 +172,7 @@ colorful==0.5.8
# via ray
colorlog==6.10.1
# via optuna
compressed-tensors==0.14.0.1
compressed-tensors==0.15.0.1
# via
# -c requirements/common.txt
# -r requirements/test/../common.txt
......@@ -269,9 +269,9 @@ fastapi==0.135.2
# model-hosting-container-standards
fastapi-cli==0.0.24
# via fastapi
fastapi-cloud-cli==0.15.1
fastapi-cloud-cli==0.16.1
# via fastapi-cli
fastar==0.9.0
fastar==0.10.0
# via fastapi-cloud-cli
fastparquet==2026.3.0
# via genai-perf
......@@ -290,7 +290,6 @@ filelock==3.25.2
# python-discovery
# ray
# torch
# transformers
# virtualenv
fiona==1.10.1
# via torchgeo
......@@ -384,7 +383,7 @@ h5py==3.16.0
# via terratorch
harfile==0.4.0
# via schemathesis
hf-xet==1.4.2
hf-xet==1.4.3
# via huggingface-hub
hiredis==3.3.1
# via tensorizer
......@@ -403,6 +402,7 @@ httpx==0.27.2
# diffusers
# fastapi
# fastapi-cloud-cli
# huggingface-hub
# mcp
# model-hosting-container-standards
# openai
......@@ -410,9 +410,8 @@ httpx==0.27.2
# schemathesis
httpx-sse==0.4.3
# via mcp
huggingface-hub==0.36.2
huggingface-hub==1.10.2
# via
# -r requirements/test/rocm.in
# accelerate
# datasets
# diffusers
......@@ -484,7 +483,7 @@ jinja2==3.1.6
# genai-perf
# lm-eval
# torch
jiter==0.13.0
jiter==0.14.0
# via
# anthropic
# openai
......@@ -631,7 +630,7 @@ msgpack==1.1.2
# via
# librosa
# ray
msgspec==0.20.0
msgspec==0.21.0
# via -r requirements/test/../common.txt
mteb==2.11.5
# via -r requirements/test/rocm.in
......@@ -742,7 +741,7 @@ omegaconf==2.3.0
# lightning
open-clip-torch==2.32.0
# via -r requirements/test/rocm.in
openai==2.30.0
openai==2.31.0
# via
# -c requirements/common.txt
# -r requirements/test/../common.txt
......@@ -1093,7 +1092,7 @@ python-dotenv==1.2.2
# uvicorn
python-json-logger==4.1.0
# via -r requirements/test/../common.txt
python-multipart==0.0.22
python-multipart==0.0.26
# via
# fastapi
# mcp
......@@ -1180,7 +1179,6 @@ requests==2.32.5
# google-api-core
# google-cloud-storage
# gpt-oss
# huggingface-hub
# lightly
# lm-eval
# mistral-common
......@@ -1194,7 +1192,6 @@ requests==2.32.5
# starlette-testclient
# tacoreader
# tiktoken
# transformers
# wandb
resampy==0.4.3
# via -r requirements/test/rocm.in
......@@ -1428,7 +1425,7 @@ timm==1.0.17
# segmentation-models-pytorch
# terratorch
# torchgeo
tokenizers==0.22.0
tokenizers==0.22.2
# via
# -c requirements/common.txt
# -r requirements/test/../common.txt
......@@ -1471,7 +1468,7 @@ tqdm==4.67.3
# tacoreader
# terratorch
# transformers
transformers==4.57.5
transformers==5.5.3
# via
# -c requirements/common.txt
# -r requirements/test/../common.txt
......@@ -1498,7 +1495,9 @@ typer==0.24.1
# fastapi-cli
# fastapi-cloud-cli
# fastsafetensors
# huggingface-hub
# perceptron
# transformers
typeshed-client==2.9.0
# via jsonargparse
typing-extensions==4.15.0
......
......@@ -13,7 +13,6 @@ pytest-shard
absl-py
accelerate
arctic-inference
hf_transfer
lm_eval[api]
modelscope
......
......@@ -19,7 +19,9 @@ aiosignal==1.4.0
albumentations==1.4.6
# via -r requirements/test/xpu.in
annotated-doc==0.0.4
# via fastapi
# via
# fastapi
# typer
annotated-types==0.7.0
# via pydantic
anyio==4.13.0
......@@ -64,6 +66,7 @@ click==8.3.1
# jiwer
# nltk
# schemathesis
# typer
# uvicorn
colorama==0.4.6
# via sacrebleu
......@@ -112,7 +115,6 @@ filelock==3.25.2
# huggingface-hub
# modelscope
# torch
# transformers
frozenlist==1.8.0
# via
# aiohttp
......@@ -133,9 +135,7 @@ h11==0.16.0
# uvicorn
harfile==0.4.0
# via schemathesis
hf-transfer==0.1.9
# via -r requirements/test/xpu.in
hf-xet==1.4.2
hf-xet==1.4.3
# via huggingface-hub
html2text==2025.4.15
# via gpt-oss
......@@ -144,8 +144,9 @@ httpcore==1.0.9
httpx==0.28.1
# via
# datasets
# huggingface-hub
# schemathesis
huggingface-hub==0.36.2
huggingface-hub==1.10.2
# via
# accelerate
# datasets
......@@ -515,7 +516,6 @@ requests==2.33.1
# docker
# evaluate
# gpt-oss
# huggingface-hub
# lm-eval
# mistral-common
# modelscope
......@@ -524,11 +524,11 @@ requests==2.33.1
# schemathesis
# starlette-testclient
# tiktoken
# transformers
rich==14.3.3
# via
# mteb
# schemathesis
# typer
rouge-score==0.1.2
# via lm-eval
rpds-py==0.30.0
......@@ -572,6 +572,8 @@ setuptools==80.10.2
# modelscope
# pytablewriter
# torch
shellingham==1.5.4
# via typer
six==1.17.0
# via
# -c requirements/common.txt
......@@ -665,7 +667,7 @@ tqdm==4.67.3
# pqdm
# sentence-transformers
# transformers
transformers==4.57.6
transformers==5.5.3
# via
# -c requirements/common.txt
# sentence-transformers
......@@ -676,6 +678,10 @@ typepy==1.3.4
# dataproperty
# pytablewriter
# tabledata
typer==0.24.1
# via
# huggingface-hub
# transformers
typing-extensions==4.15.0
# via
# -c requirements/common.txt
......
......@@ -410,6 +410,15 @@ class HfRunner:
model_name,
trust_remote_code=trust_remote_code,
)
# HF runner should use the HF config so that it's consistent with the HF model
if self.config.__module__.startswith("vllm.transformers_utils.configs"):
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
del CONFIG_MAPPING._extra_content[self.config.model_type]
self.config = AutoConfig.from_pretrained(
model_name,
trust_remote_code=trust_remote_code,
)
self.device = self.get_default_device()
self.dtype = dtype = _get_and_verify_dtype(
self.model_name,
......
......@@ -3,6 +3,7 @@
import tempfile
from collections import OrderedDict
from importlib import reload
from unittest.mock import MagicMock
import pytest
......@@ -47,6 +48,11 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
def maybe_enable_lora_dual_stream(monkeypatch: pytest.MonkeyPatch):
if current_platform.is_cuda():
monkeypatch.setenv("VLLM_LORA_ENABLE_DUAL_STREAM", "1")
import vllm.lora.layers.base_linear
if not hasattr(vllm.lora.layers.base_linear, "lora_linear_async"):
# Reload the module to ensure the environment variable takes effect.
reload(vllm.lora.layers.base_linear)
yield
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from importlib.metadata import version
import pytest
from packaging.version import Version
import vllm
from vllm.assets.image import ImageAsset
......@@ -10,6 +13,14 @@ from vllm.platforms import current_platform
from ..utils import multi_gpu_test
pytestmark = pytest.mark.skipif(
Version("5.0") <= Version(version("transformers")),
reason=(
"MiniCPMV custom processor uses tokenizer.im_start_id which is not "
"available on TokenizersBackend in transformers v5.0+"
),
)
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
PROMPT_TEMPLATE = (
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import tempfile
import huggingface_hub.constants
......@@ -10,26 +9,10 @@ from huggingface_hub.utils import LocalEntryNotFoundError
from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf,
enable_hf_transfer,
maybe_remap_kv_scale_name,
)
def test_hf_transfer_auto_activation():
if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
# in case it is already set, we can't test the auto activation
pytest.skip("HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
enable_hf_transfer()
try:
# enable hf hub transfer if available
import hf_transfer # type: ignore # noqa
HF_TRANSFER_ACTIVE = True
except ImportError:
HF_TRANSFER_ACTIVE = False
assert huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == HF_TRANSFER_ACTIVE
def test_download_weights_from_hf():
with tempfile.TemporaryDirectory() as tmpdir:
# assert LocalEntryNotFoundError error is thrown
......@@ -178,5 +161,4 @@ class TestMaybeRemapKvScaleName:
if __name__ == "__main__":
test_hf_transfer_auto_activation()
test_download_weights_from_hf()
......@@ -143,6 +143,11 @@ def test_models(
# in parts of the operators
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
if current_platform.is_cpu() and model == "TitanML/tiny-mixtral":
# This untrained model is sensitive to the rounding error
# Fuse ops to reduce bfloat16 rounding
monkeypatch.setenv("VLLM_CPU_CI_ENV", "0")
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment