Commit fc67613a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.19.1' into v0.19.0

parents 31aec25b b1388b1f
......@@ -4,7 +4,7 @@ absl-py==2.1.0
# via
# rouge-score
# tensorboard
accelerate==1.0.1
accelerate==1.13.0
# via peft
aenum==3.1.16
# via lightly
......@@ -240,7 +240,6 @@ filelock==3.16.1
# huggingface-hub
# ray
# torch
# transformers
# virtualenv
fiona==1.10.1
# via torchgeo
......@@ -323,7 +322,7 @@ h5py==3.13.0
# via terratorch
harfile==0.3.0
# via schemathesis
hf-xet==1.1.7
hf-xet==1.4.3
# via huggingface-hub
hiredis==3.0.0
# via tensorizer
......@@ -337,9 +336,10 @@ httpx==0.27.2
# via
# -r requirements/test.in
# diffusers
# huggingface-hub
# perceptron
# schemathesis
huggingface-hub==0.36.2
huggingface-hub==1.10.2
# via
# accelerate
# datasets
......@@ -740,7 +740,7 @@ pathvalidate==3.2.1
# via pytablewriter
patsy==1.0.1
# via statsmodels
peft==0.16.0
peft==0.18.1
# via -r requirements/test.in
perceptron==0.1.4
# via -r requirements/test.in
......@@ -963,7 +963,7 @@ referencing==0.35.1
# via
# jsonschema
# jsonschema-specifications
regex==2024.9.11
regex==2026.2.28
# via
# diffusers
# nltk
......@@ -982,7 +982,6 @@ requests==2.32.3
# google-api-core
# google-cloud-storage
# gpt-oss
# huggingface-hub
# lightly
# lm-eval
# mistral-common
......@@ -995,7 +994,6 @@ requests==2.32.3
# starlette-testclient
# tacoreader
# tiktoken
# transformers
# wandb
resampy==0.4.3
# via -r requirements/test.in
......@@ -1193,7 +1191,7 @@ timm==1.0.17
# segmentation-models-pytorch
# terratorch
# torchgeo
tokenizers==0.22.0
tokenizers==0.22.2
# via
# -r requirements/test.in
# transformers
......@@ -1269,7 +1267,7 @@ tqdm==4.67.3
# tacoreader
# terratorch
# transformers
transformers==4.57.5
transformers==5.5.3
# via
# -r requirements/test.in
# genai-perf
......@@ -1290,7 +1288,9 @@ typepy==1.3.2
typer==0.15.2
# via
# fastsafetensors
# huggingface-hub
# perceptron
# transformers
types-python-dateutil==2.9.0.20241206
# via arrow
typeshed-client==2.8.2
......
# This file was autogenerated by uv via the following command:
# uv pip compile requirements/test/xpu.in -c requirements/xpu.txt -o requirements/test/xpu.txt --index-strategy unsafe-best-match --torch-backend xpu --python-platform x86_64-manylinux_2_39 --python-version 3.12
absl-py==2.4.0
# via
# -r requirements/test/xpu.in
# rouge-score
accelerate==1.13.0
# via -r requirements/test/xpu.in
aiohappyeyeballs==2.6.1
# via aiohttp
aiohttp==3.13.4
# via
# -c requirements/common.txt
# fsspec
# gpt-oss
# lm-eval
aiosignal==1.4.0
# via aiohttp
albumentations==1.4.6
# via -r requirements/test/xpu.in
annotated-doc==0.0.4
# via
# fastapi
# typer
annotated-types==0.7.0
# via pydantic
anyio==4.13.0
# via
# httpx
# starlette
arctic-inference==0.1.1
# via -r requirements/test/xpu.in
attrs==26.1.0
# via
# aiohttp
# jsonlines
# jsonschema
# referencing
audioread==3.0.1
# via
# -r requirements/test/xpu.in
# librosa
blobfile==3.0.0
# via -r requirements/test/xpu.in
bm25s==0.2.13
# via
# -r requirements/test/xpu.in
# mteb
bounded-pool-executor==0.0.3
# via pqdm
certifi==2026.2.25
# via
# httpcore
# httpx
# requests
cffi==2.0.0
# via soundfile
chardet==5.2.0
# via mbstrdecoder
charset-normalizer==3.4.6
# via requests
chz==0.4.0
# via gpt-oss
click==8.3.1
# via
# jiwer
# nltk
# schemathesis
# typer
# uvicorn
colorama==0.4.6
# via sacrebleu
coverage==7.13.5
# via pytest-cov
dataproperty==1.1.0
# via
# pytablewriter
# tabledata
datasets==4.8.4
# via
# evaluate
# lm-eval
# mteb
decorator==5.2.1
# via librosa
dill==0.4.1
# via
# datasets
# evaluate
# lm-eval
# multiprocess
docker==7.1.0
# via gpt-oss
docopt==0.6.2
# via num2words
dpcpp-cpp-rt==2025.3.1
# via
# onemkl-sycl-blas
# onemkl-sycl-dft
# onemkl-sycl-lapack
# onemkl-sycl-rng
# onemkl-sycl-sparse
# torch
evaluate==0.4.6
# via lm-eval
fastapi==0.135.2
# via
# -c requirements/common.txt
# gpt-oss
filelock==3.25.2
# via
# -c requirements/common.txt
# blobfile
# datasets
# huggingface-hub
# modelscope
# torch
frozenlist==1.8.0
# via
# aiohttp
# aiosignal
fsspec==2026.2.0
# via
# datasets
# evaluate
# huggingface-hub
# torch
gpt-oss==0.0.8
# via -r requirements/test/xpu.in
graphql-core==3.2.8
# via hypothesis-graphql
h11==0.16.0
# via
# httpcore
# uvicorn
harfile==0.4.0
# via schemathesis
hf-xet==1.4.3
# via huggingface-hub
html2text==2025.4.15
# via gpt-oss
httpcore==1.0.9
# via httpx
httpx==0.28.1
# via
# datasets
# huggingface-hub
# schemathesis
huggingface-hub==1.10.2
# via
# accelerate
# datasets
# evaluate
# sentence-transformers
# timm
# tokenizers
# transformers
hypothesis==6.151.10
# via
# hypothesis-graphql
# hypothesis-jsonschema
# schemathesis
hypothesis-graphql==0.12.0
# via schemathesis
hypothesis-jsonschema==0.23.1
# via schemathesis
idna==3.11
# via
# anyio
# httpx
# requests
# yarl
imageio==2.37.3
# via scikit-image
impi-rt==2021.17.0
# via
# oneccl
# torch
iniconfig==2.3.0
# via pytest
intel-cmplr-lib-rt==2025.3.1
# via
# intel-sycl-rt
# torch
intel-cmplr-lib-ur==2025.3.1
# via
# intel-openmp
# intel-sycl-rt
# torch
intel-cmplr-lic-rt==2025.3.1
# via
# intel-opencl-rt
# intel-sycl-rt
# torch
intel-opencl-rt==2025.3.1
# via
# dpcpp-cpp-rt
# onemkl-sycl-blas
# onemkl-sycl-dft
# onemkl-sycl-lapack
# onemkl-sycl-rng
# onemkl-sycl-sparse
# torch
intel-openmp==2025.3.1
# via
# dpcpp-cpp-rt
# mkl
# torch
intel-pti==0.15.0
# via torch
intel-sycl-rt==2025.3.1
# via
# dpcpp-cpp-rt
# oneccl
# torch
jinja2==3.1.6
# via
# -c requirements/xpu.txt
# lm-eval
# torch
jiwer==4.0.0
# via -r requirements/test/xpu.in
joblib==1.5.3
# via
# librosa
# nltk
# scikit-learn
jsonlines==4.0.0
# via lm-eval
jsonschema==4.26.0
# via
# hypothesis-jsonschema
# mistral-common
# schemathesis
jsonschema-rs==0.45.0
# via schemathesis
jsonschema-specifications==2025.9.1
# via jsonschema
junit-xml==1.9
# via schemathesis
lazy-loader==0.5
# via
# librosa
# scikit-image
librosa==0.10.2.post1
# via -r requirements/test/xpu.in
llvmlite==0.44.0
# via numba
lm-eval==0.4.11
# via -r requirements/test/xpu.in
lxml==6.0.2
# via
# blobfile
# gpt-oss
# sacrebleu
markdown-it-py==4.0.0
# via rich
markupsafe==3.0.3
# via
# jinja2
# werkzeug
mbstrdecoder==1.1.4
# via
# dataproperty
# pytablewriter
# typepy
mdurl==0.1.2
# via markdown-it-py
mistral-common==1.11.0
# via
# -c requirements/common.txt
# -r requirements/test/xpu.in
mkl==2025.3.0
# via
# onemkl-sycl-blas
# onemkl-sycl-dft
# onemkl-sycl-lapack
# onemkl-sycl-rng
# onemkl-sycl-sparse
# torch
modelscope==1.35.3
# via -r requirements/test/xpu.in
more-itertools==10.8.0
# via lm-eval
mpmath==1.3.0
# via sympy
msgpack==1.1.2
# via librosa
mteb==2.12.7
# via -r requirements/test/xpu.in
multidict==6.7.1
# via
# aiohttp
# yarl
multiprocess==0.70.19
# via
# datasets
# evaluate
networkx==3.6.1
# via
# scikit-image
# torch
nltk==3.9.4
# via rouge-score
num2words==0.5.14
# via -r requirements/test/xpu.in
numba==0.61.2
# via
# -c requirements/xpu.txt
# librosa
numpy==2.2.6
# via
# accelerate
# albumentations
# bm25s
# datasets
# evaluate
# imageio
# librosa
# lm-eval
# mistral-common
# mteb
# numba
# opencv-python-headless
# pandas
# pytrec-eval-terrier
# rouge-score
# sacrebleu
# scikit-image
# scikit-learn
# scipy
# sentence-transformers
# soundfile
# soxr
# tifffile
# torchvision
# transformers
oneccl==2021.17.1
# via
# oneccl-devel
# torch
oneccl-devel==2021.17.1
# via torch
onemkl-license==2025.3.0
# via
# mkl
# torch
onemkl-sycl-blas==2025.3.0
# via
# onemkl-sycl-lapack
# onemkl-sycl-sparse
# torch
onemkl-sycl-dft==2025.3.0
# via torch
onemkl-sycl-lapack==2025.3.0
# via torch
onemkl-sycl-rng==2025.3.0
# via torch
onemkl-sycl-sparse==2025.3.0
# via torch
openai-harmony==0.0.8
# via
# -c requirements/common.txt
# gpt-oss
opencv-python-headless==4.13.0.92
# via
# -c requirements/common.txt
# albumentations
# mistral-common
packaging==26.0
# via
# -c requirements/xpu.txt
# accelerate
# datasets
# evaluate
# huggingface-hub
# lazy-loader
# modelscope
# pooch
# pytest
# pytest-rerunfailures
# scikit-image
# transformers
# typepy
pandas==3.0.1
# via
# datasets
# evaluate
pathvalidate==3.3.1
# via pytablewriter
pillow==12.1.1
# via
# imageio
# mistral-common
# scikit-image
# torchvision
platformdirs==4.9.4
# via pooch
pluggy==1.6.0
# via
# pytest
# pytest-cov
polars==1.39.3
# via mteb
polars-runtime-32==1.39.3
# via polars
pooch==1.8.2
# via
# -r requirements/test/xpu.in
# librosa
portalocker==3.2.0
# via sacrebleu
pqdm==0.2.0
# via -r requirements/test/xpu.in
propcache==0.4.1
# via
# aiohttp
# yarl
psutil==7.2.2
# via accelerate
py==1.11.0
# via pytest-forked
pyarrow==23.0.1
# via datasets
pycountry==26.2.16
# via pydantic-extra-types
pycparser==3.0
# via cffi
pycryptodomex==3.23.0
# via blobfile
pydantic==2.12.5
# via
# -c requirements/common.txt
# albumentations
# fastapi
# gpt-oss
# mistral-common
# mteb
# openai-harmony
# pydantic-extra-types
pydantic-core==2.41.5
# via pydantic
pydantic-extra-types==2.11.1
# via mistral-common
pyelftools==0.32
# via triton-xpu
pygments==2.20.0
# via
# pytest
# rich
pyrate-limiter==4.1.0
# via schemathesis
pystemmer==3.0.0
# via
# -r requirements/test/xpu.in
# mteb
pytablewriter==1.2.1
# via lm-eval
pytest==9.0.2
# via
# -r requirements/test/xpu.in
# pytest-asyncio
# pytest-cov
# pytest-forked
# pytest-rerunfailures
# pytest-shard
# pytest-timeout
# schemathesis
pytest-asyncio==1.3.0
# via -r requirements/test/xpu.in
pytest-cov==6.3.0
# via -r requirements/test/xpu.in
pytest-forked==1.6.0
# via -r requirements/test/xpu.in
pytest-rerunfailures==14.0
# via -r requirements/test/xpu.in
pytest-shard==0.1.2
# via -r requirements/test/xpu.in
pytest-timeout==2.3.1
# via -r requirements/test/xpu.in
python-dateutil==2.9.0.post0
# via
# pandas
# typepy
pytrec-eval-terrier==0.5.10
# via mteb
pytz==2026.1.post1
# via typepy
pyyaml==6.0.3
# via
# accelerate
# albumentations
# datasets
# huggingface-hub
# schemathesis
# timm
# transformers
rapidfuzz==3.12.1
# via
# -r requirements/test/xpu.in
# jiwer
referencing==0.37.0
# via
# jsonschema
# jsonschema-specifications
regex==2026.3.32
# via
# nltk
# sacrebleu
# tiktoken
# transformers
requests==2.33.1
# via
# -c requirements/common.txt
# datasets
# docker
# evaluate
# gpt-oss
# lm-eval
# mistral-common
# modelscope
# mteb
# pooch
# schemathesis
# starlette-testclient
# tiktoken
rich==14.3.3
# via
# mteb
# schemathesis
# typer
rouge-score==0.1.2
# via lm-eval
rpds-py==0.30.0
# via
# jsonschema
# referencing
sacrebleu==2.6.0
# via lm-eval
safetensors==0.7.0
# via
# accelerate
# timm
# transformers
schemathesis==4.14.2
# via -r requirements/test/xpu.in
scikit-image==0.26.0
# via albumentations
scikit-learn==1.8.0
# via
# albumentations
# librosa
# lm-eval
# mteb
# sentence-transformers
scipy==1.17.1
# via
# albumentations
# bm25s
# librosa
# mteb
# pytrec-eval-terrier
# scikit-image
# scikit-learn
# sentence-transformers
sentence-transformers==5.3.0
# via mteb
setuptools==80.10.2
# via
# -c requirements/common.txt
# -c requirements/xpu.txt
# modelscope
# pytablewriter
# torch
shellingham==1.5.4
# via typer
six==1.17.0
# via
# -c requirements/common.txt
# junit-xml
# python-dateutil
# rouge-score
sortedcontainers==2.4.0
# via hypothesis
soundfile==0.13.1
# via
# -r requirements/test/xpu.in
# librosa
# mistral-common
soxr==0.5.0.post1
# via
# -r requirements/test/xpu.in
# librosa
# mistral-common
sqlitedict==2.1.0
# via lm-eval
starlette==1.0.0
# via
# fastapi
# starlette-testclient
starlette-testclient==0.4.1
# via schemathesis
structlog==25.5.0
# via gpt-oss
sympy==1.14.0
# via torch
tabledata==1.3.4
# via pytablewriter
tabulate==0.10.0
# via sacrebleu
tbb==2022.3.0
# via
# intel-opencl-rt
# mkl
# torch
tblib==3.1.0
# via -r requirements/test/xpu.in
tcmlib==1.4.1
# via
# tbb
# torch
# umf
tcolorpy==0.1.7
# via pytablewriter
tenacity==9.1.4
# via
# gpt-oss
# lm-eval
# schemathesis
termcolor==3.3.0
# via gpt-oss
threadpoolctl==3.6.0
# via scikit-learn
tifffile==2026.3.3
# via scikit-image
tiktoken==0.12.0
# via
# -c requirements/common.txt
# gpt-oss
# lm-eval
# mistral-common
timm==1.0.17
# via -r requirements/test/xpu.in
tokenizers==0.22.2
# via
# -c requirements/common.txt
# transformers
torch==2.10.0+xpu
# via
# -c requirements/xpu.txt
# accelerate
# mteb
# sentence-transformers
# timm
# torchvision
torchvision==0.25.0+xpu
# via timm
tqdm==4.67.3
# via
# datasets
# evaluate
# huggingface-hub
# lm-eval
# modelscope
# mteb
# nltk
# pqdm
# sentence-transformers
# transformers
transformers==5.5.3
# via
# -c requirements/common.txt
# sentence-transformers
triton-xpu==3.6.0
# via torch
typepy==1.3.4
# via
# dataproperty
# pytablewriter
# tabledata
typer==0.24.1
# via
# huggingface-hub
# transformers
typing-extensions==4.15.0
# via
# -c requirements/common.txt
# aiosignal
# albumentations
# anyio
# chz
# fastapi
# huggingface-hub
# librosa
# lm-eval
# mistral-common
# mteb
# pqdm
# pydantic
# pydantic-core
# pydantic-extra-types
# pytest-asyncio
# referencing
# schemathesis
# sentence-transformers
# starlette
# torch
# typing-inspection
typing-inspection==0.4.2
# via
# fastapi
# pydantic
umf==1.0.2
# via
# intel-cmplr-lib-ur
# torch
urllib3==2.6.3
# via
# blobfile
# docker
# modelscope
# requests
uvicorn==0.42.0
# via gpt-oss
werkzeug==3.1.7
# via schemathesis
word2number==1.1
# via lm-eval
xxhash==3.6.0
# via
# datasets
# evaluate
yarl==1.23.0
# via aiohttp
zstandard==0.25.0
# via lm-eval
......@@ -9,6 +9,8 @@ pytest-shard
# --- Core Tools & Bindings ---
absl-py
arctic-inference
lm_eval[api]
modelscope
# --- Audio Processing ---
librosa
......
......@@ -409,6 +409,15 @@ class HfRunner:
model_name,
trust_remote_code=trust_remote_code,
)
# HF runner should use the HF config so that it's consistent with the HF model
if self.config.__module__.startswith("vllm.transformers_utils.configs"):
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
del CONFIG_MAPPING._extra_content[self.config.model_type]
self.config = AutoConfig.from_pretrained(
model_name,
trust_remote_code=trust_remote_code,
)
self.device = self.get_default_device()
self.dtype = dtype = _get_and_verify_dtype(
self.model_name,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for MiniMax QK RMS-norm: NCCL reference vs Lamport fused kernel."""
import pytest
import torch
import torch.nn as nn
from torch.multiprocessing import spawn
from tests.kernels.utils import opcheck
from tests.utils import ensure_current_vllm_config, init_test_distributed_environment
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP
from vllm.platforms import current_platform
from vllm.utils.network_utils import get_open_port
from vllm.utils.torch_utils import set_random_seed
@ensure_current_vllm_config()
def _worker_forward_qk(
local_rank,
world_size,
port,
num_tokens,
hidden_q_full,
hidden_k_full,
dtype,
seed,
eps,
):
"""Per-rank worker: compare NCCL allreduce path vs Lamport fused kernel."""
if not hasattr(torch.ops._C, "minimax_allreduce_rms_qk"):
cleanup_dist_env_and_memory()
return
device = torch.device(f"cuda:{local_rank}")
torch.accelerator.set_device_index(device)
init_test_distributed_environment(
world_size, 1, local_rank, port, local_rank=local_rank
)
hq = hidden_q_full // world_size
hk = hidden_k_full // world_size
q_norm = MiniMaxText01RMSNormTP(hidden_q_full, eps=eps).cuda()
k_norm = MiniMaxText01RMSNormTP(hidden_k_full, eps=eps).cuda()
set_random_seed(seed)
qw = torch.randn(hidden_q_full, dtype=dtype, device="cuda")
kw = torch.randn(hidden_k_full, dtype=dtype, device="cuda")
q_norm.weight = nn.Parameter(qw[local_rank * hq : (local_rank + 1) * hq])
k_norm.weight = nn.Parameter(kw[local_rank * hk : (local_rank + 1) * hk])
torch.manual_seed(seed + 1000 + local_rank)
qkv = torch.randn(num_tokens, hq + hk + hk, dtype=dtype, device="cuda")
q_ref, k_ref, v_ref = qkv.clone().split([hq, hk, hk], dim=-1)
ref_q, ref_k = MiniMaxText01RMSNormTP.forward_qk(q_norm, k_norm, q_ref, k_ref)
# Set up Lamport workspace.
from vllm.distributed.parallel_state import get_tp_group
from vllm.model_executor.layers.mamba.lamport_workspace import (
get_allreduce_workspace,
)
workspace = get_allreduce_workspace(
rank=local_rank,
world_size=world_size,
max_tokens=num_tokens,
process_group=get_tp_group().cpu_group,
)
opcheck(
torch.ops._C.minimax_allreduce_rms_qk,
(
qkv.clone(),
q_norm.weight,
k_norm.weight,
workspace,
hq,
hk,
local_rank,
world_size,
eps,
),
)
fused_q, fused_k = torch.ops._C.minimax_allreduce_rms_qk(
qkv.clone(),
q_norm.weight,
k_norm.weight,
workspace,
hq,
hk,
local_rank,
world_size,
eps,
)
_, _, fused_v = qkv.split([hq, hk, hk], dim=-1)
torch.accelerator.synchronize()
torch.testing.assert_close(
fused_q,
ref_q,
atol=3e-2,
rtol=3e-2,
)
torch.testing.assert_close(fused_k, ref_k, atol=3e-2, rtol=3e-2)
cleanup_dist_env_and_memory()
@pytest.mark.skipif(
not current_platform.is_cuda(),
reason="CUDA required",
)
@pytest.mark.parametrize("world_size", [2, 4, 8])
@pytest.mark.parametrize("num_tokens", [1, 128, 333])
@pytest.mark.parametrize(
"hidden_dims",
[(6144, 1024)],
)
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("eps", [1e-6])
@pytest.mark.parametrize("seed", [42])
def test_minimax_reduce_rms_qk(
world_size,
num_tokens,
hidden_dims,
dtype,
eps,
seed,
):
num_gpus = current_platform.device_count()
if num_gpus < world_size:
pytest.skip(f"Need >= {world_size} GPUs, have {num_gpus}")
hidden_q_full, hidden_k_full = hidden_dims
port = str(get_open_port())
spawn(
_worker_forward_qk,
args=(
world_size,
port,
num_tokens,
hidden_q_full,
hidden_k_full,
dtype,
seed,
eps,
),
nprocs=world_size,
join=True,
)
......@@ -3,6 +3,7 @@
import tempfile
from collections import OrderedDict
from importlib import reload
from unittest.mock import MagicMock
import pytest
......@@ -43,6 +44,18 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture
def maybe_enable_lora_dual_stream(monkeypatch: pytest.MonkeyPatch):
if current_platform.is_cuda():
monkeypatch.setenv("VLLM_LORA_ENABLE_DUAL_STREAM", "1")
import vllm.lora.layers.base_linear
if not hasattr(vllm.lora.layers.base_linear, "lora_linear_async"):
# Reload the module to ensure the environment variable takes effect.
reload(vllm.lora.layers.base_linear)
yield
@pytest.fixture
def dist_init():
from tests.utils import ensure_current_vllm_config
......
......@@ -5,7 +5,9 @@ import pytest
from vllm.lora.lora_model import LoRAModel
from vllm.lora.peft_helper import PEFTHelper
from vllm.lora.utils import parse_fine_tuned_lora_name
from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
from vllm.model_executor.models.gemma4 import Gemma4ForCausalLM
from vllm.model_executor.models.utils import WeightsMapper
lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"]
......@@ -128,3 +130,24 @@ def test_lora_weights_mapping(baichuan_lora_files):
for name in lora_model.loras:
assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
assert ".baichuan_layers." in name
def test_gemma4_lora_weights_mapping():
mapper = Gemma4ForCausalLM.hf_to_vllm_mapper
name = "base_model.model.model.language_model.layers.9.mlp.down_proj.lora_A.weight"
assert parse_fine_tuned_lora_name(name, mapper) == (
"model.layers.9.mlp.down_proj",
True,
)
def test_gemma4_moe_lora_weights_mapping():
mapper = Gemma4ForCausalLM.hf_to_vllm_mapper
name = (
"base_model.model.model.language_model.layers.9.moe.experts."
"gate_up_proj.lora_B.weight"
)
assert parse_fine_tuned_lora_name(name, mapper) == (
"model.layers.9.moe.gate_up_proj",
False,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from importlib.metadata import version
import pytest
from packaging.version import Version
import vllm
from vllm.assets.image import ImageAsset
......@@ -10,6 +13,14 @@ from vllm.platforms import current_platform
from ..utils import multi_gpu_test
pytestmark = pytest.mark.skipif(
Version("5.0") <= Version(version("transformers")),
reason=(
"MiniCPMV custom processor uses tokenizer.im_start_id which is not "
"available on TokenizersBackend in transformers v5.0+"
),
)
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
PROMPT_TEMPLATE = (
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import tempfile
import huggingface_hub.constants
......@@ -10,26 +9,10 @@ from huggingface_hub.utils import LocalEntryNotFoundError
from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf,
enable_hf_transfer,
maybe_remap_kv_scale_name,
)
def test_hf_transfer_auto_activation():
if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
# in case it is already set, we can't test the auto activation
pytest.skip("HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
enable_hf_transfer()
try:
# enable hf hub transfer if available
import hf_transfer # type: ignore # noqa
HF_TRANSFER_ACTIVE = True
except ImportError:
HF_TRANSFER_ACTIVE = False
assert huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == HF_TRANSFER_ACTIVE
def test_download_weights_from_hf():
with tempfile.TemporaryDirectory() as tmpdir:
# assert LocalEntryNotFoundError error is thrown
......@@ -178,5 +161,4 @@ class TestMaybeRemapKvScaleName:
if __name__ == "__main__":
test_hf_transfer_auto_activation()
test_download_weights_from_hf()
......@@ -143,6 +143,11 @@ def test_models(
# in parts of the operators
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
if current_platform.is_cpu() and model == "TitanML/tiny-mixtral":
# This untrained model is sensitive to the rounding error
# Fuse ops to reduce bfloat16 rounding
monkeypatch.setenv("VLLM_CPU_CI_ENV", "0")
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs
......
......@@ -109,6 +109,14 @@ def _load_hf_model(model_name: str, hf_spec: dict, device: torch.device):
**extra,
).to(device)
model.eval()
# Transformers 5.0 weight materialization can clear non-persistent
# buffers (e.g. rotary inv_freq) that were registered with
# persistent=False. Re-compute them so the model produces valid output.
for mod in model.modules():
if hasattr(mod, "_compute_inv_freq") and hasattr(mod, "inv_freq"):
mod.inv_freq = mod._compute_inv_freq(device=device)
return model
......
......@@ -8,7 +8,13 @@ import pytest
from ...utils import EmbedModelInfo
MODELS = [
EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1",
# Fixme:
# Update nomic-embed code to support the latest
# HF version and remove revision set.
revision="720244025c1a7e15661a174c63cce63c8218e52b",
),
# EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
# EmbedModelInfo("nomic-ai/CodeRankEmbed"),
EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
......@@ -24,7 +30,10 @@ max_model_len = int(original_max_position_embeddings * factor)
@pytest.mark.parametrize("model_info", MODELS)
def test_default(model_info, vllm_runner):
with vllm_runner(
model_info.name, runner="pooling", max_model_len=None
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=None,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
......@@ -39,7 +48,10 @@ def test_default(model_info, vllm_runner):
def test_set_max_model_len_legal(model_info, vllm_runner):
# set max_model_len <= 512
with vllm_runner(
model_info.name, runner="pooling", max_model_len=256
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=256,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 256
......@@ -49,11 +61,19 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
with pytest.raises(ValueError):
with vllm_runner(model_info.name, runner="pooling", max_model_len=1024):
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=1024,
):
pass
else:
with vllm_runner(
model_info.name, runner="pooling", max_model_len=1024
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=1024,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024
......@@ -63,7 +83,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
def test_set_max_model_len_illegal(model_info, vllm_runner):
# set max_model_len > 2048
with pytest.raises(ValueError):
with vllm_runner(model_info.name, runner="pooling", max_model_len=4096):
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=4096,
):
pass
# set max_model_len > 2048 by hf_overrides
......@@ -71,6 +96,7 @@ def test_set_max_model_len_illegal(model_info, vllm_runner):
with pytest.raises(ValueError):
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides,
......@@ -91,7 +117,11 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
}
with vllm_runner(
model_info.name, runner="pooling", max_model_len=None, hf_overrides=hf_overrides
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides,
):
pass
......@@ -110,6 +140,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
with pytest.raises(ValueError):
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=max_model_len + 1,
hf_overrides=hf_overrides,
......@@ -129,6 +160,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
with pytest.raises(ValueError):
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides,
......
......@@ -151,6 +151,7 @@ def mteb_test_embed_models(
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=model_info.max_model_len,
**vllm_extra_kwargs,
......@@ -201,6 +202,7 @@ def mteb_test_embed_models(
if model_info.mteb_score is None:
with hf_runner(
model_info.name,
revision=model_info.revision,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
......
......@@ -241,6 +241,7 @@ def mteb_test_rerank_models(
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=None,
max_num_seqs=8,
......@@ -286,7 +287,9 @@ def mteb_test_rerank_models(
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(model_info.name, dtype=model_info.hf_dtype) as hf_model:
with hf_runner(
model_info.name, revision=model_info.revision, dtype=model_info.hf_dtype
) as hf_model:
hf_model.chat_template = chat_template
st_main_score = run_mteb_rerank(
hf_model,
......
......@@ -69,7 +69,10 @@ MODELS = [
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
# Skip: model's custom tokenizer on HF hub is incompatible with
# transformers v5 (sets attrs before super().__init__, triggering
# AttributeError on 'verbose' in __getattr__).
enable_test=False,
),
]
......
......@@ -72,7 +72,8 @@ MODELS = [
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
# Skip: numerical regression with transformers v5.
enable_test=False,
),
########## ModernBertModel
EmbedModelInfo(
......
......@@ -75,6 +75,10 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(vllm_runner, model_info)
@pytest.mark.skip(
reason="jinaai/jina-embeddings-v3 custom XLMRobertaLoRA model on HF hub "
"is incompatible with transformers v5 (missing all_tied_weights_keys)"
)
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("dimensions", [16, 32])
......
......@@ -12,6 +12,10 @@ MODELS = [
EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel",
# Fixme:
# Update nomic-embed code to support the latest
# HF version and remove revision set.
revision="720244025c1a7e15661a174c63cce63c8218e52b",
mteb_score=0.737568559,
enable_test=True,
seq_pooling_type="MEAN",
......
......@@ -186,7 +186,14 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
auto_cls=AutoModel,
hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
marks=[
pytest.mark.core_model,
pytest.mark.cpu_model,
# TODO: Remove skip once model has been upstreamed to Transformers
pytest.mark.skip(
reason="Custom model code is not compatible with Transformers v5"
),
],
),
#### Transformers fallback to test
## To reduce test burden, we only test batching arbitrary image size
......@@ -397,14 +404,14 @@ VLM_TEST_SETTINGS = {
"gemma4": VLMTestInfo(
models=["google/gemma-4-E2B-it"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
prompt_formatter=lambda img_prompt: f"<bos><|turn>user\n{img_prompt}<turn|>\n<|turn>model\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "What's the content in the center of the image?",
"cherry_blossom": "What is the season?",
"stop_sign": "<|image|>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<|image|>What is the season?",
}
),
multi_image_prompt="Describe the two images in detail.",
multi_image_prompt="<|image|><|image|>Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
......@@ -533,6 +540,12 @@ VLM_TEST_SETTINGS = {
max_model_len=4096,
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
# TODO: Remove skip once model has been upstreamed to Transformers
marks=[
pytest.mark.skip(
reason="Custom model code tries to access data from meta-tensor"
)
],
),
"intern_vl-video": VLMTestInfo(
models=[
......@@ -545,6 +558,12 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
num_logprobs=10 if current_platform.is_rocm() else 5,
# TODO: Remove skip once model has been upstreamed to Transformers
marks=[
pytest.mark.skip(
reason="Custom model code tries to access data from meta-tensor"
)
],
),
"intern_vl-hf": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"],
......@@ -591,6 +610,8 @@ VLM_TEST_SETTINGS = {
hf_model_kwargs={"device_map": "auto"},
patch_hf_runner=model_utils.isaac_patch_hf_runner,
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
# TODO: Remove skip once model has been upstreamed to Transformers
marks=[pytest.mark.skip(reason="Custom model imports deleted object")], # noqa: E501
),
"kimi_vl": VLMTestInfo(
models=["moonshotai/Kimi-VL-A3B-Instruct"],
......@@ -806,7 +827,12 @@ VLM_TEST_SETTINGS = {
pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
reason="This model is broken in Transformers v4.57.3",
)
),
pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) >= Version("5.0.0"),
reason="Model's custom code uses ROPE_INIT_FUNCTIONS"
"['default'] which was removed in transformers v5",
),
],
),
"phi3v": VLMTestInfo(
......@@ -960,6 +986,12 @@ VLM_TEST_SETTINGS = {
)
for inp in custom_inputs.different_patch_input_cases_internvl()
],
# TODO: Remove skip once model has been upstreamed to Transformers
marks=[
pytest.mark.skip(
reason="Custom model code tries to access data from meta-tensor"
)
],
),
"llava_onevision-multiple-images": VLMTestInfo(
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
......
......@@ -103,6 +103,10 @@ def run_test(
)
@pytest.mark.skip(
reason="Model's custom MBart decoder has head count mismatch with "
"transformers v5's GQA-aware cross-attention (8 vs 16 heads)"
)
@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("num_logprobs", [5])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment