Commit 459d9b38 authored by Harry Mellor's avatar Harry Mellor Committed by khluu
Browse files

Update to transformers v5 (#30566)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: default avatarkhluu <khluu000@gmail.com>
Signed-off-by: default avatarKevin H. Luu <khluu000@gmail.com>
Signed-off-by: default avatarCyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: default avatarkhluu <khluu000@gmail.com>
Co-authored-by: default avatarCyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: default avatarjiang1.li <jiang1.li@intel.com>
(cherry picked from commit 03f8d3a5)
parent b1568cf4
......@@ -69,7 +69,10 @@ MODELS = [
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
# Skip: model's custom tokenizer on HF hub is incompatible with
# transformers v5 (sets attrs before super().__init__, triggering
# AttributeError on 'verbose' in __getattr__).
enable_test=False,
),
]
......
......@@ -72,7 +72,8 @@ MODELS = [
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
# Skip: numerical regression with transformers v5.
enable_test=False,
),
########## ModernBertModel
EmbedModelInfo(
......
......@@ -75,6 +75,10 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(vllm_runner, model_info)
@pytest.mark.skip(
reason="jinaai/jina-embeddings-v3 custom XLMRobertaLoRA model on HF hub "
"is incompatible with transformers v5 (missing all_tied_weights_keys)"
)
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("dimensions", [16, 32])
......
......@@ -186,7 +186,14 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
auto_cls=AutoModel,
hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
marks=[
pytest.mark.core_model,
pytest.mark.cpu_model,
# TODO: Remove skip once model has been upstreamed to Transformers
pytest.mark.skip(
reason="Custom model code is not compatible with Transformers v5"
),
],
),
#### Transformers fallback to test
## To reduce test burden, we only test batching arbitrary image size
......@@ -397,14 +404,14 @@ VLM_TEST_SETTINGS = {
"gemma4": VLMTestInfo(
models=["google/gemma-4-E2B-it"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
prompt_formatter=lambda img_prompt: f"<bos><|turn>user\n{img_prompt}<turn|>\n<|turn>model\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "What's the content in the center of the image?",
"cherry_blossom": "What is the season?",
"stop_sign": "<|image|>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<|image|>What is the season?",
}
),
multi_image_prompt="Describe the two images in detail.",
multi_image_prompt="<|image|><|image|>Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
......@@ -533,6 +540,12 @@ VLM_TEST_SETTINGS = {
max_model_len=4096,
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
# TODO: Remove skip once model has been upstreamed to Transformers
marks=[
pytest.mark.skip(
reason="Custom model code tries to access data from meta-tensor"
)
],
),
"intern_vl-video": VLMTestInfo(
models=[
......@@ -545,6 +558,12 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
num_logprobs=10 if current_platform.is_rocm() else 5,
# TODO: Remove skip once model has been upstreamed to Transformers
marks=[
pytest.mark.skip(
reason="Custom model code tries to access data from meta-tensor"
)
],
),
"intern_vl-hf": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"],
......@@ -591,6 +610,8 @@ VLM_TEST_SETTINGS = {
hf_model_kwargs={"device_map": "auto"},
patch_hf_runner=model_utils.isaac_patch_hf_runner,
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
# TODO: Remove skip once model has been upstreamed to Transformers
marks=[pytest.mark.skip(reason="Custom model imports deleted object")], # noqa: E501
),
"kimi_vl": VLMTestInfo(
models=["moonshotai/Kimi-VL-A3B-Instruct"],
......@@ -806,7 +827,12 @@ VLM_TEST_SETTINGS = {
pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
reason="This model is broken in Transformers v4.57.3",
)
),
pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) >= Version("5.0.0"),
reason="Model's custom code uses ROPE_INIT_FUNCTIONS"
"['default'] which was removed in transformers v5",
),
],
),
"phi3v": VLMTestInfo(
......@@ -960,6 +986,12 @@ VLM_TEST_SETTINGS = {
)
for inp in custom_inputs.different_patch_input_cases_internvl()
],
# TODO: Remove skip once model has been upstreamed to Transformers
marks=[
pytest.mark.skip(
reason="Custom model code tries to access data from meta-tensor"
)
],
),
"llava_onevision-multiple-images": VLMTestInfo(
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
......
......@@ -103,6 +103,10 @@ def run_test(
)
@pytest.mark.skip(
reason="Model's custom MBart decoder has head count mismatch with "
"transformers v5's GQA-aware cross-attention (8 vs 16 heads)"
)
@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("num_logprobs", [5])
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from importlib.metadata import version
import pytest
import regex as re
from packaging.version import Version
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm.logprobs import SampleLogprobs
from vllm.multimodal.image import rescale_image_size
from ....conftest import (
IMAGE_ASSETS,
HfRunner,
PromptImageInput,
VllmRunner,
)
from ....utils import multi_gpu_test
from ...utils import check_logprobs_close
pytestmark = pytest.mark.skipif(
Version("5.0") <= Version(version("transformers")),
reason=(
"vllm upgraded transformers above v5.4 where HF model custom code uses siglip2 "
"internals (filter_out_non_signature_kwargs) removed by "
"huggingface/transformers#43514"
),
)
MODEL_ID = "microsoft/Phi-4-reasoning-vision-15B"
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "<|user|>\n<image>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom": "<|user|>\n<image>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
}
)
HF_MULTIIMAGE_IMAGE_PROMPT = (
"<|user|>\n<image>\n<image>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
)
DTYPE = "half"
MAX_TOKENS = 128
NUM_LOGPROBS = 10
def vllm_to_hf_output(
vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str
):
"""Sanitize vllm output to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
output_str_without_image = re.sub(r"(<image>)+", "", output_str)
if output_str_without_image and output_str_without_image[0] == " ":
output_str_without_image = output_str_without_image[1:]
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
hf_output_ids = tokenizer.encode(output_str_without_image)
if hf_output_ids and hf_output_ids[0] == tokenizer.bos_token_id:
hf_output_ids = hf_output_ids[1:]
return hf_output_ids, hf_output_str, out_logprobs
def _build_single_image_inputs(
image_assets,
) -> list[tuple[list[str], PromptImageInput]]:
"""Build single-image inputs for all size_factors at once."""
images = [asset.pil_image for asset in image_assets]
all_inputs: list[tuple[list[str], PromptImageInput]] = []
for size_factors in [[1.0], [0.25, 0.5, 1.0]]:
for image, prompt in zip(images, HF_IMAGE_PROMPTS):
all_inputs.append(
(
[prompt for _ in size_factors],
[rescale_image_size(image, f) for f in size_factors],
)
)
return all_inputs
def _build_multi_image_inputs(
image_assets,
) -> list[tuple[list[str], PromptImageInput]]:
"""Build multi-image inputs for all size_factors at once."""
images = [asset.pil_image for asset in image_assets]
all_inputs: list[tuple[list[str], PromptImageInput]] = []
for size_factors in [[0.5], [0.15, 0.30]]:
all_inputs.append(
(
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
)
)
return all_inputs
def _run_and_compare(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
all_inputs: Sequence[tuple[list[str], PromptImageInput]],
model: str,
max_model_len: int,
max_num_seqs: int,
mm_limit: int,
gpu_memory_utilization: float,
):
"""Load each runner once, run all inputs, then compare."""
# NOTE: run vLLM first, then HF. vLLM needs a fresh process without
# cuda initialization; running HF first would break the multiprocessing
# backend with fork method.
with vllm_runner(
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
gpu_memory_utilization=gpu_memory_utilization,
dtype=DTYPE,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=2,
trust_remote_code=True,
enforce_eager=True,
) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
prompts,
MAX_TOKENS,
num_logprobs=NUM_LOGPROBS,
images=images,
)
for prompts, images in all_inputs
]
hf_model_kwargs = {"_attn_implementation": "sdpa", "device_map": "auto"}
with hf_runner(
model,
dtype=DTYPE,
model_kwargs=hf_model_kwargs,
auto_cls=AutoModelForCausalLM,
trust_remote_code=True,
) as hf_model:
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
MAX_TOKENS,
num_logprobs=NUM_LOGPROBS,
images=images,
)
for prompts, images in all_inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("model", [MODEL_ID])
def test_models(hf_runner, vllm_runner, image_assets, model) -> None:
all_inputs = _build_single_image_inputs(image_assets)
_run_and_compare(
hf_runner,
vllm_runner,
all_inputs,
model,
max_model_len=8192,
max_num_seqs=2,
mm_limit=1,
gpu_memory_utilization=0.80,
)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("model", [MODEL_ID])
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model) -> None:
all_inputs = _build_multi_image_inputs(image_assets)
_run_and_compare(
hf_runner,
vllm_runner,
all_inputs,
model,
max_model_len=8192,
max_num_seqs=2,
mm_limit=2,
gpu_memory_utilization=0.80,
)
......@@ -149,6 +149,10 @@ def test_online_serving(vllm_runner, audio_assets: AudioTestAssets):
)
@pytest.mark.skip(
reason="VoxtralProcessor.apply_chat_template() in transformers v5 "
"doesn't resolve chat_template=None to the default template"
)
def test_hf_reference(hf_runner, vllm_runner, audio_assets: AudioTestAssets):
"""Compare vLLM Mistral-format output against HF Transformers reference.
......
......@@ -80,6 +80,11 @@ def run_test(
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
# Avoid passing limit_mm_per_prompt twice when vllm_runner_kwargs
# already contains it (e.g. gemma4 sets it via vllm_runner_kwargs).
if "limit_mm_per_prompt" in vllm_runner_kwargs_:
limit_mm_per_prompt = vllm_runner_kwargs_.pop("limit_mm_per_prompt")
with vllm_runner(
model,
max_model_len=max_model_len,
......
......@@ -22,6 +22,11 @@ from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from ....conftest import VllmRunner
pytestmark = pytest.mark.skip(
reason="ColQwen3 model's weight tying is incompatible with "
"transformers v5 (missing all_tied_weights_keys)"
)
MODELS = [
"TomoroAI/tomoro-colqwen3-embed-4b",
"OpenSearch-AI/Ops-Colqwen3-4B",
......
......@@ -11,6 +11,11 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import ImageTestAssets
pytestmark = pytest.mark.skip(
reason="InternVisionModel's custom code is incompatible with "
"transformers v5 (missing all_tied_weights_keys)"
)
# we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
......
......@@ -15,6 +15,11 @@ from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from ....conftest import HfRunner, VllmRunner
pytestmark = pytest.mark.skip(
reason="jinaai/jina-reranker-m0 custom code is incompatible with "
"transformers v5 (missing all_tied_weights_keys)"
)
MODELS = ["jinaai/jina-reranker-m0"]
MM_PROCESSOR_KWARGS = {
......
......@@ -17,11 +17,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from importlib.metadata import version
from unittest.mock import MagicMock
import numpy as np
import pytest
import torch
from packaging.version import Version
from transformers import PretrainedConfig
from tests.models.registry import HF_EXAMPLE_MODELS
......@@ -122,6 +124,11 @@ def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):
assert builder.get_dummy_text({"audio": 2}) == "<sound><sound>"
@pytest.mark.skipif(
Version(version("transformers")) >= Version("5.5"),
reason="transformers v5.5 added native MusicFlamingoForConditionalGeneration "
"with a different get_audio_features signature (requires input_ids)",
)
def test_musicflamingo_audio_feature_pipeline_matches_hf_small_config():
from transformers.models.musicflamingo import (
modeling_musicflamingo as hf_musicflamingo_modeling,
......
......@@ -334,7 +334,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"internlm/internlm2-chat-7b", trust_remote_code=True
),
"InternLM2VEForCausalLM": _HfExamplesInfo(
"OpenGVLab/Mono-InternVL-2B", trust_remote_code=True
"OpenGVLab/Mono-InternVL-2B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": (
"Custom config cannot be loaded with Transformers "
"v5 because `vision_config` is not always set"
)
},
),
"InternLM3ForCausalLM": _HfExamplesInfo(
"internlm/internlm3-8b-instruct", trust_remote_code=True
......@@ -469,6 +477,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Plamo2ForCausalLM": _HfExamplesInfo(
"pfnet/plamo-2-1b",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"hf": (
"Custom model code uses `_tied_weight_keys: list[str]` but "
"Transformers v5 now expects `_tied_weight_keys: dict[str, str]`"
)
},
),
"Plamo3ForCausalLM": _HfExamplesInfo(
"pfnet/plamo-3-nict-2b-base",
......@@ -509,6 +524,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True,
max_model_len=4096,
is_available_online=True,
max_transformers_version="5.3",
transformers_version_reason={
"vllm": (
"vllm upgraded transformers above v5.4 where "
"validate_rope() no longer accepts ignore_keys param"
)
},
),
"SeedOssForCausalLM": _HfExamplesInfo(
"ByteDance-Seed/Seed-OSS-36B-Instruct",
......@@ -544,6 +566,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"xverse/XVERSE-7B-Chat",
tokenizer="meta-llama/Llama-2-7b",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "XVERSE tokenizer is incompatible with transformers v5 "
"(add_prefix_space / prepend_scheme mismatch).",
},
),
"Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
"MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True),
......@@ -754,10 +781,18 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# [Decoder-only]
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
"AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
"nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0"
"nvidia/audio-flamingo-3-hf",
min_transformers_version="5.3.0",
transformers_version_reason={
"vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
},
),
"MusicFlamingoForConditionalGeneration": _HfExamplesInfo(
"nvidia/music-flamingo-2601-hf", min_transformers_version="5.3.0"
"nvidia/music-flamingo-2601-hf",
min_transformers_version="5.3.0",
transformers_version_reason={
"vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
},
),
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
"BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
......@@ -800,9 +835,30 @@ _MULTIMODAL_EXAMPLE_MODELS = {
),
"FireRedASR2ForConditionalGeneration": _HfExamplesInfo(
"allendou/FireRedASR2-LLM-vllm",
trust_remote_code=True,
max_transformers_version="5.1",
transformers_version_reason={
"vllm": "Incompatible with transformers v5.2+ "
"(dict object has no attribute '__name__').",
},
),
"FireRedLIDForConditionalGeneration": _HfExamplesInfo(
"PatchyTisa/FireRedLID-vllm",
trust_remote_code=True,
max_transformers_version="5.1",
transformers_version_reason={
"vllm": "Incompatible with transformers v5.2+ "
"(dict object has no attribute '__name__').",
},
),
"FunASRForConditionalGeneration": _HfExamplesInfo(
"allendou/Fun-ASR-Nano-2512-vllm",
trust_remote_code=True,
max_transformers_version="5.1",
transformers_version_reason={
"vllm": "Incompatible with transformers v5.2+ "
"(dict object has no attribute '__name__').",
},
),
"FunAudioChatForConditionalGeneration": _HfExamplesInfo(
"funaudiochat", is_available_online=False
......@@ -844,6 +900,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"HCXVisionForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": (
"Custom config cannot be loaded with Transformers "
"v5 because `text_config` is not always set"
)
},
),
"HCXVisionV2ForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
......@@ -863,7 +926,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"},
),
"InternS1ForConditionalGeneration": _HfExamplesInfo(
"internlm/Intern-S1", trust_remote_code=True
"internlm/Intern-S1",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "Custom tokenizer code is not compatible with Transformers v5."
},
),
"InternS1ProForConditionalGeneration": _HfExamplesInfo(
"internlm/Intern-S1-Pro",
......@@ -952,7 +1020,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"MiDashengLMModel": _HfExamplesInfo(
"mispeech/midashenglm-7b", trust_remote_code=True
),
"MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", trust_remote_code=True),
"MiniCPMO": _HfExamplesInfo(
"openbmb/MiniCPM-o-2_6",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"hf": "Custom processor code is not compatible with Transformers v5."
},
),
"MiniCPMV": _HfExamplesInfo(
"openbmb/MiniCPM-Llama3-V-2_5",
extras={
......@@ -960,6 +1035,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"4.0": "openbmb/MiniCPM-V-4",
"4.5": "openbmb/MiniCPM-V-4_5",
},
max_transformers_version="4.57",
transformers_version_reason={
"vllm": (
"MiniCPMVBatchFeature is incompatible with its base class in "
"Transformers v5. See https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/discussions/78"
)
},
trust_remote_code=True,
),
"MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo(
......@@ -996,13 +1078,25 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"nano_vl_dummy", is_available_online=False, trust_remote_code=True
),
"OpenCUAForConditionalGeneration": _HfExamplesInfo(
"xlangai/OpenCUA-7B", trust_remote_code=True
"xlangai/OpenCUA-7B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "Tokenizer cannot be initialised in Transformers v5."
},
),
"OpenPanguVLForConditionalGeneration": _HfExamplesInfo(
"FreedomIntelligence/openPangu-VL-7B",
trust_remote_code=True,
max_model_len=4096,
enforce_eager=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": (
"OpenPanguVLVideoProcessorInitKwargs does not specify total=False, "
"making all kwargs required. See https://huggingface.co/FreedomIntelligence/openPangu-VL-7B/discussions/2"
)
},
),
"Ovis": _HfExamplesInfo(
"AIDC-AI/Ovis2-1B",
......@@ -1014,12 +1108,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B",
},
),
"Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
"Ovis2_5": _HfExamplesInfo(
"AIDC-AI/Ovis2.5-2B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "Custom processor code is not compatible with Transformers v5."
},
),
"Ovis2_6ForCausalLM": _HfExamplesInfo(
"AIDC-AI/Ovis2.6-2B", is_available_online=False, trust_remote_code=True
),
"Ovis2_6_MoeForCausalLM": _HfExamplesInfo(
"AIDC-AI/Ovis2.6-30B-A3B", trust_remote_code=True
"AIDC-AI/Ovis2.6-30B-A3B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "Custom processor code is not compatible with Transformers v5."
},
),
"PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
"PaddlePaddle/PaddleOCR-VL",
......@@ -1038,6 +1144,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
}, # noqa: E501
extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"},
),
"Phi4ForCausalLMV": _HfExamplesInfo(
"microsoft/Phi-4-reasoning-vision-15B",
trust_remote_code=True,
max_transformers_version="5.3",
transformers_version_reason={
"vllm": (
"vllm upgraded transformers above v5.4 where HF model "
"custom code uses siglip2 internals "
"(filter_out_non_signature_kwargs) removed "
"by huggingface/transformers#43514"
)
},
),
"Phi4MMForCausalLM": _HfExamplesInfo(
"microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
),
......@@ -1133,6 +1252,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"architectures": ["Tarsier2ForConditionalGeneration"],
"model_type": "tarsier2",
},
max_transformers_version="5.3",
transformers_version_reason={
"vllm": (
"Qwen2VLConfig was split into Qwen2VLConfig + "
"Qwen2VLTextConfig in transformers v5, breaking "
"attribute access (num_attention_heads, hidden_size, etc.)"
)
},
),
"VoxtralForConditionalGeneration": _HfExamplesInfo(
"mistralai/Voxtral-Mini-3B-2507",
......
......@@ -468,7 +468,16 @@ def dummy_hf_overrides(
else:
# Use minimal layers for testing
num_layers = 1
num_hidden_layers = 3 if model_arch == "Gemma3nForConditionalGeneration" else 1
num_hidden_layers = (
3
if model_arch
in (
"Gemma3nForConditionalGeneration",
"Gemma4ForCausalLM",
"Gemma4ForConditionalGeneration",
)
else 1
)
update_dict = {
"num_layers": num_layers,
......
......@@ -2,10 +2,10 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.tokenizers import get_tokenizer
parser_name = "step3p5"
start_token = "<think>"
......@@ -16,7 +16,7 @@ REASONING_MODEL_NAME = "stepfun-ai/Step-3.5-Flash"
@pytest.fixture(scope="module")
def step3p5_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME)
SIMPLE_REASONING = {
......
......@@ -542,12 +542,16 @@ def test_eagle_correctness_light(
"auto",
0.8,
),
(
pytest.param(
("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1),
False,
False,
"transformers",
0.8,
# TODO(hmellor): figure out why memory usage is so high
marks=pytest.mark.skip(
reason="Feature is experimental and uses too much memory in CI",
),
),
pytest.param(
(
......
......@@ -209,12 +209,24 @@ class GGUFModelLoader(BaseModelLoader):
GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
or None if no mapping found
"""
# In transformers v5, multimodal models (e.g. Gemma3) wrap
# all sub-models under an outer 'model.' attribute, producing
# state_dict keys like 'model.language_model.layers.0...' and
# 'model.vision_tower.vision_model...'. Strip this outer
# prefix so the keys match what gguf-py expects.
if is_multimodal and hf_name.startswith("model."):
hf_name = hf_name[6:] # Remove outer 'model.'
# Strip 'language_model.' prefix for multimodal models - gguf-py
# tensor mappings expect parameter names without this prefix.
# Note: 'model.' prefix should be KEPT for text-only models as
# gguf-py expects it.
if hf_name.startswith("language_model."):
hf_name = hf_name[15:] # Remove 'language_model.'
# Re-add 'model.' prefix because gguf-py text tensor maps
# expect 'model.layers...' format.
if is_multimodal:
hf_name = "model." + hf_name
# Parse parameter name and suffix
if hf_name.endswith((".weight", ".bias")):
......
......@@ -126,8 +126,12 @@ class Gemma4AudioInputs(TensorSchema):
"""
type: Literal["audio"] = "audio"
input_features_padded: Annotated[torch.Tensor, TensorShape("bn", "s", "f")]
input_features_mask: Annotated[torch.Tensor, TensorShape("bn", "s")]
input_features_padded: Annotated[
torch.Tensor, TensorShape("bn", "s", "f", dynamic_dims={"s"})
]
input_features_mask: Annotated[
torch.Tensor, TensorShape("bn", "s", dynamic_dims={"s"})
]
Gemma4ImageInputs = Gemma4ImagePixelInputs
......@@ -513,6 +517,8 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
video_timestamps_per_video: list[list[float]] = []
video_frame_counts: list[int] = []
video_replacements: list[str] = []
for item in videos:
video_array, metadata = item
......@@ -565,10 +571,7 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
video_timestamps_per_video.append(timestamps)
video_frame_counts.append(len(frames))
# Build expanded replacement text and replace the
# <|video|> placeholder in the prompt.
# Use split(token, 1) to avoid collision — the
# replacement text itself contains <|video|> tokens.
# Build expanded replacement text for this video.
ts_strs = [f"{int(s // 60):02d}:{int(s % 60):02d}" for s in timestamps]
replacement = " ".join(
f"{t} {processor.boi_token}"
......@@ -576,9 +579,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
f"{processor.eoi_token}"
for t, n in zip(ts_strs, num_soft_per_frame)
)
parts = prompt.split(processor.video_token, 1)
if len(parts) == 2:
prompt = parts[0] + replacement + parts[1]
video_replacements.append(replacement)
# Replace all <|video|> placeholders at once. We split on
# video_token to get N+1 parts, then interleave with the
# N replacement strings. This avoids the iterative
# split-replace bug where replacement text (which itself
# contains <|video|> tokens) collides with later splits.
vt = processor.video_token
parts = prompt.split(vt, len(video_replacements))
# NOTE: len(parts) <= len(video_replacements) + 1
parts_with_repl: list[str] = []
for part, repl in zip(parts, video_replacements):
parts_with_repl.extend([part, repl])
parts_with_repl.extend(parts[len(video_replacements) :])
prompt = "".join(parts_with_repl)
video_outputs = {
"pixel_values_videos": torch.cat(all_video_pixel_values, dim=0),
......@@ -641,19 +658,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
)
if "input_features" in processed_outputs:
# Keep padded features for batched audio tower execution.
processed_outputs["input_features_padded"] = processed_outputs[
"input_features"
]
# Unpad per-item so each item's cache entry is self-contained.
# Unpad per-item so each item's cache entry is
# self-contained. The batched() field config in
# _get_mm_fields_config will re-pad all fields to the
# batch's max length at batch time, ensuring consistent
# padding regardless of cache history.
masks = processed_outputs["input_features_mask"]
unpadded_features = [
f[mask]
for f, mask in zip(
processed_outputs["input_features"],
processed_outputs["input_features_mask"],
masks,
)
]
unpadded_masks = [mask[mask] for mask in masks]
processed_outputs["input_features"] = unpadded_features
processed_outputs["input_features_padded"] = unpadded_features
processed_outputs["input_features_mask"] = unpadded_masks
# Merge video outputs into the final result
combined_outputs = dict(processed_outputs, **video_outputs)
......
......@@ -32,9 +32,9 @@ from transformers.models.musicflamingo import (
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
......
......@@ -16,6 +16,7 @@
# limitations under the License.
"""Transformers modeling backend base class."""
import sys
from collections.abc import Callable, Iterable
from itertools import chain
from operator import attrgetter
......@@ -29,6 +30,7 @@ from torch import nn
from transformers import AutoModel
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
from vllm.compilation.decorators import support_torch_compile
from vllm.config.utils import getattr_iter
from vllm.distributed import get_pp_group, get_tp_group
from vllm.distributed.utils import get_pp_indices
......@@ -47,6 +49,7 @@ from vllm.model_executor.models.interfaces import (
)
from vllm.model_executor.models.interfaces_base import VllmModel
from vllm.model_executor.models.transformers.utils import (
can_enable_torch_compile,
get_feature_request_tip,
init_on_device_without_buffers,
log_replacement,
......@@ -117,6 +120,7 @@ class Base(
self.config = vllm_config.model_config.hf_config
self.text_config = self.config.get_text_config()
self.cache_config = vllm_config.cache_config
self.compilation_config = vllm_config.compilation_config
self.device_config = vllm_config.device_config
self.model_config = vllm_config.model_config
self.parallel_config = vllm_config.parallel_config
......@@ -146,7 +150,7 @@ class Base(
if self.quant_config:
quant_method_name = self.quant_config.get_name()
# Check for unsupported quantization methods.
if quant_method_name == "mxfp4":
if quant_method_name in ("mxfp4", "gpt_oss_mxfp4"):
raise NotImplementedError(
"Transformers modeling backend does "
"not support MXFP4 quantization yet."
......@@ -155,14 +159,16 @@ class Base(
if "gptq" in quant_method_name:
self.ignore_unexpected_suffixes.append(".bias")
# Patch config and init on "meta" to delay allocating GPU tensors
self._patch_config()
from_config_kwargs = dict(
config=self.config,
dtype=self.model_config.dtype,
trust_remote_code=self.model_config.trust_remote_code,
)
self._decorate_for_torch_compile(**from_config_kwargs)
# Init on "meta" to delay allocating GPU tensors
with init_on_device_without_buffers("meta"):
self.model: PreTrainedModel = AutoModel.from_config(
self.config,
dtype=self.model_config.dtype,
trust_remote_code=self.model_config.trust_remote_code,
)
self.model: PreTrainedModel = AutoModel.from_config(**from_config_kwargs)
# Create weight name to module qualname mapper
self._create_hf_to_vllm_mapper()
......@@ -218,6 +224,87 @@ class Base(
if sub_config.dtype != (dtype := self.config.dtype):
sub_config.dtype = dtype
def _get_decoder_cls(self, **kwargs: dict) -> type[PreTrainedModel]:
"""
Get the decoder class from the model.
Args:
kwargs: The kwargs to create the model.
Returns:
The decoder class.
"""
with torch.device("meta"):
model: PreTrainedModel = AutoModel.from_config(**kwargs)
decoder_cls = type(model.get_decoder())
logger.debug("Identified decoder class as: %s", decoder_cls)
del model
return decoder_cls
def _decorate_cls_for_torch_compile(
self,
cls: type[PreTrainedModel],
dynamic_arg_dims: dict[str, int] | None,
enable_if: Callable[["VllmConfig"], bool],
is_encoder: bool,
):
"""
Decorate `cls` to indicate to vLLM that it supports torch compile.
Args:
cls: The PreTrainedModel class to decorate.
dynamic_arg_dims: A mapping from argument name to the dynamic dimensions
of the argument. If None, default dynamic arg dims will be used. See
[`support_torch_compile`][vllm.compilation.decorators.support_torch_compile]
for more details.
enable_if: A function which takes in the vLLM config and returns whether
torch compile should be enabled for this class.
is_encoder: Whether the class being decorated is an encoder.
"""
logger.debug(
"Decorating `%s` as %s for torch compile with dynamic_arg_dims of %s",
cls.__name__,
"encoder" if is_encoder else "decoder",
dynamic_arg_dims,
)
@support_torch_compile(
dynamic_arg_dims=dynamic_arg_dims,
enable_if=enable_if,
is_encoder=is_encoder,
)
class SupportTorchCompileWrapper(cls): ...
# Preserve __module__ so transformers v5's source-file checks
# (e.g. _can_set_experts_implementation) read the original
# model's module instead of this file.
SupportTorchCompileWrapper.__module__ = cls.__module__
# Patch the class in its module
module = sys.modules[cls.__module__]
setattr(module, cls.__name__, SupportTorchCompileWrapper)
def _decorate_for_torch_compile(self, **kwargs: dict):
"""
Decorate the model's decoder class to indicate to vLLM that it supports torch
compile if `can_enable_torch_compile` is True.
Args:
kwargs: The kwargs to create the model, which are needed to get the decoder
class.
"""
self._decorate_cls_for_torch_compile(
cls=self._get_decoder_cls(**kwargs),
# Applied to a PreTrainedModel so the batch dimension will exist
dynamic_arg_dims=dict[str, int](
input_ids=1, # shape: [1, seq_len]
inputs_embeds=1, # shape: [1, seq_len, hidden_size]
position_ids=-1, # shape: [1, seq_len] or [3, 1, seq_len] for mrope
),
enable_if=can_enable_torch_compile,
is_encoder=False,
)
def _create_hf_to_vllm_mapper(self):
"""
Create a WeightsMapper to map checkpoint weight names to module qualnames.
......@@ -553,11 +640,6 @@ class Base(
input_ids = None
inputs_embeds = intermediate_tensors["hidden_states"]
if input_ids is not None:
input_ids = input_ids[None, ...]
if inputs_embeds is not None:
inputs_embeds = inputs_embeds[None, ...]
# If the model scales embeddings inside the input embedding layer we must
# ensure they are scaled here since VocabParallelEmbedding will not do it
if (
......@@ -568,22 +650,29 @@ class Base(
inputs_embeds = self.embed_input_ids(input_ids)
input_ids = None
if self.model_config.uses_mrope:
position_ids = positions[:, None]
else:
position_ids = positions[None, ...]
# Add batch dimension before entering Transformers model
if input_ids is not None and input_ids.ndim == 1:
# [seq_len] -> [1, seq_len]
input_ids = input_ids[None, ...]
if inputs_embeds is not None and inputs_embeds.ndim == 2:
# [seq_len, hidden_size] -> [1, seq_len, hidden_size]
inputs_embeds = inputs_embeds[None, ...]
if positions.ndim == 1:
# [seq_len] -> [1, seq_len]
positions = positions[None, ...]
outputs = self.model(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
use_cache=False,
position_ids=position_ids,
position_ids=positions,
attention_instances=self.attention_instances,
return_dict=False,
**self._output_aux_hidden_states_kwargs,
**kwargs,
)
# We must remove the batch dimension from these outputs
# Remove batch dimension after exiting Transformers model
hidden_states = outputs[0][0, ...]
if self._output_aux_hidden_states_kwargs:
aux_hidden_states = [x[0][0, ...] for x in outputs[1:]]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment