Commit a40a133c authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.2' into v0.9.2-dev

parents 1a9a61d7 a5dd03c1
......@@ -99,7 +99,7 @@ def _run_test(
max_model_len=8192) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
texts = [
# this is necessary because vllm_model.encode will not apply any
# this is necessary because vllm_model.embed will not apply any
# templating to the prompt, and therefore lacks an image_pad
# token unless one is inserted beforehand (the (28,28) image
# above is converted to an image pad token by the chat template).
......@@ -110,7 +110,7 @@ def _run_test(
# vllm will replace the pad token with the actual image,
# which may be a placeholder image, later.
]
vllm_outputs = vllm_model.encode(texts, images=input_images)
vllm_outputs = vllm_model.embed(texts, images=input_images)
hf_outputs = []
with hf_runner(model,
......
......@@ -69,7 +69,7 @@ def _run_test(
dtype=dtype,
max_model_len=4096,
enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForImageTextToText) as hf_model:
......
......@@ -47,7 +47,7 @@ def _run_test(
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model, task="embed", dtype=dtype,
enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs = {"_attn_implementation": "eager"}
......
......@@ -26,6 +26,22 @@ from ...registry import HF_EXAMPLE_MODELS
from ....utils import models_path_prefix
def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
"""
Patch the multimodal data for GLM4.1V model.
"""
# Ensure video metadata is included
if "video" in mm_data:
video = mm_data["video"]
mm_data["video"] = (video, {
"total_num_frames": len(video),
"fps": len(video),
"duration": 1,
"video_backend": "opencv"
})
return mm_data
def _test_processing_correctness(
model_id: str,
hit_rate: float,
......@@ -156,6 +172,11 @@ _IGNORE_MM_KEYS = {
"ultravox": {"audio_features"},
}
MM_DATA_PATCHES = {
# GLM4.1V requires video metadata to be included in the input
"glm4v": glm4_1v_patch_mm_data,
}
def _test_processing_correctness_one(
model_config: ModelConfig,
......@@ -168,6 +189,8 @@ def _test_processing_correctness_one(
):
model_type = model_config.hf_config.model_type
ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
if model_type in MM_DATA_PATCHES:
mm_data = MM_DATA_PATCHES[model_type](mm_data)
if isinstance(prompt, str):
text_prompt = prompt
......@@ -247,6 +270,7 @@ def _test_processing_correctness_one(
os.path.join(models_path_prefix, "adept/fuyu-8b"),
os.path.join(models_path_prefix, "google/gemma-3-4b-it"),
os.path.join(models_path_prefix, "THUDM/glm-4v-9b"),
os.path.join(models_path_prefix, "THUDM/GLM-4.1V-9B-Thinking"),
os.path.join(models_path_prefix, "ibm-granite/granite-speech-3.3-2b"),
os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),
os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
......@@ -286,6 +310,7 @@ def _test_processing_correctness_one(
os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b"),
os.path.join(models_path_prefix, "openai/whisper-large-v3"),
os.path.join(models_path_prefix, "omni-research/Tarsier-7b"),
os.path.join(models_path_prefix, "omni-research/Tarsier2-Recap-7b")
])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
import pytest
import torch
import transformers
from transformers import AutoConfig, PreTrainedModel
from vllm.config import ModelConfig
from vllm.model_executor.models.utils import WeightsMapper
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.transformers_utils.config import try_get_safetensors_metadata
from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
"""Create weights from safetensors checkpoint metadata"""
metadata = try_get_safetensors_metadata(repo)
weight_names = list(metadata.weight_map.keys())
with torch.device('meta'):
return ((name, torch.empty(0)) for name in weight_names)
def create_model_dummy_weights(
repo: str,
model_arch: str,
) -> Iterable[tuple[str, torch.Tensor]]:
"""
Create weights from a dummy meta deserialized hf model with name conversion
"""
model_cls: PreTrainedModel = getattr(transformers, model_arch)
config = AutoConfig.from_pretrained(repo)
with torch.device("meta"):
model: PreTrainedModel = model_cls._from_config(config)
return model.named_parameters()
def model_architectures_for_test() -> list[str]:
arch_to_test = list[str]()
for model_arch, info in _MULTIMODAL_EXAMPLE_MODELS.items():
if not info.trust_remote_code and hasattr(transformers, model_arch):
model_cls: PreTrainedModel = getattr(transformers, model_arch)
if getattr(model_cls, "_checkpoint_conversion_mapping", None):
arch_to_test.append(model_arch)
return arch_to_test
@pytest.mark.core_model
@pytest.mark.parametrize("model_arch", model_architectures_for_test())
def test_hf_model_weights_mapper(model_arch: str):
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
model_id = model_info.default
model_config = ModelConfig(
model_id,
task="auto",
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
seed=0,
dtype="auto",
revision=None,
hf_overrides=model_info.hf_overrides,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
original_weights = create_repo_dummy_weights(model_id)
hf_converted_weights = create_model_dummy_weights(model_id, model_arch)
mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
mapped_original_weights = mapper.apply(original_weights)
mapped_hf_converted_weights = mapper.apply(hf_converted_weights)
ref_weight_names = set(map(lambda x: x[0], mapped_original_weights))
weight_names = set(map(lambda x: x[0], mapped_hf_converted_weights))
weights_missing = ref_weight_names - weight_names
weights_unmapped = weight_names - ref_weight_names
assert (not weights_missing and not weights_unmapped), (
f"Following weights are not mapped correctly: {weights_unmapped}, "
f"Missing expected weights: {weights_missing}.")
......@@ -80,11 +80,11 @@ DOLPHIN_CONFIG = GGUFTestConfig(
)
MODELS = [
LLAMA_CONFIG,
# LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
QWEN2_CONFIG,
PHI3_CONFIG,
GPT2_CONFIG,
# STABLELM_CONFIG, # enable this when v1 support head_size=80
STABLELM_CONFIG,
DOLPHIN_CONFIG,
# STARCODER_CONFIG, # broken
]
......
......@@ -74,6 +74,12 @@ class _HfExamplesInfo:
length that is too large to fit into memory in CI.
"""
revision: Optional[str] = None
"""
The specific revision (commit hash, tag, or branch) to use for the model.
If not specified, the default revision will be used.
"""
def check_transformers_version(
self,
*,
......@@ -160,14 +166,20 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"DeepseekV3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-V3"), # noqa: E501
trust_remote_code=True),
"Ernie4_5_ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"baidu/ERNIE-4.5-0.3B-PT"),
trust_remote_code=True),
"Ernie4_5_MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"baidu/ERNIE-4.5-21B-A3B-PT"),
trust_remote_code=True),
"ExaoneForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct")), # noqa: E501
"Fairseq2LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"mgleize/fairseq2-dummy-Llama-3.2-1B")), # noqa: E501
"FalconForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/falcon-7b")),
"FalconH1ForCausalLM":_HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/Falcon-H1-1.5B-Instruct"),
"FalconH1ForCausalLM":_HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/Falcon-H1-0.5B-Base"),
min_transformers_version="4.53"),
"GemmaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"google/gemma-1.1-2b-it")),
"Gemma2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"google/gemma-2-9b")),
"Gemma3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"google/gemma-3-1b-it")),
"Gemma3nForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"google/gemma-3n-E2B-it"), # noqa: E501
min_transformers_version="4.53"),
"GlmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/glm-4-9b-chat-hf")),
"Glm4ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/GLM-4-9B-0414")),
"GPT2LMHeadModel": _HfExamplesInfo(os.path.join(models_path_prefix,"openai-community/gpt2"),
......@@ -184,7 +196,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"GraniteMoeSharedForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"ibm-research/moe-7b-1b-active-shared-experts")), # noqa: E501
"Grok1ModelForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"hpcai-tech/grok-1"),
trust_remote_code=True),
"InternLMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "internlm/internlm-chat-7b"),
"HunYuanMoEV1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"tencent/Hunyuan-A13B-Instruct"),
trust_remote_code=True),
"InternLMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"internlm/internlm-chat-7b"),
trust_remote_code=True),
"InternLM2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "internlm/internlm2-chat-7b"),
trust_remote_code=True),
......@@ -196,8 +210,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"JambaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"ai21labs/AI21-Jamba-1.5-Mini"),
extras={"tiny": os.path.join(models_path_prefix,"ai21labs/Jamba-tiny-dev")}), # noqa: E501
"LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"),
extras={"guard": os.path.join(models_path_prefix,"meta-llama/Llama-Guard-3-1B"), # noqa: E501
"hermes": os.path.join(models_path_prefix,"NousResearch/Hermes-3-Llama-3.1-8B")}), # noqa: E501
extras={"guard": os.path.join(models_path_prefix,"meta-llama/Llama-Guard-3-1B", # noqa: E501
"hermes": os.path.join(models_path_prefix,"NousResearch/Hermes-3-Llama-3.1-8B"), # noqa: E501
"fp8": os.path.join(models_path_prefix,"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8")}), # noqa: E501
"LLaMAForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"decapoda-research/llama-7b-hf"),
is_available_online=False),
"MambaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"state-spaces/mamba-130m-hf")),
......@@ -208,9 +223,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"MiniCPM3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"),
trust_remote_code=True),
"MiniMaxText01ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-Text-01"),
trust_remote_code=True),
trust_remote_code=True,
revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"), # noqa: E501
"MiniMaxM1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-M1-40k"),
trust_remote_code=True),
trust_remote_code=True),
"MistralForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.1")),
"MixtralForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1"), # noqa: E501
{"tiny": os.path.join(models_path_prefix, "TitanML/tiny-mixtral")}), # noqa: E501
......@@ -227,31 +243,31 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
{"1b": os.path.join(models_path_prefix, "facebook/opt-iml-max-1.3b")}),
"OrionForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "OrionStarAI/Orion-14B-Chat"),
trust_remote_code=True),
"PersimmonForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "adept/persimmon-8b-chat")),
"PhiForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/phi-2"), v0_only=True),
"Phi3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-3-mini-4k-instruct")),
"Phi3SmallForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-3-small-8k-instruct"),
"PersimmonForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"adept/persimmon-8b-chat")),
"PhiForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/phi-2")),
"Phi3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/Phi-3-mini-4k-instruct")),
# Blocksparse attention not supported in V1 yet
"Phi3SmallForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/Phi-3-small-8k-instruct"),
trust_remote_code=True,
v0_only=True),
"PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
"PhiMoEForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/Phi-3.5-MoE-instruct"),
trust_remote_code=True),
"Plamo2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "pfnet/plamo-2-1b"),
trust_remote_code=True),
"QWenLMHeadModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen-7B-Chat"),
trust_remote_code=True),
"Qwen2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2-0.5B-Instruct"),
extras={"2.5": os.path.join(models_path_prefix, "Qwen/Qwen2.5-0.5B-Instruct")}), # noqa: E501
"Qwen2MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen1.5-MoE-A2.7B-Chat")),
"Qwen3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen3-8B")),
"Qwen3MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen3-30B-A3B")),
"RWForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/falcon-40b")),
"StableLMEpochForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "stabilityai/stablelm-zephyr-3b"), # noqa: E501
v0_only=True),
"StableLmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"),
v0_only=True),
"Starcoder2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "bigcode/starcoder2-3b")),
"SolarForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "upstage/solar-pro-preview-instruct")),
"TeleChat2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Tele-AI/TeleChat2-3B"),
"Qwen2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen2-0.5B-Instruct"),
extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}), # noqa: E501
"Qwen2MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen1.5-MoE-A2.7B-Chat")),
"Qwen3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen3-8B")),
"Qwen3MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen3-30B-A3B")),
"Qwen3ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"tomaarsen/Qwen3-Reranker-0.6B-seq-cls")), # noqa: E501
"RWForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/falcon-40b")),
"StableLMEpochForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"stabilityai/stablelm-zephyr-3b")), # noqa: E501
"StableLmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"stabilityai/stablelm-3b-4e1t")),
"Starcoder2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"bigcode/starcoder2-3b")),
"SolarForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"upstage/solar-pro-preview-instruct")),
"TeleChat2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Tele-AI/TeleChat2-3B"),
trust_remote_code=True),
"TeleFLMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "CofeAI/FLM-2-52B-Instruct-2407"),
trust_remote_code=True),
......@@ -265,6 +281,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"MiMoForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-7B-RL"),
trust_remote_code=True),
"Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst",
min_transformers_version="4.53"),
# [Encoder-decoder]
"BartModel": _HfExamplesInfo(os.path.join(models_path_prefix, "facebook/bart-base")),
"BartForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "facebook/bart-large-cnn")),
......@@ -272,30 +290,31 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
_EMBEDDING_EXAMPLE_MODELS = {
# [Text-only]
"BertModel": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5")),
"Gemma2Model": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2")),
"GritLM": _HfExamplesInfo(os.path.join(models_path_prefix, "parasail-ai/GritLM-7B-vllm")),
"GteModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Snowflake/snowflake-arctic-embed-m-v2.0"),
"BertModel": _HfExamplesInfo(os.path.join(models_path_prefix,"BAAI/bge-base-en-v1.5"), v0_only=True),
"Gemma2Model": _HfExamplesInfo(os.path.join(models_path_prefix,"BAAI/bge-multilingual-gemma2"), v0_only=True), # noqa: E501
"GPT2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"nie3e/sentiment-polish-gpt2-small")), # noqa: E501
"GritLM": _HfExamplesInfo(os.path.join(models_path_prefix,"parasail-ai/GritLM-7B-vllm")),
"GteModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Snowflake/snowflake-arctic-embed-m-v2.0"),
trust_remote_code=True),
"GteNewModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-base-en-v1.5"),
trust_remote_code=True,
hf_overrides={"architectures": ["GteNewModel"]}), # noqa: E501
"InternLM2ForRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix, "internlm/internlm2-1_8b-reward"),
trust_remote_code=True),
"JambaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-reward-dev")), # noqa: E501
"LlamaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "llama", is_available_online=False),
"MistralModel": _HfExamplesInfo(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")),
"ModernBertModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-modernbert-base"),
trust_remote_code=True),
"NomicBertModel": _HfExamplesInfo(os.path.join(models_path_prefix, "nomic-ai/nomic-embed-text-v2-moe"),
trust_remote_code=True),
"Qwen2Model": _HfExamplesInfo(os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base")),
"Qwen2ForRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-RM-72B")),
"Qwen2ForProcessRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-PRM-7B")),
"Qwen2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "jason9693/Qwen2.5-1.5B-apeach")), # noqa: E501
"RobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "sentence-transformers/stsb-roberta-base-v2")), # noqa: E501
"RobertaForMaskedLM": _HfExamplesInfo(os.path.join(models_path_prefix, "sentence-transformers/all-roberta-large-v1")), # noqa: E501
"XLMRobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")),
"JambaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"ai21labs/Jamba-tiny-reward-dev")), # noqa: E501
"LlamaModel": _HfExamplesInfo(os.path.join(models_path_prefix,"llama"), is_available_online=False),
"MistralModel": _HfExamplesInfo(os.path.join(models_path_prefix,"intfloat/e5-mistral-7b-instruct")),
"ModernBertModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Alibaba-NLP/gte-modernbert-base"),
trust_remote_code=True, v0_only=True),
"NomicBertModel": _HfExamplesInfo(os.path.join(models_path_prefix,"nomic-ai/nomic-embed-text-v2-moe"),
trust_remote_code=True, v0_only=True), # noqa: E501
"Qwen2Model": _HfExamplesInfo(os.path.join(models_path_prefix,"ssmits/Qwen2-7B-Instruct-embed-base")),
"Qwen2ForRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen2.5-Math-RM-72B")),
"Qwen2ForProcessRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen2.5-Math-PRM-7B")),
"Qwen2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"jason9693/Qwen2.5-1.5B-apeach")), # noqa: E501
"RobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix,"sentence-transformers/stsb-roberta-base-v2"), v0_only=True), # noqa: E501
"RobertaForMaskedLM": _HfExamplesInfo(os.path.join(models_path_prefix,"sentence-transformers/all-roberta-large-v1"), v0_only=True), # noqa: E501
"XLMRobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix,"intfloat/multilingual-e5-small"), v0_only=True), # noqa: E501
# [Multimodal]
"LlavaNextForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "royokong/e5-v")),
"Phi3VForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "TIGER-Lab/VLM2Vec-Full"),
......@@ -307,10 +326,10 @@ _EMBEDDING_EXAMPLE_MODELS = {
_CROSS_ENCODER_EXAMPLE_MODELS = {
# [Text-only]
"BertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2")), # noqa: E501
"RobertaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "cross-encoder/quora-roberta-base")), # noqa: E501
"XLMRobertaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3")), # noqa: E501
"ModernBertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-reranker-modernbert-base")), # noqa: E501
"BertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2"), v0_only=True), # noqa: E501
"RobertaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "cross-encoder/quora-roberta-base"), v0_only=True), # noqa: E501
"XLMRobertaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"), v0_only=True), # noqa: E501
"ModernBertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-reranker-modernbert-base"), v0_only=True), # noqa: E501
}
_MULTIMODAL_EXAMPLE_MODELS = {
......@@ -318,8 +337,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"AriaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"rhymes-ai/Aria")),
"AyaVisionForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"CohereForAI/aya-vision-8b")), # noqa: E501
"Blip2ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"Salesforce/blip2-opt-2.7b"), # noqa: E501
extras={"6b": os.path.join(models_path_prefix,"Salesforce/blip2-opt-6.7b")}, # noqa: E501
v0_only=True),
extras={"6b": os.path.join(models_path_prefix,"Salesforce/blip2-opt-6.7b")}), # noqa: E501
"ChameleonForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"facebook/chameleon-7b")), # noqa: E501
"DeepseekVLV2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"deepseek-ai/deepseek-vl2-tiny"), # noqa: E501
extras={"fork": os.path.join(models_path_prefix,"Isotr0py/deepseek-vl2-tiny")}, # noqa: E501
......@@ -332,8 +350,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"GLM4VForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/glm-4v-9b"),
trust_remote_code=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501
"H2OVLChatModel": _HfExamplesInfo(os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),
extras={"2b": os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-2b")}, # noqa: E501
"Glm4vForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/GLM-4.1V-9B-Thinking"), min_transformers_version="4.53"), # noqa: E501
"H2OVLChatModel": _HfExamplesInfo(os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-800m"),
extras={"2b": os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-2b")}, # noqa: E501
max_transformers_version="4.48", # noqa: E501
transformers_version_reason="HF model is not compatible."), # noqa: E501
"InternVLChatModel": _HfExamplesInfo(os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
......@@ -342,11 +361,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3"), # noqa: E501
{"tiny": os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")}), # noqa: E501
"KeyeForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
trust_remote_code=True),
"KimiVLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Instruct"), # noqa: E501
extras={"thinking": os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Thinking")}, # noqa: E501
trust_remote_code=True,
v0_only=True),
"Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"), # noqa: E501
trust_remote_code=True),
"Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
max_model_len=10240),
"LlavaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
extras={"mistral": os.path.join(models_path_prefix, "mistral-community/pixtral-12b"), # noqa: E501
......@@ -404,6 +424,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True),
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
"Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501
# [Encoder-decoder]
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
......
......@@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
model_info.check_transformers_version(on_fail="skip")
# FIXME: Possible memory leak in the previous tests?
if model_arch == "GraniteSpeechForConditionalGeneration":
if model_arch in ("GraniteSpeechForConditionalGeneration",
"KimiVLForConditionalGeneration"):
pytest.skip("Avoid OOM")
# Avoid OOM and reduce initialization time by only using 1 layer
......@@ -31,12 +32,21 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
text_config = hf_config.get_text_config()
# Ensure at least 2 expert per group
# Since `grouped_topk` assums top-2
n_group = getattr(text_config, 'n_group', None)
num_experts = n_group * 2 if n_group is not None else 2
text_config.update({
"num_layers": 1,
"num_hidden_layers": 1,
"num_experts": 2,
"num_experts": num_experts,
"num_experts_per_tok": 2,
"num_local_experts": 2,
"num_local_experts": num_experts,
# Otherwise there will not be any expert layers
"first_k_dense_replace": 0,
# To avoid OOM on DeepSeek-V3
"n_routed_experts": num_experts,
})
if hasattr(hf_config, "vision_config"):
......@@ -80,6 +90,7 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
model_info.default,
tokenizer=model_info.tokenizer,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
speculative_config={
"model": model_info.speculative_model,
"num_speculative_tokens": 1,
......
......@@ -53,7 +53,9 @@ def test_oot_registration_embedding(
with monkeypatch.context() as m:
m.setenv("VLLM_PLUGINS", "register_dummy_model")
prompts = ["Hello, my name is", "The text does not matter"]
llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
llm = LLM(model=dummy_gemma2_embedding_path,
load_format="dummy",
max_model_len=2048)
outputs = llm.embed(prompts)
for output in outputs:
......
......@@ -10,9 +10,9 @@ import torch.cuda
from vllm.model_executor.models import (is_pooling_model,
is_text_generation_model,
supports_multimodal)
from vllm.model_executor.models.adapters import (as_classification_model,
as_embedding_model,
as_reward_model)
from vllm.model_executor.models.adapters import (as_embedding_model,
as_reward_model,
as_seq_cls_model)
from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
_SPECULATIVE_DECODING_MODELS,
_TEXT_GENERATION_MODELS,
......@@ -46,7 +46,7 @@ def test_registry_imports(model_arch):
assert is_text_generation_model(model_cls)
# All vLLM models should be convertible to a pooling model
assert is_pooling_model(as_classification_model(model_cls))
assert is_pooling_model(as_seq_cls_model(model_cls))
assert is_pooling_model(as_embedding_model(model_cls))
assert is_pooling_model(as_reward_model(model_cls))
......
......@@ -336,3 +336,10 @@ class EmbedModelInfo(NamedTuple):
architecture: str = ""
dtype: str = "auto"
enable_test: bool = True
class RerankModelInfo(NamedTuple):
name: str
architecture: str = ""
dtype: str = "auto"
enable_test: bool = True
......@@ -68,7 +68,7 @@ async def test_evil_forward(tmp_socket):
with pytest.raises(MQEngineDeadError):
async for _ in client.generate(prompt="Hello my name is",
sampling_params=SamplingParams(),
request_id=uuid.uuid4()):
request_id=str(uuid.uuid4())):
pass
assert client.errored
......@@ -117,7 +117,7 @@ async def test_failed_health_check(tmp_socket):
with pytest.raises(MQEngineDeadError):
async for _ in client.generate(prompt="Hello my name is",
sampling_params=SamplingParams(),
request_id=uuid.uuid4()):
request_id=str(uuid.uuid4())):
pass
client.close()
......@@ -159,7 +159,7 @@ async def test_failed_abort(tmp_socket):
async for _ in client.generate(
prompt="Hello my name is",
sampling_params=SamplingParams(max_tokens=10),
request_id=uuid.uuid4()):
request_id=str(uuid.uuid4())):
pass
assert "KeyError" in repr(execinfo.value)
assert client.errored
......@@ -191,7 +191,7 @@ async def test_batch_error(tmp_socket):
params = SamplingParams(min_tokens=2048, max_tokens=2048)
async for _ in client.generate(prompt="Hello my name is",
sampling_params=params,
request_id=uuid.uuid4()):
request_id=str(uuid.uuid4())):
pass
tasks = [asyncio.create_task(do_generate(client)) for _ in range(10)]
......@@ -291,7 +291,7 @@ async def test_engine_process_death(tmp_socket):
with pytest.raises(MQEngineDeadError):
async for _ in client.generate(prompt="Hello my name is",
sampling_params=SamplingParams(),
request_id=uuid.uuid4()):
request_id=str(uuid.uuid4())):
pass
# And the health check should show the engine is dead
......
......@@ -9,6 +9,7 @@ from typing import Optional
import pytest
import os
from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal
......@@ -73,6 +74,12 @@ def test_multi_step_llm(
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> 1 logprob returned.
"""
if current_platform.is_rocm() and \
(attention_backend == "FLASHINFER" or enable_chunked_prefill):
pytest.skip(
"Multi-Step with FLASHINFER or Chunked-Prefill is not supported"
"on ROCm")
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
......@@ -223,6 +230,9 @@ def test_multi_step_llm_w_prompt_logprobs(
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
@pytest.mark.parametrize("num_logprobs", [None, 5])
@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
@pytest.mark.skipif(
current_platform.is_rocm(),
reason="Multi-Step + Chunked-Prefill not supported on ROCm")
def test_multi_step_llm_chunked_prefill_prefix_cache(
vllm_runner,
example_prompts,
......
......@@ -60,3 +60,15 @@ def test_hash_collision_array_shape():
hasher = MultiModalHasher
assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
def test_hash_non_contiguous_array():
arr = np.arange(24).reshape(4, 6).T
assert not arr.flags.c_contiguous
arr_c = np.ascontiguousarray(arr)
assert arr_c.flags.c_contiguous
hasher = MultiModalHasher
# Both should be hashable and produce the same hashes
assert hasher.hash_kwargs(data=arr) == hasher.hash_kwargs(data=arr_c)
......@@ -1086,6 +1086,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
prompt="",
mm_data={},
mm_kwargs=call_kwargs,
tok_kwargs={},
)
assert out_kwargs == expected_kwargs
......@@ -169,12 +169,15 @@ async def test_fetch_image_error_conversion():
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_fetch_video_http(video_url: str, num_frames: int):
connector = MediaConnector()
connector = MediaConnector(
media_io_kwargs={"video": {
"num_frames": num_frames,
}})
video_sync = connector.fetch_video(video_url, num_frames=num_frames)
video_async = await connector.fetch_video_async(video_url,
num_frames=num_frames)
video_sync, metadata_sync = connector.fetch_video(video_url)
video_async, metadata_async = await connector.fetch_video_async(video_url)
assert np.array_equal(video_sync, video_async)
assert metadata_sync == metadata_async
# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
......
......@@ -4,7 +4,10 @@ import numpy as np
import numpy.typing as npt
import pytest
from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader
from vllm import envs
from vllm.multimodal.image import ImageMediaIO
from vllm.multimodal.video import (VIDEO_LOADER_REGISTRY, VideoLoader,
VideoMediaIO)
NUM_FRAMES = 10
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
......@@ -40,3 +43,46 @@ def test_video_loader_registry():
def test_video_loader_type_doesnt_exist():
with pytest.raises(AssertionError):
VIDEO_LOADER_REGISTRY.load("non_existing_video_loader")
@VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps")
class Assert10Frames1FPSVideoLoader(VideoLoader):
@classmethod
def load_bytes(cls,
data: bytes,
num_frames: int = -1,
fps: float = -1.0,
**kwargs) -> npt.NDArray:
assert num_frames == 10, "bad num_frames"
assert fps == 1.0, "bad fps"
return FAKE_OUTPUT_2
def test_video_media_io_kwargs():
envs.VLLM_VIDEO_LOADER_BACKEND = "assert_10_frames_1_fps"
imageio = ImageMediaIO()
# Verify that different args pass/fail assertions as expected.
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0})
_ = videoio.load_bytes(b"test")
videoio = VideoMediaIO(
imageio, **{
"num_frames": 10,
"fps": 1.0,
"not_used": "not_used"
})
_ = videoio.load_bytes(b"test")
with pytest.raises(AssertionError, match="bad num_frames"):
videoio = VideoMediaIO(imageio, **{})
_ = videoio.load_bytes(b"test")
with pytest.raises(AssertionError, match="bad num_frames"):
videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0})
_ = videoio.load_bytes(b"test")
with pytest.raises(AssertionError, match="bad fps"):
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0})
_ = videoio.load_bytes(b"test")
......@@ -7,6 +7,8 @@ import pytest
import torch
import torch.nn.functional as F
from vllm.utils import cdiv
class BlockDiagonalCausalFromBottomRightMask:
......@@ -398,11 +400,8 @@ def test_contexted_kv_attention(
assert (large_tile_size >= B_P_SIZE
), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
def ceil_div(a, b):
return (a + b - 1) // b
def pad_to_multiple(a, b):
return ceil_div(a, b) * b
return cdiv(a, b) * b
def pad_to_next_power_of_2(a):
assert a > 0
......@@ -411,7 +410,7 @@ def test_contexted_kv_attention(
# calculate input shapes
max_num_queries = pad_to_next_power_of_2(sum(query_lens))
context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
num_active_blocks = ceil_div(context_lens, block_size).sum().item()
num_active_blocks = cdiv(context_lens, block_size).sum().item()
num_active_blocks = pad_to_multiple(num_active_blocks,
large_tile_size // block_size)
context_kv_len = num_active_blocks * block_size
......
......@@ -10,5 +10,7 @@ setup(
entry_points={
'vllm.platform_plugins': [
"dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin" # noqa
]
],
"vllm.general_plugins":
["dummy_custom_ops = vllm_add_dummy_platform:register_ops"],
})
......@@ -6,3 +6,7 @@ from typing import Optional
def dummy_platform_plugin() -> Optional[str]:
return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
def register_ops():
import vllm_add_dummy_platform.dummy_custom_ops # noqa
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment