Commit 4eabe123 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

parents 45840cd2 58738772
# SPDX-License-Identifier: Apache-2.0
import os
import re
from collections.abc import Sequence
from typing import Optional
import librosa
import pytest
import regex as re
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.image import convert_image_mode, rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
......@@ -267,7 +267,7 @@ def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
# use the example speech question so that the model outputs are reasonable
audio = librosa.load(speech_question, sr=None)
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
inputs_vision_speech = [
(
......
......@@ -3,11 +3,13 @@
for manipulating the input / output of HF & vLLM test runners, which are
typically specific to a small subset of models.
"""
import re
import types
from pathlib import PosixPath
from typing import Optional, Union
import numpy as np
import numpy.typing as npt
import regex as re
import torch
from PIL.Image import Image
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
......@@ -495,13 +497,20 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, list[Image]],
**kwargs):
def __call__(
self,
text: str,
images: Union[Image, list[Image]] = None,
videos: Union[npt.NDArray, list[npt.NDArray]] = None,
**kwargs,
):
from vllm.model_executor.models.internvl import (
IMG_CONTEXT, IMG_END, IMG_START,
image_to_pixel_values_internvl)
image_to_pixel_values_internvl, video_to_pixel_values_internvl)
images = [images] if isinstance(images, Image) else images
pixel_values = [
videos = [videos] if isinstance(videos, np.ndarray) else videos
if images is not None:
pixel_values_images = [
image_to_pixel_values_internvl(
image,
input_size=self.image_size,
......@@ -510,15 +519,52 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
use_thumbnail=self.use_thumbnail,
) for image in images
]
num_patches_list = [
pixel_value.shape[0] for pixel_value in pixel_values
num_patches_images = [
pixel_value.shape[0] for pixel_value in pixel_values_images
]
pixel_values = torch.cat(pixel_values, dim=0)
for num_patches in num_patches_list:
else:
pixel_values_images, num_patches_images = [], []
if videos is not None:
pixel_values_videos = [
video_to_pixel_values_internvl(
video,
input_size=self.image_size,
min_num=1,
max_num=1,
use_thumbnail=False,
) for video in videos
]
num_patches_videos = [
pixel_value.shape[0] for pixel_value in pixel_values_videos
]
else:
pixel_values_videos, num_patches_videos = [], []
pixel_values = []
while ("<image>" in text) or ("<video>" in text):
image_index = text.find("<image>")
video_index = text.find("<video>")
if image_index == -1 or (video_index > -1
and video_index < image_index):
num_patches = num_patches_videos.pop(0)
pixel_values.append(pixel_values_videos.pop(0))
context_tokens = IMG_START + \
IMG_CONTEXT * self.num_image_token + IMG_END
video_tokens = ''.join([
f'Frame{i+1}: {context_tokens}'
for i in range(num_patches)
])
text = text.replace('<video>', video_tokens, 1)
else:
num_patches = num_patches_images.pop(0)
pixel_values.append(pixel_values_images.pop(0))
context_tokens = IMG_CONTEXT * self.num_image_token \
* num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
pixel_values = torch.cat(pixel_values, dim=0)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt
......
......@@ -9,15 +9,15 @@ from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
UserMessage)
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from PIL import Image
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from vllm.config import ModelConfig
from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.inputs import MultiModalInputs
from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache
from vllm.transformers_utils.tokenizer import (MistralTokenizer,
cached_tokenizer_from_config)
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
cached_tokenizer_from_config,
encode_tokens)
from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import HF_EXAMPLE_MODELS
......@@ -28,7 +28,6 @@ def _test_processing_correctness(
hit_rate: float,
num_batches: int,
simplify_rate: float,
ignore_mm_keys: Optional[set[str]] = None,
):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_available_online(on_fail="skip")
......@@ -99,10 +98,23 @@ def _test_processing_correctness(
}
mm_counts = {k: len(vs) for k, vs in mm_data.items()}
# Mistral chat outputs tokens directly, rather than text prompts
if isinstance(tokenizer, MistralTokenizer):
images = mm_data.get("image", [])
request = ChatCompletionRequest(messages=[
UserMessage(content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]),
])
res = tokenizer.mistral.encode_chat_completion(request)
prompt = res.tokens
else:
prompt = dummy_inputs.get_dummy_processor_inputs(
model_config.max_model_len,
mm_counts,
).prompt_text
).prompt
# Drop unnecessary keys and test single -> multi conversion
if rng.rand() < simplify_rate:
......@@ -112,8 +124,7 @@ def _test_processing_correctness(
elif len(mm_data[k]) == 1:
mm_data[k] = mm_data[k][0]
if isinstance(tokenizer, MistralTokenizer):
_test_processing_correctness_mistral(
_test_processing_correctness_one(
model_config,
tokenizer,
prompt,
......@@ -121,58 +132,51 @@ def _test_processing_correctness(
baseline_processor,
cached_processor,
batch_idx,
ignore_mm_keys=ignore_mm_keys,
)
else:
_test_processing_correctness_hf(
model_config,
tokenizer,
prompt,
mm_data,
baseline_processor,
cached_processor,
batch_idx,
ignore_mm_keys=ignore_mm_keys,
)
def _test_processing_correctness_hf(
# For some multimodal models, tokenizer will always add bos_token
# at the beginning of prompt by default, causing hf_processor outputs
# incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor.
_ADD_SPECIAL_TOKENS_OVERRIDES = {
"mllama": False,
"ovis": False,
"ultravox": False,
"whisper": False,
}
_IGNORE_MM_KEYS = {
# In Ultravox, the audio_features can be different depending on padding
# The slight difference should not be a problem though, since
# attention_mask lets us ignore the difference.
"ultravox": {"audio_features"},
}
def _test_processing_correctness_one(
model_config: ModelConfig,
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
prompt: str,
tokenizer: AnyTokenizer,
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
baseline_processor: BaseMultiModalProcessor,
cached_processor: BaseMultiModalProcessor,
batch_idx: int,
ignore_mm_keys: Optional[set[str]] = None,
):
if model_config.hf_config.model_type in ("mllama", "ovis", "ultravox",
"whisper"):
# For some multimodal models, tokenizer will always add bos_token
# at the beginning of prompt by default, causing hf_processor outputs
# incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor.
token_prompt = tokenizer.encode(prompt, add_special_tokens=False)
else:
token_prompt = tokenizer.encode(prompt)
model_type = model_config.hf_config.model_type
ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
baseline_result = baseline_processor.apply(
prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
cached_result = cached_processor.apply(
if isinstance(prompt, str):
text_prompt = prompt
token_prompt = encode_tokens(
tokenizer,
prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
_assert_inputs_equal(
baseline_result,
cached_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
)
else:
# Mistral does not support decode_tokens with skip_special_tokens=False
text_prompt = None
token_prompt = prompt
baseline_tokenized_result = baseline_processor.apply(
token_prompt,
......@@ -180,13 +184,6 @@ def _test_processing_correctness_hf(
hf_processor_mm_kwargs={},
)
_assert_inputs_equal(
baseline_result,
baseline_tokenized_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
)
cached_tokenized_result = cached_processor.apply(
token_prompt,
mm_data=mm_data,
......@@ -194,53 +191,45 @@ def _test_processing_correctness_hf(
)
_assert_inputs_equal(
cached_result,
baseline_tokenized_result,
cached_tokenized_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
msg=f"Failed ({batch_idx=}, {token_prompt=}, {mm_data=})",
)
def _test_processing_correctness_mistral(
model_config: ModelConfig,
tokenizer: MistralTokenizer,
prompt: str,
mm_data: MultiModalDataDict,
baseline_processor: BaseMultiModalProcessor,
cached_processor: BaseMultiModalProcessor,
batch_idx: int,
ignore_mm_keys: Optional[set[str]] = None,
):
images = mm_data.get("image", [])
if not isinstance(images, list):
images = [images]
request = ChatCompletionRequest(messages=[
UserMessage(content=[
TextChunk(text=prompt),
*(ImageChunk(image=image) for image in images),
]),
])
res = tokenizer.mistral.encode_chat_completion(request)
token_prompt = res.tokens
# Mistral chat outputs tokens directly, rather than text prompts
baseline_tokenized_result = baseline_processor.apply(
token_prompt,
if text_prompt is not None:
baseline_text_result = baseline_processor.apply(
text_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
cached_tokenized_result = cached_processor.apply(
token_prompt,
cached_text_result = cached_processor.apply(
text_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
_assert_inputs_equal(
baseline_text_result,
cached_text_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {text_prompt=}, {mm_data=})",
)
_assert_inputs_equal(
baseline_text_result,
baseline_tokenized_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {text_prompt=}, "
f"{token_prompt=}, {mm_data=})",
)
_assert_inputs_equal(
cached_text_result,
cached_tokenized_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
msg=f"Failed ({batch_idx=}, {text_prompt=}, "
f"{token_prompt=}, {mm_data=})",
)
......@@ -258,6 +247,7 @@ def _test_processing_correctness_mistral(
"ibm-granite/granite-speech-3.3-8b",
"h2oai/h2ovl-mississippi-800m",
"OpenGVLab/InternVL2-1B",
"OpenGVLab/InternVL3-1B",
"HuggingFaceM4/Idefics3-8B-Llama3",
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
"moonshotai/Kimi-VL-A3B-Instruct",
......@@ -280,6 +270,7 @@ def _test_processing_correctness_mistral(
"AIDC-AI/Ovis2-1B",
"google/paligemma-3b-mix-224",
"google/paligemma2-3b-ft-docci-448",
"microsoft/Phi-3.5-vision-instruct",
"microsoft/Phi-4-multimodal-instruct",
"mistralai/Pixtral-12B-2409",
"mistral-community/pixtral-12b",
......@@ -303,41 +294,6 @@ def test_processing_correctness(
num_batches: int,
simplify_rate: float,
):
ignore_mm_keys = None
if 'ultravox' in model_id:
# In Ultravox, the audio_features can be different depending on padding
# The slight difference should not be a problem though, since
# attention_mask lets us ignore the difference.
ignore_mm_keys = {"audio_features"}
_test_processing_correctness(
model_id,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
ignore_mm_keys=ignore_mm_keys,
)
# yapf: disable
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
# yapf: enable
def test_processing_correctness_phi3v(
model_id: str,
hit_rate: float,
num_batches: int,
simplify_rate: float,
):
# HACK - this is an attempted workaround for the following bug
# https://github.com/huggingface/transformers/issues/34307
from transformers import AutoImageProcessor # noqa: F401
from transformers import AutoProcessor # noqa: F401
AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
_test_processing_correctness(
model_id,
hit_rate=hit_rate,
......@@ -356,16 +312,10 @@ def _assert_inputs_equal(
if ignore_mm_keys is None:
ignore_mm_keys = set()
if msg is None:
assert "mm_kwargs" in a and "mm_kwargs" in b
else:
assert "mm_kwargs" in a and "mm_kwargs" in b, msg
for key in ignore_mm_keys:
a["mm_kwargs"].pop(key, None)
b["mm_kwargs"].pop(key, None)
if msg is None:
assert a == b
else:
assert a == b, msg
......@@ -49,7 +49,7 @@ def test_profiling(
] * max_num_seqs
mm_kwargs = processor.apply(
prompt=dummy_mm_data.prompt_text,
prompt=dummy_mm_data.prompt,
mm_data=dummy_mm_data.mm_data,
hf_processor_mm_kwargs=dict(),
)["mm_kwargs"]
......
......@@ -78,8 +78,12 @@ DOLPHIN_CONFIG = GGUFTestConfig(
)
MODELS = [
LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG,
DOLPHIN_CONFIG
LLAMA_CONFIG,
QWEN2_CONFIG,
PHI3_CONFIG,
GPT2_CONFIG,
# STABLELM_CONFIG, # enable this when v1 support head_size=80
DOLPHIN_CONFIG,
# STARCODER_CONFIG, # broken
]
......
......@@ -41,8 +41,8 @@ EXPECTED_STRS_MAP = {
reason=
"Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system.")
@pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
reason="nvfp4 is not supported on this GPU type.")
@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"),
reason="modelopt_fp4 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
model = LLM(
......@@ -50,7 +50,7 @@ def test_models(example_prompts, model_name) -> None:
max_model_len=MAX_MODEL_LEN,
trust_remote_code=True,
enforce_eager=True,
quantization="nvfp4",
quantization="modelopt_fp4",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
......
......@@ -8,6 +8,8 @@ import pytest
from packaging.version import Version
from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.config import TokenizerMode
@dataclass(frozen=True)
class _HfExamplesInfo:
......@@ -20,7 +22,7 @@ class _HfExamplesInfo:
tokenizer: Optional[str] = None
"""Set the tokenizer to load for this architecture."""
tokenizer_mode: str = "auto"
tokenizer_mode: TokenizerMode = "auto"
"""Set the tokenizer type for this architecture."""
speculative_model: Optional[str] = None
......@@ -55,9 +57,18 @@ class _HfExamplesInfo:
trust_remote_code: bool = False
"""The ``trust_remote_code`` level required to load the model."""
v0_only: bool = False
"""The model is only available with the vLLM V0 engine."""
hf_overrides: dict[str, Any] = field(default_factory=dict)
"""The ``hf_overrides`` required to load the model."""
max_model_len: Optional[int] = None
"""
The maximum model length to use for this model. Some models default to a
length that is too large to fit into memory in CI.
"""
def check_transformers_version(
self,
*,
......@@ -124,7 +135,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
trust_remote_code=True),
"BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B",
extras={"tiny": "hmellor/bamba-tiny-random"}), # noqa: E501
extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501
"BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
{"1b": "bigscience/bloomz-1b1"}),
"ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
......@@ -147,6 +158,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501
"Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501
"FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
"FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
is_available_online=False,
min_transformers_version="4.52.2"),
"GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
"Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
"Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
......@@ -212,10 +226,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
trust_remote_code=True),
"PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
"PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
"PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True),
"Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
"Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
trust_remote_code=True),
trust_remote_code=True,
v0_only=True),
"PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
trust_remote_code=True),
"Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
......@@ -231,7 +246,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
is_available_online=False),
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501
is_available_online=False),
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
v0_only=True),
"Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
"SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
"TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
......@@ -300,7 +316,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501
extras={"6b": "Salesforce/blip2-opt-6.7b"}), # noqa: E501
extras={"6b": "Salesforce/blip2-opt-6.7b"}, # noqa: E501
v0_only=True),
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501
extras={"fork": "Isotr0py/deepseek-vl2-tiny"}, # noqa: E501
......@@ -319,15 +336,18 @@ _MULTIMODAL_EXAMPLE_MODELS = {
max_transformers_version="4.48", # noqa: E501
transformers_version_reason="HF model is not compatible."), # noqa: E501
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
extras={"2B": "OpenGVLab/InternVL2-2B"}, # noqa: E501
extras={"2B": "OpenGVLab/InternVL2-2B",
"3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501
trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
trust_remote_code=True),
trust_remote_code=True,
v0_only=True),
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
min_transformers_version="4.51"),
min_transformers_version="4.51",
max_model_len=10240),
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
"mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}), # noqa: E501
......@@ -346,7 +366,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras={"2.6": "openbmb/MiniCPM-V-2_6"}, # noqa: E501
trust_remote_code=True),
"MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
trust_remote_code=True),
trust_remote_code=True,
v0_only=True),
"Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503", # noqa: E501
extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}), # noqa: E501
"MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
......@@ -379,6 +400,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"), # noqa: E501
"Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B",
min_transformers_version="4.52"),
"Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ", # noqa: E501
min_transformers_version="4.52"),
"SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
"SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
......
......@@ -15,12 +15,12 @@ from .registry import HF_EXAMPLE_MODELS
@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
def test_can_initialize(model_arch):
def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
# Avoid OOM
# Avoid OOM and reduce initialization time by only using 1 layer
def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
hf_config.update(model_info.hf_overrides)
......@@ -34,6 +34,12 @@ def test_can_initialize(model_arch):
"num_local_experts": 2,
})
if hasattr(hf_config, "vision_config"):
hf_config.vision_config.update({
"num_layers": 1,
"num_hidden_layers": 1,
})
return hf_config
# Avoid calling model.forward()
......@@ -46,7 +52,7 @@ def test_can_initialize(model_arch):
scheduler_kv_cache_config = get_kv_cache_config(
vllm_config,
kv_cache_specs[0],
20 * GiB_bytes,
10 * GiB_bytes,
)
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
......@@ -55,7 +61,9 @@ def test_can_initialize(model_arch):
with (patch.object(V0LLMEngine, "_initialize_kv_caches",
_initialize_kv_caches_v0),
patch.object(V1EngineCore, "_initialize_kv_caches",
_initialize_kv_caches_v1)):
_initialize_kv_caches_v1), monkeypatch.context() as m):
if model_info.v0_only:
m.setenv("VLLM_USE_V1", "0")
LLM(
model_info.default,
tokenizer=model_info.tokenizer,
......@@ -65,6 +73,7 @@ def test_can_initialize(model_arch):
"num_speculative_tokens": 1,
} if model_info.speculative_model else None,
trust_remote_code=model_info.trust_remote_code,
max_model_len=model_info.max_model_len,
load_format="dummy",
hf_overrides=hf_overrides,
)
......@@ -4,6 +4,7 @@ import pytest
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.multimodal.image import convert_image_mode
from ..utils import create_new_process_for_each_test
......@@ -58,7 +59,7 @@ def test_oot_registration_embedding(
assert all(v == 0 for v in output.outputs.embedding)
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
@create_new_process_for_each_test()
......
# SPDX-License-Identifier: Apache-2.0
"""Test the functionality of the Transformers backend."""
from typing import Any, Optional, Union
import pytest
from vllm.platforms import current_platform
from ..conftest import HfRunner, VllmRunner
from ..core.block.e2e.test_correctness_sliding_window import prep_prompts
from ..utils import multi_gpu_test
from .utils import check_logprobs_close
def check_implementation(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
runner_ref: type[Union[HfRunner, VllmRunner]],
runner_test: type[VllmRunner],
example_prompts: list[str],
model: str,
kwargs_ref: Optional[dict[str, Any]] = None,
kwargs_test: Optional[dict[str, Any]] = None,
**kwargs,
):
if kwargs_ref is None:
kwargs_ref = {}
if kwargs_test is None:
kwargs_test = {}
max_tokens = 32
num_logprobs = 5
with vllm_runner(model, **kwargs) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
args = (example_prompts, max_tokens, num_logprobs)
with runner_test(model, **kwargs_test, **kwargs) as model_test:
outputs_test = model_test.generate_greedy_logprobs(*args)
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
with runner_ref(model, **kwargs_ref) as model_ref:
if isinstance(model_ref, VllmRunner):
outputs_ref = model_ref.generate_greedy_logprobs(*args)
else:
outputs_ref = model_ref.generate_greedy_logprobs_limit(*args)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
outputs_0_lst=outputs_ref,
outputs_1_lst=outputs_test,
name_0="ref",
name_1="test",
)
......@@ -58,6 +71,18 @@ def test_models(
model_impl=model_impl)
def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
prompts, _, _ = prep_prompts(4, (800, 801))
kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
kwargs_test = {"model_impl": "transformers", **kwargs_ref}
check_implementation(vllm_runner,
vllm_runner,
prompts,
model="hmellor/tiny-random-Gemma2ForCausalLM",
kwargs_ref=kwargs_ref,
kwargs_test=kwargs_test)
@multi_gpu_test(num_gpus=2)
def test_distributed(
hf_runner: type[HfRunner],
......@@ -65,8 +90,11 @@ def test_distributed(
example_prompts,
):
kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
check_implementation(hf_runner, vllm_runner, example_prompts,
"meta-llama/Llama-3.2-1B-Instruct", **kwargs)
check_implementation(hf_runner,
vllm_runner,
example_prompts,
"meta-llama/Llama-3.2-1B-Instruct",
kwargs_test=kwargs)
@pytest.mark.skipif(
......
......@@ -77,3 +77,73 @@ def test_module_with_child_containing_batchnorm_can_autoload():
assert torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
def test_module_skip_prefix():
"""Ensure the auto weight loader can skip prefix."""
mod = ModuleWithNestedBatchNorm()
# Run some data through the module with batchnorm
mod(torch.Tensor([[1, 2], [3, 4]]))
# Try to load the weights to a new instance
def weight_generator():
# weights needed to be filtered out
redundant_weights = {
"prefix.bn.weight": torch.Tensor([1, 2]),
"prefix.bn.bias": torch.Tensor([3, 4]),
}
yield from (mod.state_dict() | redundant_weights).items()
new_mod = ModuleWithNestedBatchNorm()
assert not torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
assert not torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
loader = AutoWeightsLoader(new_mod, skip_prefixes=["prefix."])
loader.load_weights(weight_generator())
# Ensure the stats are updated
assert torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
assert torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
def test_module_skip_substr():
"""Ensure the auto weight loader can skip prefix."""
mod = ModuleWithNestedBatchNorm()
# Run some data through the module with batchnorm
mod(torch.Tensor([[1, 2], [3, 4]]))
# Try to load the weights to a new instance
def weight_generator():
# weights needed to be filtered out
redundant_weights = {
"nested_mod.0.substr.weight": torch.Tensor([1, 2]),
"nested_mod.0.substr.bias": torch.Tensor([3, 4]),
"nested_mod.substr.weight": torch.Tensor([1, 2]),
"nested_mod.substr.bias": torch.Tensor([3, 4]),
}
yield from (mod.state_dict() | redundant_weights).items()
new_mod = ModuleWithNestedBatchNorm()
assert not torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
assert not torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
loader = AutoWeightsLoader(new_mod, skip_substrs=["substr."])
loader.load_weights(weight_generator())
# Ensure the stats are updated
assert torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
assert torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
import numpy as np
from PIL import Image, ImageChops
from vllm.multimodal.image import convert_image_mode
ASSETS_DIR = Path(__file__).parent / "assets"
assert ASSETS_DIR.exists()
def test_rgb_to_rgb():
# Start with an RGB image.
original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB")
converted_image = convert_image_mode(original_image, "RGB")
# RGB to RGB should be a no-op.
diff = ImageChops.difference(original_image, converted_image)
assert diff.getbbox() is None
def test_rgba_to_rgb():
original_image = Image.open(ASSETS_DIR / "rgba.png")
original_image_numpy = np.array(original_image)
converted_image = convert_image_mode(original_image, "RGB")
converted_image_numpy = np.array(converted_image)
for i in range(original_image_numpy.shape[0]):
for j in range(original_image_numpy.shape[1]):
# Verify that all transparent pixels are converted to white.
if original_image_numpy[i][j][3] == 0:
assert converted_image_numpy[i][j][0] == 255
assert converted_image_numpy[i][j][1] == 255
assert converted_image_numpy[i][j][2] == 255
......@@ -10,6 +10,7 @@ import numpy as np
import pytest
from PIL import Image, ImageChops
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.inputs import PlaceholderRange
from vllm.multimodal.utils import (MediaConnector,
merge_and_sort_multimodal_metadata)
......@@ -53,7 +54,7 @@ def get_supported_suffixes() -> tuple[str, ...]:
def _image_equals(a: Image.Image, b: Image.Image) -> bool:
return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
@pytest.mark.asyncio
......
# SPDX-License-Identifier: Apache-2.0
import json
import os
import shutil
import tempfile
import torch
from huggingface_hub import snapshot_download
from safetensors import safe_open
from vllm import LLM, SamplingParams
def patch_eagle_draft_with_lm_head(target_model_id: str,
draft_model_id: str) -> str:
# In NxDI, draft model checkpoint must include lm_head weights from target
# model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
# /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
# #eagle-checkpoint-compatibility
final_draft_dir = "/tmp/patched_eagle_draft"
with tempfile.TemporaryDirectory() as tmp_dir:
target_dir = snapshot_download(repo_id=target_model_id,
local_dir=os.path.join(
tmp_dir, "target"))
draft_dir = snapshot_download(repo_id=draft_model_id,
local_dir=os.path.join(tmp_dir, "draft"))
lm_head_key = "lm_head.weight"
index_path = os.path.join(target_dir, "model.safetensors.index.json")
with open(index_path) as f:
index = json.load(f)
shard_name = index["weight_map"][lm_head_key]
target_safetensor_path = os.path.join(target_dir, shard_name)
with safe_open(target_safetensor_path, framework="pt") as f:
target_lm_head = f.get_tensor(lm_head_key)
draft_path = os.path.join(draft_dir, "pytorch_model.bin")
draft_state_dict = torch.load(draft_path, map_location="cpu")
draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16)
torch.save(draft_state_dict, draft_path)
shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True)
return final_draft_dir
def test_eagle():
patched_draft_path = patch_eagle_draft_with_lm_head(
target_model_id="meta-llama/Llama-2-7b-hf",
draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
speculative_config={
"model": patched_draft_path,
"num_speculative_tokens": 5,
"max_model_len": 128
},
max_num_seqs=1,
max_model_len=128,
tensor_parallel_size=2,
override_neuron_config={
"enable_eagle_speculation": True,
"enable_fused_speculation": True,
"fused_qkv": True
},
)
prompts = [
"The president of the United States is",
]
outputs = llm.generate(prompts, SamplingParams(top_k=1))
expected_output = " the head of state and head of government of " \
"the United States. The president direct"
for output in outputs:
generated_text = output.outputs[0].text
print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
assert (expected_output == generated_text)
print("Neuron Eagle speculation test passed.")
......@@ -7,26 +7,58 @@ def test_mistral():
llm = LLM(model="mistralai/Mistral-7B-v0.1",
tensor_parallel_size=2,
max_num_seqs=4,
max_model_len=512,
max_model_len=128,
use_v2_block_manager=True,
override_neuron_config={
"sequence_parallel_enabled": False,
"skip_warmup": True
},
device="neuron")
})
# Send more prompts than the compiled batch size (4) and request
# varying generation lengths to test accuracy related to Neuron
# specific sequence id sorting.
prompts = [
"The president of the United States is",
"The capital of France is",
"What is Annapurna labs?",
"I believe the meaning of life is",
"Tell me a story about a brave knight",
"Hello, my name is Llama",
]
outputs = llm.generate(prompts, SamplingParams(top_k=1))
sampling_params = [
SamplingParams(top_k=1, max_tokens=10),
SamplingParams(top_k=1, max_tokens=20),
SamplingParams(top_k=1, max_tokens=30),
SamplingParams(top_k=1, max_tokens=40),
SamplingParams(top_k=1, max_tokens=50),
SamplingParams(top_k=1, max_tokens=60)
]
outputs = llm.generate(prompts, sampling_params)
expected_outputs = [
" the most powerful person in the world. He is the head of state "
"and head",
" a city of many faces. It is a city of history, culture, art"
" the most powerful person in the world. He is",
" a city of many faces. It is a city of history, culture, art, "
"fashion, and",
"\n\nAnnapurna Labs is a semiconductor company that was founded "
"in 2013 by Amazon. The company is",
" to be happy.\n\nI believe that happiness is a choice.\n\nI "
"believe that happiness is a state of mind.\n\nI believe that "
"happiness is a journey.\n\nI believe",
" who rescued a princess from a dragon.\n\nTell me a story about"
" a princess who rescued herself from a dragon.\n\nTell me a "
"story about a princess who rescued herself from a dragon and "
"then rescued a knight from",
" and I am a 10 year old male. I am a very friendly and "
"affectionate boy who loves to be around people. I am a very "
"active boy who loves to play and run around. I am a very smart "
"boy who loves to learn new things. I am a very loyal boy"
]
for expected_output, output in zip(expected_outputs, outputs):
generated_text = output.outputs[0].text
print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
assert (expected_output == generated_text)
print("Neuron Mistral test passed.")
......@@ -29,5 +29,5 @@ def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
# ignore the backend env variable if it is set
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
backend = get_attn_backend(16, torch.float16, "auto", 16, False)
assert backend.get_name() == "Dummy_Backend"
......@@ -37,12 +37,6 @@ models_pre_quant_8bit_to_test = [
("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
]
models_pre_quant_8bit_to_test = [
('meta-llama/Llama-Guard-3-8B-INT8',
'read pre-quantized llama 8-bit model'),
("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
]
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
......
......@@ -23,10 +23,11 @@ def test_runai_model_loader():
runai_model_streamer_tensors = {}
hf_safetensors_tensors = {}
for name, tensor in runai_safetensors_weights_iterator(safetensors):
for name, tensor in runai_safetensors_weights_iterator(
safetensors, True):
runai_model_streamer_tensors[name] = tensor
for name, tensor in safetensors_weights_iterator(safetensors):
for name, tensor in safetensors_weights_iterator(safetensors, True):
hf_safetensors_tensors[name] = tensor
assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors)
......
......@@ -5,14 +5,6 @@ from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Tensorizer only tested on V0 so far.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.fixture(autouse=True)
def cleanup():
cleanup_dist_env_and_memory(shutdown_ray=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment