Commit 38d80967 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

parents 33650733 880c741b
......@@ -250,7 +250,7 @@ def build_video_inputs_from_test_info(
def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
size_type: SizeType):
"""Applies a size scaler to one image; this can be a an image size factor,
"""Applies a size scaler to one image; this can be an image size factor,
which scales the image while maintaining the aspect ratio"""
# Special case for embeddings; if it's a tensor, it's only valid if we
# are considering size factors at constant scale, i.e., we just clone
......
......@@ -42,7 +42,7 @@ def get_filtered_test_settings(
else:
assert test_info.prompt_formatter is not None
# Everything looks okay; keep if this is has correct proc handling
# Everything looks okay; keep if this is correct proc handling
if (test_info.distributed_executor_backend
is not None) == new_proc_per_test:
matching_tests[test_name] = test_info
......
......@@ -42,7 +42,7 @@ def run_test(
tensor_parallel_size: int = 1,
vllm_embeddings: Optional[torch.Tensor] = None,
):
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
"""Modality agnostic test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
......@@ -69,6 +69,9 @@ def run_test(
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
if model_info.hf_overrides:
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
if model_info.skip_tokenizer_init:
vllm_runner_kwargs_[
"skip_tokenizer_init"] = model_info.skip_tokenizer_init
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
......
......@@ -46,7 +46,7 @@ def _run_test(
vllm_model.encode(prompt)
MODELS = ["christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"]
MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
@pytest.mark.core_model
......
......@@ -66,7 +66,9 @@ def _test_processing_correctness(
hf_overrides=model_info.hf_overrides,
# Ensure that the cache can fit all of the data
mm_processor_cache_gb=2048,
)
skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
......@@ -293,6 +295,7 @@ def _test_processing_correctness_one(
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
"OpenGVLab/InternVL3_5-30B-A3B",
"Kwai-Keye/Keye-VL-8B-Preview",
"Kwai-Keye/Keye-VL-1_5-8B",
"moonshotai/Kimi-VL-A3B-Instruct",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"llava-hf/llava-1.5-7b-hf",
......@@ -301,6 +304,7 @@ def _test_processing_correctness_one(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"meta-llama/Llama-3.2-11B-Vision-Instruct",
"TIGER-Lab/Mantis-8B-siglip-llama3",
"mispeech/midashenglm-7b",
"openbmb/MiniCPM-Llama3-V-2_5",
"openbmb/MiniCPM-o-2_6",
"openbmb/MiniCPM-V-2_6",
......
......@@ -5,6 +5,7 @@ import pytest
from vllm.assets.video import VideoAsset
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
from ...utils import build_model_context
......@@ -50,3 +51,49 @@ def test_processor_override(
assert grid_t == expected_grid_t
assert video_tok_count == expected_toks_per_frame * grid_t
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
@pytest.mark.parametrize("fps", [2])
def test_video_loader_consistency(
model_id: str,
fps: int,
):
"""
Ensure dynamic video loader (pre-sampled by loader) and normal video
loader (post-sampled by processor) produce same video processing outputs.
"""
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"video": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {"fps": fps}
# Build the image str / prompt based on the number of images we pass
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
with open(video_path, "rb") as f:
video_bytes = f.read()
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
video_bytes, requested_fps=fps)
# pre-sampled loader shouldn't read all frames
assert len(dynamic_video) < len(static_video)
static_mm_data = {"video": [(static_video, static_metadata)]}
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
static_outputs = processor.apply(prompt, static_mm_data,
hf_processor_mm_kwargs)
dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
hf_processor_mm_kwargs)
assert static_outputs["prompt_token_ids"] == dynamic_outputs[
"prompt_token_ids"]
assert static_outputs["mm_kwargs"].get_data(
) == dynamic_outputs["mm_kwargs"].get_data()
......@@ -52,7 +52,7 @@ def test_profiling(model_id: str, max_model_len: int):
chunks_per_image = prod(mm_data["patches_per_image"])
total_num_patches = chunks_per_image * tokens_per_patch
num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
1] # x-y seperator tokens
1] # x-y separator tokens
total_tokens = total_num_patches.item() + num_tiles.item(
) + 3 # image start, image, image end
......
......@@ -31,6 +31,7 @@ from ...utils import dummy_hf_overrides
ARCH_TO_SKIP = {
"MolmoForCausalLM": "incompatible requirements",
"Florence2ForConditionalGeneration": "not supported in V1",
}
ARCH_NEEDS_EXTRAS = [
"InternVLChatModel",
......@@ -41,9 +42,6 @@ ARCH_NEEDS_EXTRAS = [
]
REPO_ID_TO_SKIP = {
"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
# FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model
# after support PP for GPT-OSS
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model",
}
ImageInput = list[Image.Image]
......@@ -199,7 +197,9 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=hf_overrides_fn,
)
skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
......
......@@ -59,7 +59,9 @@ def test_hf_model_weights_mapper(model_arch: str):
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
)
skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
original_weights = create_repo_dummy_weights(model_id)
......
......@@ -6,10 +6,11 @@ from dataclasses import dataclass, field
from typing import Any, Literal, Optional
import pytest
import torch
from packaging.version import Version
from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.config import TokenizerMode
from vllm.config import ModelDType, TokenizerMode
@dataclass(frozen=True)
......@@ -47,6 +48,23 @@ class _HfExamplesInfo:
The reason for the minimum/maximum version requirement.
"""
skip_tokenizer_init: bool = False
"""
If true, skip initialization of tokenizer and detokenizer.
"""
dtype: ModelDType = "auto"
"""
The data type for the model weights and activations.
"""
enforce_eager: bool = False
"""
Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
"""
is_available_online: bool = True
"""
Set this to ``False`` if the name of this architecture no longer exists on
......@@ -76,6 +94,15 @@ class _HfExamplesInfo:
If not specified, the default revision will be used.
"""
max_num_seqs: Optional[int] = None
"""Maximum number of sequences to be processed in a single iteration."""
use_original_num_layers: bool = False
"""
If True, use the original number of layers from the model config
instead of minimal layers for testing.
"""
def check_transformers_version(
self,
*,
......@@ -137,7 +164,7 @@ class _HfExamplesInfo:
# yapf: disable
_TEXT_GENERATION_EXAMPLE_MODELS = {
# [Decoder-only]
"ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B",
"ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-2509",
min_transformers_version="4.56.0",
trust_remote_code=True),
"AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
......@@ -154,7 +181,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
trust_remote_code=True),
"BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1",
min_transformers_version="4.56.0",
min_transformers_version="4.55.3",
extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501
"BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
{"1b": "bigscience/bloomz-1b1"}),
......@@ -208,7 +235,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"GptOssForCausalLM": _HfExamplesInfo("lmsys/gpt-oss-20b-bf16"),
"GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
"GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
"GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"), # noqa: E501
"GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview", # noqa: E501
min_transformers_version="4.55.3"),
"GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"), # noqa: E501
"Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
trust_remote_code=True),
......@@ -228,7 +256,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
min_transformers_version="4.56.0",
min_transformers_version="4.55.3",
extras={
"tiny": "ai21labs/Jamba-tiny-dev",
"random": "ai21labs/Jamba-tiny-random", # noqa: E501
......@@ -244,7 +272,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
is_available_online=False),
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
"Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
"Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
min_transformers_version="4.55.3",
extras={
"random": "yujiepan/mamba2-codestral-v0.1-tiny-random", # noqa: E501
}),
"FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501
"MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
trust_remote_code=True),
......@@ -259,7 +291,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
"MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1", # noqa: E501
{"tiny": "TitanML/tiny-mixtral"}), # noqa: E501
"QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"), # noqa: E501
"MotifForCausalLM": _HfExamplesInfo("Motif-Technologies/Motif-2.6B",
trust_remote_code=True,
v0_only=True),
"MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
"MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
"NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
......@@ -282,8 +316,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
trust_remote_code=True),
"Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
max_transformers_version="4.53",
transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings", # noqa: E501
trust_remote_code=True),
"QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
max_transformers_version="4.53",
......@@ -294,6 +326,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
"Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
"Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
"Qwen3NextForCausalLM": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
min_transformers_version="4.56.2"),
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
"SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501
trust_remote_code=True,
......@@ -328,6 +362,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
# [Text-only]
"BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
"Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), # noqa: E501
"Gemma3TextModel": _HfExamplesInfo("google/embeddinggemma-300m"),
"GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
"GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
trust_remote_code=True),
......@@ -359,7 +394,20 @@ _EMBEDDING_EXAMPLE_MODELS = {
trust_remote_code=True),
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
"PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
is_available_online=False), # noqa: E501
dtype=torch.float16,
enforce_eager=True,
skip_tokenizer_init=True,
# This is to avoid the model
# going OOM in CI
max_num_seqs=32,
),
"Terratorch": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
dtype=torch.float16,
enforce_eager=True,
skip_tokenizer_init=True,
# This is to avoid the model going OOM in CI
max_num_seqs=32,
),
}
_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
......@@ -438,6 +486,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"), # noqa: E501
"KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
trust_remote_code=True),
"KeyeVL1_5ForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-1_5-8B", # noqa: E501
trust_remote_code=True),
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
trust_remote_code=True),
......@@ -455,6 +505,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
max_transformers_version="4.48", # noqa: E501
transformers_version_reason="HF model is not compatible.", # noqa: E501
hf_overrides={"architectures": ["MantisForConditionalGeneration"]}), # noqa: E501
"MiDashengLMModel": _HfExamplesInfo("mispeech/midashenglm-7b",
trust_remote_code=True),
"MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
trust_remote_code=True),
"MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
......@@ -474,6 +526,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True),
"Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501
trust_remote_code=True),
"NemotronH_Nano_VL": _HfExamplesInfo("nano_vl_dummy",
is_available_online=False,
trust_remote_code=True),
"Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
max_transformers_version="4.53",
transformers_version_reason="HF model is not compatible", # noqa: E501
......@@ -554,19 +609,21 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"EagleDeepSeekMTPModel": _HfExamplesInfo("eagle618/deepseek-v3-random",
speculative_model="eagle618/eagle-deepseek-v3-random", # noqa: E501
trust_remote_code=True),
"EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B",
"EagleLlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B-Instruct", # noqa: E501
trust_remote_code=True,
speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"), # noqa: E501
"Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", # noqa: E501
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"), # noqa: E501
"Eagle3LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.1-8B-Instruct", # noqa: E501
trust_remote_code=True,
speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", # noqa: E501
tokenizer="meta-llama/Llama-3.1-8B-Instruct",
use_original_num_layers=True,
max_model_len=10240),
"LlamaForCausalLMEagle3": _HfExamplesInfo("Qwen/Qwen3-8B", # noqa: E501
trust_remote_code=True,
speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
# TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501
# "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3", # noqa: E501
# trust_remote_code=True,
# speculative_model="AngelSlim/Qwen3-8B_eagle3", # noqa: E501
# tokenizer="Qwen/Qwen3-8B"),
speculative_model="AngelSlim/Qwen3-8B_eagle3", # noqa: E501
tokenizer="Qwen/Qwen3-8B",
use_original_num_layers=True),
"EagleLlama4ForCausalLM": _HfExamplesInfo(
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
trust_remote_code=True,
......@@ -586,7 +643,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
is_available_online=False),
"MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
trust_remote_code=True,
speculative_model="XiaomiMiMo/MiMo-7B-RL")
speculative_model="XiaomiMiMo/MiMo-7B-RL"),
"Qwen3NextMTP": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
min_transformers_version="4.56.2"),
}
_TRANSFORMERS_BACKEND_MODELS = {
......
......@@ -36,7 +36,10 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
hf_overrides_fn = partial(dummy_hf_overrides,
model_arch=model_arch,
exist_overrides=model_info.hf_overrides)
exist_overrides=model_info.hf_overrides,
use_original_num_layers=getattr(
model_info, 'use_original_num_layers',
False))
# Avoid calling model.forward()
def _initialize_kv_caches_v0(self) -> None:
......@@ -60,19 +63,29 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
_initialize_kv_caches_v1), monkeypatch.context() as m):
if model_info.v0_only:
m.setenv("VLLM_USE_V1", "0")
if model_arch == "Phi4FlashForCausalLM":
# Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
# Phi4FlashForCausalLM and MotifForCausalLM
# only supports DIFFERENTIAL_FLASH_ATTN backend
m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
if model_arch == "GptOssForCausalLM":
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3.
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
if model_arch == "Florence2ForConditionalGeneration":
# An encoder-decoder model that's V0-only. Just skip it
# since V0 is about to be removed.
pytest.skip("Skipping Florence2ForConditionalGeneration")
if model_arch == "WhisperForConditionalGeneration":
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
LLM(
model_info.default,
tokenizer=model_info.tokenizer,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
enforce_eager=model_info.enforce_eager,
skip_tokenizer_init=model_info.skip_tokenizer_init,
dtype=model_info.dtype,
speculative_config={
"model": model_info.speculative_model,
"num_speculative_tokens": 1,
......@@ -85,7 +98,7 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
model_impl=ModelImpl.TRANSFORMERS
if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
hf_overrides=hf_overrides_fn,
)
max_num_seqs=model_info.max_num_seqs)
@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from tests.conftest import VllmRunner
from vllm.utils import set_default_torch_num_threads
@pytest.mark.parametrize(
"model",
[
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
"mgazz/Prithvi_v2_eo_300_tl_unet_agb"
],
)
def test_inference(
vllm_runner: type[VllmRunner],
model: str,
) -> None:
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
prompt = dict(prompt_token_ids=[1],
multi_modal_data=dict(pixel_values=pixel_values,
location_coords=location_coords))
with (
set_default_torch_num_threads(1),
vllm_runner(
model,
runner="pooling",
dtype=torch.float16,
enforce_eager=True,
skip_tokenizer_init=True,
# Limit the maximum number of sequences to avoid the
# test going OOM during the warmup run
max_num_seqs=32,
) as vllm_model,
):
vllm_output = vllm_model.llm.encode(prompt)
assert torch.equal(
torch.isnan(vllm_output[0].outputs.data).any(),
torch.tensor(False))
......@@ -294,6 +294,8 @@ def build_model_context(
limit_mm_per_prompt=limit_mm_per_prompt,
mm_processor_cache_gb=mm_processor_cache_gb,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager,
**model_config_kwargs,
)
return InputContext(model_config)
......@@ -345,6 +347,7 @@ class ModelInfo:
name: str
architecture: str = ""
dtype: str = "auto"
hf_dtype: str = "float32"
hf_overrides: Optional[dict[str, Any]] = None
default_pooling_type: str = ""
enable_test: bool = True
......@@ -352,6 +355,7 @@ class ModelInfo:
@dataclass
class EmbedModelInfo(ModelInfo):
mteb_score: Optional[float] = None
is_matryoshka: bool = False
matryoshka_dimensions: Optional[list[int]] = None
......@@ -368,7 +372,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo):
@dataclass
class RerankModelInfo(ModelInfo):
pass
mteb_score: Optional[float] = None
@dataclass
......@@ -381,11 +385,18 @@ class LASTPoolingRerankModelInfo(RerankModelInfo):
default_pooling_type: str = "LAST"
@dataclass
class GenerateModelInfo(ModelInfo):
hf_dtype: str = "auto"
hf_ppl: Optional[float] = None
def dummy_hf_overrides(
hf_config: PretrainedConfig,
*,
model_arch: str = "",
exist_overrides: Optional[dict[str, Any]] = None,
use_original_num_layers: bool = False,
) -> PretrainedConfig:
"""
Dummy HF overrides function used to create dummy model
......@@ -402,10 +413,18 @@ def dummy_hf_overrides(
# we use three layers for Gemma-3n to check
# both normal layer and kv_shared_layer
num_hidden_layers = (3 if model_arch == "Gemma3nForConditionalGeneration"
else 1)
if use_original_num_layers:
# Use the original number of layers from the config
num_layers = getattr(text_config, 'num_layers', 1)
num_hidden_layers = getattr(text_config, 'num_hidden_layers', 1)
else:
# Use minimal layers for testing
num_layers = 1
num_hidden_layers = (3 if model_arch
== "Gemma3nForConditionalGeneration" else 1)
text_config.update({
"num_layers": 1,
"num_layers": num_layers,
"num_hidden_layers": num_hidden_layers,
"num_experts": num_experts,
"num_experts_per_tok": 2,
......
......@@ -31,11 +31,11 @@ if TYPE_CHECKING:
from vllm.multimodal.inputs import MultiModalPlaceholderDict
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
TEST_IMAGE_ASSETS = [
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
"Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
TEST_VIDEO_URLS = [
......@@ -45,12 +45,11 @@ TEST_VIDEO_URLS = [
@pytest.fixture(scope="module")
def url_images() -> dict[str, Image.Image]:
connector = MediaConnector()
def url_images(local_asset_server) -> dict[str, Image.Image]:
return {
image_url: connector.fetch_image(image_url)
for image_url in TEST_IMAGE_URLS
image_url: local_asset_server.get_image_asset(image_url)
for image_url in TEST_IMAGE_ASSETS
}
......@@ -69,7 +68,7 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
@pytest.mark.asyncio
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_fetch_image_http(image_url: str):
connector = MediaConnector()
......@@ -79,12 +78,12 @@ async def test_fetch_image_http(image_url: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
@pytest.mark.parametrize("suffix", get_supported_suffixes())
async def test_fetch_image_base64(url_images: dict[str, Image.Image],
image_url: str, suffix: str):
raw_image_url: str, suffix: str):
connector = MediaConnector()
url_image = url_images[image_url]
url_image = url_images[raw_image_url]
try:
mime_type = Image.MIME[Image.registered_extensions()[suffix]]
......@@ -117,7 +116,7 @@ async def test_fetch_image_base64(url_images: dict[str, Image.Image],
@pytest.mark.asyncio
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_fetch_image_local_files(image_url: str):
connector = MediaConnector()
......@@ -152,8 +151,8 @@ async def test_fetch_image_local_files(image_url: str):
@pytest.mark.asyncio
async def test_fetch_image_local_files_with_space_in_name():
image_url = TEST_IMAGE_URLS[0]
@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
async def test_fetch_image_local_files_with_space_in_name(image_url: str):
connector = MediaConnector()
with TemporaryDirectory() as temp_dir:
......@@ -205,6 +204,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
assert metadata_sync == metadata_async
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("max_duration", [1, 60, 1800])
@pytest.mark.parametrize("requested_fps", [2, 24])
async def test_fetch_video_http_with_dynamic_loader(
video_url: str, max_duration: int, requested_fps: int,
monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
connector = MediaConnector(
media_io_kwargs={
"video": {
"max_duration": max_duration,
"requested_fps": requested_fps,
}
})
video_sync, metadata_sync = connector.fetch_video(video_url)
video_async, metadata_async = await connector.fetch_video_async(
video_url)
assert np.array_equal(video_sync, video_async)
assert metadata_sync == metadata_async
assert metadata_sync["video_backend"] == "opencv_dynamic"
# Used for `test_argsort_mm_positions`.
class TestCase(NamedTuple):
mm_positions: "MultiModalPlaceholderDict"
......@@ -458,7 +483,7 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
with torch.inference_mode():
sharded_output = run_dp_sharded_vision_model(image_input, vision_model)
# Check that the world size is setup correctly
# Check that the world size is set up correctly
assert get_tensor_model_parallel_world_size() == world_size
# Check that the outputs have the same shape
......@@ -636,11 +661,13 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
# Run the model through the sharded function
with torch.inference_mode():
sharded_output = run_dp_sharded_mrope_vision_model(
vision_model, pixel_values, grid_thw_list)
sharded_output = run_dp_sharded_mrope_vision_model(vision_model,
pixel_values,
grid_thw_list,
rope_type="rope_3d")
sharded_output = torch.cat(sharded_output, dim=0)
# Check that the world size is setup correctly
# Check that the world size is set up correctly
assert get_tensor_model_parallel_world_size() == world_size
# Compare outputs (only on rank 0)
......@@ -691,8 +718,10 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(
# Should handle empty input gracefully
with torch.inference_mode():
output = run_dp_sharded_mrope_vision_model(vision_model, pixel_values,
grid_thw_list)
output = run_dp_sharded_mrope_vision_model(vision_model,
pixel_values,
grid_thw_list,
rope_type="rope_3d")
assert len(output) == 0
......@@ -745,8 +774,10 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
# Should handle uneven distribution without errors
with torch.inference_mode():
output_tuple = run_dp_sharded_mrope_vision_model(
vision_model, pixel_values, grid_thw_list)
output_tuple = run_dp_sharded_mrope_vision_model(vision_model,
pixel_values,
grid_thw_list,
rope_type="rope_3d")
# Verify output shape is reasonable
merge_factor = vision_model.spatial_merge_size**2
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
import torch.nn.functional as F
from vllm.model_executor.layers.activation import FastGELU, SiluAndMul
from vllm.platforms import current_platform
@pytest.mark.parametrize("activation", ["silu_and_mul", "gelu_fast"])
@pytest.mark.parametrize("num_tokens,d,dtype", [
(7, 512, torch.half),
(7, 512, torch.float),
(83, 512, torch.half),
])
@torch.inference_mode()
def test_act_and_mul(
activation: str,
num_tokens: int,
d: int,
dtype: torch.dtype,
) -> None:
import torch_xla.core.xla_model as xm
device = xm.xla_device()
current_platform.seed_everything(0)
torch.set_default_device("cpu")
x = torch.randn(num_tokens, 2 * d, dtype=dtype).to(device=device)
if activation == "silu_and_mul":
layer = SiluAndMul()
fn = layer.forward_native
elif activation == "gelu_fast":
layer = FastGELU()
fn = F.gelu
else:
raise NotImplementedError(
f"activation {activation} is not implemented.")
assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
out = layer.to(device=device).forward_neuron(x)
ref_out = fn(x.cpu())
torch.testing.assert_close(out.cpu(), ref_out, atol=0.01, rtol=0.0)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import neuronxcc.nki.language as nl
import pytest
import torch
import torch.nn.functional as F
from neuronxcc import nki
from vllm.attention.ops.nki_flash_attn import (
load_block_tables, transform_block_tables_for_indirect_load)
def is_power_of_2(n):
return n > 0 and (n & (n - 1) == 0)
def nki_load_and_transform_block_tables(
block_tables,
num_tiles,
num_blocks_per_tile,
num_head,
head_id,
block_size_tiling_factor,
):
assert is_power_of_2(
num_blocks_per_tile), f"{num_blocks_per_tile=} must be power of 2"
block_tables_sbuf = load_block_tables(block_tables, num_tiles,
num_blocks_per_tile)
# we need to pass an Index as head_id
head_id = nl.arange(1)[None, :] + head_id
block_tables_transposed = transform_block_tables_for_indirect_load(
block_tables_sbuf, block_size_tiling_factor, num_head, head_id)
B_P_SIZE = 128
assert block_tables_transposed.shape[1] == B_P_SIZE
out = nl.ndarray(
block_tables_transposed.shape,
dtype=nl.int32,
buffer=nl.shared_hbm,
)
for i in nl.affine_range(block_tables_transposed.shape[0]):
nl.store(dst=out[i], value=block_tables_transposed[i])
return out
def ref_block_tables_transform(
block_tables,
num_tiles,
num_blocks_per_tile,
num_head,
head_id,
block_size_tiling_factor,
):
assert block_tables.numel() == num_tiles * num_blocks_per_tile
block_tables = block_tables.view(num_tiles, num_blocks_per_tile)
B_F_SIZE = 128
num_tiles_padded = (num_tiles + B_F_SIZE - 1) // B_F_SIZE * B_F_SIZE
block_tables = F.pad(
block_tables,
(0, 0, 0, num_tiles_padded - num_tiles),
"constant",
0,
)
block_tables = block_tables * num_head + head_id
block_tables = block_tables.view(num_tiles_padded, num_blocks_per_tile, 1)
offset = torch.arange(0, block_size_tiling_factor).view(1, 1, -1)
block_tables = block_tables * block_size_tiling_factor + offset
block_tables_transposed = block_tables.view(num_tiles_padded, -1).t()
num_blocks_per_tile = block_tables_transposed.shape[0]
assert num_blocks_per_tile % B_F_SIZE == 0
return block_tables_transposed.view(num_blocks_per_tile // B_F_SIZE,
B_F_SIZE, num_tiles_padded)
@pytest.mark.parametrize(
"q_head_per_kv_head,head_id",
[
(1, 0),
(3, 1),
],
)
@pytest.mark.parametrize(
"num_tiles,num_blocks_per_tile",
[
(1, 1),
(13, 16),
(17, 128),
(35, 512),
(128, 128),
(130, 64),
(280, 256),
(315, 1),
],
)
@torch.inference_mode()
def test_load_and_transform_block_tables(
monkeypatch: pytest.MonkeyPatch,
num_tiles,
num_blocks_per_tile,
q_head_per_kv_head,
head_id,
) -> None:
import torch_xla.core.xla_model as xm
device = xm.xla_device()
compiler_flags_str = " ".join([
"-O1",
"--retry_failed_compilation",
])
with monkeypatch.context() as m:
m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
torch.manual_seed(10000)
torch.set_printoptions(sci_mode=False)
# On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
B_P_SIZE = 128
if num_blocks_per_tile < B_P_SIZE:
assert B_P_SIZE % num_blocks_per_tile == 0
block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
else:
block_size_tiling_factor = 1
max_num_blocks = 100000
block_tables = torch.randint(
0,
max_num_blocks,
(num_tiles * num_blocks_per_tile, ),
dtype=torch.int32,
)
nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
block_tables.to(device=device),
num_tiles,
num_blocks_per_tile,
q_head_per_kv_head,
head_id,
block_size_tiling_factor,
).cpu()
ref_out = ref_block_tables_transform(
block_tables,
num_tiles,
num_blocks_per_tile,
q_head_per_kv_head,
head_id,
block_size_tiling_factor,
)
assert (nki_out.shape == ref_out.shape
), f"{nki_out.shape=} != {ref_out.shape=}"
assert torch.all(nki_out == ref_out)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.attention.ops.nki_flash_attn import reshape_and_cache
@pytest.mark.parametrize(
"num_tokens, n_kv_head, d_head, num_blocks, block_size",
[
# Small model configuration (e.g., GPT-2 small)
(32, 12, 64, 4, 128), # Typical sequence processing
(1, 12, 64, 4, 128), # Single token update
(128, 12, 64, 4, 128), # Longer sequence
# Medium model configuration (e.g., GPT-2 medium)
(64, 16, 96, 8, 256), # Standard batch
(256, 16, 96, 8, 256), # Large batch
# Large model configuration (e.g., GPT-3 style)
(48, 32, 128, 16, 512), # Typical processing window
(512, 32, 128, 16, 512), # Full context window
# Edge cases and stress tests
(1024, 8, 32, 32, 32), # Many tokens, small heads
(16, 64, 256, 4, 64), # Few tokens, many heads
(2048, 24, 128, 64, 128), # Large scale test
# Minimal configurations for debugging
(4, 2, 16, 2, 16), # Tiny test case
(1, 1, 8, 1, 8), # Minimal possible
])
def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
block_size):
# Set random seed for reproducibility
torch.manual_seed(42)
# Create CPU tensors for reference implementation
key_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
torch.tensor(d_head))
value_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
torch.tensor(d_head))
key_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
value_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
slot_mapping_cpu = torch.randperm(num_blocks * block_size)[:num_tokens]
# Run reference implementation on CPU
block_indices = torch.div(slot_mapping_cpu,
block_size,
rounding_mode="floor")
block_offsets = slot_mapping_cpu % block_size
for i in range(num_tokens):
block_idx = block_indices[i]
block_offset = block_offsets[i]
key_cache_cpu[block_idx, :, block_offset, :] = key_cpu[i]
value_cache_cpu[block_idx, :, block_offset, :] = value_cpu[i]
# Create XLA device tensors
device = torch.device('xla')
key = key_cpu.to(device)
value = value_cpu.to(device)
key_cache = torch.zeros_like(key_cache_cpu, device=device)
value_cache = torch.zeros_like(value_cache_cpu, device=device)
slot_mapping = slot_mapping_cpu.to(device)
kv_cache = torch.stack([key_cache, value_cache])
# Run vectorized implementation on XLA device
reshape_and_cache(key, value, kv_cache, slot_mapping)
key_cache, value_cache = torch.unbind(kv_cache, dim=0)
# Move results back to CPU for comparison
key_cache_result = key_cache.cpu()
value_cache_result = value_cache.cpu()
# Assert results match
torch.testing.assert_close(key_cache_result,
key_cache_cpu,
rtol=1e-5,
atol=1e-5)
torch.testing.assert_close(value_cache_result,
value_cache_cpu,
rtol=1e-5,
atol=1e-5)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.platforms import current_platform
@pytest.mark.parametrize("num_tokens,hidden_size,add_residual,dtype", [
(7, 8, False, torch.half),
(83, 768, False, torch.half),
(83, 768, True, torch.half),
(83, 768, True, torch.bfloat16),
(83, 768, True, torch.float32),
])
@torch.inference_mode()
def test_rms_norm(
num_tokens: int,
hidden_size: int,
add_residual: bool,
dtype: torch.dtype,
) -> None:
import torch_xla.core.xla_model as xm
device = xm.xla_device()
current_platform.seed_everything(0)
torch.set_default_device("cpu")
layer = RMSNorm(hidden_size).to(dtype=dtype)
layer.weight.data.normal_(mean=1.0, std=0.1)
scale = 1 / (2 * hidden_size)
x = torch.randn(num_tokens, hidden_size, dtype=dtype).to(device=device)
x *= scale
residual = torch.randn_like(x) * scale if add_residual else None
residual_cpu = residual.cpu() if add_residual else None
ref_out = layer.to(device="cpu").forward_native(x.cpu(), residual_cpu)
assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
out = layer.to(device=device)(x, residual)
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
# numerical errors than other operators because they involve reductions.
# Therefore, we use a larger tolerance.
if add_residual:
assert out[0].is_xla, "output tensor is expected to be XLA tensor"
torch.testing.assert_close(out[0].cpu(),
ref_out[0],
atol=1e-2,
rtol=1e-2)
torch.testing.assert_close(out[1].cpu(),
ref_out[1],
atol=1e-2,
rtol=1e-2)
else:
assert out.is_xla, "output tensor is expected to be XLA tensor"
torch.testing.assert_close(out.cpu(), ref_out, atol=1e-2, rtol=1e-2)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
from unittest.mock import patch
import pytest
import torch
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_random_seed
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import is_pin_memory_available
class MockLogitsProcessor(LogitsProcessor):
def __init__(self, vocab_size: int, scale: float,
fake_logits: torch.Tensor):
super().__init__(vocab_size=vocab_size, scale=scale)
self.fake_logits = fake_logits.clone()
def forward(self, *args, **kwargs):
with patch(
"vllm.model_executor.layers.logits_processor._prune_hidden_states",
lambda x, y: x
), patch(
"vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
lambda *args, **kwargs: self.fake_logits):
return super().forward(*args, **kwargs)
def _prepare_test(
batch_size: int
) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
vocab_size = 32000
input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
fake_logits = torch.full((batch_size, vocab_size),
1e-2,
dtype=input_tensor.dtype)
logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
return input_tensor, fake_logits, logits_processor
RANDOM_SEEDS = list(range(8))
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
def test_logits_processors(seed: int):
import torch_xla.core.xla_model as xm
device = xm.xla_device()
set_random_seed(seed)
torch.set_default_device("cpu")
batch_size = random.randint(1, 256)
input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
# This sample logits processor gives infinite score to the i-th token,
# where i is the length of the input sequence.
# We therefore expect the output token sequence to be [0, 1, 2, ...]
def pick_ith(token_ids, logits):
logits[len(token_ids)] = float("inf")
return logits
seq_group_metadata_list = []
seq_lens = []
for i in range(batch_size):
seq_group_metadata_list.append(
SequenceGroupMetadata(
request_id=f"test_{i}",
is_prompt=True,
seq_data={0: SequenceData.from_seqs([1, 2, 3])},
sampling_params=SamplingParams(temperature=0,
logits_processors=[pick_ith]),
block_tables={0: [1]},
))
seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
sampling_metadata = SamplingMetadata.prepare(
seq_group_metadata_list,
seq_lens,
query_lens=seq_lens,
device=device,
pin_memory=is_pin_memory_available())
logits_processor_output = logits_processor(
lm_head=None,
hidden_states=input_tensor,
sampling_metadata=sampling_metadata)
fake_logits *= logits_processor.scale
torch.testing.assert_close(logits_processor_output[:, 1],
fake_logits[:, 1],
rtol=1e-4,
atol=0.0)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from unittest.mock import MagicMock
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.platforms.neuron import NeuronFramework
from vllm.sampling_params import SamplingParams
from vllm.sequence import SequenceData, SequenceGroupMetadata
from vllm.worker.neuron_model_runner import NeuronModelRunner
os.environ[
'VLLM_NEURON_FRAMEWORK'] = NeuronFramework.TRANSFORMERS_NEURONX.value
def _create_neuron_model_runner(model: str, *args,
**kwargs) -> NeuronModelRunner:
engine_args = EngineArgs(model, *args, **kwargs)
engine_config = engine_args.create_engine_config()
vllm_config = VllmConfig(
model_config=engine_config.model_config,
parallel_config=engine_config.parallel_config,
scheduler_config=engine_config.scheduler_config,
device_config=engine_config.device_config,
)
neuron_model_runner = NeuronModelRunner(vllm_config=vllm_config)
return neuron_model_runner
def test_update_neuron_sampling_params_not_full_batch():
os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
model_runner = _create_neuron_model_runner(
"facebook/opt-125m",
seed=0,
dtype="float16",
max_num_seqs=2,
)
assert not model_runner._on_device_sampling_disabled
# Test sampling param updating only when TNx is framework
# NxDI handles sampling parameter updating inside model
if current_platform.use_transformers_neuronx():
model_mock = MagicMock()
model_runner.model = model_mock
seq_group_metadata_list = [
SequenceGroupMetadata(
request_id="test_0",
is_prompt=True,
seq_data={0: SequenceData.from_seqs([1, 2, 3])},
sampling_params=SamplingParams(temperature=0.5,
top_k=1,
top_p=0.5),
block_tables={0: [1]},
)
]
model_runner.prepare_model_input(seq_group_metadata_list)
# Index neuron sampling parameters based on block_tables indices.
# The first block_id of the sequence 0 is 1, so its parameters are
# placed at index 1. So the sampling parameters will be:
# Index 0: default sampling parameters
# Index 1: sequecne 0's sampling parameters.
neuron_sampling_params = (
model_runner.model_config.neuron_sampling_params)
assert neuron_sampling_params.temperature == [1.0, 0.5]
assert neuron_sampling_params.top_k == [
model_runner._MAX_NEURON_SAMPLING_TOP_K, 1
]
assert neuron_sampling_params.top_p == [1.0, 0.5]
model_mock.model.update_generation_config.assert_called_once_with(
neuron_sampling_params)
def test_update_neuron_sampling_params_full_batch():
os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
model_runner = _create_neuron_model_runner(
"facebook/opt-125m",
seed=0,
dtype="float16",
max_num_seqs=2,
)
assert not model_runner._on_device_sampling_disabled
# Test sampling param updating only when TNx is framework
# NxDI handles sampling parameter updating inside model
if current_platform.use_transformers_neuronx():
model_mock = MagicMock()
model_runner.model = model_mock
seq_group_metadata_list = [
SequenceGroupMetadata(
request_id="test_0",
is_prompt=True,
seq_data={0: SequenceData.from_seqs([1, 2, 3])},
sampling_params=SamplingParams(temperature=0.5,
top_k=1,
top_p=0.5),
block_tables={0: [1]},
),
SequenceGroupMetadata(
request_id="test_0",
is_prompt=True,
seq_data={1: SequenceData.from_seqs([4, 5, 6])},
sampling_params=SamplingParams(temperature=0.2,
top_k=2,
top_p=0.2),
block_tables={1: [0]},
)
]
model_runner.prepare_model_input(seq_group_metadata_list)
# Index neuron sampling parameters based on block_tables indices.
# The first block_id of the sequence 0 is 1, so its parameters are
# placed at index 1. So the sampling parameters will be:
# Index 0: sequence 1's sampling parameters
# Index 1: sequecne 0's sampling parameters.
neuron_sampling_params = (
model_runner.model_config.neuron_sampling_params)
assert neuron_sampling_params.temperature == [0.2, 0.5]
assert neuron_sampling_params.top_k == [2, 1]
assert neuron_sampling_params.top_p == [0.2, 0.5]
model_mock.model.update_generation_config.assert_called_once_with(
neuron_sampling_params)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment