Commit a3f8d5dd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori

parents 8d75f22e f34eca5f
...@@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict ...@@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.cache import MultiModalProcessorOnlyCache
from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.tokenizers import ( from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
MistralTokenizer, from vllm.tokenizers.mistral import MistralTokenizer
TokenizerLike,
cached_tokenizer_from_config,
)
from ....multimodal.utils import random_audio, random_image, random_video from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import ( from ...registry import (
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["google/gemma-3-4b-it"])
def test_get_image_size_with_most_features(
image_assets: ImageTestAssets, model_id: str
):
ctx = build_model_context(
model_id,
mm_processor_kwargs={"do_pan_and_scan": True},
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs: dict[str, object] = {}
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
max_image_size = processor.info.get_image_size_with_most_features()
max_tokens = processor.info.get_num_image_tokens(
image_width=max_image_size.width,
image_height=max_image_size.height,
processor=hf_processor,
)
prompt = "<start_of_image>"
image_seq_length = hf_processor.image_seq_length
for asset in image_assets:
mm_data = {"image": [asset.pil_image]}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
mm_kwargs_data = processed_inputs["mm_kwargs"].get_data()
num_patches_tensor = mm_kwargs_data["num_patches"]
tokens = int(num_patches_tensor.item()) * image_seq_length
assert tokens <= max_tokens
...@@ -60,12 +60,12 @@ def test_profiling(model_id: str, max_model_len: int): ...@@ -60,12 +60,12 @@ def test_profiling(model_id: str, max_model_len: int):
total_num_patches.item() + num_tiles.item() + 3 total_num_patches.item() + num_tiles.item() + 3
) # image start, image, image end ) # image start, image, image end
profiled_tokens = profiler.get_mm_max_contiguous_tokens( profiled_tokens = profiler.get_mm_max_tokens(
max_model_len, max_model_len,
mm_counts=mm_counts, mm_counts=mm_counts,
) )
assert total_tokens == profiled_tokens["image"] assert total_num_patches == profiled_tokens["image"]
assert total_tokens == sum( assert total_tokens == sum(
placeholder.length placeholder.length
for placeholder in decoder_dummy_data.multi_modal_placeholders["image"] for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
......
...@@ -53,3 +53,38 @@ def test_processor_override( ...@@ -53,3 +53,38 @@ def test_processor_override(
assert img_tok_count == expected_toks_per_img * num_imgs assert img_tok_count == expected_toks_per_img * num_imgs
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
assert pixel_shape[1] == expected_pixels_shape[1] assert pixel_shape[1] == expected_pixels_shape[1]
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
@pytest.mark.parametrize("max_pixels", [1280 * 28 * 28, 1283 * 28 * 28])
def test_get_image_size_with_most_features(
image_assets: ImageTestAssets,
model_id: str,
max_pixels: int,
):
ctx = build_model_context(
model_id,
mm_processor_kwargs={"max_pixels": max_pixels},
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs: dict[str, object] = {}
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
merge_size = processor.info.get_hf_config().vision_config.spatial_merge_size
max_image_size = processor.info.get_image_size_with_most_features()
max_tokens = processor.info.get_num_image_tokens(
image_width=max_image_size.width,
image_height=max_image_size.height,
image_processor=hf_processor.image_processor,
)
prompt = "<|vision_start|><|image_pad|><|vision_end|>"
for asset in image_assets:
mm_data = {"image": [asset.pil_image]}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
grid_thw = processed_inputs["mm_kwargs"].get_data()["image_grid_thw"].tolist()
t, h, w = grid_thw[0]
tokens = (t * h * w) // (merge_size**2)
assert tokens < max_tokens
...@@ -8,6 +8,7 @@ from typing import Any, TypeAlias ...@@ -8,6 +8,7 @@ from typing import Any, TypeAlias
import numpy as np import numpy as np
import pytest import pytest
import torch
import torch.nn as nn import torch.nn as nn
from PIL import Image from PIL import Image
...@@ -35,6 +36,7 @@ from vllm.tokenizers import cached_tokenizer_from_config ...@@ -35,6 +36,7 @@ from vllm.tokenizers import cached_tokenizer_from_config
from vllm.utils.collection_utils import is_list_of from vllm.utils.collection_utils import is_list_of
from vllm.utils.torch_utils import set_default_torch_dtype from vllm.utils.torch_utils import set_default_torch_dtype
from ....utils import create_new_process_for_each_test
from ...registry import HF_EXAMPLE_MODELS from ...registry import HF_EXAMPLE_MODELS
from ...utils import dummy_hf_overrides from ...utils import dummy_hf_overrides
from .test_common import get_model_ids_to_test, get_text_token_prompts from .test_common import get_model_ids_to_test, get_text_token_prompts
...@@ -136,6 +138,7 @@ def create_batched_mm_kwargs( ...@@ -136,6 +138,7 @@ def create_batched_mm_kwargs(
) )
# TODO(Isotr0py): Don't initalize model during test
@contextmanager @contextmanager
def initialize_dummy_model( def initialize_dummy_model(
model_cls: type[nn.Module], model_cls: type[nn.Module],
...@@ -150,16 +153,21 @@ def initialize_dummy_model( ...@@ -150,16 +153,21 @@ def initialize_dummy_model(
backend="nccl", backend="nccl",
) )
initialize_model_parallel(tensor_model_parallel_size=1) initialize_model_parallel(tensor_model_parallel_size=1)
current_device = torch.get_default_device()
vllm_config = VllmConfig(model_config=model_config) vllm_config = VllmConfig(model_config=model_config)
with set_current_vllm_config(vllm_config=vllm_config): with set_current_vllm_config(vllm_config=vllm_config):
with set_default_torch_dtype(model_config.dtype): with set_default_torch_dtype(model_config.dtype):
torch.set_default_device(current_platform.device_type)
model = model_cls(vllm_config=vllm_config) model = model_cls(vllm_config=vllm_config)
torch.set_default_device(current_device)
yield model yield model
del model del model
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
@create_new_process_for_each_test()
@pytest.mark.parametrize("model_id", get_model_ids_to_test()) @pytest.mark.parametrize("model_id", get_model_ids_to_test())
def test_model_tensor_schema(model_id: str): def test_model_tensor_schema(model_id: str):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
......
...@@ -173,10 +173,7 @@ class _HfExamplesInfo: ...@@ -173,10 +173,7 @@ class _HfExamplesInfo:
_TEXT_GENERATION_EXAMPLE_MODELS = { _TEXT_GENERATION_EXAMPLE_MODELS = {
# [Decoder-only] # [Decoder-only]
"AfmoeForCausalLM": _HfExamplesInfo( "AfmoeForCausalLM": _HfExamplesInfo("arcee-ai/Trinity-Nano-Preview"),
"arcee-ai/Trinity-Nano",
is_available_online=False,
),
"ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"), "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"),
"AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True), "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True),
"AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True), "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True),
...@@ -359,7 +356,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -359,7 +356,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
), ),
"MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"), "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
"MistralLarge3ForCausalLM": _HfExamplesInfo( "MistralLarge3ForCausalLM": _HfExamplesInfo(
"mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", is_available_online=False "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4"
), ),
"MixtralForCausalLM": _HfExamplesInfo( "MixtralForCausalLM": _HfExamplesInfo(
"mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mixtral-8x7B-Instruct-v0.1",
...@@ -576,12 +573,17 @@ _AUTOMATIC_CONVERTED_MODELS = { ...@@ -576,12 +573,17 @@ _AUTOMATIC_CONVERTED_MODELS = {
"Qwen3ForSequenceClassification": _HfExamplesInfo( "Qwen3ForSequenceClassification": _HfExamplesInfo(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls" "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
), ),
"Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
} }
_MULTIMODAL_EXAMPLE_MODELS = { _MULTIMODAL_EXAMPLE_MODELS = {
# [Decoder-only] # [Decoder-only]
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"), "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
"AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
"nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
),
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"), "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
"BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
"BeeForConditionalGeneration": _HfExamplesInfo( "BeeForConditionalGeneration": _HfExamplesInfo(
"Open-Bee/Bee-8B-RL", "Open-Bee/Bee-8B-RL",
trust_remote_code=True, trust_remote_code=True,
...@@ -638,7 +640,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -638,7 +640,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
), ),
"HunYuanVLForConditionalGeneration": _HfExamplesInfo( "HunYuanVLForConditionalGeneration": _HfExamplesInfo(
"tencent/HunyuanOCR", "tencent/HunyuanOCR",
is_available_online=False, hf_overrides={"num_experts": 0},
), ),
"Idefics3ForConditionalGeneration": _HfExamplesInfo( "Idefics3ForConditionalGeneration": _HfExamplesInfo(
"HuggingFaceM4/Idefics3-8B-Llama3", "HuggingFaceM4/Idefics3-8B-Llama3",
...@@ -677,8 +679,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -677,8 +679,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31", "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
), ),
"LightOnOCRForConditionalGeneration": _HfExamplesInfo( "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
"lightonai/LightOnOCR-1B", "lightonai/LightOnOCR-1B-1025"
is_available_online=False,
), ),
"Llama4ForConditionalGeneration": _HfExamplesInfo( "Llama4ForConditionalGeneration": _HfExamplesInfo(
"meta-llama/Llama-4-Scout-17B-16E-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
...@@ -782,8 +783,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -782,8 +783,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"ministral-3": "mistralai/Ministral-3-3B-Instruct-2512", "ministral-3": "mistralai/Ministral-3-3B-Instruct-2512",
}, },
tokenizer_mode="mistral", tokenizer_mode="mistral",
# TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available.
is_available_online=False,
), ),
"QwenVLForConditionalGeneration": _HfExamplesInfo( "QwenVLForConditionalGeneration": _HfExamplesInfo(
"Qwen/Qwen-VL", "Qwen/Qwen-VL",
...@@ -846,7 +845,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -846,7 +845,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
is_available_online=False, is_available_online=False,
), ),
# [Encoder-decoder] # [Encoder-decoder]
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), "WhisperForConditionalGeneration": _HfExamplesInfo(
"openai/whisper-large-v3-turbo",
extras={"v3": "openai/whisper-large-v3"},
),
# [Cross-encoder] # [Cross-encoder]
"JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),
} }
...@@ -889,6 +891,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { ...@@ -889,6 +891,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"EagleMistralLarge3ForCausalLM": _HfExamplesInfo( "EagleMistralLarge3ForCausalLM": _HfExamplesInfo(
"mistralai/Mistral-Large-3-675B-Instruct-2512", "mistralai/Mistral-Large-3-675B-Instruct-2512",
speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle", speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle",
# TODO: revert once figuring out OOM in CI
is_available_online=False, is_available_online=False,
), ),
"LlamaForCausalLMEagle3": _HfExamplesInfo( "LlamaForCausalLMEagle3": _HfExamplesInfo(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Unit tests for sparse tensor validation.
Simple, fast unit tests that can run without server fixtures.
Run with: pytest tests/multimodal/test_sparse_tensor_validation_unit.py -v
"""
import io
import pytest
import torch
class TestSparseTensorValidationContextManager:
"""Test that torch.sparse.check_sparse_tensor_invariants() works as expected."""
def test_valid_sparse_tensor_passes(self):
"""Valid sparse tensors should pass validation."""
indices = torch.tensor([[0, 1], [0, 1]])
values = torch.tensor([1.0, 2.0])
shape = (2, 2)
with torch.sparse.check_sparse_tensor_invariants():
tensor = torch.sparse_coo_tensor(indices, values, shape)
dense = tensor.to_dense()
assert dense.shape == shape
def test_out_of_bounds_indices_rejected(self):
"""Sparse tensors with out-of-bounds indices should be rejected."""
indices = torch.tensor([[5], [5]]) # Out of bounds for 2x2
values = torch.tensor([1.0])
shape = (2, 2)
with pytest.raises(RuntimeError) as exc_info: # noqa: SIM117
with torch.sparse.check_sparse_tensor_invariants():
tensor = torch.sparse_coo_tensor(indices, values, shape)
tensor.to_dense()
assert (
"index" in str(exc_info.value).lower()
or "bound" in str(exc_info.value).lower()
)
def test_negative_indices_rejected(self):
"""Sparse tensors with negative indices should be rejected."""
indices = torch.tensor([[-1], [0]])
values = torch.tensor([1.0])
shape = (2, 2)
with pytest.raises(RuntimeError): # noqa: SIM117
with torch.sparse.check_sparse_tensor_invariants():
tensor = torch.sparse_coo_tensor(indices, values, shape)
tensor.to_dense()
def test_without_context_manager_allows_invalid(self):
"""
WITHOUT validation, invalid tensors may not immediately error.
This demonstrates the vulnerability: PyTorch 2.8.0+ doesn't validate
by default, which can lead to memory corruption.
"""
indices = torch.tensor([[100], [100]]) # Way out of bounds
values = torch.tensor([1.0])
shape = (2, 2)
# Without validation context, this might create an invalid tensor
# (actual behavior depends on PyTorch version)
tensor = torch.sparse_coo_tensor(indices, values, shape)
# The tensor object is created, but it's invalid
assert tensor.is_sparse
class TestTorchLoadWithValidation:
"""Test torch.load() with sparse tensor validation."""
def test_load_valid_sparse_tensor_with_validation(self):
"""Valid sparse tensors should load successfully with validation."""
# Create and save a valid sparse tensor
indices = torch.tensor([[0, 1], [0, 1]])
values = torch.tensor([1.0, 2.0])
tensor = torch.sparse_coo_tensor(indices, values, (2, 2))
buffer = io.BytesIO()
torch.save(tensor, buffer)
buffer.seek(0)
# Load with validation
with torch.sparse.check_sparse_tensor_invariants():
loaded = torch.load(buffer, weights_only=True)
dense = loaded.to_dense()
assert dense.shape == (2, 2)
def test_load_invalid_sparse_tensor_rejected(self):
"""Invalid sparse tensors should be caught when loaded with validation."""
# Create an invalid sparse tensor (out of bounds)
indices = torch.tensor([[10], [10]])
values = torch.tensor([1.0])
tensor = torch.sparse_coo_tensor(indices, values, (2, 2))
buffer = io.BytesIO()
torch.save(tensor, buffer)
buffer.seek(0)
# Load with validation - should fail on to_dense()
with pytest.raises(RuntimeError): # noqa: SIM117
with torch.sparse.check_sparse_tensor_invariants():
loaded = torch.load(buffer, weights_only=True)
loaded.to_dense()
def test_load_dense_tensor_unaffected(self):
"""Dense tensors should work normally with the validation context."""
# Create and save a dense tensor
tensor = torch.randn(10, 20)
buffer = io.BytesIO()
torch.save(tensor, buffer)
buffer.seek(0)
# Load with validation (should have no effect on dense tensors)
with torch.sparse.check_sparse_tensor_invariants():
loaded = torch.load(buffer, weights_only=True)
assert loaded.shape == (10, 20)
assert not loaded.is_sparse
if __name__ == "__main__":
# Allow running directly for quick testing
pytest.main([__file__, "-v", "--tb=short"])
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import base64 import base64
import mimetypes import mimetypes
import os import os
...@@ -8,6 +9,7 @@ from tempfile import NamedTemporaryFile, TemporaryDirectory ...@@ -8,6 +9,7 @@ from tempfile import NamedTemporaryFile, TemporaryDirectory
import numpy as np import numpy as np
import pytest import pytest
import torch
from PIL import Image, ImageChops from PIL import Image, ImageChops
from vllm.multimodal.image import convert_image_mode from vllm.multimodal.image import convert_image_mode
...@@ -186,6 +188,7 @@ async def test_fetch_image_error_conversion(): ...@@ -186,6 +188,7 @@ async def test_fetch_image_error_conversion():
connector.fetch_image(broken_img) connector.fetch_image(broken_img)
@pytest.mark.flaky(reruns=3, reruns_delay=5)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800]) @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
...@@ -198,8 +201,12 @@ async def test_fetch_video_http(video_url: str, num_frames: int): ...@@ -198,8 +201,12 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
} }
) )
video_sync, metadata_sync = connector.fetch_video(video_url) try:
video_async, metadata_async = await connector.fetch_video_async(video_url) video_sync, metadata_sync = connector.fetch_video(video_url)
video_async, metadata_async = await connector.fetch_video_async(video_url)
except (TimeoutError, asyncio.TimeoutError) as e:
pytest.skip(f"Timeout fetching video (CI network flakiness): {e}")
assert np.array_equal(video_sync, video_async) assert np.array_equal(video_sync, video_async)
assert metadata_sync == metadata_async assert metadata_sync == metadata_async
...@@ -404,6 +411,97 @@ def test_argsort_mm_positions(case): ...@@ -404,6 +411,97 @@ def test_argsort_mm_positions(case):
assert modality_idxs == expected_modality_idxs assert modality_idxs == expected_modality_idxs
@pytest.mark.parametrize(
"is_embed,expected",
[
(None, 5),
(torch.tensor([True, True, True, True, True]), 5),
(torch.tensor([False, False, False, False, False]), 0),
(torch.tensor([True, False, True, False, True]), 3),
(torch.tensor([True]), 1),
],
)
def test_placeholder_range_get_num_embeds(is_embed, expected):
length = len(is_embed) if is_embed is not None else 5
pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
assert pr.get_num_embeds == expected
@pytest.mark.parametrize(
"is_embed,expected",
[
(None, None),
(
torch.tensor([False, True, False, True, True]),
torch.tensor([0, 1, 1, 2, 3]),
),
(torch.tensor([True, True, True]), torch.tensor([1, 2, 3])),
],
)
def test_placeholder_range_embeds_cumsum(is_embed, expected):
length = len(is_embed) if is_embed is not None else 5
pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
if expected is None:
assert pr.embeds_cumsum is None
return
assert torch.equal(pr.embeds_cumsum, expected)
# cached_property should return the same object on repeated access
assert pr.embeds_cumsum is pr.embeds_cumsum
@pytest.mark.parametrize(
"is_embed,start_idx,end_idx,expected",
[
(None, 2, 4, (2, 4)),
(
torch.tensor([False, True, False, True, True]),
3,
5,
(1, 3),
),
(
torch.tensor([False, True, False, True, True]),
0,
2,
(0, 1),
),
(
torch.tensor([True, False, True, False]),
2,
2,
(1, 1),
),
],
)
def test_placeholder_range_get_embeds_indices_in_range(
is_embed, start_idx, end_idx, expected
):
length = len(is_embed) if is_embed is not None else 5
pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
assert pr.get_embeds_indices_in_range(start_idx, end_idx) == expected
@pytest.mark.parametrize(
"offset,is_embed,expected",
[
(0, None, [(0, 4)]),
(
2,
torch.tensor([False, True, False, True, True]),
[(3, 3), (5, 6)],
),
(0, torch.tensor([True, True, True, True]), [(0, 3)]),
(0, torch.tensor([False, False, False, False]), []),
],
)
def test_placeholder_range_extract_embeds_range(offset, is_embed, expected):
length = len(is_embed) if is_embed is not None else 5
pr = PlaceholderRange(offset=offset, length=length, is_embed=is_embed)
assert pr.extract_embeds_range() == expected
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800]) @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
......
...@@ -147,7 +147,7 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch): ...@@ -147,7 +147,7 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
""" """
Regression test for handling videos with broken frames. Regression test for handling videos with broken frames.
This test uses a pre-corrupted video file (assets/corrupted.mp4) that This test uses a pre-corrupted video file (assets/corrupted.mp4) that
contains broken/unreadable frames to verify the video loader handles contains broken frames to verify the video loader handles
them gracefully without crashing and returns accurate metadata. them gracefully without crashing and returns accurate metadata.
""" """
with monkeypatch.context() as m: with monkeypatch.context() as m:
...@@ -177,3 +177,125 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch): ...@@ -177,3 +177,125 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
f"Expected fewer than {metadata['total_num_frames']} frames, " f"Expected fewer than {metadata['total_num_frames']} frames, "
f"but loaded {frames.shape[0]} frames" f"but loaded {frames.shape[0]} frames"
) )
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1")
class TestVideoBackendOverride1(VideoLoader):
"""Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"}
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2")
class TestVideoBackendOverride2(VideoLoader):
"""Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"}
def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch):
"""
Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
environment variable.
This allows users to dynamically select a different video backend
via --media-io-kwargs without changing the global env var, which is
useful when plugins set a default backend but a specific request
needs a different one.
"""
with monkeypatch.context() as m:
# Set the env var to one backend
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1")
imageio = ImageMediaIO()
# Without video_backend kwarg, should use env var backend
videoio_default = VideoMediaIO(imageio, num_frames=10)
frames_default, metadata_default = videoio_default.load_bytes(b"test")
np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1)
assert metadata_default["video_backend"] == "test_video_backend_override_1"
# With video_backend kwarg, should override env var
videoio_override = VideoMediaIO(
imageio, num_frames=10, video_backend="test_video_backend_override_2"
)
frames_override, metadata_override = videoio_override.load_bytes(b"test")
np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2)
assert metadata_override["video_backend"] == "test_video_backend_override_2"
def test_video_media_io_backend_kwarg_not_passed_to_loader(
monkeypatch: pytest.MonkeyPatch,
):
"""
Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
through to the underlying video loader's load_bytes method.
This ensures the kwarg is properly popped from kwargs before forwarding.
"""
@VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg")
class RejectVideoBackendKwargLoader(VideoLoader):
"""Test loader that fails if video_backend is passed through."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
# This should never receive video_backend in kwargs
if "video_backend" in kwargs:
raise AssertionError(
"video_backend should be consumed by VideoMediaIO, "
"not passed to loader"
)
return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())}
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg")
imageio = ImageMediaIO()
# Even when video_backend is provided, it should NOT be passed to loader
videoio = VideoMediaIO(
imageio,
num_frames=10,
video_backend="test_reject_video_backend_kwarg",
other_kwarg="should_pass_through",
)
# This should NOT raise AssertionError
frames, metadata = videoio.load_bytes(b"test")
np.testing.assert_array_equal(frames, FAKE_OUTPUT_1)
# Verify other kwargs are still passed through
assert "other_kwarg" in metadata["received_kwargs"]
def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch):
"""
Test that when video_backend kwarg is None or not provided,
VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
"""
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2")
imageio = ImageMediaIO()
# Explicit None should fall back to env var
videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None)
frames_none, metadata_none = videoio_none.load_bytes(b"test")
np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2)
assert metadata_none["video_backend"] == "test_video_backend_override_2"
# Not providing video_backend should also fall back to env var
videoio_missing = VideoMediaIO(imageio, num_frames=10)
frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
assert metadata_missing["video_backend"] == "test_video_backend_override_2"
...@@ -10,9 +10,9 @@ import pytest ...@@ -10,9 +10,9 @@ import pytest
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform from vllm.platforms import current_platform
if not current_platform.is_device_capability(100): if not current_platform.is_device_capability_family(100):
pytest.skip( pytest.skip(
"This test only runs on Blackwell GPUs (SM100).", allow_module_level=True "This test only runs on Blackwell GPUs (SM10x).", allow_module_level=True
) )
......
...@@ -212,11 +212,11 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int): ...@@ -212,11 +212,11 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
task = "wikitext" task = "wikitext"
rtol = 0.1 rtol = 0.1
# Smaller cuda_graph_sizes to speed up the test. # Smaller cudagraph_capture_sizes to speed up the test.
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
model="vllm", model="vllm",
model_args=config.get_model_args( model_args=config.get_model_args(
tp_size=tp_size, kwargs={"cuda_graph_sizes": [16]} tp_size=tp_size, kwargs={"cudagraph_capture_sizes": [16]}
), ),
tasks=task, tasks=task,
batch_size=64, batch_size=64,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "minimax_m2_append_think"
end_token = "</think>"
# MiniMax M2 model path
REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
@pytest.fixture(scope="module")
def minimax_m2_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
# =============================================================================
# MiniMaxM2AppendThinkReasoningParser behavior:
# - Prepends <think> to the beginning of the output
# - Does NOT separate reasoning and content
# - Returns everything as content (with <think> prepended)
# - reasoning is always None
#
# This parser is used when you want to keep the raw output with <think> added
# =============================================================================
# Case: simple output with end token
SIMPLE_OUTPUT = {
"output": "This is reasoning</think>This is response",
"reasoning": None,
"content": "<think>This is reasoning</think>This is response",
"is_reasoning_end": True,
}
# Case: output without end token (reasoning in progress)
NO_END_TOKEN = {
"output": "This is reasoning in progress",
"reasoning": None,
"content": "<think>This is reasoning in progress",
"is_reasoning_end": False,
}
# Case: only end token
ONLY_END_TOKEN = {
"output": "</think>This is response",
"reasoning": None,
"content": "<think></think>This is response",
"is_reasoning_end": True,
}
# Case: multiple lines
MULTIPLE_LINES = {
"output": "Line 1\nLine 2</think>Response 1\nResponse 2",
"reasoning": None,
"content": "<think>Line 1\nLine 2</think>Response 1\nResponse 2",
"is_reasoning_end": True,
}
# Case: empty output (non-streaming prepends <think>)
EMPTY = {
"output": "",
"reasoning": None,
"content": "<think>",
"is_reasoning_end": False,
}
# Case: empty output streaming (no tokens = no output)
EMPTY_STREAMING = {
"output": "",
"reasoning": None,
"content": None,
"is_reasoning_end": False,
}
# Case: special characters
SPECIAL_CHARS = {
"output": "Let me think... 1+1=2</think>Yes!",
"reasoning": None,
"content": "<think>Let me think... 1+1=2</think>Yes!",
"is_reasoning_end": True,
}
# Case: code in output
CODE_OUTPUT = {
"output": "```python\nprint('hi')\n```</think>Here's the code.",
"reasoning": None,
"content": "<think>```python\nprint('hi')\n```</think>Here's the code.",
"is_reasoning_end": True,
}
TEST_CASES = [
pytest.param(
False,
SIMPLE_OUTPUT,
id="simple_output",
),
pytest.param(
True,
SIMPLE_OUTPUT,
id="simple_output_streaming",
),
pytest.param(
False,
NO_END_TOKEN,
id="no_end_token",
),
pytest.param(
True,
NO_END_TOKEN,
id="no_end_token_streaming",
),
pytest.param(
False,
ONLY_END_TOKEN,
id="only_end_token",
),
pytest.param(
True,
ONLY_END_TOKEN,
id="only_end_token_streaming",
),
pytest.param(
False,
MULTIPLE_LINES,
id="multiple_lines",
),
pytest.param(
True,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
False,
EMPTY,
id="empty",
),
pytest.param(
True,
EMPTY_STREAMING,
id="empty_streaming",
),
pytest.param(
False,
SPECIAL_CHARS,
id="special_chars",
),
pytest.param(
True,
SPECIAL_CHARS,
id="special_chars_streaming",
),
pytest.param(
False,
CODE_OUTPUT,
id="code_output",
),
pytest.param(
True,
CODE_OUTPUT,
id="code_output_streaming",
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
minimax_m2_tokenizer,
):
output = minimax_m2_tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
minimax_m2_tokenizer
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]
# Test is_reasoning_end
output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
is_reasoning_end = parser.is_reasoning_end(output_ids)
assert is_reasoning_end == param_dict["is_reasoning_end"]
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "minimax_m2"
end_token = "</think>"
# MiniMax M2 model path
REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
@pytest.fixture(scope="module")
def minimax_m2_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
# =============================================================================
# MiniMax M2 specific behavior:
# - Model does NOT generate <think> start token
# - Model only generates </think> end token
# - All content before </think> is reasoning
# - All content after </think> is the actual response (content)
# =============================================================================
# Case: reasoning + end token + content (typical case)
SIMPLE_REASONING = {
"output": "This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
# Case: reasoning + end token only (no content after)
COMPLETE_REASONING = {
"output": "This is a reasoning section</think>",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
# Case: no end token yet (streaming in progress, all is reasoning)
NO_END_TOKEN = {
"output": "This is reasoning in progress",
"reasoning": "This is reasoning in progress",
"content": None,
"is_reasoning_end": False,
}
# Case: multiple lines of reasoning
MULTIPLE_LINES = {
"output": "First line\nSecond line</think>Response first line\nResponse second",
"reasoning": "First line\nSecond line",
"content": "Response first line\nResponse second",
"is_reasoning_end": True,
}
# Case: only end token (empty reasoning, immediate response)
SHORTEST_REASONING_NO_STREAMING = {
"output": "</think>This is the response",
"reasoning": "",
"content": "This is the response",
"is_reasoning_end": True,
}
# Case: only end token streaming (reasoning is None because it's just the token)
SHORTEST_REASONING_STREAMING = {
"output": "</think>This is the response",
"reasoning": None,
"content": "This is the response",
"is_reasoning_end": True,
}
# Case: empty output
EMPTY = {
"output": "",
"reasoning": "",
"content": None,
"is_reasoning_end": False,
}
# Case: empty streaming
EMPTY_STREAMING = {
"output": "",
"reasoning": None,
"content": None,
"is_reasoning_end": False,
}
# Case: long reasoning with special characters
SPECIAL_CHARS = {
"output": "Let me think... 1+1=2, right?</think>Yes, 1+1=2.",
"reasoning": "Let me think... 1+1=2, right?",
"content": "Yes, 1+1=2.",
"is_reasoning_end": True,
}
# Case: reasoning with code blocks
CODE_IN_REASONING = {
"output": "```python\nprint('hello')\n```</think>Here is the code.",
"reasoning": "```python\nprint('hello')\n```",
"content": "Here is the code.",
"is_reasoning_end": True,
}
TEST_CASES = [
# Core cases: no start token (MiniMax M2 actual behavior)
pytest.param(
False,
SIMPLE_REASONING,
id="simple_reasoning",
),
pytest.param(
True,
SIMPLE_REASONING,
id="simple_reasoning_streaming",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_reasoning",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_reasoning_streaming",
),
pytest.param(
False,
NO_END_TOKEN,
id="no_end_token",
),
pytest.param(
True,
NO_END_TOKEN,
id="no_end_token_streaming",
),
pytest.param(
False,
MULTIPLE_LINES,
id="multiple_lines",
),
pytest.param(
True,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
False,
SHORTEST_REASONING_NO_STREAMING,
id="shortest_reasoning",
),
pytest.param(
True,
SHORTEST_REASONING_STREAMING,
id="shortest_reasoning_streaming",
),
pytest.param(
False,
EMPTY,
id="empty",
),
pytest.param(
True,
EMPTY_STREAMING,
id="empty_streaming",
),
pytest.param(
False,
SPECIAL_CHARS,
id="special_chars",
),
pytest.param(
True,
SPECIAL_CHARS,
id="special_chars_streaming",
),
pytest.param(
False,
CODE_IN_REASONING,
id="code_in_reasoning",
),
pytest.param(
True,
CODE_IN_REASONING,
id="code_in_reasoning_streaming",
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
minimax_m2_tokenizer,
):
output = minimax_m2_tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
minimax_m2_tokenizer
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]
# Test is_reasoning_end
output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
is_reasoning_end = parser.is_reasoning_end(output_ids)
assert is_reasoning_end == param_dict["is_reasoning_end"]
# Test extract_content
if param_dict["content"] is not None:
content = parser.extract_content_ids(output_ids)
assert content == minimax_m2_tokenizer.convert_tokens_to_ids(
minimax_m2_tokenizer.tokenize(param_dict["content"])
)
else:
content = parser.extract_content_ids(output)
assert content == []
...@@ -5,7 +5,7 @@ import pytest ...@@ -5,7 +5,7 @@ import pytest
from tests.reasoning.utils import run_reasoning_extraction_mistral from tests.reasoning.utils import run_reasoning_extraction_mistral
from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.tokenizers import MistralTokenizer from vllm.tokenizers.mistral import MistralTokenizer
parser_name = "mistral" parser_name = "mistral"
...@@ -18,47 +18,53 @@ def mistral_tokenizer(): ...@@ -18,47 +18,53 @@ def mistral_tokenizer():
return mistral_tokenizer return mistral_tokenizer
SIMPLE_REASONING = { INVALID_SIMPLE_REASONING = {
"output": "This is a reasoning section[/THINK]This is the rest", "output": "This is a reasoning section[/THINK]This is the rest",
"reasoning": "This is a reasoning section", "reasoning": None,
"content": "This is the rest", "content": "This is a reasoning sectionThis is the rest",
"is_reasoning_end": True, "is_reasoning_end": False,
} }
COMPLETE_REASONING = { INVALID_COMPLETE_REASONING = {
"output": "This is a reasoning section[/THINK]", "output": "This is a reasoning section[/THINK]",
"reasoning": "This is a reasoning section", "reasoning": None,
"content": None, "content": "This is a reasoning section",
"is_reasoning_end": True, "is_reasoning_end": False,
} }
NO_CONTENT = { NO_CONTENT = {
"output": "This is content", "output": "[THINK]This is reasoning",
"reasoning": "This is content", "reasoning": "This is reasoning",
"content": None, "content": None,
"is_reasoning_end": False, "is_reasoning_end": False,
} }
NO_REASONING = {
"output": "This is content",
"reasoning": None,
"content": "This is content",
"is_reasoning_end": False,
}
NO_REASONING_STREAMING = { NO_REASONING_STREAMING = {
"output": "This is a reasoning section", "output": "This is a reasoning section",
"reasoning": "This is a reasoning section", "reasoning": None,
"content": None, "content": "This is a reasoning section",
"is_reasoning_end": False, "is_reasoning_end": False,
} }
MULTIPLE_LINES = { INVALID_MULTIPLE_LINES = {
"output": "This\nThat[/THINK]This is the rest\nThat", "output": "This\nThat[/THINK]This is the rest\nThat",
"reasoning": "This\nThat", "reasoning": None,
"content": "This is the rest\nThat", "content": "This\nThatThis is the rest\nThat",
"is_reasoning_end": True, "is_reasoning_end": False,
} }
SHORTEST_REASONING_NO_STREAMING = { INVALID_SHORTEST_REASONING_NO_STREAMING = {
"output": "[/THINK]This is the rest", "output": "[/THINK]This is the rest",
"reasoning": "", "reasoning": None,
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True, "is_reasoning_end": False,
} }
SHORTEST_REASONING = { INVALID_SHORTEST_REASONING = {
"output": "[/THINK]This is the rest", "output": "[/THINK]This is the rest",
"reasoning": None, "reasoning": None,
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True, "is_reasoning_end": False,
} }
REASONING_WITH_THINK = { REASONING_WITH_THINK = {
"output": "[THINK]This is a reasoning section[/THINK]This is the rest", "output": "[THINK]This is a reasoning section[/THINK]This is the rest",
...@@ -78,17 +84,17 @@ MULTIPLE_LINES_WITH_THINK = { ...@@ -78,17 +84,17 @@ MULTIPLE_LINES_WITH_THINK = {
"content": "This is the rest\nThat", "content": "This is the rest\nThat",
"is_reasoning_end": True, "is_reasoning_end": True,
} }
SHORTEST_REASONING_NO_STREAMING_WITH_THINK = { INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
"output": "[/THINK]This is the rest", "output": "[/THINK]This is the rest",
"reasoning": "", "reasoning": None,
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True, "is_reasoning_end": False,
} }
SHORTEST_REASONING_WITH_THINK = { INVALID_SHORTEST_REASONING_WITH_THINK = {
"output": "[/THINK]This is the rest", "output": "[/THINK]This is the rest",
"reasoning": None, "reasoning": None,
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True, "is_reasoning_end": False,
} }
THINK_NO_END = { THINK_NO_END = {
"output": "[THINK]This is a reasoning section", "output": "[THINK]This is a reasoning section",
...@@ -98,8 +104,8 @@ THINK_NO_END = { ...@@ -98,8 +104,8 @@ THINK_NO_END = {
} }
EMPTY = { EMPTY = {
"output": "", "output": "",
"reasoning": "", "reasoning": None,
"content": None, "content": "",
"is_reasoning_end": False, "is_reasoning_end": False,
} }
EMPTY_STREAMING = { EMPTY_STREAMING = {
...@@ -109,47 +115,48 @@ EMPTY_STREAMING = { ...@@ -109,47 +115,48 @@ EMPTY_STREAMING = {
"is_reasoning_end": False, "is_reasoning_end": False,
} }
NEW_LINE = { NEW_LINE = {
"output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest", "output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
"reasoning": "This is a reasoning section", "reasoning": "This is a reasoning section",
"content": "\nThis is the rest", "content": "Before\n\nThis is the rest",
"is_reasoning_end": True, "is_reasoning_end": True,
} }
# Streaming cannot handle new lines at the beginning of the output
# because we need to support [THINK]...[/THINK] and [/THINK]...
# We cannot know if the text before [THINK] is reasoning content
# or not.
NEW_LINE_STREAMING = { NEW_LINE_STREAMING = {
"output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest", "output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
"reasoning": "\nThis is a reasoning section", "reasoning": "This is a reasoning section",
"content": "\nThis is the rest", "content": "Before\n\nThis is the rest",
"is_reasoning_end": True, "is_reasoning_end": True,
} }
TEST_CASES = [ TEST_CASES = [
pytest.param( pytest.param(
False, False,
SIMPLE_REASONING, INVALID_SIMPLE_REASONING,
id="simple_reasoning", id="invalid_simple_reasoning",
), ),
pytest.param( pytest.param(
True, True,
SIMPLE_REASONING, INVALID_SIMPLE_REASONING,
id="simple_reasoning_streaming", id="invalid_simple_reasoning_streaming",
), ),
pytest.param( pytest.param(
False, False,
COMPLETE_REASONING, INVALID_COMPLETE_REASONING,
id="complete_reasoning", id="invalid_complete_reasoning",
), ),
pytest.param( pytest.param(
True, True,
COMPLETE_REASONING, INVALID_COMPLETE_REASONING,
id="complete_reasoning_streaming", id="invalid_complete_reasoning_streaming",
), ),
pytest.param( pytest.param(
False, False,
NO_CONTENT, NO_CONTENT,
id="no_content_token", id="no_content",
),
pytest.param(
False,
NO_REASONING,
id="no_reasoning",
), ),
pytest.param( pytest.param(
True, True,
...@@ -158,23 +165,23 @@ TEST_CASES = [ ...@@ -158,23 +165,23 @@ TEST_CASES = [
), ),
pytest.param( pytest.param(
False, False,
MULTIPLE_LINES, INVALID_MULTIPLE_LINES,
id="multiple_lines", id="invalid_multiple_lines",
), ),
pytest.param( pytest.param(
True, True,
MULTIPLE_LINES, INVALID_MULTIPLE_LINES,
id="multiple_lines_streaming", id="invalid_multiple_lines_streaming",
), ),
pytest.param( pytest.param(
True, True,
SHORTEST_REASONING, INVALID_SHORTEST_REASONING,
id="shortest", id="invalid_shortest",
), ),
pytest.param( pytest.param(
False, False,
SHORTEST_REASONING_NO_STREAMING, INVALID_SHORTEST_REASONING_NO_STREAMING,
id="shortest_streaming", id="invalid_shortest_streaming",
), ),
pytest.param( pytest.param(
False, False,
...@@ -208,13 +215,13 @@ TEST_CASES = [ ...@@ -208,13 +215,13 @@ TEST_CASES = [
), ),
pytest.param( pytest.param(
False, False,
SHORTEST_REASONING_NO_STREAMING_WITH_THINK, INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
id="shortest_with_think", id="invalid_shortest_with_think",
), ),
pytest.param( pytest.param(
True, True,
SHORTEST_REASONING_WITH_THINK, INVALID_SHORTEST_REASONING_WITH_THINK,
id="shortest_with_think_streaming", id="invalid_shortest_with_think_streaming",
), ),
pytest.param( pytest.param(
False, False,
...@@ -316,10 +323,26 @@ def test_mistral_reasoning( ...@@ -316,10 +323,26 @@ def test_mistral_reasoning(
# Test extract_content # Test extract_content
if param_dict["content"] is not None: if param_dict["content"] is not None:
content = parser.extract_content_ids(output_tokens) # Handle the case where there are tokens outputted before Thinking.
assert content == mistral_tokenizer.tokenizer.encode( # This should not occur if the model is well trained and prompted.
param_dict["content"], bos=False, eos=False if "[THINK]" in param_dict["output"] and not param_dict["output"].startswith(
"[THINK]"
):
before_content = param_dict["output"].split("[THINK]")[0]
before_token_ids = mistral_tokenizer.tokenizer.encode(
before_content, bos=False, eos=False
)
left_to_encode = param_dict["content"][len(before_content) :]
# Normal situation.
else:
before_token_ids = []
left_to_encode = param_dict["content"]
content_tokens = parser.extract_content_ids(output_tokens)
expected_token_ids = before_token_ids + mistral_tokenizer.tokenizer.encode(
left_to_encode, bos=False, eos=False
) )
assert content_tokens == expected_token_ids
else: else:
content = parser.extract_content_ids(output_tokens) content = parser.extract_content_ids(output_tokens)
assert content == [] assert content == []
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.reasoning import ReasoningParser from vllm.reasoning import ReasoningParser
from vllm.tokenizers import MistralTokenizer from vllm.tokenizers.mistral import MistralTokenizer
class StreamingReasoningReconstructor: class StreamingReasoningReconstructor:
......
...@@ -3,12 +3,45 @@ ...@@ -3,12 +3,45 @@
# for users who do not have any compilers installed on their system # for users who do not have any compilers installed on their system
set -e set -e
set -x
merge_base_commit=$(git merge-base HEAD origin/main) merge_base_commit=$(git merge-base HEAD origin/main)
echo "Current merge base commit with main: $merge_base_commit" echo "INFO: current merge base commit with main: $merge_base_commit"
git show --oneline -s $merge_base_commit git show --oneline -s $merge_base_commit
# test whether the metadata.json url is valid, retry each 3 minutes up to 5 times
# this avoids cumbersome error messages & manual retries in case the precompiled wheel
# for the given commit is still being built in the release pipeline
meta_json_url="https://wheels.vllm.ai/$merge_base_commit/vllm/metadata.json"
echo "INFO: will use metadata.json from $meta_json_url"
for i in {1..5}; do
echo "Checking metadata.json URL (attempt $i)..."
if curl --fail "$meta_json_url" > metadata.json; then
echo "INFO: metadata.json URL is valid."
# check whether it is valid json by python
if python3 -m json.tool metadata.json; then
echo "INFO: metadata.json is valid JSON. Proceeding with the test."
else
echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!"
exit 1
fi
break
fi
# failure handling
if [ $i -eq 5 ]; then
echo "ERROR: metadata.json URL is still not valid after 5 attempts."
echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists."
echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes."
echo " NOTE: If it fails, please report in #sig-ci channel."
exit 1
else
echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..."
sleep 180
fi
done
set -x
cd /vllm-workspace/ cd /vllm-workspace/
# uninstall vllm # uninstall vllm
...@@ -29,6 +62,6 @@ python3 -c 'import vllm' ...@@ -29,6 +62,6 @@ python3 -c 'import vllm'
# Check if the clangd log file was created # Check if the clangd log file was created
if [ ! -f /tmp/changed.file ]; then if [ ! -f /tmp/changed.file ]; then
echo "changed.file was not created, python only compilation failed" echo "ERROR: changed.file was not created, python only compilation failed"
exit 1 exit 1
fi fi
...@@ -89,64 +89,6 @@ def test_update_config(): ...@@ -89,64 +89,6 @@ def test_update_config():
new_config3 = update_config(config3, {"a": "new_value"}) new_config3 = update_config(config3, {"a": "new_value"})
# Can remove once --task option is fully deprecated
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
[
("distilbert/distilgpt2", "generate", "none", "generate"),
("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none", "classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "embed"),
("openai/whisper-small", "generate", "none", "transcription"),
],
)
def test_auto_task(
model_id, expected_runner_type, expected_convert_type, expected_task
):
config = ModelConfig(model_id, task="auto")
assert config.runner_type == expected_runner_type
assert config.convert_type == expected_convert_type
# Can remove once --task option is fully deprecated
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
[
("distilbert/distilgpt2", "pooling", "embed", "embed"),
("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify", "classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
("openai/whisper-small", "pooling", "embed", "embed"),
],
)
def test_score_task(
model_id, expected_runner_type, expected_convert_type, expected_task
):
config = ModelConfig(model_id, task="score")
assert config.runner_type == expected_runner_type
assert config.convert_type == expected_convert_type
# Can remove once --task option is fully deprecated
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
[
("openai/whisper-small", "generate", "none", "transcription"),
],
)
def test_transcription_task(
model_id, expected_runner_type, expected_convert_type, expected_task
):
config = ModelConfig(model_id, task="transcription")
assert config.runner_type == expected_runner_type
assert config.convert_type == expected_convert_type
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_convert_type"), ("model_id", "expected_runner_type", "expected_convert_type"),
[ [
...@@ -1085,7 +1027,7 @@ def test_vllm_config_explicit_overrides(): ...@@ -1085,7 +1027,7 @@ def test_vllm_config_explicit_overrides():
) )
# Override one field but not others # Override one field but not others
pass_config = PassConfig(enable_noop=False) pass_config = PassConfig(eliminate_noops=False)
compilation_config = CompilationConfig(pass_config=pass_config) compilation_config = CompilationConfig(pass_config=pass_config)
config = VllmConfig( config = VllmConfig(
model_config=regular_model, model_config=regular_model,
......
...@@ -8,6 +8,7 @@ import pytest ...@@ -8,6 +8,7 @@ import pytest
import vllm.envs as envs import vllm.envs as envs
from vllm.envs import ( from vllm.envs import (
disable_envs_cache,
enable_envs_cache, enable_envs_cache,
env_list_with_choices, env_list_with_choices,
env_set_with_choices, env_set_with_choices,
...@@ -57,6 +58,43 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch): ...@@ -57,6 +58,43 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
envs.__getattr__ = envs.__getattr__.__wrapped__ envs.__getattr__ = envs.__getattr__.__wrapped__
def test_getattr_with_reset(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1")
# __getattr__ is not decorated with functools.cache
assert not hasattr(envs.__getattr__, "cache_info")
# Enable envs cache and ignore ongoing environment changes
enable_envs_cache()
assert envs.VLLM_HOST_IP == "1.1.1.1"
# With cache enabled, the environment variable value is cached and unchanged
monkeypatch.setenv("VLLM_HOST_IP", "2.2.2.2")
assert envs.VLLM_HOST_IP == "1.1.1.1"
disable_envs_cache()
assert envs.VLLM_HOST_IP == "2.2.2.2"
# After cache disabled, the environment variable value would be synced
# with os.environ
monkeypatch.setenv("VLLM_HOST_IP", "3.3.3.3")
assert envs.VLLM_HOST_IP == "3.3.3.3"
def test_is_envs_cache_enabled() -> None:
assert not envs._is_envs_cache_enabled()
enable_envs_cache()
assert envs._is_envs_cache_enabled()
# Only wrap one-layer of cache, so we only need to
# call disable once to reset.
enable_envs_cache()
enable_envs_cache()
enable_envs_cache()
disable_envs_cache()
assert not envs._is_envs_cache_enabled()
disable_envs_cache()
assert not envs._is_envs_cache_enabled()
class TestEnvWithChoices: class TestEnvWithChoices:
"""Test cases for env_with_choices function.""" """Test cases for env_with_choices function."""
......
...@@ -7,7 +7,7 @@ from vllm.config import ModelConfig ...@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
from vllm.inputs import zip_enc_dec_prompts from vllm.inputs import zip_enc_dec_prompts
from vllm.inputs.parse import parse_raw_prompts from vllm.inputs.parse import parse_raw_prompts
from vllm.inputs.preprocess import InputPreprocessor from vllm.inputs.preprocess import InputPreprocessor
from vllm.tokenizers import init_tokenizer_from_config from vllm.tokenizers import cached_tokenizer_from_config
pytestmark = pytest.mark.cpu_test pytestmark = pytest.mark.cpu_test
...@@ -34,6 +34,13 @@ INPUTS_SLICES = [ ...@@ -34,6 +34,13 @@ INPUTS_SLICES = [
] ]
# Test that a nested mixed-type list of lists raises a TypeError.
@pytest.mark.parametrize("invalid_input", [[[1, 2], ["foo", "bar"]]])
def test_invalid_input_raise_type_error(invalid_input):
with pytest.raises(TypeError):
parse_raw_prompts(invalid_input)
def test_parse_raw_single_batch_empty(): def test_parse_raw_single_batch_empty():
with pytest.raises(ValueError, match="at least one prompt"): with pytest.raises(ValueError, match="at least one prompt"):
parse_raw_prompts([]) parse_raw_prompts([])
...@@ -108,7 +115,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): ...@@ -108,7 +115,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
) )
def test_preprocessor_always_mm_code_path(model_id, prompt): def test_preprocessor_always_mm_code_path(model_id, prompt):
model_config = ModelConfig(model=model_id) model_config = ModelConfig(model=model_id)
tokenizer = init_tokenizer_from_config(model_config) tokenizer = cached_tokenizer_from_config(model_config)
input_preprocessor = InputPreprocessor(model_config, tokenizer) input_preprocessor = InputPreprocessor(model_config, tokenizer)
# HF processor adds sep token # HF processor adds sep token
......
...@@ -3,38 +3,39 @@ ...@@ -3,38 +3,39 @@
from typing import _get_protocol_attrs # type: ignore from typing import _get_protocol_attrs # type: ignore
import pytest import pytest
from transformers import PreTrainedTokenizerBase from transformers import (
PreTrainedTokenizer,
PreTrainedTokenizerBase,
PreTrainedTokenizerFast,
)
from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.mistral import MistralTokenizer
def _get_missing_attrs(obj: object, target: type): def _get_missing_attrs(obj: object, target: type):
return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)] return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]
def _assert_tokenizer_like(tokenizer: object):
missing_attrs = _get_missing_attrs(tokenizer, TokenizerLike)
assert not missing_attrs, f"Missing attrs: {missing_attrs}"
def test_tokenizer_like_protocol(): def test_tokenizer_like_protocol():
assert not ( tokenizer = get_tokenizer("gpt2", use_fast=False)
missing_attrs := _get_missing_attrs( assert isinstance(tokenizer, PreTrainedTokenizer)
get_tokenizer("gpt2", use_fast=False), _assert_tokenizer_like(tokenizer)
TokenizerLike,
) tokenizer = get_tokenizer("gpt2", use_fast=True)
), f"Missing attrs: {missing_attrs}" assert isinstance(tokenizer, PreTrainedTokenizerFast)
_assert_tokenizer_like(tokenizer)
assert not (
missing_attrs := _get_missing_attrs( tokenizer = get_tokenizer(
get_tokenizer("gpt2", use_fast=True), "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
TokenizerLike, )
) assert isinstance(tokenizer, MistralTokenizer)
), f"Missing attrs: {missing_attrs}" _assert_tokenizer_like(tokenizer)
assert not (
missing_attrs := _get_missing_attrs(
get_tokenizer(
"mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
),
TokenizerLike,
)
), f"Missing attrs: {missing_attrs}"
@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"]) @pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment