Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
......@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num: int,
max_num: int,
):
from vllm.model_executor.models.internvl import (
from vllm.transformers_utils.processors.internvl import (
calculate_internvl_targets,
get_internvl_target_ratios,
)
......
......@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num: int,
max_num: int,
):
from vllm.model_executor.models.nemotron_vl import (
from vllm.transformers_utils.processors.nemotron_vl import (
calculate_nemotron_vl_targets,
get_nemotron_vl_target_ratios,
)
......
......@@ -185,14 +185,16 @@ def make_mock_model(hidden: int = 8):
# super().embed_input_ids → use SupportsMultiModal.embed_input_ids
def fake_super_embed(
ids, mm_embs=None, *, is_multimodal=None, handle_oov_mm_token=False
ids,
mm_embs=None,
*,
is_multimodal=None,
):
return SupportsMultiModal.embed_input_ids(
model,
ids,
mm_embs,
is_multimodal=is_multimodal,
handle_oov_mm_token=handle_oov_mm_token,
)
# Bind embed_input_ids as the real method
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""E2E tests for online MXFP8 quantization.
Loads a BF16 model with ``--quantization mxfp8`` (online quantization) and
compares log-probabilities against the same model served in BF16 without
quantization. This exercises the full pipeline: config parsing,
``Mxfp8OnlineLinearMethod``, ``Mxfp8OnlineMoEMethod``, weight loading,
online quantization / shuffling, and inference through ``apply_monolithic``.
Layer skipping (``modules_to_not_convert``) is configured in the model's
``config.json`` under ``quantization_config`` and is not tested here.
``example_prompts`` is a pytest fixture (from conftest.py) that loads 8
diverse prompts from ``tests/prompts/example.txt``.
"""
import pytest
from tests.quantization.utils import is_quant_method_supported
from ..utils import check_logprobs_close
# A small MoE model that fits on a single GPU and has both linear + MoE layers.
MOE_MODEL = "Qwen/Qwen3-30B-A3B"
# A small dense model (no MoE) to validate the linear-only path.
DENSE_MODEL = "Qwen/Qwen3-0.6B"
MAX_MODEL_LEN = 1024
MAX_TOKENS = 4
NUM_LOG_PROBS = 8
@pytest.mark.skipif(
not is_quant_method_supported("mxfp8"),
reason="mxfp8 is not supported on this GPU type (requires sm_100+).",
)
@pytest.mark.quant_model
@pytest.mark.parametrize("model", [DENSE_MODEL, MOE_MODEL], ids=["dense", "moe"])
def test_mxfp8_logprobs(
vllm_runner,
example_prompts,
model: str,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Compare BF16 baseline logprobs against online MXFP8-quantized model.
Runs the same model twice -- once in BF16 (baseline) and once with
online MXFP8 quantization -- then checks that the top log-probabilities
are close. Only 4 tokens are generated to keep the test fast while
still catching numerical divergence.
"""
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", "true")
with vllm_runner(
model,
max_model_len=MAX_MODEL_LEN,
enforce_eager=True,
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, MAX_TOKENS, NUM_LOG_PROBS
)
with vllm_runner(
model,
max_model_len=MAX_MODEL_LEN,
enforce_eager=True,
quantization="mxfp8",
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, MAX_TOKENS, NUM_LOG_PROBS
)
check_logprobs_close(
outputs_0_lst=baseline_outputs,
outputs_1_lst=test_outputs,
name_0="bf16",
name_1="mxfp8",
)
@pytest.mark.skipif(
not is_quant_method_supported("mxfp8"),
reason="mxfp8 is not supported on this GPU type (requires sm_100+).",
)
@pytest.mark.quant_model
@pytest.mark.parametrize("model", [DENSE_MODEL, MOE_MODEL], ids=["dense", "moe"])
def test_mxfp8_generation(vllm_runner, model: str) -> None:
"""Smoke test: verify online MXFP8 model generates coherent text."""
prompt = "1 2 3 4 5"
with vllm_runner(
model,
enforce_eager=True,
quantization="mxfp8",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
output = vllm_model.generate_greedy([prompt], max_tokens=5)
generated = output[0][1]
assert len(generated) > len(prompt), (
f"MXFP8 model produced no new tokens. Output: {generated!r}"
)
......@@ -628,6 +628,11 @@ _LATE_INTERACTION_EXAMPLE_MODELS = {
trust_remote_code=True,
hf_overrides={"architectures": ["ColBERTJinaRobertaModel"]},
),
"ColBERTLfm2Model": _HfExamplesInfo(
"LiquidAI/LFM2-ColBERT-350M",
trust_remote_code=True,
hf_overrides={"architectures": ["ColBERTLfm2Model"]},
),
# [Multimodal]
"ColModernVBertForRetrieval": _HfExamplesInfo(
"ModernVBERT/colmodernvbert-merged",
......@@ -639,6 +644,11 @@ _LATE_INTERACTION_EXAMPLE_MODELS = {
"OpsColQwen3Model": _HfExamplesInfo(
"OpenSearch-AI/Ops-Colqwen3-4B", trust_remote_code=True
),
"ColQwen3_5": _HfExamplesInfo(
"athrael-soju/colqwen3.5-4.5B-v3",
trust_remote_code=True,
max_model_len=4096,
),
"Qwen3VLNemotronEmbedModel": _HfExamplesInfo(
"nvidia/nemotron-colembed-vl-4b-v2",
),
......@@ -774,7 +784,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"rednote-hilab/dots.ocr", trust_remote_code=True
),
"Eagle2_5_VLForConditionalGeneration": _HfExamplesInfo(
"nvidia/Eagle2.5-8B", trust_remote_code=True, is_available_online=False
"nvidia/Eagle2.5-8B",
trust_remote_code=True,
),
"Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo(
......@@ -1116,6 +1127,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
tokenizer_mode="mistral",
),
# [Encoder-decoder]
"CohereASRForConditionalGeneration": _HfExamplesInfo(
"/host/engines/vllm/audio/2b-release",
trust_remote_code=True,
is_available_online=False, # TODO (ekagra): revert after asr release
),
"NemotronParseForConditionalGeneration": _HfExamplesInfo(
"nvidia/NVIDIA-Nemotron-Parse-v1.1", trust_remote_code=True
),
......
......@@ -8,7 +8,7 @@ from tests.conftest import VllmRunner
from tests.utils import create_new_process_for_each_test
@create_new_process_for_each_test() # Memory is not cleaned up properly otherwise
@create_new_process_for_each_test() # Hangs otherwise
@pytest.mark.parametrize(
"model",
[
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
from pathlib import Path
from unittest.mock import patch
import librosa
import numpy as np
import pybase64 as base64
import pytest
from vllm.multimodal.media import AudioMediaIO
from ...conftest import AudioTestAssets
pytestmark = pytest.mark.cpu_test
ASSETS_DIR = Path(__file__).parent.parent / "assets"
......@@ -22,40 +24,32 @@ def dummy_audio():
@pytest.fixture
def dummy_audio_bytes():
return b"FAKEAUDIOBYTES"
def dummy_audio_bytes(audio_assets: AudioTestAssets):
with open(audio_assets[0].get_local_path(), "rb") as f:
return f.read()
def test_audio_media_io_load_bytes(dummy_audio_bytes):
audio_io = AudioMediaIO()
with patch("librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_bytes(dummy_audio_bytes)
mock_load.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
out = audio_io.load_bytes(dummy_audio_bytes)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_base64(dummy_audio_bytes):
audio_io = AudioMediaIO()
encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_base64("audio/wav", encoded)
mock_load_bytes.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
out = audio_io.load_base64("audio/wav", encoded)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_file():
def test_audio_media_io_load_file(audio_assets: AudioTestAssets):
audio_io = AudioMediaIO()
path = Path("/fake/path.wav")
with patch("librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_file(path)
mock_load.assert_called_once_with(path, sr=None)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
path = audio_assets[0].get_local_path()
out = audio_io.load_file(path)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_encode_base64(dummy_audio):
......
......@@ -2,13 +2,13 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import base64
import mimetypes
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory
import aiohttp
import numpy as np
import pybase64 as base64
import pytest
import requests
import torch
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io
from pathlib import Path
import numpy as np
import numpy.typing as npt
import pybase64
import pytest
from PIL import Image
......@@ -235,3 +237,53 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch
frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
assert metadata_missing["video_backend"] == "test_video_backend_override_2"
def test_load_base64_jpeg_returns_metadata():
"""Regression test: load_base64 with video/jpeg must return metadata.
Previously, base64 JPEG frame sequences returned an empty dict for
metadata, which broke downstream consumers that rely on fields like
total_num_frames and fps. See PR #37301.
"""
num_test_frames = 3
frame_width, frame_height = 8, 8
# Build a few tiny JPEG frames and base64-encode them
b64_frames = []
for i in range(num_test_frames):
img = Image.new("RGB", (frame_width, frame_height), color=(i * 80, 0, 0))
buf = io.BytesIO()
img.save(buf, format="JPEG")
b64_frames.append(pybase64.b64encode(buf.getvalue()).decode("ascii"))
data = ",".join(b64_frames)
imageio = ImageMediaIO()
videoio = VideoMediaIO(imageio, num_frames=num_test_frames)
frames, metadata = videoio.load_base64("video/jpeg", data)
# Frames array shape: (num_frames, H, W, 3)
assert frames.shape[0] == num_test_frames
# All required metadata keys must be present
required_keys = {
"total_num_frames",
"fps",
"duration",
"video_backend",
"frames_indices",
"do_sample_frames",
}
assert required_keys.issubset(metadata.keys()), (
f"Missing metadata keys: {required_keys - metadata.keys()}"
)
assert metadata["total_num_frames"] == num_test_frames
assert metadata["video_backend"] == "jpeg_sequence"
assert metadata["frames_indices"] == list(range(num_test_frames))
assert metadata["do_sample_frames"] is False
# Default fps=1 → duration == num_frames
assert metadata["fps"] == 1.0
assert metadata["duration"] == float(num_test_frames)
......@@ -14,7 +14,7 @@ from vllm.multimodal.audio import (
AudioSpec,
ChannelReduction,
normalize_audio,
resample_audio_librosa,
resample_audio_pyav,
resample_audio_scipy,
split_audio,
)
......@@ -25,14 +25,14 @@ def dummy_audio():
return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float)
def test_resample_audio_librosa(dummy_audio):
with patch("vllm.multimodal.audio.librosa.resample") as mock_resample:
mock_resample.return_value = dummy_audio * 2
out = resample_audio_librosa(dummy_audio, orig_sr=44100, target_sr=22050)
mock_resample.assert_called_once_with(
dummy_audio, orig_sr=44100, target_sr=22050
)
assert np.all(out == dummy_audio * 2)
def test_resample_audio_pyav(dummy_audio):
out_down = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=2)
out_up = resample_audio_pyav(dummy_audio, orig_sr=2, target_sr=4)
out_same = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=4)
assert len(out_down) == 3
assert len(out_up) == 10
assert np.all(out_same == dummy_audio)
def test_resample_audio_scipy(dummy_audio):
......@@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
assert np.isfinite(out).all()
def test_audio_resampler_librosa_calls_resample(dummy_audio):
resampler = AudioResampler(target_sr=22050, method="librosa")
with patch("vllm.multimodal.audio.resample_audio_librosa") as mock_resample:
def test_audio_resampler_pyav_calls_resample(dummy_audio):
resampler = AudioResampler(target_sr=22050, method="pyav")
with patch("vllm.multimodal.audio.resample_audio_pyav") as mock_resample:
mock_resample.return_value = dummy_audio
out = resampler.resample(dummy_audio, orig_sr=44100)
mock_resample.assert_called_once_with(
......@@ -423,13 +423,13 @@ class TestAudioPipelineE2E:
# Verify channel averaging: mean of [0.5, -0.5] = 0.0
np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
def test_librosa_mono_passthrough_e2e(self):
"""Full pipeline: librosa mono format → preserved as mono."""
def test_pyav_mono_passthrough_e2e(self):
"""Full pipeline: pyav mono format → preserved as mono."""
from vllm.multimodal.parse import MultiModalDataParser
# Simulate librosa output: already mono (time,) format
mono_librosa = np.random.randn(16000).astype(np.float32)
assert mono_librosa.shape == (16000,)
# Simulate pyav output: already mono (time,) format
mono_pyav = np.random.randn(16000).astype(np.float32)
assert mono_pyav.shape == (16000,)
# Create parser with mono normalization
parser = MultiModalDataParser(
......@@ -438,7 +438,7 @@ class TestAudioPipelineE2E:
)
# Process audio through the parser
result = parser._parse_audio_data((mono_librosa, 16000))
result = parser._parse_audio_data((mono_pyav, 16000))
audio_output = result.get(0)
# Verify output is still mono 1D
......@@ -446,7 +446,7 @@ class TestAudioPipelineE2E:
assert audio_output.shape == (16000,)
# Verify audio content is preserved
np.testing.assert_array_almost_equal(audio_output, mono_librosa)
np.testing.assert_array_almost_equal(audio_output, mono_pyav)
def test_multichannel_5_1_surround_to_mono_e2e(self):
"""Full pipeline: 5.1 surround (6 channels) → mono output."""
......
......@@ -3,10 +3,10 @@
from collections.abc import Sequence
from vllm.config import VllmConfig
from vllm.config import ModelConfig, PoolerConfig, VllmConfig
from vllm.entrypoints.openai.engine.protocol import UsageInfo
from vllm.entrypoints.pooling.base.protocol import EmbedRequestMixin
from vllm.inputs.data import PromptType
from vllm.logger import init_logger
from vllm.outputs import PoolingRequestOutput
from vllm.plugins.io_processors.interface import (
IOProcessor,
......@@ -16,14 +16,13 @@ from vllm.renderers import BaseRenderer
from vllm.tokenizers.detokenizer_utils import convert_ids_list_to_tokens
from .types import (
EMBED_TASKS,
SparseEmbeddingCompletionRequestMixin,
SparseEmbeddingResponse,
SparseEmbeddingResponseData,
SparseEmbeddingTokenWeight,
)
logger = init_logger(__name__)
class BgeM3SparseEmbeddingsProcessor(
IOProcessor[SparseEmbeddingCompletionRequestMixin, SparseEmbeddingResponse]
......@@ -33,6 +32,22 @@ class BgeM3SparseEmbeddingsProcessor(
self.offline_requests: list[SparseEmbeddingCompletionRequestMixin] = []
self.online_requests: dict[str, SparseEmbeddingCompletionRequestMixin] = {}
self.renderer: BaseRenderer = renderer
self.default_pooling_params = {}
pooler_config: PoolerConfig = vllm_config.model_config.pooler_config
if pooler_config is not None:
for param in ["use_activation", "dimensions"]:
if getattr(pooler_config, param, None) is None:
continue
self.default_pooling_params[param] = getattr(pooler_config, param)
self.embed_dimensions = vllm_config.model_config.embedding_size
self.embed_request_queue: list[EmbedRequestMixin] = []
def __repr__(self) -> str:
return (
f"BgeM3SparseEmbeddingsProcessor("
f"embed_dimensions={self.embed_dimensions}, "
f"default_pooling_params={self.default_pooling_params})"
)
def merge_pooling_params(
self,
......@@ -41,7 +56,57 @@ class BgeM3SparseEmbeddingsProcessor(
if params is None:
params = PoolingParams()
# refer to PoolingCompletionRequest.to_pooling_params
params.task = "token_classify"
# set and verify pooling params
params.skip_reading_prefix_cache = True
raw_embed_request = self.embed_request_queue.pop(0)
if raw_embed_request.embed_task not in EMBED_TASKS:
raise ValueError(
f"Unsupported task {raw_embed_request}, "
f"Supported tasks are {EMBED_TASKS}"
)
has_dense_embed = True
if raw_embed_request.embed_task == "dense":
params.task = "embed"
params.skip_reading_prefix_cache = False
elif raw_embed_request.embed_task == "sparse":
params.task = "token_classify"
has_dense_embed = False
else:
params.task = "embed&token_classify"
params.use_activation = raw_embed_request.use_activation
if params.use_activation is None:
params.use_activation = True
if not has_dense_embed:
params.dimensions = None
return params
params.dimensions = raw_embed_request.dimensions
model_config: ModelConfig = self.vllm_config.model_config
for param in self.default_pooling_params:
if getattr(params, param, None) is None:
setattr(params, param, self.default_pooling_params[param])
if params.dimensions is not None:
if not model_config.is_matryoshka:
raise ValueError(
f'Model "{model_config.served_model_name}" does not '
f"support matryoshka representation, "
f"changing output dimensions will lead to poor results."
)
mds = model_config.matryoshka_dimensions
if mds is not None:
if params.dimensions not in mds:
raise ValueError(
f"Model {model_config.served_model_name!r} "
f"only supports {str(mds)} matryoshka dimensions, "
f"use other output dimensions will "
f"lead to poor results."
)
elif params.dimensions < 1:
raise ValueError("Dimensions must be greater than 0")
return params
def parse_request(
......@@ -61,14 +126,16 @@ class BgeM3SparseEmbeddingsProcessor(
if request_id is not None:
assert request_id not in self.online_requests, "request_id duplicated"
self.online_requests[request_id] = prompt
self.embed_request_queue.extend(prompt.to_embed_requests_online())
else:
self.offline_requests.append(prompt)
self.embed_request_queue.extend(prompt.to_embed_requests_offline())
return prompt.input
def _get_sparse_embedding_request(self, request_id: str | None = None):
if request_id:
return self.online_requests.pop(request_id, None)
return self.offline_requests.pop()
return self.offline_requests.pop(0)
def _build_sparse_embedding_token_weights(
self,
......@@ -100,26 +167,45 @@ class BgeM3SparseEmbeddingsProcessor(
) -> SparseEmbeddingResponse:
num_prompt_tokens = 0
response_data = []
return_tokens = self._get_sparse_embedding_request(request_id).return_tokens
raw_request = self._get_sparse_embedding_request(request_id)
has_dense_embed = raw_request.embed_task in ["dense", "dense&sparse"]
has_sparse_embed = raw_request.embed_task in ["sparse", "dense&sparse"]
embed_dimensions = 0
if has_dense_embed:
embed_dimensions = (
self.embed_dimensions
if raw_request.dimensions is None
else raw_request.dimensions
)
for idx in range(len(model_output)):
mo = model_output[idx]
sparse_embedding: dict[int, float] = {}
sparse_embedding_dict: dict[int, float] = {}
num_prompt_tokens += len(mo.prompt_token_ids)
if len(mo.prompt_token_ids) != len(mo.outputs.data):
# this is the case that add_special_tokens is True,
# which means first token and last token are special tokens
mo.prompt_token_ids = mo.prompt_token_ids[1:]
for token_id, weight in zip(mo.prompt_token_ids, mo.outputs.data.tolist()):
sparse_embedding[token_id] = max(
weight, sparse_embedding.get(token_id, 0.0)
dense_embedding: list[float] | None = None
sparse_embedding: list[SparseEmbeddingTokenWeight] | None = None
if has_dense_embed:
dense_embedding = mo.outputs.data[:embed_dimensions].tolist()
if has_sparse_embed:
sparse_weights = mo.outputs.data[embed_dimensions:].tolist()
if len(mo.prompt_token_ids) != len(sparse_weights):
# this is the case that add_special_tokens is True,
# which means first token and last token are special tokens
mo.prompt_token_ids = mo.prompt_token_ids[1:]
for token_id, weight in zip(mo.prompt_token_ids, sparse_weights):
sparse_embedding_dict[token_id] = max(
weight, sparse_embedding_dict.get(token_id, 0.0)
)
sparse_embedding = self._build_sparse_embedding_token_weights(
sparse_embedding_dict,
raw_request.return_tokens,
)
response_data.append(
SparseEmbeddingResponseData(
index=idx,
sparse_embedding=self._build_sparse_embedding_token_weights(
sparse_embedding,
return_tokens,
),
object=raw_request.embed_task,
sparse_embedding=sparse_embedding,
dense_embedding=dense_embedding,
)
)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Literal, get_args
from pydantic import BaseModel, Field
from vllm.entrypoints.openai.engine.protocol import UsageInfo
from vllm.entrypoints.pooling.base.protocol import CompletionRequestMixin
from vllm.entrypoints.pooling.base.protocol import (
CompletionRequestMixin,
EmbedRequestMixin,
)
EmbedTask = Literal[
"sparse",
"dense",
"dense&sparse",
]
EMBED_TASKS: tuple[EmbedTask, ...] = get_args(EmbedTask)
class SparseEmbeddingCompletionRequestMixin(CompletionRequestMixin):
class SparseEmbeddingCompletionRequestMixin(CompletionRequestMixin, EmbedRequestMixin):
return_tokens: bool | None = Field(
default=None,
description="Whether to return dict shows the mapping of token_id to text."
"`None` or False means not return.",
)
embed_task: EmbedTask = Field(
default="dense&sparse",
description="embed task, can be one of 'sparse', 'dense' , 'dense&sparse', "
"default to 'dense&sparse'",
)
def to_embed_requests_offline(self) -> list[EmbedRequestMixin]:
if isinstance(self.input, list):
return [self] * len(self.input)
return [self]
def to_embed_requests_online(self) -> list[EmbedRequestMixin]:
return [self]
class SparseEmbeddingTokenWeight(BaseModel):
......@@ -23,8 +49,9 @@ class SparseEmbeddingTokenWeight(BaseModel):
class SparseEmbeddingResponseData(BaseModel):
index: int
object: str = "sparse-embedding"
sparse_embedding: list[SparseEmbeddingTokenWeight]
object: str = "dense&sparse"
sparse_embedding: list[SparseEmbeddingTokenWeight] | None
dense_embedding: list[float] | None
class SparseEmbeddingResponse(BaseModel):
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import datetime
import os
import tempfile
......@@ -11,6 +10,7 @@ from typing import Any
import albumentations
import numpy as np
import pybase64 as base64
import rasterio
import regex as re
import torch
......
......@@ -19,6 +19,12 @@ model_config = {
),
}
dense_embedding_sum = [
-0.7214539647102356, # "What is the capital of France?"
-0.6926871538162231, # "What is the capital of Germany?"
-0.7129564881324768, # "What is the capital of Spain?"
]
def _float_close(expected: object, result: object):
assert isinstance(expected, float) and isinstance(result, float), (
......@@ -33,6 +39,12 @@ def _get_attr_or_val(obj: object | dict, key: str):
return getattr(obj, key, None)
def _check_dense_embedding(data, index=0):
assert _float_close(sum(data), dense_embedding_sum[index]), (
"dense-embedding result not match"
)
def _check_sparse_embedding(data, check_tokens=False):
expected_weights = [
{"token_id": 32, "weight": 0.0552978515625, "token": "?"},
......@@ -109,7 +121,7 @@ async def test_bge_m3_sparse_plugin_online(
assert len(_get_attr_or_val(parsed_response, "data")) > 0
data_entry = _get_attr_or_val(parsed_response, "data")[0]
assert _get_attr_or_val(data_entry, "object") == "sparse-embedding"
assert _get_attr_or_val(data_entry, "object") == "dense&sparse"
assert _get_attr_or_val(data_entry, "sparse_embedding")
# Verify sparse embedding format
......@@ -117,6 +129,11 @@ async def test_bge_m3_sparse_plugin_online(
assert isinstance(sparse_embedding, list)
_check_sparse_embedding(sparse_embedding, return_tokens)
# Verify dense embedding format
dense_embedding = _get_attr_or_val(data_entry, "dense_embedding")
assert isinstance(dense_embedding, list)
_check_dense_embedding(dense_embedding)
# Verify usage information
usage = _get_attr_or_val(parsed_response, "usage")
assert usage, f"usage not found for {parsed_response}"
......@@ -164,6 +181,9 @@ def test_bge_m3_sparse_plugin_offline(vllm_runner, return_tokens: bool):
sparse_embedding = output.sparse_embedding
assert isinstance(sparse_embedding, list)
_check_sparse_embedding(sparse_embedding, return_tokens)
dense_embedding = output.dense_embedding
assert isinstance(dense_embedding, list)
_check_dense_embedding(dense_embedding)
# Verify usage
assert response.usage.prompt_tokens > 0
......@@ -206,6 +226,9 @@ def test_bge_m3_sparse_plugin_offline_multiple_inputs(vllm_runner):
# Each output should have sparse embeddings
sparse_embedding = output.sparse_embedding
assert isinstance(sparse_embedding, list)
dense_embedding = output.dense_embedding
assert isinstance(dense_embedding, list)
_check_dense_embedding(dense_embedding, i)
# Verify usage
assert response.usage.prompt_tokens > 0
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io
import imagehash
import pybase64 as base64
import pytest
import requests
from PIL import Image
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
def test_mi3xx_moe():
print("TODO: add tests for Mi3xx MoE quantization")
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
from vllm.reasoning.kimi_k2_reasoning_parser import KimiK2ReasoningParser
from vllm.tokenizers import get_tokenizer
REASONING_MODEL_NAME = "moonshotai/Kimi-K2.5"
@pytest.fixture(scope="module")
def kimi_k2_tokenizer():
return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME, trust_remote_code=True)
def test_parser_selection_thinking_enabled(kimi_k2_tokenizer):
parser = KimiK2ReasoningParser(
kimi_k2_tokenizer, chat_template_kwargs={"thinking": True}
)
assert parser._identity_parser is None
def test_parser_selection_thinking_disabled(kimi_k2_tokenizer):
parser = KimiK2ReasoningParser(
kimi_k2_tokenizer, chat_template_kwargs={"thinking": False}
)
assert isinstance(parser._identity_parser, IdentityReasoningParser)
def test_extract_reasoning_with_think_tags(kimi_k2_tokenizer):
parser = KimiK2ReasoningParser(kimi_k2_tokenizer)
request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0)
reasoning, content = parser.extract_reasoning(
"<think>step by step reasoning</think>final answer", request
)
assert reasoning == "step by step reasoning"
assert content == "final answer"
def test_extract_reasoning_empty_thinking(kimi_k2_tokenizer):
parser = KimiK2ReasoningParser(kimi_k2_tokenizer)
request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0)
reasoning, content = parser.extract_reasoning(
"<think></think>final answer", request
)
assert reasoning == ""
assert content == "final answer"
def test_extract_reasoning_implicit_start(kimi_k2_tokenizer):
"""When there's no <think> tag, everything is treated as reasoning."""
parser = KimiK2ReasoningParser(kimi_k2_tokenizer)
request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0)
reasoning, content = parser.extract_reasoning(
"implicit reasoning with no tags", request
)
assert reasoning == "implicit reasoning with no tags"
assert content is None
def test_extract_reasoning_tool_section_ends_reasoning(kimi_k2_tokenizer):
"""<|tool_calls_section_begin|> implicitly ends reasoning."""
parser = KimiK2ReasoningParser(kimi_k2_tokenizer)
request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0)
text = "some reasoning<|tool_calls_section_begin|>tool call data"
reasoning, content = parser.extract_reasoning(text, request)
assert reasoning == "some reasoning"
assert content == "<|tool_calls_section_begin|>tool call data"
def test_streaming_reasoning_then_content(kimi_k2_tokenizer):
"""Token-by-token streaming: reasoning tokens then content after </think>."""
parser = KimiK2ReasoningParser(kimi_k2_tokenizer)
think_id = parser._start_token_id
end_think_id = parser._end_token_id
# Use a real token ID from the tokenizer for regular content
regular_id = kimi_k2_tokenizer.encode("hello", add_special_tokens=False)[0]
# First token: <think> — single special token should be skipped
result = parser.extract_reasoning_streaming(
previous_text="",
current_text="<think>",
delta_text="<think>",
previous_token_ids=[],
current_token_ids=[think_id],
delta_token_ids=[think_id],
)
assert result is None
# Reasoning token
result = parser.extract_reasoning_streaming(
previous_text="<think>",
current_text="<think>step one",
delta_text="step one",
previous_token_ids=[think_id],
current_token_ids=[think_id, regular_id],
delta_token_ids=[regular_id],
)
assert isinstance(result, DeltaMessage)
assert result.reasoning == "step one"
assert result.content is None
# End token </think> as single token — should be skipped
result = parser.extract_reasoning_streaming(
previous_text="<think>step one",
current_text="<think>step one</think>",
delta_text="</think>",
previous_token_ids=[think_id, regular_id],
current_token_ids=[think_id, regular_id, end_think_id],
delta_token_ids=[end_think_id],
)
assert result is None
# Content after </think>
content_id = kimi_k2_tokenizer.encode("world", add_special_tokens=False)[0]
result = parser.extract_reasoning_streaming(
previous_text="<think>step one</think>",
current_text="<think>step one</think>answer",
delta_text="answer",
previous_token_ids=[think_id, regular_id, end_think_id],
current_token_ids=[think_id, regular_id, end_think_id, content_id],
delta_token_ids=[content_id],
)
assert isinstance(result, DeltaMessage)
assert result.content == "answer"
def test_streaming_tool_section_ends_reasoning(kimi_k2_tokenizer):
"""<|tool_calls_section_begin|> in delta ends reasoning during streaming."""
parser = KimiK2ReasoningParser(kimi_k2_tokenizer)
think_id = parser._start_token_id
tool_begin_id = parser._tool_section_start_token_id
regular_id = kimi_k2_tokenizer.encode("hello", add_special_tokens=False)[0]
# Tool section token arrives — should transition from reasoning to content
result = parser.extract_reasoning_streaming(
previous_text="<think>thinking",
current_text="<think>thinking<|tool_calls_section_begin|>",
delta_text="<|tool_calls_section_begin|>",
previous_token_ids=[think_id, regular_id],
current_token_ids=[think_id, regular_id, tool_begin_id],
delta_token_ids=[tool_begin_id],
)
assert isinstance(result, DeltaMessage)
assert result.content == "<|tool_calls_section_begin|>"
......@@ -21,119 +21,119 @@ def step3p5_tokenizer():
SIMPLE_REASONING = {
"output": "This is a reasoning section</think>This is the rest",
"reasoning_content": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
# need to get into parser again to remove newline after </think>
COMPLETE_REASONING = {
"output": "This is a reasoning section</think>",
"reasoning_content": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
NO_CONTENT = {
"output": "This is content",
"reasoning_content": "This is content",
"reasoning": "This is content",
"content": None,
"is_reasoning_end": False,
}
NO_REASONING_STREAMING = {
"output": "This is a reasoning section",
"reasoning_content": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
MULTIPLE_LINES = {
"output": "This\nThat</think>This is the rest\nThat",
"reasoning_content": "This\nThat",
"reasoning": "This\nThat",
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
SHORTEST_REASONING_NO_STREAMING = {
"output": "</think>This is the rest",
"reasoning_content": None,
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": True,
}
SHORTEST_REASONING = {
"output": "</think>This is the rest",
"reasoning_content": None,
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": True,
}
REASONING_WITH_THINK = {
"output": "<think>This is a reasoning section</think>This is the rest",
"reasoning_content": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
COMPLETE_REASONING_WITH_THINK = {
"output": "<think>This is a reasoning section</think>",
"reasoning_content": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
MULTIPLE_LINES_WITH_THINK = {
"output": "<think>This\nThat</think>This is the rest\nThat",
"reasoning_content": "This\nThat",
"reasoning": "This\nThat",
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
"output": "</think>This is the rest",
"reasoning_content": None,
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": True,
}
SHORTEST_REASONING_WITH_THINK = {
"output": "</think>This is the rest",
"reasoning_content": None,
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": True,
}
THINK_NO_END = {
"output": "<think>This is a reasoning section",
"reasoning_content": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
EMPTY = {
"output": "",
"reasoning_content": None,
"reasoning": None,
"content": None,
"is_reasoning_end": False,
}
EMPTY_STREAMING = {
"output": "",
"reasoning_content": None,
"reasoning": None,
"content": None,
"is_reasoning_end": False,
}
NEW_LINE = {
"output": "\n<think>This is a reasoning section</think>\nThis is the rest",
"reasoning_content": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
NEW_LINE_STREAMING = {
"output": "\n<think>This is a reasoning section\n</think>\nThis is the rest",
"reasoning_content": "\nThis is a reasoning section",
"reasoning": "\nThis is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
NEW_LINE_STREAMING_COMPLEX_CONTENT = {
"output": "\n This is a \n reasoning section\n\n\n</think>\n\nThis is the rest",
"reasoning_content": "\n This is a \n reasoning section\n\n",
"reasoning": "\n This is a \n reasoning section\n\n",
"content": "\nThis is the rest",
"is_reasoning_end": True,
}
MULTI_TURN_PROMPT_CONTENT = {
"output": "<think> This is last turn's reasoning section </think> hello <think>",
"reasoning_content": "",
"reasoning": "",
"content": "",
"is_reasoning_end": False,
}
......@@ -296,7 +296,7 @@ def test_reasoning(
print(f"content: {content}")
test_id = request.node.callspec.id if hasattr(request.node, "callspec") else None
if request.node.callspec.id != "multi_turn_prompt_content":
assert reasoning == param_dict["reasoning_content"]
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]
# Test is_reasoning_end
......
......@@ -5,9 +5,9 @@ Tests verify that malicious sparse tensors are rejected before they can trigger
out-of-bounds memory writes during to_dense() operations.
"""
import base64
import io
import pybase64 as base64
import pytest
import torch
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment