Commit 9c4ecf15 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.4' into v0.8.4-ori

parents bfc2d6f7 dc1b4a6f
......@@ -5,6 +5,7 @@
ray
triton==3.1.0
pandas
numpy==1.26.4
tabulate
setuptools>=61
setuptools-scm>=8
......
......@@ -2,7 +2,7 @@
-r common.txt
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61; python_version > '3.9'
numba == 0.61.2; python_version > '3.9'
# Dependencies for AMD GPUs
awscli
......
......@@ -5,6 +5,7 @@ pytest-forked
pytest-asyncio
pytest-rerunfailures
pytest-shard
pytest-timeout
# testing utils
awscli
......@@ -27,10 +28,11 @@ torchvision==0.21.0
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.5.4 # required for pixtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
transformers==4.51.0
transformers==4.51.1
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
# quantization
bitsandbytes>=0.45.3
......@@ -40,7 +42,7 @@ genai_perf==0.0.8
tritonclient==2.51.0
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61; python_version > '3.9'
numba == 0.61.2; python_version > '3.9'
numpy
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
......
......@@ -101,6 +101,8 @@ dill==0.3.8
# multiprocess
dnspython==2.7.0
# via email-validator
docopt==0.6.2
# via num2words
docutils==0.16
# via awscli
einops==0.8.0
......@@ -263,7 +265,9 @@ networkx==3.2.1
# via torch
nltk==3.9.1
# via rouge-score
numba==0.61.0
num2words==0.5.14
# via -r requirements/test.in
numba==0.61.2
# via
# -r requirements/test.in
# librosa
......@@ -444,6 +448,7 @@ pytest==8.3.3
# pytest-mock
# pytest-rerunfailures
# pytest-shard
# pytest-timeout
pytest-asyncio==0.24.0
# via -r requirements/test.in
pytest-forked==1.6.0
......@@ -454,6 +459,8 @@ pytest-rerunfailures==14.0
# via -r requirements/test.in
pytest-shard==0.1.2
# via -r requirements/test.in
pytest-timeout==2.3.1
# via -r requirements/test.in
python-dateutil==2.9.0.post0
# via
# botocore
......@@ -645,7 +652,7 @@ tqdm==4.66.6
# transformers
tqdm-multiprocess==0.0.11
# via lm-eval
transformers==4.51.0
transformers==4.51.1
# via
# -r requirements/test.in
# genai-perf
......
......@@ -17,10 +17,10 @@ ray[data]
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
......@@ -2,7 +2,7 @@
from __future__ import annotations
from typing import Any, Union
from typing import Any, Optional, Union
import pytest
import torch
......@@ -15,7 +15,7 @@ from vllm.platforms import current_platform
from ..utils import create_new_process_for_each_test
def models_list(all: bool):
def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
......@@ -32,47 +32,50 @@ def models_list(all: bool):
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if not all:
return TEST_MODELS
if all:
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
return TEST_MODELS
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
if keywords is None:
return TEST_MODELS
# filter by keywords
pred = lambda model: any(keyword in model[0] for keyword in keywords)
return list(filter(pred, TEST_MODELS))
@pytest.mark.parametrize(
......@@ -96,20 +99,30 @@ def test_full_graph(
run_model(optimization_level, model, model_kwargs)
PassConfig = CompilationConfig.PassConfig
# TODO(luka) add other supported compilation config scenarios here
@pytest.mark.parametrize(
"compilation_config",
# additional compile sizes
"compilation_config, model_info",
[
CompilationConfig(level=CompilationLevel.PIECEWISE,
compile_sizes=[1, 2])
# additional compile sizes, only some of the models
(CompilationConfig(level=CompilationLevel.PIECEWISE,
compile_sizes=[1, 2]), model)
for model in models_list(all=False)
] + [
# RMSNorm + quant fusion, only 8-bit quant models
(CompilationConfig(level=CompilationLevel.PIECEWISE,
custom_ops=["+rms_norm"],
pass_config=PassConfig(enable_fusion=True,
enable_noop=True)), model)
for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
])
# only test some of the models
@pytest.mark.parametrize("model_info", models_list(all=False))
@create_new_process_for_each_test()
def test_custom_compile_config(
model_info: tuple[str, dict[str, Any]],
compilation_config: CompilationConfig,
model_info: tuple[str, dict[str, Any]],
):
model, model_kwargs = model_info
print(f"MODEL={model}")
......
......@@ -44,12 +44,17 @@ class TestModel(torch.nn.Module):
resid = torch.sqrt(x)
y = self.norm[0](x)
x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0])
x2 = self.fp8_linear.apply(y,
self.w[0],
self.wscale[0],
input_scale=self.scale[0])
# make sure resid is used for replacement to work
y2, resid = self.norm[1](x2, resid)
x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1],
self.scale[1])
x3 = self.fp8_linear.apply(y2,
self.w[1],
self.wscale[1],
input_scale=self.scale[1])
y3, resid = self.norm[2](x3, resid) # use resid here
return y3
......
......@@ -671,8 +671,9 @@ class HfRunner:
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]:
return self.model.encode(prompts)
def encode(self, prompts: list[str], *args,
**kwargs) -> list[list[torch.Tensor]]:
return self.model.encode(prompts, *args, **kwargs)
def predict(self, prompts: list[list[str]]) -> torch.Tensor:
return self.model.predict(prompts, convert_to_tensor=True)
......@@ -959,19 +960,19 @@ class VllmRunner:
req_outputs = self.model.classify(prompts)
return [req_output.outputs.probs for req_output in req_outputs]
def encode(
self,
prompts: list[str],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
) -> list[list[float]]:
def encode(self,
prompts: list[str],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
*args,
**kwargs) -> list[list[float]]:
inputs = self.get_inputs(prompts,
images=images,
videos=videos,
audios=audios)
req_outputs = self.model.embed(inputs)
req_outputs = self.model.embed(inputs, *args, **kwargs)
return [req_output.outputs.embedding for req_output in req_outputs]
def score(
......
# SPDX-License-Identifier: Apache-2.0
from argparse import ArgumentTypeError
from argparse import ArgumentError, ArgumentTypeError
import pytest
......@@ -142,3 +142,39 @@ def test_composite_arg_parser(arg, expected, option):
else:
args = parser.parse_args([f"--{option}", arg])
assert getattr(args, option.replace("-", "_")) == expected
def test_human_readable_model_len():
# `exit_on_error` disabled to test invalid values below
parser = EngineArgs.add_cli_args(
FlexibleArgumentParser(exit_on_error=False))
args = parser.parse_args([])
assert args.max_model_len is None
args = parser.parse_args(["--max-model-len", "1024"])
assert args.max_model_len == 1024
# Lower
args = parser.parse_args(["--max-model-len", "1m"])
assert args.max_model_len == 1_000_000
args = parser.parse_args(["--max-model-len", "10k"])
assert args.max_model_len == 10_000
# Capital
args = parser.parse_args(["--max-model-len", "3K"])
assert args.max_model_len == 1024 * 3
args = parser.parse_args(["--max-model-len", "10M"])
assert args.max_model_len == 2**20 * 10
# Decimal values
args = parser.parse_args(["--max-model-len", "10.2k"])
assert args.max_model_len == 10200
# ..truncated to the nearest int
args = parser.parse_args(["--max-model-len", "10.212345k"])
assert args.max_model_len == 10212
# Invalid (do not allow decimals with binary multipliers)
for invalid in ["1a", "pwd", "10.24", "1.23M"]:
with pytest.raises(ArgumentError):
args = parser.parse_args(["--max-model-len", invalid])
......@@ -18,7 +18,8 @@ models = ["llava-hf/llava-1.5-7b-hf"]
def test_context_length_too_short(vllm_runner, image_assets, model):
images = [asset.pil_image for asset in image_assets]
with pytest.raises(ValueError, match="too long to fit into the model"):
with pytest.raises(ValueError,
match="longer than the maximum model length"):
vllm_model = vllm_runner(
model,
max_model_len=128, # LLaVA has a feature size of 576
......
......@@ -3,9 +3,11 @@
import json
import re
import weakref
from enum import Enum
import jsonschema
import pytest
from pydantic import BaseModel
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.entrypoints.llm import LLM
......@@ -284,15 +286,26 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
@pytest.mark.skip_global_cleanup
def test_disable_guided_decoding_fallback(sample_regex, llm):
# see has_xgrammar_unsupported_json_features()
unsupported_json = {
"type": "object",
"properties": {
"example": {
"type": "string",
"minLength": 5 # unsupported by xgrammar
}
}
}
sampling_params = SamplingParams(temperature=0.8,
top_p=0.95,
guided_decoding=GuidedDecodingParams(
regex=sample_regex,
json=unsupported_json,
backend="xgrammar:no-fallback"))
with pytest.raises(
ValueError,
match="xgrammar does not support regex guided decoding"):
match="xgrammar does not support advanced JSON schema features "
"like enums, patterns or numeric ranges."):
llm.generate(prompts="This should fail",
sampling_params=sampling_params,
use_tqdm=True)
......@@ -330,3 +343,44 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
# Parse to verify it is valid JSON
parsed_json = json.loads(generated_text)
assert isinstance(parsed_json, dict)
class CarType(str, Enum):
sedan = "sedan"
suv = "SUV"
truck = "Truck"
coupe = "Coupe"
class CarDescription(BaseModel):
brand: str
model: str
car_type: CarType
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
json_schema = CarDescription.model_json_schema()
sampling_params = SamplingParams(temperature=1.0,
max_tokens=1000,
guided_decoding=GuidedDecodingParams(
json=json_schema,
backend=guided_decoding_backend))
outputs = llm.generate(
prompts="Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's",
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json, schema=json_schema)
\ No newline at end of file
......@@ -15,7 +15,7 @@ def v1(run_with_both_engines):
def test_empty_prompt():
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
with pytest.raises(ValueError, match='Prompt cannot be empty'):
with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
llm.generate([""])
......
......@@ -12,7 +12,9 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
TEST_AUDIO_URLS = [
AudioAsset("winning_call").url,
AudioAsset("mary_had_lamb").url,
]
MAXIMUM_AUDIOS = 2
@pytest.fixture(scope="module")
......@@ -24,6 +26,8 @@ def server():
"5",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
f"audio={MAXIMUM_AUDIOS}",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......@@ -46,7 +50,7 @@ def base64_encoded_audio() -> dict[str, str]:
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
model_name: str, audio_url: str):
messages = [{
......@@ -100,7 +104,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
async def test_single_chat_session_audio_base64encoded(
client: openai.AsyncOpenAI, model_name: str, audio_url: str,
base64_encoded_audio: dict[str, str]):
......@@ -158,7 +162,7 @@ async def test_single_chat_session_audio_base64encoded(
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
async def test_single_chat_session_input_audio(
client: openai.AsyncOpenAI, model_name: str, audio_url: str,
base64_encoded_audio: dict[str, str]):
......@@ -330,28 +334,21 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
@pytest.mark.parametrize(
"audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]])
async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
audio_url: str,
base64_encoded_audio: dict[str, str]):
audio_urls: list[str]):
messages = [{
"role":
"user",
"content": [
{
*({
"type": "audio_url",
"audio_url": {
"url": audio_url
}
},
{
"type": "input_audio",
"input_audio": {
"data": base64_encoded_audio[audio_url],
"format": "wav"
}
},
} for audio_url in audio_urls),
{
"type": "text",
"text": "What's happening in this audio?"
......@@ -359,20 +356,30 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
],
}]
with pytest.raises(openai.BadRequestError): # test multi-audio input
await client.chat.completions.create(
if len(audio_urls) > MAXIMUM_AUDIOS:
with pytest.raises(openai.BadRequestError): # test multi-audio input
await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
else:
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
......@@ -20,8 +20,6 @@ from .test_completion import zephyr_lora_files # noqa: F401
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
@pytest.fixture(scope="module")
def monkeypatch_module():
......@@ -487,20 +485,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
assert last_completion_tokens == 10
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
# (i.e. using the same ordering as in the Completions API tests), the test
# will fail on the second `guided_decoding_backend` even when I swap their order
# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
is_v1_server: bool,
guided_decoding_backend: str,
sample_guided_choice):
if is_v1_server and guided_decoding_backend != 'xgrammar':
pytest.skip("Only xgrammar backend is supported with V1")
messages = [{
"role": "system",
"content": "you are a helpful assistant"
......@@ -515,8 +502,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
messages=messages,
max_completion_tokens=10,
temperature=0.7,
extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend))
extra_body=dict(guided_choice=sample_guided_choice))
choice1 = chat_completion.choices[0].message.content
assert choice1 in sample_guided_choice
......@@ -530,22 +516,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
messages=messages,
max_completion_tokens=10,
temperature=0.7,
extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend))
extra_body=dict(guided_choice=sample_guided_choice))
choice2 = chat_completion.choices[0].message.content
assert choice2 in sample_guided_choice
assert choice1 != choice2
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
guided_decoding_backend: str,
async def test_guided_json_chat(client: openai.AsyncOpenAI,
sample_json_schema):
if is_v1_server:
pytest.skip("sample_json_schema has features unsupported in V1")
messages = [{
"role": "system",
"content": "you are a helpful assistant"
......@@ -560,8 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
model=MODEL_NAME,
messages=messages,
max_completion_tokens=1000,
extra_body=dict(guided_json=sample_json_schema,
guided_decoding_backend=guided_decoding_backend))
extra_body=dict(guided_json=sample_json_schema))
message = chat_completion.choices[0].message
assert message.content is not None
json1 = json.loads(message.content)
......@@ -578,8 +557,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
model=MODEL_NAME,
messages=messages,
max_completion_tokens=1000,
extra_body=dict(guided_json=sample_json_schema,
guided_decoding_backend=guided_decoding_backend))
extra_body=dict(guided_json=sample_json_schema))
message = chat_completion.choices[0].message
assert message.content is not None
json2 = json.loads(message.content)
......@@ -589,13 +567,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
async def test_guided_regex_chat(client: openai.AsyncOpenAI,
is_v1_server: bool,
guided_decoding_backend: str, sample_regex):
if is_v1_server and guided_decoding_backend != 'xgrammar':
pytest.skip("Only xgrammar backend is supported with V1")
async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
messages = [{
"role": "system",
......@@ -610,8 +582,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
model=MODEL_NAME,
messages=messages,
max_completion_tokens=20,
extra_body=dict(guided_regex=sample_regex,
guided_decoding_backend=guided_decoding_backend))
extra_body=dict(guided_regex=sample_regex))
ip1 = chat_completion.choices[0].message.content
assert ip1 is not None
assert re.fullmatch(sample_regex, ip1) is not None
......@@ -622,8 +593,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
model=MODEL_NAME,
messages=messages,
max_completion_tokens=20,
extra_body=dict(guided_regex=sample_regex,
guided_decoding_backend=guided_decoding_backend))
extra_body=dict(guided_regex=sample_regex))
ip2 = chat_completion.choices[0].message.content
assert ip2 is not None
assert re.fullmatch(sample_regex, ip2) is not None
......@@ -652,15 +622,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
is_v1_server: bool,
guided_decoding_backend: str,
sample_guided_choice):
if is_v1_server and guided_decoding_backend != 'xgrammar':
pytest.skip("Only xgrammar backend is supported with V1")
messages = [{
"role": "system",
"content": "you are a helpful assistant"
......@@ -676,8 +640,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5,
extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend))
extra_body=dict(guided_choice=sample_guided_choice))
assert chat_completion.choices[0].logprobs is not None
assert chat_completion.choices[0].logprobs.content is not None
......@@ -689,14 +652,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
guided_decoding_backend: str,
sample_json_schema):
if is_v1_server:
pytest.skip("sample_json_schema has features unsupported on V1")
async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
......@@ -728,7 +684,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
"name": "dummy_function_name"
}
},
extra_body=dict(guided_decoding_backend=guided_decoding_backend))
)
message = chat_completion.choices[0].message
assert len(message.content) == 0
json_string = message.tool_calls[0].function.arguments
......@@ -763,7 +719,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
"name": "dummy_function_name"
}
},
extra_body=dict(guided_decoding_backend=guided_decoding_backend),
stream=True)
output = []
......@@ -888,7 +843,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
model=model_name,
tools=tools,
tool_choice="required",
extra_body=dict(guided_decoding_backend="outlines"),
)
assert chat_completion.choices[0].message.tool_calls is not None
......@@ -900,7 +854,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
model=model_name,
tools=tools,
tool_choice="required",
extra_body=dict(guided_decoding_backend="outlines"),
stream=True,
)
......@@ -914,12 +867,7 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
is_v1_server: bool,
sample_json_schema):
if is_v1_server:
pytest.skip("sample_json_schema has features unsupported on V1")
messages = [{
"role": "system",
"content": "you are a helpful assistant"
......
# SPDX-License-Identifier: Apache-2.0
import openai
import pytest
import pytest_asyncio
from vllm.config import ModelConfig
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
def get_vocab_size(model_name):
config = ModelConfig(
model=model_name,
task="auto",
tokenizer=model_name,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="bfloat16",
)
return config.get_vocab_size()
@pytest.fixture(scope="module")
def server():
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"1024",
"--enforce-eager",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_chat_logit_bias_valid(client):
"""Test that valid logit_bias values are accepted in chat completions."""
vocab_size = get_vocab_size(MODEL_NAME)
valid_token_id = vocab_size - 1
completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=[{
"role": "user",
"content": "Testing valid logit bias"
}],
max_tokens=5,
logit_bias={str(valid_token_id): 1.0},
)
assert completion.choices[0].message.content is not None
@pytest.mark.asyncio
async def test_chat_logit_bias_invalid(client):
"""Test that invalid logit_bias values are rejected in chat completions."""
vocab_size = get_vocab_size(MODEL_NAME)
invalid_token_id = vocab_size + 1
with pytest.raises(openai.BadRequestError) as excinfo:
await client.chat.completions.create(
model=MODEL_NAME,
messages=[{
"role": "user",
"content": "Testing invalid logit bias"
}],
max_tokens=5,
logit_bias={str(invalid_token_id): 1.0},
)
error = excinfo.value
error_message = str(error)
assert error.status_code == 400
assert str(invalid_token_id) in error_message
assert str(vocab_size) in error_message
......@@ -11,6 +11,7 @@ import requests
from vllm.entrypoints.openai.protocol import EmbeddingResponse
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...models.embedding.utils import check_embeddings_close
from ...utils import RemoteOpenAIServer
MODEL_NAME = "intfloat/multilingual-e5-small"
......@@ -190,30 +191,35 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
responses_float = await client.embeddings.create(input=input_texts,
model=model_name,
encoding_format="float")
float_data = [d.embedding for d in responses_float.data]
responses_base64 = await client.embeddings.create(input=input_texts,
model=model_name,
encoding_format="base64")
decoded_responses_base64_data = []
base64_data = []
for data in responses_base64.data:
decoded_responses_base64_data.append(
base64_data.append(
np.frombuffer(base64.b64decode(data.embedding),
dtype="float32").tolist())
assert responses_float.data[0].embedding == decoded_responses_base64_data[
0]
assert responses_float.data[1].embedding == decoded_responses_base64_data[
1]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=base64_data,
name_0="float",
name_1="base64",
)
# Default response is float32 decoded from base64 by OpenAI Client
responses_default = await client.embeddings.create(input=input_texts,
model=model_name)
default_data = [d.embedding for d in responses_default.data]
assert responses_float.data[0].embedding == responses_default.data[
0].embedding
assert responses_float.data[1].embedding == responses_default.data[
1].embedding
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=default_data,
name_0="float",
name_1="default",
)
@pytest.mark.asyncio
......
# SPDX-License-Identifier: Apache-2.0
"""
Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
"""
from typing import NamedTuple
import openai
import pytest
from vllm.entrypoints.openai.protocol import EmbeddingResponse
from ...utils import RemoteOpenAIServer
class ModelInfo(NamedTuple):
name: str
is_matryoshka: bool
MODELS = [
ModelInfo(name="BAAI/bge-m3", is_matryoshka=False),
ModelInfo(name="jinaai/jina-embeddings-v3", is_matryoshka=True),
]
input_texts = [
"The chef prepared a delicious meal.",
] * 3
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
async def test_validating_dimensions(model: ModelInfo):
args = [
"--task",
"embed",
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--enforce-eager",
"--max-model-len",
"512",
"--trust_remote_code"
]
with RemoteOpenAIServer(model.name, args) as remote_server:
client = remote_server.get_async_client()
async def make_request(dimensions):
embedding_response = await client.embeddings.create(
model=model.name,
input=input_texts,
dimensions=dimensions,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) > 0
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens > 0
assert embeddings.usage.total_tokens > 0
if dimensions is not None:
assert len(embeddings.data[0].embedding) == dimensions
if model.is_matryoshka:
for dimensions in [None, 16]:
await make_request(dimensions)
with pytest.raises(openai.BadRequestError):
for dimensions in [-1]:
await make_request(dimensions)
else:
for dimensions in [None]:
await make_request(dimensions)
with pytest.raises(openai.BadRequestError):
for dimensions in [-1, 16]:
await make_request(dimensions)
......@@ -17,7 +17,7 @@ async def test_empty_prompt():
client = remote_server.get_async_client()
with pytest.raises(openai.BadRequestError,
match=re.compile('.+Prompt cannot be empty.+')):
match="decoder prompt cannot be empty"):
await client.completions.create(model=model_name,
prompt="",
max_tokens=5,
......
......@@ -25,11 +25,13 @@ EXAMPLES_DIR = VLLM_PATH / "examples"
PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@pytest.fixture(scope="function")
......@@ -80,6 +82,30 @@ def mllama_tokenizer():
)
@pytest.fixture(scope="function")
def mistral_model_config():
return ModelConfig(MISTRAL_MODEL_ID,
task="generate",
tokenizer=MISTRAL_MODEL_ID,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="auto",
seed=0,
limit_mm_per_prompt={
"image": 2,
})
@pytest.fixture(scope="module")
def mistral_tokenizer():
return TokenizerGroup(
tokenizer_id=MISTRAL_MODEL_ID,
enable_lora=False,
max_num_seqs=5,
max_input_length=None,
)
@pytest.fixture(scope="module")
def image_url():
image = ImageAsset('cherry_blossom')
......@@ -131,6 +157,66 @@ def test_parse_chat_messages_single_image(
_assert_mm_data_is_image_input(mm_data, 1)
def test_parse_chat_messages_empty_system(
mistral_model_config,
mistral_tokenizer,
):
# Test string format
conversation, _ = parse_chat_messages(
[{
"role": "system",
"content": ""
}, {
"role": "user",
"content": [{
"type": "text",
"text": "Who are you?"
}]
}],
mistral_model_config,
mistral_tokenizer,
content_format="string",
)
assert conversation == [{
"role": "system",
"content": ""
}, {
"role": "user",
"content": "Who are you?"
}]
# Test openai format
conversation, _ = parse_chat_messages(
[{
"role": "system",
"content": ""
}, {
"role": "user",
"content": [{
"type": "text",
"text": "Who are you?"
}]
}],
mistral_model_config,
mistral_tokenizer,
content_format="openai",
)
assert conversation == [{
"role": "system",
"content": [{
"type": "text",
"text": ""
}]
}, {
"role":
"user",
"content": [{
"type": "text",
"text": "Who are you?"
}]
}]
@pytest.mark.asyncio
async def test_parse_chat_messages_single_image_async(
phi3v_model_config,
......@@ -671,7 +757,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
# Build a config for the model
model_config = ModelConfig(model,
task="generate",
tokenizer=MLLAMA_MODEL_ID,
tokenizer=model,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="auto",
......@@ -682,7 +768,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
# Build the tokenizer group and grab the underlying tokenizer
tokenizer_group = TokenizerGroup(
MLLAMA_MODEL_ID,
model,
enable_lora=False,
max_num_seqs=5,
max_input_length=None,
......@@ -756,6 +842,8 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
assert isinstance(chat_template, str)
# NOTE: Qwen2-Audio default chat template is specially defined inside
# processor class instead of using `tokenizer_config.json`
# yapf: disable
@pytest.mark.parametrize(
("model", "expected_format"),
......@@ -763,6 +851,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
(QWEN2VL_MODEL_ID, "openai"),
(QWEN25VL_MODEL_ID, "openai"),
(ULTRAVOX_MODEL_ID, "string"),
(QWEN2AUDIO_MODEL_ID, "openai"),
(MLLAMA_MODEL_ID, "openai"),
(LLAMA_GUARD_MODEL_ID, "openai")],
)
......@@ -815,10 +904,13 @@ def test_resolve_content_format_hf_defined(model, expected_format):
("template_chatglm2.jinja", "string"),
("template_chatml.jinja", "string"),
("template_deepseek_vl2.jinja", "string"),
("template_dse_qwen2_vl.jinja", "openai"),
("template_falcon_180b.jinja", "string"),
("template_falcon.jinja", "string"),
("template_florence2.jinja", "string"),
("template_inkbot.jinja", "string"),
("template_llava.jinja", "string"),
("template_teleflm.jinja", "string"),
("template_vlm2vec.jinja", "openai"),
("tool_chat_template_granite_20b_fc.jinja", "string"),
("tool_chat_template_hermes.jinja", "string"),
......
......@@ -18,6 +18,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8, w8a8_block_fp8_matmul)
from vllm.platforms import current_platform
from .utils_block import native_w8a8_block_matmul
dg_available = False
try:
import deep_gemm
......@@ -75,61 +77,6 @@ def native_per_token_group_quant_fp8(x,
return x_q, x_s
def native_w8a8_block_fp8_matmul(A,
B,
As,
Bs,
block_size,
output_dtype=torch.float16):
"""Matrix multiplication with block-wise quantization using native torch."""
A = A.to(torch.float32)
B = B.to(torch.float32)
assert A.shape[-1] == B.shape[-1]
assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
assert len(block_size) == 2
block_n, block_k = block_size[0], block_size[1]
assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
assert A.shape[:-1] == As.shape[:-1]
M = A.numel() // A.shape[-1]
N, K = B.shape
origin_C_shape = A.shape[:-1] + (N, )
A = A.reshape(M, A.shape[-1])
As = As.reshape(M, As.shape[-1])
n_tiles = (N + block_n - 1) // block_n
k_tiles = (K + block_k - 1) // block_k
assert n_tiles == Bs.shape[0]
assert k_tiles == Bs.shape[1]
C_shape = (M, N)
C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
A_tiles = [
A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
]
B_tiles = [[
B[
j * block_n:min((j + 1) * block_n, N),
i * block_k:min((i + 1) * block_k, K),
] for i in range(k_tiles)
] for j in range(n_tiles)]
C_tiles = [
C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
]
As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
for i in range(k_tiles):
for j in range(n_tiles):
a = A_tiles[i]
b = B_tiles[j][i]
c = C_tiles[j]
s = As_tiles[i] * Bs[j][i]
c[:, :] += torch.matmul(a, b.t()) * s
C = C.reshape(origin_C_shape).to(output_dtype)
return C
def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
"""Fused moe with block-wise quantization using native torch."""
B, D = a.shape
......@@ -146,22 +93,22 @@ def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
for i in range(w1.shape[0]):
mask = topk_ids == i
if mask.sum():
inter_out = native_w8a8_block_fp8_matmul(a_q[mask],
w1[i],
a_s[mask],
w1_s[i],
block_shape,
output_dtype=a.dtype)
inter_out = native_w8a8_block_matmul(a_q[mask],
w1[i],
a_s[mask],
w1_s[i],
block_shape,
output_dtype=a.dtype)
act_out = SiluAndMul().forward_native(inter_out)
act_out_q, act_out_s = native_per_token_group_quant_fp8(
act_out, block_k)
act_out = act_out.to(torch.float32)
out[mask] = native_w8a8_block_fp8_matmul(act_out_q,
w2[i],
act_out_s,
w2_s[i],
block_shape,
output_dtype=a.dtype)
out[mask] = native_w8a8_block_matmul(act_out_q,
w2[i],
act_out_s,
w2_s[i],
block_shape,
output_dtype=a.dtype)
return (out.view(B, -1, w2.shape[1]) *
topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
......@@ -215,8 +162,8 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
out_dtype)
ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
out_dtype)
out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
rel_diff = (torch.mean(
......@@ -239,8 +186,6 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_max, fp8_min = fp8_info.max, fp8_info.min
vllm_config = VllmConfig()
a = torch.randn((M, K), dtype=dtype) / 10
w1_bf16 = (torch.rand(
......@@ -266,6 +211,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
score = torch.randn((M, E), dtype=dtype)
# Set the context to avoid lots of warning spam.
vllm_config = VllmConfig()
with set_current_vllm_config(vllm_config):
out = fused_moe(
a,
......@@ -334,8 +280,8 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
As = As_fp8.to(torch.float32)
Bs = Bs_fp8.to(torch.float32)
ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
out_dtype)
ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
out_dtype)
# Transpose earlier so that the testing will not trigger transposing kernels
As_fp8 = deep_gemm.get_col_major_tma_aligned_tensor(As_fp8)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment