Commit 4eabe123 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

parents 45840cd2 58738772
......@@ -33,6 +33,7 @@ num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
transformers==4.51.3
tokenizers==0.21.1
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
......
......@@ -99,6 +99,7 @@ datasets==3.0.2
# via
# evaluate
# lm-eval
# mteb
decorator==5.1.1
# via librosa
dill==0.3.8
......@@ -124,6 +125,8 @@ email-validator==2.2.0
# via pydantic
encodec==0.1.1
# via vocos
eval-type-backport==0.2.2
# via mteb
evaluate==0.4.3
# via lm-eval
fastparquet==2024.11.0
......@@ -291,6 +294,8 @@ msgpack==1.1.0
# via
# librosa
# ray
mteb==1.38.11
# via -r requirements/test.in
multidict==6.1.0
# via
# aiohttp
......@@ -331,6 +336,7 @@ numpy==1.26.4
# librosa
# matplotlib
# mistral-common
# mteb
# numba
# numexpr
# opencv-python-headless
......@@ -443,6 +449,8 @@ plotly==5.24.1
# via genai-perf
pluggy==1.5.0
# via pytest
polars==1.29.0
# via mteb
pooch==1.8.2
# via librosa
portalocker==2.10.1
......@@ -476,6 +484,7 @@ pydantic==2.9.2
# via
# datamodel-code-generator
# mistral-common
# mteb
pydantic-core==2.23.4
# via pydantic
pygments==2.18.0
......@@ -522,6 +531,8 @@ python-dateutil==2.9.0.post0
# typepy
python-rapidjson==1.20
# via tritonclient
pytrec-eval-terrier==0.5.7
# via mteb
pytz==2024.2
# via
# pandas
......@@ -564,6 +575,7 @@ requests==2.32.3
# huggingface-hub
# lm-eval
# mistral-common
# mteb
# pooch
# ray
# responses
......@@ -580,6 +592,7 @@ rfc3987==1.3.8
rich==13.9.4
# via
# genai-perf
# mteb
# typer
rouge-score==0.1.2
# via lm-eval
......@@ -607,16 +620,20 @@ scikit-learn==1.5.2
# via
# librosa
# lm-eval
# mteb
# sentence-transformers
scipy==1.13.1
# via
# librosa
# mteb
# scikit-learn
# sentence-transformers
# statsmodels
# vocos
sentence-transformers==3.2.1
# via -r requirements/test.in
# via
# -r requirements/test.in
# mteb
sentencepiece==0.2.0
# via mistral-common
setuptools==77.0.3
......@@ -696,6 +713,7 @@ torch==2.7.0+cu128
# fastsafetensors
# lm-eval
# mamba-ssm
# mteb
# peft
# runai-model-streamer
# sentence-transformers
......@@ -720,6 +738,7 @@ tqdm==4.66.6
# evaluate
# huggingface-hub
# lm-eval
# mteb
# nltk
# peft
# pqdm
......@@ -759,6 +778,7 @@ typing-extensions==4.12.2
# huggingface-hub
# librosa
# mistral-common
# mteb
# pqdm
# pydantic
# pydantic-core
......
......@@ -18,9 +18,9 @@ setuptools==78.1.0
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch==2.8.0.dev20250430
torchvision==0.22.0.dev20250430
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch==2.8.0.dev20250518
torchvision==0.22.0.dev20250518
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
......@@ -5,12 +5,12 @@ import importlib.util
import json
import logging
import os
import re
import subprocess
import sys
from pathlib import Path
from shutil import which
import regex as re
import torch
from packaging.version import Version, parse
from setuptools import Extension, setup
......@@ -389,7 +389,6 @@ class repackage_wheel(build_ext):
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
import re
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members += list(
......
......@@ -8,12 +8,13 @@ import weakref
from unittest.mock import Mock
import pytest
import torch
from vllm import LLM
from vllm import LLM, envs
from vllm.platforms import current_platform
from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
from ..conftest import VllmRunner
from ..conftest import HfRunner, VllmRunner
from ..models.utils import check_outputs_equal
from ..utils import multi_gpu_test
......@@ -43,11 +44,26 @@ def test_vllm_gc_ed():
assert weak_llm() is None
def _fix_prompt_embed_outputs(
vllm_outputs: list[tuple[list[int], str]], hf_model: HfRunner,
example_prompts: list[str]) -> list[tuple[list[int], str]]:
fixed_vllm_outputs = []
for vllm_output, hf_input, prompt in zip(
vllm_outputs, hf_model.get_inputs(example_prompts),
example_prompts):
hf_input_ids = hf_input["input_ids"].tolist()[0]
fixed_vllm_outputs.append(
(hf_input_ids + vllm_output[0][len(hf_input_ids):],
prompt + vllm_output[1]))
return fixed_vllm_outputs
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner,
......@@ -56,8 +72,13 @@ def test_models(
dtype: str,
max_tokens: int,
enforce_eager: bool,
enable_prompt_embeds: bool,
) -> None:
if enable_prompt_embeds and envs.is_set(
"VLLM_USE_V1") and envs.VLLM_USE_V1:
pytest.skip("enable_prompt_embeds is not supported in v1.")
if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.")
......@@ -78,14 +99,25 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if enable_prompt_embeds:
with torch.no_grad():
prompt_embeds = hf_model.get_prompt_embeddings(
example_prompts)
with VllmRunner(model,
max_model_len=8192,
dtype=dtype,
enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy(
prompt_embeds, max_tokens)
vllm_outputs = _fix_prompt_embed_outputs(
vllm_outputs, hf_model, example_prompts)
else:
vllm_outputs = vllm_model.generate_greedy(
example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
......@@ -108,6 +140,7 @@ def test_models(
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
])
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
def test_models_distributed(
monkeypatch: pytest.MonkeyPatch,
hf_runner,
......@@ -117,14 +150,22 @@ def test_models_distributed(
distributed_executor_backend: str,
attention_backend: str,
test_suite: str,
enable_prompt_embeds: bool,
) -> None:
if enable_prompt_embeds and envs.is_set(
"VLLM_USE_V1") and envs.VLLM_USE_V1:
pytest.skip("enable_prompt_embeds is not supported in v1.")
if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}")
with monkeypatch.context() as monkeypatch_context:
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# test Ray Compiled Graph
if enable_prompt_embeds:
pytest.skip(
"enable_prompt_embeds does not work with ray compiled dag."
)
monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
......@@ -147,12 +188,26 @@ def test_models_distributed(
dtype=dtype,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
if enable_prompt_embeds:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with torch.no_grad():
prompt_embeds = hf_model.get_prompt_embeddings(
example_prompts)
vllm_outputs = vllm_model.generate_greedy(
prompt_embeds, max_tokens)
vllm_outputs = _fix_prompt_embed_outputs(
vllm_outputs, hf_model, example_prompts)
hf_outputs = hf_model.generate_greedy(
example_prompts, max_tokens)
else:
vllm_outputs = vllm_model.generate_greedy(
example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(
example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
......
......@@ -5,6 +5,8 @@ from typing import Callable, Union
from torch import fx
from vllm.compilation.fx_utils import (find_specified_fn,
find_specified_fn_maybe)
from vllm.compilation.inductor_pass import InductorPass
from vllm.config import get_current_vllm_config
......@@ -44,3 +46,19 @@ class TestBackend:
self.graph_post_pass = deepcopy(graph)
# assign by reference, will reflect the final state of the graph
self.final_graph = graph
def check_before_ops(self, ops,
find_fn=find_specified_fn, \
find_fn_maybe=find_specified_fn_maybe, \
ops_fully_replaced=True):
for op in ops:
find_fn(self.graph_pre_pass.nodes, op)
if ops_fully_replaced:
assert find_fn_maybe(self.graph_post_pass.nodes, op) is None
def check_after_ops(self, ops,
find_fn=find_specified_fn, \
find_fn_maybe=find_specified_fn_maybe):
for op in ops:
find_fn(self.graph_post_pass.nodes, op)
assert find_fn_maybe(self.graph_pre_pass.nodes, op) is None
# SPDX-License-Identifier: Apache-2.0
import json
import pytest
import torch
import vllm.envs as envs
from vllm.compilation.collective_fusion import AsyncTPPass
from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
PassConfig, VllmConfig)
from vllm.distributed import (tensor_model_parallel_all_gather,
tensor_model_parallel_reduce_scatter)
from vllm.distributed.parallel_state import (init_distributed_environment,
initialize_model_parallel)
from vllm.platforms import current_platform
from vllm.utils import update_environment_variables
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import (compare_two_settings, create_new_process_for_each_test,
multi_gpu_test)
from .backend import TestBackend
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
class TestMMRSModel(torch.nn.Module):
def __init__(self, hidden_size=16):
super().__init__()
self.hidden_size = hidden_size
self.gate_proj = torch.nn.Parameter(torch.empty(
(self.hidden_size * 2, hidden_size)),
requires_grad=False)
# Initialize weights
torch.nn.init.normal_(self.gate_proj, std=0.02)
def forward(self, hidden_states):
"""
Forward pass implementing the mm + reduce scatter in the FX graph
"""
# Reshape input
view = hidden_states.reshape(-1, self.hidden_size)
# matrix multiplication
permute = self.gate_proj.permute(1, 0)
mm = torch.mm(view, permute)
reduce_scatter = tensor_model_parallel_reduce_scatter(mm, dim=0)
return reduce_scatter
def ops_in_model_before(self):
return [torch.ops.vllm.reduce_scatter.default]
def ops_in_model_after(self):
return [torch.ops.symm_mem.fused_matmul_reduce_scatter.default]
class TestAGMMModel(torch.nn.Module):
def __init__(self, hidden_size=16):
super().__init__()
self.hidden_size = hidden_size
self.weight = torch.nn.Parameter(torch.empty(
(hidden_size, hidden_size)),
requires_grad=False)
# Initialize weights
torch.nn.init.normal_(self.weight, std=0.02)
def forward(self, hidden_states):
"""
Forward pass implementing the mm + all gather in the FX graph
"""
# Reshape input
view = hidden_states.reshape(-1, self.hidden_size)
all_gather = tensor_model_parallel_all_gather(view, dim=0)
permute = self.weight.permute(1, 0)
mm = torch.mm(all_gather, permute)
return mm
def ops_in_model_before(self):
return [torch.ops.vllm.all_gather.default]
def ops_in_model_after(self):
return [torch.ops.symm_mem.fused_all_gather_matmul.default]
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("test_model", [TestMMRSModel, TestAGMMModel])
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("seq_len", [16])
@pytest.mark.parametrize("hidden_size", [16])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
reason="Only test on CUDA")
def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int,
hidden_size: int, dtype: torch.dtype):
num_processes = 2
def run_torch_spawn(fn, nprocs):
# need to use torch.mp.spawn otherwise will have problems with
# torch.distributed and cuda
torch.multiprocessing.spawn(fn,
args=(num_processes, test_model,
batch_size, seq_len, hidden_size,
dtype),
nprocs=nprocs)
run_torch_spawn(async_tp_pass_on_test_model, num_processes)
def async_tp_pass_on_test_model(local_rank: int, world_size: int,
test_model_cls: torch.nn.Module,
batch_size: int, seq_len: int,
hidden_size: int, dtype: torch.dtype):
current_platform.seed_everything(0)
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
torch.set_default_device(device)
torch.set_default_dtype(dtype)
update_environment_variables({
'RANK': str(local_rank),
'LOCAL_RANK': str(local_rank),
'WORLD_SIZE': str(world_size),
'MASTER_ADDR': 'localhost',
'MASTER_PORT': '12345',
})
# initialize distributed
init_distributed_environment()
initialize_model_parallel(tensor_model_parallel_size=world_size)
# configure vllm config for SequenceParallelismPass
vllm_config = VllmConfig()
vllm_config.compilation_config = CompilationConfig(pass_config=PassConfig(
enable_async_tp=True, ), )
vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
# this is a fake model name to construct the model config
# in the vllm_config, it's not really used.
model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
vllm_config.model_config = ModelConfig(model=model_name,
task="auto",
tokenizer=model_name,
tokenizer_mode="auto",
trust_remote_code=True,
dtype=dtype,
seed=42)
async_tp_pass = AsyncTPPass(vllm_config)
backend = TestBackend(async_tp_pass)
model = test_model_cls(hidden_size)
hidden_states = torch.randn((batch_size * seq_len, hidden_size),
dtype=dtype,
requires_grad=False)
compiled_model = torch.compile(model, backend=backend)
compiled_model(hidden_states)
# In pre-nodes, all gather or reduce scatter should exist,
# fused_matmul_reduce_scatter or fused_all_gather_matmul should not
backend.check_before_ops(model.ops_in_model_before(),
ops_fully_replaced=False)
# In post-nodes, fused_matmul_reduce_scatter or \
# fused_all_gather_matmul should exist
backend.check_after_ops(model.ops_in_model_after())
@create_new_process_for_each_test()
@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"])
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("async_tp_enabled", [True])
@pytest.mark.parametrize("distributed_backend", ["mp"])
@pytest.mark.parametrize("eager_mode", [False, True])
def test_async_tp_pass_correctness(
model_id: str,
tp_size: int,
async_tp_enabled: bool,
distributed_backend: str,
eager_mode: bool,
num_gpus_available: int,
):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_transformers_version(on_fail="skip")
model_info.check_available_online(on_fail="skip")
pp_size = 1
if num_gpus_available < tp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
common_args = [
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"8",
]
if eager_mode:
common_args.append("--enforce-eager")
compilation_config = {
'level': 3,
'compile_sizes': [2, 4, 8],
'splitting_ops': [],
'pass_config': {
'enable_async_tp': async_tp_enabled
},
}
async_tp_env = tp_env = {
"VLLM_USE_V1": "1",
}
aysnc_tp_args = [
*common_args,
"--tensor-parallel-size",
str(tp_size),
"--distributed-executor-backend",
distributed_backend,
"--compilation_config",
json.dumps(compilation_config),
]
tp_args = [
*common_args,
"--tensor-parallel-size",
str(tp_size),
"--distributed-executor-backend",
"mp",
]
compare_two_settings(model_id,
aysnc_tp_args,
tp_args,
async_tp_env,
tp_env,
method="generate")
......@@ -29,6 +29,10 @@ class TestModel(torch.nn.Module):
self.cutlass_fp8_enabled = cutlass_fp8_enabled
self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
self.key = QuantKey(dtype=FP8_DTYPE,
static=static,
per_tensor=static,
symmetric=True)
if static:
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
else:
......@@ -59,6 +63,15 @@ class TestModel(torch.nn.Module):
y3, resid = self.norm[2](x3, resid) # use resid here
return y3
def ops_in_model_before(self):
return [QUANT_OPS[self.key]]
def ops_in_model_after(self):
return [
FUSED_OPS[FusedRMSQuantKey(self.key, False)],
FUSED_OPS[FusedRMSQuantKey(self.key, True)]
]
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
......@@ -107,25 +120,10 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
# Check substitution worked
pre_nodes = backend.graph_pre_pass.nodes
post_nodes = backend.graph_post_pass.nodes
# static is per-tensor, dynamic is per-token
key = QuantKey(dtype=FP8_DTYPE,
static=static,
per_tensor=static,
symmetric=True)
rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
fp8_quant = QUANT_OPS[key]
# In pre-nodes, fp8 quant should be there and fused kernels should not
assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
find_auto_fn(pre_nodes, fp8_quant)
backend.check_before_ops(model.ops_in_model_before(), find_auto_fn,
find_auto_fn_maybe)
# In post-nodes, fused kernels should be there and fp8 quant should not
find_auto_fn(post_nodes, rms_quant)
find_auto_fn(post_nodes, add_rms_quant)
assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
backend.check_after_ops(model.ops_in_model_after(), find_auto_fn,
find_auto_fn_maybe)
......@@ -5,9 +5,7 @@ import torch
import vllm.envs as envs
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
from vllm.compilation.fx_utils import (find_auto_fn, find_auto_fn_maybe,
find_specified_fn,
find_specified_fn_maybe, is_func)
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
from vllm.compilation.sequence_parallelism import SequenceParallelismPass
from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
PassConfig, VllmConfig)
......@@ -21,17 +19,6 @@ from vllm.utils import update_environment_variables
from ..utils import multi_gpu_test
from .backend import TestBackend
OPS_IN_MODEL_BEFORE = [
torch.ops.vllm.all_reduce.default,
]
OPS_IN_MODEL_AFTER = [
torch.ops.vllm.reduce_scatter.default,
torch.ops.vllm.all_gather.default,
]
OPS_IN_MODEL = [torch.ops._C.fused_add_rms_norm.default]
prompts = [
"Hello, my name is",
"The president of the United States is",
......@@ -78,6 +65,18 @@ class TestModel(torch.nn.Module):
return norm_output, residual_output
def ops_in_model_before(self):
return [torch.ops.vllm.all_reduce.default]
def ops_in_model_after(self):
return [
torch.ops.vllm.reduce_scatter.default,
torch.ops.vllm.all_gather.default
]
def ops_in_model(self):
return [torch.ops._C.fused_add_rms_norm.default]
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("batch_size", [8])
......@@ -156,26 +155,16 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
compiled_model_func = torch.compile(model, backend=backend_func)
compiled_model_func(hidden_states, residual)
# Check substitution worked
pre_nodes = backend_no_func.graph_pre_pass.nodes
post_nodes = backend_no_func.graph_post_pass.nodes
# In pre-nodes, all reduce should be there,
# reduce scatter and all gather should not
for op in OPS_IN_MODEL_BEFORE:
find_specified_fn(pre_nodes, op)
for op in OPS_IN_MODEL_AFTER:
assert find_specified_fn_maybe(pre_nodes, op) is None
backend_no_func.check_before_ops(model.ops_in_model_before())
# In post-nodes, reduce scatter and all gather should be there,
# all reduce should not
for op in OPS_IN_MODEL_AFTER:
find_specified_fn(post_nodes, op)
for op in OPS_IN_MODEL_BEFORE:
assert find_specified_fn_maybe(post_nodes, op) is None
backend_no_func.check_after_ops(model.ops_in_model_after())
# check if the functionalization pass is applied
for op in OPS_IN_MODEL:
for op in model.ops_in_model():
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
op) is None # noqa: E501
......@@ -183,7 +172,7 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
# make sure the ops were all de-functionalized
found = dict()
for node in backend_func.graph_post_pass.nodes:
for op in OPS_IN_MODEL:
for op in model.ops_in_model():
if is_func(node, op):
found[op] = True
assert all(found[op] for op in OPS_IN_MODEL)
assert all(found[op] for op in model.ops_in_model())
......@@ -430,6 +430,15 @@ class HfRunner:
return all_inputs
def get_prompt_embeddings(self, prompts: list[str]) -> list[torch.Tensor]:
all_inputs = self.get_inputs(prompts)
embeddings = []
for inputs in all_inputs:
input_ids = self.wrap_device(inputs)["input_ids"]
embedding = self.model.get_input_embeddings()(input_ids).squeeze(0)
embeddings.append(embedding)
return embeddings
def classify(self, prompts: list[str]) -> list[str]:
# output is final logits
all_inputs = self.get_inputs(prompts)
......
......@@ -119,13 +119,12 @@ def test_topic_filtering(publisher_config):
"""
publisher_config.replay_endpoint = None
cfg = publisher_config.model_copy()
cfg.topic = "foo"
pub = EventPublisherFactory.create(cfg)
publisher_config.topic = "foo"
pub = EventPublisherFactory.create(publisher_config)
from .conftest import MockSubscriber
sub_foo = MockSubscriber(cfg.endpoint, None, "foo")
sub_bar = MockSubscriber(cfg.endpoint, None, "bar")
sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
sub_bar = MockSubscriber(publisher_config.endpoint, None, "bar")
try:
time.sleep(0.1)
......
......@@ -9,7 +9,7 @@ import torch.distributed as dist
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
from vllm.distributed.utils import StatelessProcessGroup
from vllm.utils import get_ip, get_open_port, update_environment_variables
from vllm.utils import get_open_port, update_environment_variables
def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
......@@ -60,12 +60,12 @@ def worker_fn():
rank = dist.get_rank()
if rank == 0:
port = get_open_port()
ip = get_ip()
ip = '127.0.0.1'
dist.broadcast_object_list([ip, port], src=0)
else:
recv = [None, None]
dist.broadcast_object_list(recv, src=0)
ip, port = recv
ip, port = recv # type: ignore
stateless_pg = StatelessProcessGroup.create(ip, port, rank,
dist.get_world_size())
......@@ -107,10 +107,10 @@ def worker_fn():
if pg == dist.group.WORLD:
dist.barrier()
print("torch distributed passed the test!")
print(f"torch distributed passed the test! Rank {rank}")
else:
pg.barrier()
print("StatelessProcessGroup passed the test!")
print(f"StatelessProcessGroup passed the test! Rank {rank}")
def test_shm_broadcast():
......
# SPDX-License-Identifier: Apache-2.0
import json
import re
import weakref
from enum import Enum
import jsonschema
import pytest
import regex as re
from pydantic import BaseModel
from vllm.distributed import cleanup_dist_env_and_memory
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
OpenAIClientMtebEncoder,
run_mteb_embed_task,
run_mteb_embed_task_st)
from tests.utils import RemoteOpenAIServer
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "BAAI/bge-m3"
DTYPE = "float16"
MAIN_SCORE = 0.7873427091972599
@pytest.fixture(scope="module")
def server():
args = [
"--task", "embed", "--dtype", DTYPE, "--enforce-eager",
"--max-model-len", "512"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
def test_mteb(server):
client = server.get_client()
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE or run_mteb_embed_task_st(
MODEL_NAME, MTEB_EMBED_TASKS)
print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)
assert st_main_score == pytest.approx(vllm_main_score, rel=1e-4)
......@@ -2,13 +2,13 @@
# imports for guided decoding tests
import json
import re
from typing import Optional
import jsonschema
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
import regex as re
import requests
import torch
from openai import BadRequestError, OpenAI
......
# SPDX-License-Identifier: Apache-2.0
# imports for guided decoding tests
import json
import re
import shutil
from tempfile import TemporaryDirectory
from typing import Optional
......@@ -11,6 +9,7 @@ import jsonschema
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
import regex as re
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from openai import BadRequestError
......
# SPDX-License-Identifier: Apache-2.0
from typing import Final
import pytest
import schemathesis
from hypothesis import settings
from schemathesis import GenerationConfig
from ...utils import RemoteOpenAIServer
......@@ -9,6 +12,8 @@ schemathesis.experimental.OPEN_API_3_1.enable()
MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
MAXIMUM_IMAGES = 2
DEFAULT_TIMEOUT_SECONDS: Final[int] = 10
LONG_TIMEOUT_SECONDS: Final[int] = 60
@pytest.fixture(scope="module")
......@@ -42,8 +47,58 @@ def get_schema(server):
schema = schemathesis.from_pytest_fixture("get_schema")
@schemathesis.hook
def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
op = context.operation
assert op is not None
def no_file_type(case: schemathesis.models.Case):
"""
This filter skips test cases for the `POST /tokenize` endpoint where the
HTTP request body uses `"type": "file"` in any message's content.
We expect these cases to fail because that type isn't implemented here
https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
Example test cases that are skipped:
curl -X POST -H 'Content-Type: application/json' \
-d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
http://localhost:8000/tokenize
curl -X POST -H 'Content-Type: application/json' \
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
http://localhost:8000/tokenize
""" # noqa: E501
if (op.method.lower() == "post" and op.path == "/tokenize"
and hasattr(case, "body") and isinstance(case.body, dict)
and "messages" in case.body
and isinstance(case.body["messages"], list)
and len(case.body["messages"]) > 0):
for message in case.body["messages"]:
if not isinstance(message, dict):
continue
content = message.get("content", [])
if not isinstance(content, list) or len(content) == 0:
continue
if any(item.get("type") == "file" for item in content):
return False
return True
return strategy.filter(no_file_type)
@schema.parametrize()
@schema.override(headers={"Content-Type": "application/json"})
@settings(deadline=LONG_TIMEOUT_SECONDS * 1000)
def test_openapi_stateless(case: schemathesis.Case):
key = (
case.operation.method.upper(),
case.operation.path,
)
timeout = {
# requires a longer timeout
("POST", "/v1/chat/completions"):
LONG_TIMEOUT_SECONDS,
}.get(key, DEFAULT_TIMEOUT_SECONDS)
#No need to verify SSL certificate for localhost
case.call_and_validate(verify=False)
case.call_and_validate(verify=False, timeout=timeout)
# SPDX-License-Identifier: Apache-2.0
# imports for guided decoding tests
import re
import openai
import pytest
import regex as re
from ...utils import RemoteOpenAIServer
......@@ -32,7 +31,7 @@ async def test_out_of_vocab_token_ids():
client = remote_server.get_async_client()
with pytest.raises(openai.BadRequestError,
match=re.compile('.*out of vocabulary.*')):
match=re.compile('.*out of vocabulary.*').pattern):
await client.completions.create(model=model_name,
prompt=[999999],
max_tokens=5,
......@@ -46,9 +45,10 @@ async def test_reject_multistep_with_guided_decoding():
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
with pytest.raises(openai.BadRequestError,
with pytest.raises(
openai.BadRequestError,
match=re.compile(
'.*Guided decoding .* multi-step decoding.*')):
'.*Guided decoding .* multi-step decoding.*').pattern):
await client.completions.create(
model=model_name,
prompt="Hello",
......
# SPDX-License-Identifier: Apache-2.0
import math
from typing import Any
import pytest
......@@ -92,7 +90,7 @@ class TestModel:
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
......@@ -124,7 +122,7 @@ class TestModel:
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
......@@ -150,7 +148,7 @@ class TestModel:
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_score_max_model_len(self, server: RemoteOpenAIServer,
model: dict[str, Any]):
......
# SPDX-License-Identifier: Apache-2.0
import gc
import json
import tempfile
import openai
import pytest
import pytest_asyncio
import torch.cuda
from vllm.engine.arg_utils import EngineArgs
from vllm.model_executor.model_loader.tensorizer import (
TensorizerConfig, tensorize_lora_adapter, tensorize_vllm_model)
from ...utils import RemoteOpenAIServer
MODEL_NAME = "unsloth/llama-3.2-1b-Instruct"
LORA_PATH = "davzoku/finqa_adapter_1b"
def _cleanup():
gc.collect()
torch.cuda.empty_cache()
@pytest.fixture(autouse=True)
def cleanup():
_cleanup()
@pytest.fixture(scope='module')
def tmp_dir():
with tempfile.TemporaryDirectory() as path:
yield path
@pytest.fixture(scope='module')
def model_uri(tmp_dir):
yield f"{tmp_dir}/model.tensors"
@pytest.fixture(scope="module")
def tensorize_model_and_lora(tmp_dir, model_uri):
tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri,
lora_dir=tmp_dir)
args = EngineArgs(model=MODEL_NAME, device="cuda")
tensorize_lora_adapter(LORA_PATH, tensorizer_config)
tensorize_vllm_model(args, tensorizer_config)
# Manually invoke a _cleanup() here, as the cleanup()
# fixture won't be guaranteed to be called after this
# when this fixture is used for a test
_cleanup()
yield
@pytest.fixture(scope="module")
def server(model_uri, tensorize_model_and_lora):
model_loader_extra_config = {
"tensorizer_uri": model_uri,
}
## Start OpenAI API server
args = [
"--load-format", "tensorizer", "--device", "cuda",
"--model-loader-extra-config",
json.dumps(model_loader_extra_config), "--enable-lora"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
_cleanup()
completion = await client.completions.create(model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
assert completion.model == MODEL_NAME
assert len(completion.choices) == 1
assert len(completion.choices[0].text) >= 5
assert completion.choices[0].finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment