Commit a810671a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori

parents 86b5aefe 6a09612b
...@@ -47,6 +47,8 @@ We currently support the following OpenAI APIs: ...@@ -47,6 +47,8 @@ We currently support the following OpenAI APIs:
- [Completions API](#completions-api) (`/v1/completions`) - [Completions API](#completions-api) (`/v1/completions`)
- Only applicable to [text generation models](../models/generative_models.md). - Only applicable to [text generation models](../models/generative_models.md).
- *Note: `suffix` parameter is not supported.* - *Note: `suffix` parameter is not supported.*
- [Responses API](#responses-api) (`/v1/responses`)
- Only applicable to [text generation models](../models/generative_models.md).
- [Chat Completions API](#chat-api) (`/v1/chat/completions`) - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
- Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template). - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
- *Note: `user` parameter is ignored.* - *Note: `user` parameter is ignored.*
...@@ -229,6 +231,31 @@ The following extra parameters are supported: ...@@ -229,6 +231,31 @@ The following extra parameters are supported:
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params" --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
``` ```
### Responses API
Our Responses API is compatible with [OpenAI's Responses API](https://platform.openai.com/docs/api-reference/responses);
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
Code example: [examples/online_serving/openai_responses_client_with_tools.py](../../examples/online_serving/openai_responses_client_with_tools.py)
#### Extra parameters
The following extra parameters in the request object are supported:
??? code
```python
--8<-- "vllm/entrypoints/openai/protocol.py:responses-extra-params"
```
The following extra parameters in the response object are supported:
??? code
```python
--8<-- "vllm/entrypoints/openai/protocol.py:responses-response-extra-params"
```
### Embeddings API ### Embeddings API
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
......
...@@ -55,7 +55,6 @@ done ...@@ -55,7 +55,6 @@ done
echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS" echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
export RAY_DEDUP_LOGS=0 export RAY_DEDUP_LOGS=0
export VLLM_ALL2ALL_BACKEND="pplx"
export VLLM_USE_DEEP_GEMM=1 export VLLM_USE_DEEP_GEMM=1
vllm serve $MODEL_NAME \ vllm serve $MODEL_NAME \
...@@ -65,6 +64,7 @@ vllm serve $MODEL_NAME \ ...@@ -65,6 +64,7 @@ vllm serve $MODEL_NAME \
--enforce-eager \ --enforce-eager \
--enable-expert-parallel \ --enable-expert-parallel \
--enable-eplb \ --enable-eplb \
--all2all-backend pplx \
--num-redundant-experts $REDUNDANT_EXPERTS \ --num-redundant-experts $REDUNDANT_EXPERTS \
--trust-remote-code \ --trust-remote-code \
--host $HOST \ --host $HOST \
......
...@@ -6,7 +6,7 @@ requires = [ ...@@ -6,7 +6,7 @@ requires = [
"packaging>=24.2", "packaging>=24.2",
"setuptools>=77.0.3,<81.0.0", "setuptools>=77.0.3,<81.0.0",
"setuptools-scm>=8.0", "setuptools-scm>=8.0",
"torch == 2.9.0", "torch == 2.9.1",
"wheel", "wheel",
"jinja2", "jinja2",
] ]
......
...@@ -4,7 +4,7 @@ ninja ...@@ -4,7 +4,7 @@ ninja
packaging>=24.2 packaging>=24.2
setuptools>=77.0.3,<81.0.0 setuptools>=77.0.3,<81.0.0
setuptools-scm>=8 setuptools-scm>=8
torch==2.9.0 torch==2.9.1
wheel wheel
jinja2>=3.1.6 jinja2>=3.1.6
regex regex
......
...@@ -37,7 +37,7 @@ pyyaml ...@@ -37,7 +37,7 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL. einops # Required for Qwen2-VL.
compressed-tensors == 0.12.2 # required for compressed-tensors compressed-tensors == 0.13.0 # required for compressed-tensors
depyf==0.20.0 # required for profiling and debugging with compilation config depyf==0.20.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files watchfiles # required for http server to monitor the updates of TLS files
...@@ -50,5 +50,5 @@ ijson # Required for mistral streaming tool parser ...@@ -50,5 +50,5 @@ ijson # Required for mistral streaming tool parser
setproctitle # Used to set process names for better debugging and monitoring setproctitle # Used to set process names for better debugging and monitoring
openai-harmony >= 0.0.3 # Required for gpt-oss openai-harmony >= 0.0.3 # Required for gpt-oss
anthropic == 0.71.0 anthropic == 0.71.0
model-hosting-container-standards >= 0.1.9, < 1.0.0 model-hosting-container-standards >= 0.1.10, < 1.0.0
mcp mcp
\ No newline at end of file
cmake>=3.26.1 cmake>=3.26.1
ninja ninja
packaging>=24.2 packaging>=24.2
setuptools>=77.0.3,<81.0.0 setuptools==77.0.3 # this version can reuse CMake build dir
setuptools-scm>=8 setuptools-scm>=8
torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x" torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64" torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
......
# Common dependencies # Common dependencies
-r common.txt -r common.txt
setuptools==77.0.3 # this version can reuse CMake build dir
numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
# Dependencies for CPUs # Dependencies for CPUs
......
...@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding ...@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding
# Dependencies for NVIDIA GPUs # Dependencies for NVIDIA GPUs
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1. ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
torch==2.9.0 torch==2.9.1
torchaudio==2.9.0 torchaudio==2.9.1
# These must be updated alongside torch # These must be updated alongside torch
torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# FlashInfer should be updated together with the Dockerfile # FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.5.3 flashinfer-python==0.5.3
...@@ -2,11 +2,11 @@ ...@@ -2,11 +2,11 @@
-r common.txt -r common.txt
--extra-index-url https://download.pytorch.org/whl/rocm6.4 --extra-index-url https://download.pytorch.org/whl/rocm6.4
torch==2.9.0 torch==2.9.1
torchvision==0.24.0 torchvision==0.24.1
torchaudio==2.9.0 torchaudio==2.9.1
triton==3.5.0 triton==3.5.1
cmake>=3.26.1,<4 cmake>=3.26.1,<4
packaging>=24.2 packaging>=24.2
setuptools>=77.0.3,<80.0.0 setuptools>=77.0.3,<80.0.0
......
...@@ -24,9 +24,9 @@ soundfile # required for audio tests ...@@ -24,9 +24,9 @@ soundfile # required for audio tests
jiwer # required for audio tests jiwer # required for audio tests
tblib # for pickling test exceptions tblib # for pickling test exceptions
timm >=1.0.17 # required for internvl and gemma3n-mm test timm >=1.0.17 # required for internvl and gemma3n-mm test
torch==2.9.0 torch==2.9.1
torchaudio==2.9.0 torchaudio==2.9.1
torchvision==0.24.0 torchvision==0.24.1
transformers_stream_generator # required for qwen-vl test transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.5 # required for voxtral test mistral_common[image,audio] >= 1.8.5 # required for voxtral test
......
...@@ -1123,7 +1123,7 @@ tomli==2.2.1 ...@@ -1123,7 +1123,7 @@ tomli==2.2.1
# via schemathesis # via schemathesis
tomli-w==1.2.0 tomli-w==1.2.0
# via schemathesis # via schemathesis
torch==2.9.0+cu129 torch==2.9.1+cu129
# via # via
# -r requirements/test.in # -r requirements/test.in
# accelerate # accelerate
...@@ -1152,7 +1152,7 @@ torch==2.9.0+cu129 ...@@ -1152,7 +1152,7 @@ torch==2.9.0+cu129
# torchvision # torchvision
# vector-quantize-pytorch # vector-quantize-pytorch
# vocos # vocos
torchaudio==2.9.0+cu129 torchaudio==2.9.1+cu129
# via # via
# -r requirements/test.in # -r requirements/test.in
# encodec # encodec
...@@ -1165,7 +1165,7 @@ torchmetrics==1.7.4 ...@@ -1165,7 +1165,7 @@ torchmetrics==1.7.4
# pytorch-lightning # pytorch-lightning
# terratorch # terratorch
# torchgeo # torchgeo
torchvision==0.24.0+cu129 torchvision==0.24.1+cu129
# via # via
# -r requirements/test.in # -r requirements/test.in
# lightly # lightly
...@@ -1206,7 +1206,7 @@ transformers==4.57.3 ...@@ -1206,7 +1206,7 @@ transformers==4.57.3
# transformers-stream-generator # transformers-stream-generator
transformers-stream-generator==0.0.5 transformers-stream-generator==0.0.5
# via -r requirements/test.in # via -r requirements/test.in
triton==3.5.0 triton==3.5.1
# via torch # via torch
tritonclient==2.51.0 tritonclient==2.51.0
# via # via
......
...@@ -67,7 +67,6 @@ def _fix_prompt_embed_outputs( ...@@ -67,7 +67,6 @@ def _fix_prompt_embed_outputs(
@pytest.mark.parametrize("model_executor", ["uni", "mp"]) @pytest.mark.parametrize("model_executor", ["uni", "mp"])
@pytest.mark.parametrize("enable_prompt_embeds", [True, False]) @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
def test_models( def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
model: str, model: str,
backend: str, backend: str,
...@@ -77,48 +76,46 @@ def test_models( ...@@ -77,48 +76,46 @@ def test_models(
model_executor: str, model_executor: str,
enable_prompt_embeds: bool, enable_prompt_embeds: bool,
) -> None: ) -> None:
with monkeypatch.context() as m: # 5042 tokens for gemma2
m.setenv("VLLM_ATTENTION_BACKEND", backend) # gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
# 5042 tokens for gemma2 prompt = (
# gemma2 has alternating sliding window size of 4096 "The following numbers of the sequence "
# we need a prompt with more than 4096 tokens to test the sliding window + ", ".join(str(i) for i in range(1024))
prompt = ( + " are:"
"The following numbers of the sequence " )
+ ", ".join(str(i) for i in range(1024)) example_prompts = [prompt]
+ " are:"
) with hf_runner(model) as hf_model:
example_prompts = [prompt] hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if enable_prompt_embeds:
with hf_runner(model) as hf_model: with torch.no_grad():
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
if enable_prompt_embeds:
with torch.no_grad(): with VllmRunner(
prompt_embeds = hf_model.get_prompt_embeddings(example_prompts) model,
max_model_len=8192,
with VllmRunner( enforce_eager=enforce_eager,
model, enable_prompt_embeds=enable_prompt_embeds,
max_model_len=8192, gpu_memory_utilization=0.7,
enforce_eager=enforce_eager, async_scheduling=async_scheduling,
enable_prompt_embeds=enable_prompt_embeds, distributed_executor_backend=model_executor,
gpu_memory_utilization=0.7, attention_config={"backend": backend},
async_scheduling=async_scheduling, ) as vllm_model:
distributed_executor_backend=model_executor, if enable_prompt_embeds:
) as vllm_model: vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
if enable_prompt_embeds: vllm_outputs = _fix_prompt_embed_outputs(
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens) vllm_outputs, hf_model, example_prompts
vllm_outputs = _fix_prompt_embed_outputs( )
vllm_outputs, hf_model, example_prompts else:
) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
else:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
...@@ -161,12 +158,6 @@ def test_models_distributed( ...@@ -161,12 +158,6 @@ def test_models_distributed(
): # noqa ): # noqa
pytest.skip("enable_prompt_embeds does not work with ray compiled dag.") pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
if attention_backend:
monkeypatch_context.setenv(
"VLLM_ATTENTION_BACKEND",
attention_backend,
)
for k, v in extra_env.items(): for k, v in extra_env.items():
monkeypatch_context.setenv(k, v) monkeypatch_context.setenv(k, v)
...@@ -178,6 +169,7 @@ def test_models_distributed( ...@@ -178,6 +169,7 @@ def test_models_distributed(
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method # will hurt multiprocessing backend with fork method
# (the default method). # (the default method).
attention_config = {"backend": attention_backend} if attention_backend else None
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
...@@ -185,6 +177,7 @@ def test_models_distributed( ...@@ -185,6 +177,7 @@ def test_models_distributed(
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enable_prompt_embeds=enable_prompt_embeds, enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
attention_config=attention_config,
) as vllm_model: ) as vllm_model:
if enable_prompt_embeds: if enable_prompt_embeds:
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
......
...@@ -19,21 +19,18 @@ def server(): ...@@ -19,21 +19,18 @@ def server():
@pytest.mark.benchmark @pytest.mark.benchmark
def test_bench_serve(server): def test_bench_serve(server):
# Test default model detection and input/output len
command = [ command = [
"vllm", "vllm",
"bench", "bench",
"serve", "serve",
"--model",
MODEL_NAME,
"--host", "--host",
server.host, server.host,
"--port", "--port",
str(server.port), str(server.port),
"--dataset-name", "--input-len",
"random",
"--random-input-len",
"32", "32",
"--random-output-len", "--output-len",
"4", "4",
"--num-prompts", "--num-prompts",
"5", "5",
......
...@@ -208,7 +208,8 @@ def test_attn_quant( ...@@ -208,7 +208,8 @@ def test_attn_quant(
# To capture subprocess logs, we need to know whether spawn or fork is used. # To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general. # Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
# Testing properties # Testing properties
...@@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm( ...@@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
# To capture subprocess logs, we need to know whether spawn or fork is used. # To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general. # Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
# Testing properties # Testing properties
...@@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp( ...@@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp(
# To capture subprocess logs, we need to know whether spawn or fork is used. # To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general. # Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
# Testing properties # Testing properties
...@@ -564,7 +567,9 @@ def test_rms_group_quant( ...@@ -564,7 +567,9 @@ def test_rms_group_quant(
splitting_ops=splitting_ops, splitting_ops=splitting_ops,
# Common # Common
mode=CompilationMode.VLLM_COMPILE, mode=CompilationMode.VLLM_COMPILE,
pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True), pass_config=PassConfig(
fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
),
# Inductor caches custom passes by default as well via uuid # Inductor caches custom passes by default as well via uuid
inductor_compile_config={"force_disable_caches": True}, inductor_compile_config={"force_disable_caches": True},
) )
......
...@@ -89,7 +89,6 @@ class TestSetting: ...@@ -89,7 +89,6 @@ class TestSetting:
], ],
) )
def test_compile_correctness( def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
test_setting: TestSetting, test_setting: TestSetting,
): ):
# this test is run under multiple suits, with different GPUs. # this test is run under multiple suits, with different GPUs.
...@@ -107,49 +106,48 @@ def test_compile_correctness( ...@@ -107,49 +106,48 @@ def test_compile_correctness(
f"{cuda_device_count_stateless()}" f"{cuda_device_count_stateless()}"
) )
with monkeypatch.context() as m: final_args = [
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) *model_args,
final_args = [ "-pp",
*model_args, str(pp_size),
"-pp", "-tp",
str(pp_size), str(tp_size),
"-tp", "-cc.cudagraph_mode=none",
str(tp_size), f"--attention-backend={attn_backend}",
"-cc.cudagraph_mode=none", ]
]
all_args: list[list[str]] = [] all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = [] all_envs: list[dict[str, str] | None] = []
for comp_mode in [ for comp_mode in [
CompilationMode.STOCK_TORCH_COMPILE, CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE, CompilationMode.VLLM_COMPILE,
]: ]:
for mode in [CompilationMode.NONE, comp_mode]: for mode in [CompilationMode.NONE, comp_mode]:
all_args.append( all_args.append(
final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"] final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
)
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings(
model,
all_args,
all_envs,
method=method if method != "generate" else "generate_close",
) )
all_envs.clear()
all_args.clear()
for mode in [ # inductor will change the output, so we only compare if the output
CompilationMode.NONE, # is close, not exactly the same.
CompilationMode.STOCK_TORCH_COMPILE, compare_all_settings(
CompilationMode.DYNAMO_TRACE_ONCE, model,
CompilationMode.VLLM_COMPILE, all_args,
]: all_envs,
all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"]) method=method if method != "generate" else "generate_close",
all_envs.append({}) )
all_envs.append({}) all_envs.clear()
all_args.clear()
for mode in [
CompilationMode.NONE,
CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
all_envs.append({})
all_envs.append({})
compare_all_settings(model, all_args * 3, all_envs, method=method) compare_all_settings(model, all_args * 3, all_envs, method=method)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment