Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
88 additions
and
43 deletions
+88
-43
requirements/xpu.txt
requirements/xpu.txt
+1
-1
setup.py
setup.py
+4
-5
tests/async_engine/api_server_async_engine.py
tests/async_engine/api_server_async_engine.py
+7
-6
tests/async_engine/conftest.py
tests/async_engine/conftest.py
+1
-0
tests/async_engine/test_api_server.py
tests/async_engine/test_api_server.py
+1
-0
tests/async_engine/test_async_llm_engine.py
tests/async_engine/test_async_llm_engine.py
+23
-0
tests/async_engine/test_request_tracker.py
tests/async_engine/test_request_tracker.py
+1
-0
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+21
-13
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+1
-0
tests/basic_correctness/test_cpu_offload.py
tests/basic_correctness/test_cpu_offload.py
+1
-0
tests/basic_correctness/test_cumem.py
tests/basic_correctness/test_cumem.py
+1
-0
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+1
-0
tests/benchmarks/test_latency_cli.py
tests/benchmarks/test_latency_cli.py
+1
-0
tests/benchmarks/test_serve_cli.py
tests/benchmarks/test_serve_cli.py
+1
-0
tests/benchmarks/test_throughput_cli.py
tests/benchmarks/test_throughput_cli.py
+1
-0
tests/build_cython.py
tests/build_cython.py
+1
-0
tests/compile/backend.py
tests/compile/backend.py
+1
-0
tests/compile/conftest.py
tests/compile/conftest.py
+0
-14
tests/compile/piecewise/test_full_cudagraph.py
tests/compile/piecewise/test_full_cudagraph.py
+6
-2
tests/compile/piecewise/test_simple.py
tests/compile/piecewise/test_simple.py
+14
-2
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
requirements/xpu.txt
View file @
cc7f22a8
...
...
@@ -2,7 +2,7 @@
-r common.txt
ray>=2.9
cmake>=3.26
cmake>=3.26
.1
packaging>=24.2
setuptools-scm>=8
setuptools>=77.0.3,<80.0.0
...
...
setup.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
ctypes
import
importlib.util
import
json
import
logging
import
os
import
re
import
subprocess
import
sys
from
pathlib
import
Path
from
shutil
import
which
import
regex
as
re
import
torch
from
packaging.version
import
Version
,
parse
from
setuptools
import
Extension
,
setup
...
...
@@ -251,11 +252,8 @@ class cmake_build_ext(build_ext):
# CMake appends the extension prefix to the install path,
# and outdir already contains that prefix, so we need to remove it.
# We assume only the final component of extension prefix is added by
# CMake, this is currently true for current extensions but may not
# always be the case.
prefix
=
outdir
if
'.'
in
ext
.
name
:
for
_
in
range
(
ext
.
name
.
count
(
'.'
))
:
prefix
=
prefix
.
parent
# prefix here should actually be the same for all components
...
...
@@ -690,6 +688,7 @@ setup(
ext_modules
=
ext_modules
,
install_requires
=
get_requirements
(),
extras_require
=
{
"bench"
:
[
"pandas"
,
"datasets"
],
"tensorizer"
:
[
"tensorizer>=2.9.0"
],
"fastsafetensors"
:
[
"fastsafetensors >= 0.1.10"
],
"runai"
:
[
"runai-model-streamer"
,
"runai-model-streamer-s3"
,
"boto3"
],
...
...
tests/async_engine/api_server_async_engine.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""vllm.entrypoints.api_server with some extra logging for testing."""
from
collections.abc
import
Iterable
from
typing
import
Any
...
...
@@ -7,6 +8,7 @@ import uvicorn
from
fastapi.responses
import
JSONResponse
,
Response
import
vllm.entrypoints.api_server
import
vllm.envs
as
envs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.utils
import
FlexibleArgumentParser
...
...
@@ -45,9 +47,8 @@ if __name__ == "__main__":
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngineWithStats
.
from_engine_args
(
engine_args
)
vllm
.
entrypoints
.
api_server
.
engine
=
engine
uvicorn
.
run
(
app
,
host
=
args
.
host
,
port
=
args
.
port
,
log_level
=
"debug"
,
timeout_keep_alive
=
vllm
.
entrypoints
.
api_server
.
TIMEOUT_KEEP_ALIVE
)
uvicorn
.
run
(
app
,
host
=
args
.
host
,
port
=
args
.
port
,
log_level
=
"debug"
,
timeout_keep_alive
=
envs
.
VLLM_HTTP_TIMEOUT_KEEP_ALIVE
)
tests/async_engine/conftest.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
...
...
tests/async_engine/test_api_server.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
subprocess
...
...
tests/async_engine/test_async_llm_engine.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
os
...
...
@@ -383,3 +384,25 @@ async def test_delayed_generator(async_engine, stop):
assert
final_output
is
not
None
assert
len
(
final_output
.
outputs
[
0
].
token_ids
)
==
10
assert
final_output
.
finished
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
async
def
test_invalid_argument
(
async_engine
):
scheduler_config
=
await
async_engine
.
get_scheduler_config
()
if
scheduler_config
.
num_scheduler_steps
!=
1
:
pytest
.
skip
(
"no need to test this one with multistep"
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
min_tokens
=
10
,
max_tokens
=
10
,
)
# Targeting specific DP rank only supported in v1 multi-instance DP
with
pytest
.
raises
(
ValueError
):
async
for
_
in
async_engine
.
generate
(
"test"
,
sampling_params
,
request_id
=
uid
(),
data_parallel_rank
=
0
):
pass
tests/async_engine/test_request_tracker.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
...
...
tests/basic_correctness/test_basic_correctness.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Compare the short outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
...
...
@@ -60,7 +61,6 @@ def _fix_prompt_embed_outputs(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"enable_prompt_embeds"
,
[
True
,
False
])
...
...
@@ -69,7 +69,6 @@ def test_models(
hf_runner
,
model
:
str
,
backend
:
str
,
dtype
:
str
,
max_tokens
:
int
,
enforce_eager
:
bool
,
enable_prompt_embeds
:
bool
,
...
...
@@ -97,7 +96,7 @@ def test_models(
str
(
i
)
for
i
in
range
(
1024
))
+
" are:"
example_prompts
=
[
prompt
]
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
with
hf_runner
(
model
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
if
enable_prompt_embeds
:
with
torch
.
no_grad
():
...
...
@@ -106,7 +105,6 @@ def test_models(
with
VllmRunner
(
model
,
max_model_len
=
8192
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
enable_prompt_embeds
=
enable_prompt_embeds
,
gpu_memory_utilization
=
0.7
)
as
vllm_model
:
...
...
@@ -130,15 +128,21 @@ def test_models(
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend, attention_backend, "
"test_suite"
,
[
(
"distilbert/distilgpt2"
,
"ray"
,
""
,
"L4"
),
(
"distilbert/distilgpt2"
,
"mp"
,
""
,
"L4"
),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"ray"
,
""
,
"L4"
),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"mp"
,
""
,
"L4"
),
(
"distilbert/distilgpt2"
,
"ray"
,
""
,
"A100"
),
(
"distilbert/distilgpt2"
,
"mp"
,
""
,
"A100"
),
(
"distilbert/distilgpt2"
,
"mp"
,
"FLASHINFER"
,
"A100"
),
(
"meta-llama/Meta-Llama-3-8B"
,
"ray"
,
"FLASHINFER"
,
"A100"
),
"test_suite, extra_env"
,
[
(
"distilbert/distilgpt2"
,
"ray"
,
""
,
"L4"
,
{}),
(
"distilbert/distilgpt2"
,
"mp"
,
""
,
"L4"
,
{}),
(
"distilbert/distilgpt2"
,
"ray"
,
""
,
"L4"
,
{
"VLLM_SLEEP_WHEN_IDLE"
:
"1"
}),
(
"distilbert/distilgpt2"
,
"mp"
,
""
,
"L4"
,
{
"VLLM_SLEEP_WHEN_IDLE"
:
"1"
}),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"ray"
,
""
,
"L4"
,
{}),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"mp"
,
""
,
"L4"
,
{}),
(
"distilbert/distilgpt2"
,
"ray"
,
""
,
"A100"
,
{}),
(
"distilbert/distilgpt2"
,
"mp"
,
""
,
"A100"
,
{}),
(
"distilbert/distilgpt2"
,
"mp"
,
"FLASHINFER"
,
"A100"
,
{}),
(
"meta-llama/Meta-Llama-3-8B"
,
"ray"
,
"FLASHINFER"
,
"A100"
,
{}),
])
@
pytest
.
mark
.
parametrize
(
"enable_prompt_embeds"
,
[
True
,
False
])
def
test_models_distributed
(
...
...
@@ -150,6 +154,7 @@ def test_models_distributed(
distributed_executor_backend
:
str
,
attention_backend
:
str
,
test_suite
:
str
,
extra_env
:
dict
[
str
,
str
],
enable_prompt_embeds
:
bool
,
)
->
None
:
...
...
@@ -175,6 +180,9 @@ def test_models_distributed(
attention_backend
,
)
for
k
,
v
in
extra_env
.
items
():
monkeypatch_context
.
setenv
(
k
,
v
)
dtype
=
"half"
max_tokens
=
5
...
...
tests/basic_correctness/test_chunked_prefill.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Compare the outputs of HF and vLLM when using greedy sampling.
It tests chunked prefill. Chunked prefill can be enabled by
...
...
tests/basic_correctness/test_cpu_offload.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
..utils
import
compare_two_settings
...
...
tests/basic_correctness/test_cumem.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
...
...
tests/basic_correctness/test_preemption.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Compare the short outputs of HF and vLLM when using greedy sampling.
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
...
...
tests/benchmarks/test_latency_cli.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
subprocess
import
pytest
...
...
tests/benchmarks/test_serve_cli.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
subprocess
import
pytest
...
...
tests/benchmarks/test_throughput_cli.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
subprocess
import
pytest
...
...
tests/build_cython.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
Cython.Compiler.Options
from
Cython.Build
import
cythonize
from
setuptools
import
setup
...
...
tests/compile/backend.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
copy
import
deepcopy
from
typing
import
Callable
,
Union
...
...
tests/compile/conftest.py
deleted
100644 → 0
View file @
b9ea0c09
# SPDX-License-Identifier: Apache-2.0
import
pytest
# TEST V1: this should be removed. Right now V1 overrides
# all the torch compile logic. We should re-enable this
# as we add torch compile support back to V1.
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
tests/compile/piecewise/test_full_cudagraph.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
contextlib
import
os
...
...
@@ -6,6 +7,7 @@ import pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
CompilationConfig
from
vllm.platforms
import
current_platform
MODEL
=
"Qwen/Qwen2-1.5B-Instruct"
...
...
@@ -36,7 +38,7 @@ def full_cudagraph_llm():
"VLLM_FLASH_ATTN_VERSION"
:
"3"
}):
return
LLM
(
model
=
MODEL
,
gpu_memory_utilization
=
0.
2
,
gpu_memory_utilization
=
0.
3
,
compilation_config
=
CompilationConfig
(
full_cuda_graph
=
True
))
...
...
@@ -47,7 +49,7 @@ def piecewise_llm():
"VLLM_FLASH_ATTN_VERSION"
:
"3"
}):
return
LLM
(
model
=
MODEL
,
gpu_memory_utilization
=
0.
5
,
gpu_memory_utilization
=
0.
6
,
compilation_config
=
CompilationConfig
())
...
...
@@ -60,6 +62,8 @@ def generate_text(llm: LLM, batch_size: int, max_tokens: int):
return
llm
.
generate
(
prompts
,
sampling_params
)
@
pytest
.
mark
.
skipif
(
current_platform
.
get_device_capability
()
!=
(
9
,
0
),
reason
=
"Only Hopper GPUs support FlashAttention 3"
)
@
pytest
.
mark
.
parametrize
((
"batch_size"
,
"max_tokens"
),
[(
1
,
10
),
(
7
,
10
),
(
16
,
10
),
(
25
,
10
),
(
32
,
10
),
(
45
,
10
),
...
...
tests/compile/piecewise/test_simple.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test the piecewise compilation with a simple model so that we
can exactly calculate the expected output and side effects.
...
...
@@ -12,6 +13,7 @@ from vllm.compilation.counter import compilation_counter
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.envs
import
VLLM_USE_V1
from
vllm.utils
import
direct_register_custom_op
global_counter
=
0
...
...
@@ -74,11 +76,13 @@ class SillyModel(nn.Module):
return
x
def
test_simple_piecewise_compile
():
def
_test_simple_piecewise_compile
(
*
,
use_inductor
):
assert
VLLM_USE_V1
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
use_cudagraph
=
True
,
use_inductor
=
use_inductor
,
splitting_ops
=
[
"silly.attention"
],
cudagraph_copy_inputs
=
True
,
cudagraph_capture_sizes
=
[
1
,
2
],
...
...
@@ -93,7 +97,7 @@ def test_simple_piecewise_compile():
num_piecewise_graphs_seen
=
5
,
# 2 * num_layers + 1
num_piecewise_capturable_graphs_seen
=
3
,
# 1 + num_layers
num_backend_compilations
=
3
,
# num_piecewise_capturable_graphs_seen
num_cudagraph_cap
u
tured
=
num_cudagraph_captured
=
6
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
...
...
@@ -108,3 +112,11 @@ def test_simple_piecewise_compile():
output
=
model
(
input
)
assert
global_counter
==
2
assert
torch
.
allclose
(
output
.
cpu
(),
torch
.
tensor
([
3.
,
1.
]))
def
test_simple_piecewise_compile_inductor
():
_test_simple_piecewise_compile
(
use_inductor
=
True
)
def
test_simple_piecewise_compile_no_inductor
():
_test_simple_piecewise_compile
(
use_inductor
=
False
)
Prev
1
…
9
10
11
12
13
14
15
16
17
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment