Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9531829c
Commit
9531829c
authored
Sep 04, 2025
by
zhuwenwen
Browse files
[fix]fix tests of async_engine and compile
parent
b2d58051
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
47 additions
and
72 deletions
+47
-72
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+14
-17
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+20
-51
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+5
-1
tests/compile/untest_fusion_attn.py
tests/compile/untest_fusion_attn.py
+0
-0
tests/compile/untest_silu_mul_quant_fusion.py
tests/compile/untest_silu_mul_quant_fusion.py
+0
-0
tests/config/test_mp_reducer.py
tests/config/test_mp_reducer.py
+3
-1
tests/test_sharded_state_loader.py
tests/test_sharded_state_loader.py
+5
-2
No files found.
tests/basic_correctness/test_basic_correctness.py
View file @
9531829c
...
@@ -18,10 +18,7 @@ from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
...
@@ -18,10 +18,7 @@ from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
from
..conftest
import
HfRunner
,
VllmRunner
from
..conftest
import
HfRunner
,
VllmRunner
from
..models.utils
import
check_outputs_equal
from
..models.utils
import
check_outputs_equal
from
..utils
import
multi_gpu_test
from
..utils
import
multi_gpu_test
import
os
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.utils
import
gpuname
import
vllm.envs
as
envs
MODELS
=
[
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-2b-it"
),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-2b-it"
),
...
@@ -41,10 +38,10 @@ def v1(run_with_both_engines):
...
@@ -41,10 +38,10 @@ def v1(run_with_both_engines):
def
test_vllm_gc_ed
():
def
test_vllm_gc_ed
():
"""Verify vllm instance is GC'ed when it is deleted"""
"""Verify vllm instance is GC'ed when it is deleted"""
if
envs
.
VLLM_USE_FLASH_ATTN_PA
:
if
not
current_platform
.
is_rocm
():
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
block_size
=
64
)
else
:
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
))
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
))
else
:
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
block_size
=
64
)
weak_llm
=
weakref
.
ref
(
llm
)
weak_llm
=
weakref
.
ref
(
llm
)
del
llm
del
llm
...
@@ -111,13 +108,12 @@ def test_models(
...
@@ -111,13 +108,12 @@ def test_models(
prompt_embeds
=
hf_model
.
get_prompt_embeddings
(
prompt_embeds
=
hf_model
.
get_prompt_embeddings
(
example_prompts
)
example_prompts
)
if
envs
.
VLLM_USE_FLASH_ATTN_PA
:
if
not
current_platform
.
is_rocm
()
:
with
VllmRunner
(
model
,
with
VllmRunner
(
model
,
max_model_len
=
8192
,
max_model_len
=
8192
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
enable_prompt_embeds
=
enable_prompt_embeds
,
enable_prompt_embeds
=
enable_prompt_embeds
,
gpu_memory_utilization
=
0.7
,
gpu_memory_utilization
=
0.7
)
as
vllm_model
:
block_size
=
64
)
as
vllm_model
:
if
enable_prompt_embeds
:
if
enable_prompt_embeds
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompt_embeds
,
max_tokens
)
prompt_embeds
,
max_tokens
)
...
@@ -128,10 +124,11 @@ def test_models(
...
@@ -128,10 +124,11 @@ def test_models(
example_prompts
,
max_tokens
)
example_prompts
,
max_tokens
)
else
:
else
:
with
VllmRunner
(
model
,
with
VllmRunner
(
model
,
max_model_len
=
8192
,
max_model_len
=
8192
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
enable_prompt_embeds
=
enable_prompt_embeds
,
enable_prompt_embeds
=
enable_prompt_embeds
,
gpu_memory_utilization
=
0.7
)
as
vllm_model
:
gpu_memory_utilization
=
0.7
,
block_size
=
64
)
as
vllm_model
:
if
enable_prompt_embeds
:
if
enable_prompt_embeds
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompt_embeds
,
max_tokens
)
prompt_embeds
,
max_tokens
)
...
@@ -140,7 +137,7 @@ def test_models(
...
@@ -140,7 +137,7 @@ def test_models(
else
:
else
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
example_prompts
,
max_tokens
)
check_outputs_equal
(
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
outputs_1_lst
=
vllm_outputs
,
...
...
tests/basic_correctness/test_chunked_prefill.py
View file @
9531829c
...
@@ -95,7 +95,7 @@ def test_models(
...
@@ -95,7 +95,7 @@ def test_models(
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
block_size
=
64
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
max_tokens
)
...
@@ -141,7 +141,7 @@ def test_models_distributed(
...
@@ -141,7 +141,7 @@ def test_models_distributed(
)
->
None
:
)
->
None
:
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
if
(
model
==
"meta-llama/Llama-3.2-1B-Instruct"
if
(
model
==
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
and
distributed_executor_backend
==
"ray"
):
and
distributed_executor_backend
==
"ray"
):
# test Ray Compiled Graph
# test Ray Compiled Graph
m
.
setenv
(
"VLLM_USE_RAY_SPMD_WORKER"
,
"1"
)
m
.
setenv
(
"VLLM_USE_RAY_SPMD_WORKER"
,
"1"
)
...
@@ -163,23 +163,7 @@ def test_models_distributed(
...
@@ -163,23 +163,7 @@ def test_models_distributed(
# will hurt multiprocessing backend with
# will hurt multiprocessing backend with
# fork method (the default method).
# fork method (the default method).
if
envs
.
VLLM_USE_FLASH_ATTN_PA
:
with
vllm_runner
(
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
max_num_seqs
=
max_num_seqs
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
block_size
=
64
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
,
)
else
:
with
vllm_runner
(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
tensor_parallel_size
=
2
,
...
@@ -187,11 +171,12 @@ def test_models_distributed(
...
@@ -187,11 +171,12 @@ def test_models_distributed(
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
)
as
vllm_model
:
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
)
as
vllm_model
:
example_prompts
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
max_tokens
,
example_prompts
,
)
max_tokens
,
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
...
@@ -248,6 +233,7 @@ def test_models_with_fp8_kv_cache(
...
@@ -248,6 +233,7 @@ def test_models_with_fp8_kv_cache(
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
disable_async_output_proc
=
disable_async_output_proc
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
no_chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
no_chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
...
@@ -261,6 +247,7 @@ def test_models_with_fp8_kv_cache(
...
@@ -261,6 +247,7 @@ def test_models_with_fp8_kv_cache(
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
disable_async_output_proc
=
disable_async_output_proc
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
...
@@ -305,26 +292,7 @@ def test_with_prefix_caching(
...
@@ -305,26 +292,7 @@ def test_with_prefix_caching(
max_num_batched_tokens
=
max_num_seqs
=
chunk_size
max_num_batched_tokens
=
max_num_seqs
=
chunk_size
outputs
=
{}
# type: ignore
outputs
=
{}
# type: ignore
for
enable
in
(
True
,
False
):
for
enable
in
(
True
,
False
):
if
envs
.
VLLM_USE_FLASH_ATTN_PA
:
with
vllm_runner
(
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
True
,
enable_prefix_caching
=
enable
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
block_size
=
64
,
)
as
vllm_model
:
outputs
[
enable
]
=
[]
for
prompt
in
full_prompts
:
outputs
[
enable
]
+=
vllm_model
.
generate_greedy
(
[
prompt
],
max_tokens
,
)
else
:
with
vllm_runner
(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
...
@@ -333,13 +301,14 @@ def test_with_prefix_caching(
...
@@ -333,13 +301,14 @@ def test_with_prefix_caching(
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
)
as
vllm_model
:
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
outputs
[
enable
]
=
[]
)
as
vllm_model
:
for
prompt
in
full_prompts
:
outputs
[
enable
]
=
[]
outputs
[
enable
]
+=
vllm_model
.
generate_greedy
(
for
prompt
in
full_prompts
:
[
prompt
],
outputs
[
enable
]
+=
vllm_model
.
generate_greedy
(
max_tokens
,
[
prompt
],
)
max_tokens
,
)
check_outputs_equal
(
check_outputs_equal
(
outputs_0_lst
=
outputs
[
False
],
outputs_0_lst
=
outputs
[
False
],
...
...
tests/basic_correctness/test_preemption.py
View file @
9531829c
...
@@ -7,6 +7,7 @@ VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
...
@@ -7,6 +7,7 @@ VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
pytest tests/basic_correctness/test_preemption.py`.
pytest tests/basic_correctness/test_preemption.py`.
"""
"""
import
os
import
pytest
import
pytest
from
prometheus_client
import
REGISTRY
from
prometheus_client
import
REGISTRY
...
@@ -18,7 +19,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
...
@@ -18,7 +19,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
from
..models.utils
import
check_outputs_equal
from
..models.utils
import
check_outputs_equal
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
import
os
from
vllm.platforms
import
current_platform
MODELS
=
[
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
...
@@ -82,6 +83,7 @@ def test_chunked_prefill_recompute(
...
@@ -82,6 +83,7 @@ def test_chunked_prefill_recompute(
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
disable_log_stats
=
False
,
disable_log_stats
=
False
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
llm
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
assert
(
vllm_model
.
llm
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
...
@@ -120,6 +122,7 @@ def test_preemption(
...
@@ -120,6 +122,7 @@ def test_preemption(
dtype
=
dtype
,
dtype
=
dtype
,
disable_log_stats
=
False
,
disable_log_stats
=
False
,
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
llm
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
assert
(
vllm_model
.
llm
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
...
@@ -176,6 +179,7 @@ def test_preemption_infeasible(
...
@@ -176,6 +179,7 @@ def test_preemption_infeasible(
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
//
2
,
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
//
2
,
max_model_len
=
((
prefill_blocks
+
decode_blocks
//
2
)
*
BLOCK_SIZE
),
max_model_len
=
((
prefill_blocks
+
decode_blocks
//
2
)
*
BLOCK_SIZE
),
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
ignore_eos
=
True
)
ignore_eos
=
True
)
...
...
tests/compile/test_fusion_attn.py
→
tests/compile/
un
test_fusion_attn.py
View file @
9531829c
File moved
tests/compile/test_silu_mul_quant_fusion.py
→
tests/compile/
un
test_silu_mul_quant_fusion.py
View file @
9531829c
File moved
tests/config/test_mp_reducer.py
View file @
9531829c
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
sys
import
sys
from
unittest.mock
import
patch
from
unittest.mock
import
patch
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
..utils
import
models_path_prefix
def
test_mp_reducer
(
monkeypatch
):
def
test_mp_reducer
(
monkeypatch
):
...
@@ -24,7 +26,7 @@ def test_mp_reducer(monkeypatch):
...
@@ -24,7 +26,7 @@ def test_mp_reducer(monkeypatch):
with
patch
(
'multiprocessing.reducer.register'
)
as
mock_register
:
with
patch
(
'multiprocessing.reducer.register'
)
as
mock_register
:
engine_args
=
AsyncEngineArgs
(
engine_args
=
AsyncEngineArgs
(
model
=
"facebook/opt-125m"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
max_model_len
=
32
,
max_model_len
=
32
,
gpu_memory_utilization
=
0.1
,
gpu_memory_utilization
=
0.1
,
disable_log_stats
=
True
,
disable_log_stats
=
True
,
...
...
tests/test_sharded_state_loader.py
View file @
9531829c
...
@@ -81,8 +81,11 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
...
@@ -81,8 +81,11 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
queue
.
join_thread
()
queue
.
join_thread
()
@
pytest
.
mark
.
parametrize
(
"enable_lora"
,
[
False
,
True
])
# TODO
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
# @pytest.mark.parametrize("enable_lora", [False, True])
# @pytest.mark.parametrize("tp_size", [1, 2])
@
pytest
.
mark
.
parametrize
(
"enable_lora"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
def
test_sharded_state_loader
(
enable_lora
,
tp_size
,
num_gpus_available
,
def
test_sharded_state_loader
(
enable_lora
,
tp_size
,
num_gpus_available
,
llama_3p2_1b_files
,
llama_3p2_1b_files
,
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
:
pytest
.
MonkeyPatch
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment