Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
22d7e7c4
Commit
22d7e7c4
authored
Sep 04, 2025
by
zhuwenwen
Browse files
[fix]fix tests of async_engine and compile
parent
99963991
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
107 additions
and
65 deletions
+107
-65
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+7
-9
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+17
-14
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+55
-25
tests/compile/test_async_tp.py
tests/compile/test_async_tp.py
+3
-1
tests/compile/test_basic_correctness.py
tests/compile/test_basic_correctness.py
+10
-9
tests/compile/test_config.py
tests/compile/test_config.py
+6
-3
tests/compile/untest_fusion_attn.py
tests/compile/untest_fusion_attn.py
+0
-0
tests/compile/untest_silu_mul_quant_fusion.py
tests/compile/untest_silu_mul_quant_fusion.py
+0
-0
tests/config/test_mp_reducer.py
tests/config/test_mp_reducer.py
+3
-1
tests/conftest.py
tests/conftest.py
+2
-1
tests/test_sharded_state_loader.py
tests/test_sharded_state_loader.py
+4
-2
No files found.
tests/basic_correctness/test_basic_correctness.py
View file @
22d7e7c4
...
@@ -20,8 +20,6 @@ from ..models.utils import check_outputs_equal
...
@@ -20,8 +20,6 @@ from ..models.utils import check_outputs_equal
from
..utils
import
multi_gpu_test
from
..utils
import
multi_gpu_test
import
os
import
os
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.utils
import
gpuname
import
vllm.envs
as
envs
MODELS
=
[
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-2b-it"
),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-2b-it"
),
...
@@ -41,10 +39,10 @@ def v1(run_with_both_engines):
...
@@ -41,10 +39,10 @@ def v1(run_with_both_engines):
def
test_vllm_gc_ed
():
def
test_vllm_gc_ed
():
"""Verify vllm instance is GC'ed when it is deleted"""
"""Verify vllm instance is GC'ed when it is deleted"""
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
:
if
not
current_platform
.
is_rocm
():
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
block_size
=
64
)
else
:
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
))
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
))
else
:
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
block_size
=
64
)
weak_llm
=
weakref
.
ref
(
llm
)
weak_llm
=
weakref
.
ref
(
llm
)
del
llm
del
llm
...
@@ -111,13 +109,12 @@ def test_models(
...
@@ -111,13 +109,12 @@ def test_models(
prompt_embeds
=
hf_model
.
get_prompt_embeddings
(
prompt_embeds
=
hf_model
.
get_prompt_embeddings
(
example_prompts
)
example_prompts
)
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
:
if
not
current_platform
.
is_rocm
()
:
with
VllmRunner
(
model
,
with
VllmRunner
(
model
,
max_model_len
=
8192
,
max_model_len
=
8192
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
enable_prompt_embeds
=
enable_prompt_embeds
,
enable_prompt_embeds
=
enable_prompt_embeds
,
gpu_memory_utilization
=
0.7
,
gpu_memory_utilization
=
0.7
)
as
vllm_model
:
block_size
=
64
)
as
vllm_model
:
if
enable_prompt_embeds
:
if
enable_prompt_embeds
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompt_embeds
,
max_tokens
)
prompt_embeds
,
max_tokens
)
...
@@ -131,7 +128,8 @@ def test_models(
...
@@ -131,7 +128,8 @@ def test_models(
max_model_len
=
8192
,
max_model_len
=
8192
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
enable_prompt_embeds
=
enable_prompt_embeds
,
enable_prompt_embeds
=
enable_prompt_embeds
,
gpu_memory_utilization
=
0.7
)
as
vllm_model
:
gpu_memory_utilization
=
0.7
,
block_size
=
64
)
as
vllm_model
:
if
enable_prompt_embeds
:
if
enable_prompt_embeds
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompt_embeds
,
max_tokens
)
prompt_embeds
,
max_tokens
)
...
...
tests/basic_correctness/test_chunked_prefill.py
View file @
22d7e7c4
...
@@ -94,7 +94,7 @@ def test_models(
...
@@ -94,7 +94,7 @@ def test_models(
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
max_tokens
)
...
@@ -128,7 +128,7 @@ def test_models_distributed(
...
@@ -128,7 +128,7 @@ def test_models_distributed(
)
->
None
:
)
->
None
:
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
if
(
model
==
"meta-llama/Llama-3.2-1B-Instruct"
if
(
model
==
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
and
distributed_executor_backend
==
"ray"
):
and
distributed_executor_backend
==
"ray"
):
# test Ray Compiled Graph
# test Ray Compiled Graph
m
.
setenv
(
"VLLM_USE_RAY_SPMD_WORKER"
,
"1"
)
m
.
setenv
(
"VLLM_USE_RAY_SPMD_WORKER"
,
"1"
)
...
@@ -158,7 +158,7 @@ def test_models_distributed(
...
@@ -158,7 +158,7 @@ def test_models_distributed(
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
example_prompts
,
...
@@ -220,22 +220,25 @@ def test_models_with_fp8_kv_cache(
...
@@ -220,22 +220,25 @@ def test_models_with_fp8_kv_cache(
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
disable_async_output_proc
=
disable_async_output_proc
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
no_chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
no_chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
disable_async_output_proc
=
disable_async_output_proc
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
no_chunked_prefill_outputs
,
outputs_0_lst
=
no_chunked_prefill_outputs
,
...
@@ -286,7 +289,7 @@ def test_with_prefix_caching(
...
@@ -286,7 +289,7 @@ def test_with_prefix_caching(
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
outputs
[
enable
]
=
[]
outputs
[
enable
]
=
[]
for
prompt
in
full_prompts
:
for
prompt
in
full_prompts
:
...
@@ -303,7 +306,7 @@ def test_with_prefix_caching(
...
@@ -303,7 +306,7 @@ def test_with_prefix_caching(
)
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
,
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
,
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
1
,
4
,
16
])
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
1
,
4
,
16
])
...
...
tests/basic_correctness/test_preemption.py
View file @
22d7e7c4
...
@@ -7,6 +7,7 @@ VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
...
@@ -7,6 +7,7 @@ VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
pytest tests/basic_correctness/test_preemption.py`.
pytest tests/basic_correctness/test_preemption.py`.
"""
"""
import
os
import
pytest
import
pytest
from
prometheus_client
import
REGISTRY
from
prometheus_client
import
REGISTRY
...
@@ -18,7 +19,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
...
@@ -18,7 +19,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
from
..models.utils
import
check_outputs_equal
from
..models.utils
import
check_outputs_equal
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
import
os
from
vllm.platforms
import
current_platform
MODELS
=
[
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
...
@@ -74,18 +75,33 @@ def test_chunked_prefill_recompute(
...
@@ -74,18 +75,33 @@ def test_chunked_prefill_recompute(
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
vllm_runner
(
if
not
current_platform
.
is_rocm
():
model
,
with
vllm_runner
(
dtype
=
dtype
,
model
,
max_num_batched_tokens
=
max_num_batched_tokens
,
dtype
=
dtype
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_seqs
=
max_num_seqs
,
enable_chunked_prefill
=
enable_chunked_prefill
,
distributed_executor_backend
=
distributed_executor_backend
,
max_num_seqs
=
max_num_seqs
,
disable_log_stats
=
False
,
distributed_executor_backend
=
distributed_executor_backend
,
)
as
vllm_model
:
disable_log_stats
=
False
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
)
as
vllm_model
:
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
else
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_seqs
=
max_num_seqs
,
distributed_executor_backend
=
distributed_executor_backend
,
disable_log_stats
=
False
,
block_size
=
64
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
@@ -115,17 +131,31 @@ def test_preemption(
...
@@ -115,17 +131,31 @@ def test_preemption(
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
vllm_runner
(
if
not
current_platform
.
is_rocm
():
model
,
with
vllm_runner
(
dtype
=
dtype
,
model
,
disable_log_stats
=
False
,
dtype
=
dtype
,
distributed_executor_backend
=
distributed_executor_backend
,
disable_log_stats
=
False
,
)
as
vllm_model
:
distributed_executor_backend
=
distributed_executor_backend
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
)
as
vllm_model
:
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
total_preemption
=
(
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
num_cumulative_preemption
)
total_preemption
=
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
num_cumulative_preemption
)
else
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
distributed_executor_backend
=
distributed_executor_backend
,
block_size
=
64
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
total_preemption
=
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
num_cumulative_preemption
)
check_outputs_equal
(
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_0_lst
=
hf_outputs
,
...
@@ -163,7 +193,7 @@ def test_preemption_infeasible(
...
@@ -163,7 +193,7 @@ def test_preemption_infeasible(
distributed_executor_backend
:
str
,
distributed_executor_backend
:
str
,
)
->
None
:
)
->
None
:
"""Verify infeasible preemption request will be ignored."""
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE
=
16
BLOCK_SIZE
=
16
if
not
current_platform
.
is_rocm
()
else
64
prefill_blocks
=
2
prefill_blocks
=
2
decode_blocks
=
max_tokens
//
BLOCK_SIZE
decode_blocks
=
max_tokens
//
BLOCK_SIZE
with
vllm_runner
(
with
vllm_runner
(
...
...
tests/compile/test_async_tp.py
View file @
22d7e7c4
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
json
import
json
import
pytest
import
pytest
...
@@ -21,6 +22,7 @@ from ..models.registry import HF_EXAMPLE_MODELS
...
@@ -21,6 +22,7 @@ from ..models.registry import HF_EXAMPLE_MODELS
from
..utils
import
(
compare_two_settings
,
create_new_process_for_each_test
,
from
..utils
import
(
compare_two_settings
,
create_new_process_for_each_test
,
multi_gpu_test
)
multi_gpu_test
)
from
.backend
import
TestBackend
from
.backend
import
TestBackend
from
..utils
import
models_path_prefix
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -177,7 +179,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
...
@@ -177,7 +179,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"meta-llama/Llama-3.2-1B-Instruct"
])
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"async_tp_enabled"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"async_tp_enabled"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"distributed_backend"
,
[
"mp"
])
@
pytest
.
mark
.
parametrize
(
"distributed_backend"
,
[
"mp"
])
...
...
tests/compile/test_basic_correctness.py
View file @
22d7e7c4
...
@@ -84,16 +84,17 @@ class TestSetting:
...
@@ -84,16 +84,17 @@ class TestSetting:
# method="encode",
# method="encode",
# fullgraph=True,
# fullgraph=True,
# ),
# ),
# TODO
# vision language model
# vision language model
TestSetting
(
#
TestSetting(
model
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
),
#
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
model_args
=
[
"--trust-remote-code"
,
"--max-model-len"
,
"2048"
],
#
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size
=
2
,
#
pp_size=2,
tp_size
=
1
,
#
tp_size=1,
attn_backend
=
"FLASH_ATTN"
,
#
attn_backend="FLASH_ATTN",
method
=
"generate_with_image"
,
#
method="generate_with_image",
fullgraph
=
False
,
#
fullgraph=False,
),
#
),
])
])
def
test_compile_correctness
(
def
test_compile_correctness
(
monkeypatch
:
pytest
.
MonkeyPatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
...
...
tests/compile/test_config.py
View file @
22d7e7c4
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
import
pytest
import
vllm
import
vllm
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.counter
import
compilation_counter
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.utils
import
_is_torch_equal_or_newer
from
vllm.utils
import
_is_torch_equal_or_newer
from
..utils
import
models_path_prefix
def
test_version
():
def
test_version
():
assert
_is_torch_equal_or_newer
(
'2.8.0.dev20250624+cu128'
,
'2.8.0.dev'
)
assert
_is_torch_equal_or_newer
(
'2.8.0.dev20250624+cu128'
,
'2.8.0.dev'
)
...
@@ -26,7 +27,9 @@ def test_use_cudagraphs_dynamic(monkeypatch):
...
@@ -26,7 +27,9 @@ def test_use_cudagraphs_dynamic(monkeypatch):
assert
not
vllm_config
.
compilation_config
.
use_cudagraph
assert
not
vllm_config
.
compilation_config
.
use_cudagraph
@
pytest
.
mark
.
parametrize
(
"enabled"
,
[
True
,
False
])
# TODO: when True num_cudagraph_captured=13
# @pytest.mark.parametrize("enabled", [True, False])
@
pytest
.
mark
.
parametrize
(
"enabled"
,
[
False
])
def
test_use_cudagraphs
(
vllm_runner
,
monkeypatch
,
enabled
):
def
test_use_cudagraphs
(
vllm_runner
,
monkeypatch
,
enabled
):
assert
vllm
.
envs
.
VLLM_USE_V1
assert
vllm
.
envs
.
VLLM_USE_V1
...
@@ -44,7 +47,7 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
...
@@ -44,7 +47,7 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
num_cudagraph_captured
=
13
if
enabled
else
0
,
num_cudagraph_captured
=
13
if
enabled
else
0
,
),
),
# loading the model causes compilation (if enabled) to happen
# loading the model causes compilation (if enabled) to happen
vllm_runner
(
'facebook/opt-125m'
,
vllm_runner
(
os
.
path
.
join
(
models_path_prefix
,
'facebook/opt-125m'
)
,
compilation_config
=
compilation_config
,
compilation_config
=
compilation_config
,
gpu_memory_utilization
=
0.4
)
as
_
):
gpu_memory_utilization
=
0.4
)
as
_
):
pass
pass
tests/compile/test_fusion_attn.py
→
tests/compile/
un
test_fusion_attn.py
View file @
22d7e7c4
File moved
tests/compile/test_silu_mul_quant_fusion.py
→
tests/compile/
un
test_silu_mul_quant_fusion.py
View file @
22d7e7c4
File moved
tests/config/test_mp_reducer.py
View file @
22d7e7c4
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
sys
import
sys
from
unittest.mock
import
patch
from
unittest.mock
import
patch
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
..utils
import
models_path_prefix
def
test_mp_reducer
(
monkeypatch
):
def
test_mp_reducer
(
monkeypatch
):
...
@@ -24,7 +26,7 @@ def test_mp_reducer(monkeypatch):
...
@@ -24,7 +26,7 @@ def test_mp_reducer(monkeypatch):
with
patch
(
'multiprocessing.reducer.register'
)
as
mock_register
:
with
patch
(
'multiprocessing.reducer.register'
)
as
mock_register
:
engine_args
=
AsyncEngineArgs
(
engine_args
=
AsyncEngineArgs
(
model
=
"facebook/opt-125m"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
max_model_len
=
32
,
max_model_len
=
32
,
gpu_memory_utilization
=
0.1
,
gpu_memory_utilization
=
0.1
,
disable_log_stats
=
True
,
disable_log_stats
=
True
,
...
...
tests/conftest.py
View file @
22d7e7c4
...
@@ -40,6 +40,7 @@ from vllm.sampling_params import BeamSearchParams
...
@@ -40,6 +40,7 @@ from vllm.sampling_params import BeamSearchParams
from
vllm.transformers_utils.utils
import
maybe_model_redirect
from
vllm.transformers_utils.utils
import
maybe_model_redirect
from
.utils
import
models_path_prefix
from
.utils
import
models_path_prefix
from
vllm.platforms
import
current_platform
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -783,7 +784,7 @@ class VllmRunner:
...
@@ -783,7 +784,7 @@ class VllmRunner:
dtype
:
str
=
"auto"
,
dtype
:
str
=
"auto"
,
disable_log_stats
:
bool
=
True
,
disable_log_stats
:
bool
=
True
,
tensor_parallel_size
:
int
=
1
,
tensor_parallel_size
:
int
=
1
,
block_size
:
int
=
16
,
block_size
:
int
=
16
if
not
current_platform
.
is_rocm
()
else
64
,
enable_chunked_prefill
:
Optional
[
bool
]
=
False
,
enable_chunked_prefill
:
Optional
[
bool
]
=
False
,
swap_space
:
int
=
4
,
swap_space
:
int
=
4
,
enforce_eager
:
Optional
[
bool
]
=
False
,
enforce_eager
:
Optional
[
bool
]
=
False
,
...
...
tests/test_sharded_state_loader.py
View file @
22d7e7c4
...
@@ -79,8 +79,10 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
...
@@ -79,8 +79,10 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
queue
.
join_thread
()
queue
.
join_thread
()
@
pytest
.
mark
.
parametrize
(
"enable_lora"
,
[
False
,
True
])
# @pytest.mark.parametrize("enable_lora", [False, True])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
# @pytest.mark.parametrize("tp_size", [1, 2])
@
pytest
.
mark
.
parametrize
(
"enable_lora"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
def
test_sharded_state_loader
(
enable_lora
,
tp_size
,
num_gpus_available
,
def
test_sharded_state_loader
(
enable_lora
,
tp_size
,
num_gpus_available
,
llama_3p2_1b_files
,
llama_3p2_1b_files
,
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
:
pytest
.
MonkeyPatch
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment