Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1e4ecca1
Unverified
Commit
1e4ecca1
authored
Oct 07, 2025
by
Cyrus Leung
Committed by
GitHub
Oct 07, 2025
Browse files
[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
c0a7b89d
Changes
51
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
489 additions
and
606 deletions
+489
-606
tests/kernels/test_flex_attention.py
tests/kernels/test_flex_attention.py
+6
-8
tests/models/multimodal/generation/test_maverick.py
tests/models/multimodal/generation/test_maverick.py
+0
-1
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
..._dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+1
-5
tests/plugins_tests/test_scheduler_plugins.py
tests/plugins_tests/test_scheduler_plugins.py
+0
-1
tests/samplers/test_no_bad_words.py
tests/samplers/test_no_bad_words.py
+0
-7
tests/tpu/lora/test_lora.py
tests/tpu/lora/test_lora.py
+0
-11
tests/v1/attention/utils.py
tests/v1/attention/utils.py
+0
-1
tests/v1/core/test_kv_sharing.py
tests/v1/core/test_kv_sharing.py
+3
-0
tests/v1/core/test_scheduler_e2e.py
tests/v1/core/test_scheduler_e2e.py
+0
-4
tests/v1/cudagraph/test_cudagraph_mode.py
tests/v1/cudagraph/test_cudagraph_mode.py
+2
-2
tests/v1/e2e/test_cascade_attention.py
tests/v1/e2e/test_cascade_attention.py
+0
-1
tests/v1/e2e/test_correctness_sliding_window.py
tests/v1/e2e/test_correctness_sliding_window.py
+29
-34
tests/v1/e2e/test_kv_sharing_fast_prefill.py
tests/v1/e2e/test_kv_sharing_fast_prefill.py
+0
-2
tests/v1/e2e/test_min_tokens.py
tests/v1/e2e/test_min_tokens.py
+1
-5
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+0
-1
tests/v1/engine/test_async_llm.py
tests/v1/engine/test_async_llm.py
+14
-44
tests/v1/engine/test_engine_args.py
tests/v1/engine/test_engine_args.py
+0
-7
tests/v1/engine/test_engine_core.py
tests/v1/engine/test_engine_core.py
+300
-314
tests/v1/engine/test_engine_core_client.py
tests/v1/engine/test_engine_core_client.py
+125
-146
tests/v1/engine/test_llm_engine.py
tests/v1/engine/test_llm_engine.py
+8
-12
No files found.
tests/kernels/test_flex_attention.py
View file @
1e4ecca1
...
@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
set_seed
(
seed
)
set_seed
(
seed
)
...
@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with default backend
# Run with default backend
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
set_seed
(
seed
)
set_seed
(
seed
)
with
vllm_runner
(
with
vllm_runner
(
model_name
,
model_name
,
...
@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
with
vllm_runner
(
with
vllm_runner
(
model_name
,
model_name
,
...
@@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
flex_outputs
=
llm_flex
.
embed
(
prompts
)
flex_outputs
=
llm_flex
.
embed
(
prompts
)
# Run with default backend
# Run with default backend
with
monkeypatch
.
context
()
as
m
:
with
(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
onkeypatch
.
context
()
as
m
,
with
vllm_runner
(
vllm_runner
(
model_name
,
model_name
,
runner
=
"pooling"
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
max_model_len
=
100
,
enforce_eager
=
True
,
enforce_eager
=
True
,
)
as
llm_default
:
)
as
llm_default
,
default_outputs
=
llm_default
.
embed
(
prompts
)
):
default_outputs
=
llm_default
.
embed
(
prompts
)
check_embeddings_close
(
check_embeddings_close
(
embeddings_0_lst
=
flex_outputs
,
embeddings_0_lst
=
flex_outputs
,
...
...
tests/models/multimodal/generation/test_maverick.py
View file @
1e4ecca1
...
@@ -613,7 +613,6 @@ def test_dummy_maverick(
...
@@ -613,7 +613,6 @@ def test_dummy_maverick(
profile
:
bool
=
False
,
profile
:
bool
=
False
,
)
->
None
:
)
->
None
:
# Disable multiprocessing allows us to access model executor from LLM engine
# Disable multiprocessing allows us to access model executor from LLM engine
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
model_path
=
create_reduced_maverick_model
(
model_path
=
create_reduced_maverick_model
(
...
...
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
View file @
1e4ecca1
...
@@ -8,7 +8,6 @@ if TYPE_CHECKING:
...
@@ -8,7 +8,6 @@ if TYPE_CHECKING:
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
else
:
else
:
VllmConfig
=
None
VllmConfig
=
None
from
vllm
import
envs
class
DummyPlatform
(
Platform
):
class
DummyPlatform
(
Platform
):
...
@@ -19,10 +18,7 @@ class DummyPlatform(Platform):
...
@@ -19,10 +18,7 @@ class DummyPlatform(Platform):
@
classmethod
@
classmethod
def
check_and_update_config
(
cls
,
vllm_config
:
VllmConfig
)
->
None
:
def
check_and_update_config
(
cls
,
vllm_config
:
VllmConfig
)
->
None
:
if
envs
.
VLLM_USE_V1
:
vllm_config
.
compilation_config
.
custom_ops
=
[
"all"
]
compilation_config
=
vllm_config
.
compilation_config
# Activate custom ops for v1.
compilation_config
.
custom_ops
=
[
"all"
]
def
get_attn_backend_cls
(
def
get_attn_backend_cls
(
self
,
self
,
...
...
tests/plugins_tests/test_scheduler_plugins.py
View file @
1e4ecca1
...
@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler):
...
@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler):
def
test_scheduler_plugins_v1
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_scheduler_plugins_v1
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Explicitly turn off engine multiprocessing so
# Explicitly turn off engine multiprocessing so
# that the scheduler runs in this process
# that the scheduler runs in this process
m
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
m
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
...
...
tests/samplers/test_no_bad_words.py
View file @
1e4ecca1
...
@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
...
@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
from
typing
import
Optional
from
typing
import
Optional
import
pytest
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
monkeypatch
):
"""Only run on vLLM v1."""
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
def
_generate
(
def
_generate
(
llm
:
LLM
,
llm
:
LLM
,
prompt
:
str
,
prompt
:
str
,
...
...
tests/tpu/lora/test_lora.py
View file @
1e4ecca1
...
@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest
...
@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest
# 100 training iterations with a training batch size of 100.
# 100 training iterations with a training batch size of 100.
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v1_only
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
for all tests in this file
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
yield
def
setup_vllm
(
num_loras
:
int
,
tp
:
int
)
->
vllm
.
LLM
:
def
setup_vllm
(
num_loras
:
int
,
tp
:
int
)
->
vllm
.
LLM
:
return
vllm
.
LLM
(
return
vllm
.
LLM
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
...
...
tests/v1/attention/utils.py
View file @
1e4ecca1
...
@@ -305,7 +305,6 @@ full_cg_backend_configs = {
...
@@ -305,7 +305,6 @@ full_cg_backend_configs = {
"CutlassMLA"
:
BackendConfig
(
"CutlassMLA"
:
BackendConfig
(
name
=
"CutlassMLA"
,
name
=
"CutlassMLA"
,
env_vars
=
{
env_vars
=
{
"VLLM_USE_V1"
:
"1"
,
"VLLM_ATTENTION_BACKEND"
:
"CUTLASS_MLA"
,
"VLLM_ATTENTION_BACKEND"
:
"CUTLASS_MLA"
,
"FORCE_NUM_KV_SPLITS"
:
"1"
,
# TODO: remove this when hang issue is fixed
"FORCE_NUM_KV_SPLITS"
:
"1"
,
# TODO: remove this when hang issue is fixed
},
},
...
...
tests/v1/core/test_kv_sharing.py
View file @
1e4ecca1
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
import
torch
from
vllm.v1.kv_cache_interface
import
FullAttentionSpec
,
KVCacheGroupSpec
from
vllm.v1.kv_cache_interface
import
FullAttentionSpec
,
KVCacheGroupSpec
from
vllm.v1.worker.utils
import
add_kv_sharing_layers_to_kv_cache_groups
from
vllm.v1.worker.utils
import
add_kv_sharing_layers_to_kv_cache_groups
pytestmark
=
pytest
.
mark
.
cpu_test
def
new_kv_cache_spec
():
def
new_kv_cache_spec
():
return
FullAttentionSpec
(
16
,
1
,
1
,
torch
.
float32
,
False
)
return
FullAttentionSpec
(
16
,
1
,
1
,
torch
.
float32
,
False
)
...
...
tests/v1/core/test_scheduler_e2e.py
View file @
1e4ecca1
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
import
pytest
from
vllm
import
LLM
from
vllm
import
LLM
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
MODEL
=
"meta-llama/Llama-3.2-1B"
MODEL
=
"meta-llama/Llama-3.2-1B"
PROMPT
=
"Hello my name is Robert and I"
PROMPT
=
"Hello my name is Robert and I"
...
...
tests/v1/cudagraph/test_cudagraph_mode.py
View file @
1e4ecca1
...
@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
...
@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
):
):
pytest
.
skip
(
"Only Hopper GPUs support FA3 and FlashMLA"
)
pytest
.
skip
(
"Only Hopper GPUs support FA3 and FlashMLA"
)
env_vars
=
{
"VLLM_USE_V1"
:
"1"
,
**
backend_configs
[
backend_name
].
env_vars
}
env_vars
=
backend_configs
[
backend_name
].
env_vars
with
temporary_environ
(
env_vars
),
ExitStack
()
as
stack
:
with
temporary_environ
(
env_vars
),
ExitStack
()
as
stack
:
if
not
supported
:
if
not
supported
:
...
@@ -117,7 +117,7 @@ combo_cases_2 = [
...
@@ -117,7 +117,7 @@ combo_cases_2 = [
def
test_cudagraph_compilation_combo
(
combo_case
):
def
test_cudagraph_compilation_combo
(
combo_case
):
backend_name
,
cudagraph_mode
,
compilation_level
,
supported
=
combo_case
backend_name
,
cudagraph_mode
,
compilation_level
,
supported
=
combo_case
env_vars
=
{
"VLLM_USE_V1"
:
"1"
,
**
backend_configs
[
backend_name
].
env_vars
}
env_vars
=
backend_configs
[
backend_name
].
env_vars
with
temporary_environ
(
env_vars
),
ExitStack
()
as
stack
:
with
temporary_environ
(
env_vars
),
ExitStack
()
as
stack
:
if
not
supported
:
if
not
supported
:
...
...
tests/v1/e2e/test_cascade_attention.py
View file @
1e4ecca1
...
@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
...
@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
)
)
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
llm
=
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
)
llm
=
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
)
...
...
tests/v1/e2e/test_correctness_sliding_window.py
View file @
1e4ecca1
...
@@ -32,7 +32,7 @@ model_config = {
...
@@ -32,7 +32,7 @@ model_config = {
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"disable_hybrid_kv_cache_manager"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"disable_hybrid_kv_cache_manager"
,
[
True
,
False
])
def
test_sliding_window_retrieval
(
def
test_sliding_window_retrieval
(
monkeypatch
,
model
,
batch_size
,
seed
,
disable_hybrid_kv_cache_manager
model
,
batch_size
,
seed
,
disable_hybrid_kv_cache_manager
):
):
"""
"""
The test does a bunch of assignments "x1 = 10
\n
x2 = 33
\n
..." and then
The test does a bunch of assignments "x1 = 10
\n
x2 = 33
\n
..." and then
...
@@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
...
@@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
If we tell it upfront which we are going to be looking for, then
If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly).
it answers correctly (mostly).
"""
"""
with
monkeypatch
.
context
()
as
m
:
test_config
=
model_config
[
model
]
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
test_config
=
model_config
[
model
]
model
=
model
,
disable_hybrid_kv_cache_manager
=
disable_hybrid_kv_cache_manager
)
llm
=
LLM
(
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
100
)
model
=
model
,
disable_hybrid_kv_cache_manager
=
disable_hybrid_kv_cache_manager
)
prompts
,
answer
,
indices
=
prep_prompts
(
batch_size
,
ln_range
=
test_config
.
ln_range
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
100
)
check_length
(
prompts
,
llm
,
test_config
.
sliding_window
)
prompts
,
answer
,
indices
=
prep_prompts
(
batch_size
,
ln_range
=
test_config
.
ln_range
# Fresh generation
)
responses
=
llm
.
generate
(
prompts
,
sampling_params
)
check_answers
(
check_length
(
prompts
,
llm
,
test_config
.
sliding_window
)
indices
,
answer
,
# Fresh generation
[
response
.
outputs
[
0
].
text
for
response
in
responses
],
responses
=
llm
.
generate
(
prompts
,
sampling_params
)
accept_rate
=
1.0
,
check_answers
(
)
indices
,
answer
,
# Re-generate with the same prompts to test prefix caching
[
response
.
outputs
[
0
].
text
for
response
in
responses
],
responses
=
llm
.
generate
(
prompts
,
sampling_params
)
accept_rate
=
1.0
,
check_answers
(
)
indices
,
answer
,
# Re-generate with the same prompts to test prefix caching
[
response
.
outputs
[
0
].
text
for
response
in
responses
],
responses
=
llm
.
generate
(
prompts
,
sampling_params
)
accept_rate
=
1.0
,
check_answers
(
)
indices
,
answer
,
[
response
.
outputs
[
0
].
text
for
response
in
responses
],
accept_rate
=
1.0
,
)
def
check_length
(
prompts
:
list
[
str
],
llm
:
LLM
,
sliding_window
:
int
):
def
check_length
(
prompts
:
list
[
str
],
llm
:
LLM
,
sliding_window
:
int
):
...
...
tests/v1/e2e/test_kv_sharing_fast_prefill.py
View file @
1e4ecca1
...
@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
...
@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
)
)
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Make scheduling deterministic for reproducibility
# Make scheduling deterministic for reproducibility
m
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
m
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
...
...
tests/v1/e2e/test_min_tokens.py
View file @
1e4ecca1
...
@@ -13,7 +13,6 @@ Covers:
...
@@ -13,7 +13,6 @@ Covers:
5) Multiple stop conditions
5) Multiple stop conditions
"""
"""
import
os
from
typing
import
Optional
,
Union
from
typing
import
Optional
,
Union
import
pytest
import
pytest
...
@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
...
@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm_v1
():
def
llm_v1
():
"""Create V1 LLM instance for testing"""
"""Create V1 LLM instance for testing"""
# Ensure V1 engine is used
os
.
environ
[
"VLLM_USE_V1"
]
=
"1"
llm
=
LLM
(
llm
=
LLM
(
model
=
TEST_MODEL
,
model
=
TEST_MODEL
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
...
@@ -503,6 +499,6 @@ if __name__ == "__main__":
...
@@ -503,6 +499,6 @@ if __name__ == "__main__":
Usage:
Usage:
cd vllm/
cd vllm/
VLLM_USE_V1=1
python -m pytest tests/v1/e2e/test_min_tokens.py -v
python -m pytest tests/v1/e2e/test_min_tokens.py -v
"""
"""
pytest
.
main
([
__file__
,
"-v"
])
pytest
.
main
([
__file__
,
"-v"
])
tests/v1/e2e/test_spec_decode.py
View file @
1e4ecca1
...
@@ -301,7 +301,6 @@ def test_mtp_correctness(
...
@@ -301,7 +301,6 @@ def test_mtp_correctness(
model_setup: (method, model_name, tp_size)
model_setup: (method, model_name, tp_size)
"""
"""
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_MLA_DISABLE"
,
"1"
)
m
.
setenv
(
"VLLM_MLA_DISABLE"
,
"1"
)
method
,
model_name
,
tp_size
=
model_setup
method
,
model_name
,
tp_size
=
model_setup
...
...
tests/v1/engine/test_async_llm.py
View file @
1e4ecca1
...
@@ -95,17 +95,11 @@ async def generate(
...
@@ -95,17 +95,11 @@ async def generate(
)
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_load
(
async
def
test_load
(
monkeypatch
:
pytest
.
MonkeyPatch
,
output_kind
:
RequestOutputKind
,
output_kind
:
RequestOutputKind
,
engine_args
:
AsyncEngineArgs
,
engine_args
:
AsyncEngineArgs
,
prompt
:
PromptType
,
prompt
:
PromptType
,
):
):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
with
ExitStack
()
as
after
:
# so that in the future when we switch, we don't have to change all the
# tests.
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
set_default_torch_num_threads
(
1
):
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
after
.
callback
(
engine
.
shutdown
)
after
.
callback
(
engine
.
shutdown
)
...
@@ -149,14 +143,11 @@ async def test_load(
...
@@ -149,14 +143,11 @@ async def test_load(
)
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_abort
(
async
def
test_abort
(
monkeypatch
:
pytest
.
MonkeyPatch
,
output_kind
:
RequestOutputKind
,
output_kind
:
RequestOutputKind
,
engine_args
:
AsyncEngineArgs
,
engine_args
:
AsyncEngineArgs
,
prompt
:
PromptType
,
prompt
:
PromptType
,
):
):
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
with
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
set_default_torch_num_threads
(
1
):
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
after
.
callback
(
engine
.
shutdown
)
after
.
callback
(
engine
.
shutdown
)
...
@@ -222,13 +213,8 @@ async def test_abort(
...
@@ -222,13 +213,8 @@ async def test_abort(
"output_kind"
,
[
RequestOutputKind
.
DELTA
,
RequestOutputKind
.
FINAL_ONLY
]
"output_kind"
,
[
RequestOutputKind
.
DELTA
,
RequestOutputKind
.
FINAL_ONLY
]
)
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_multi_abort
(
async
def
test_multi_abort
(
output_kind
:
RequestOutputKind
):
monkeypatch
:
pytest
.
MonkeyPatch
,
with
ExitStack
()
as
after
:
output_kind
:
RequestOutputKind
,
):
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
set_default_torch_num_threads
(
1
):
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
after
.
callback
(
engine
.
shutdown
)
after
.
callback
(
engine
.
shutdown
)
...
@@ -304,14 +290,11 @@ async def test_multi_abort(
...
@@ -304,14 +290,11 @@ async def test_multi_abort(
)
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_finished_flag
(
async
def
test_finished_flag
(
monkeypatch
:
pytest
.
MonkeyPatch
,
n
:
int
,
n
:
int
,
engine_args
:
AsyncEngineArgs
,
engine_args
:
AsyncEngineArgs
,
prompt
:
PromptType
,
prompt
:
PromptType
,
):
):
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
with
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
set_default_torch_num_threads
(
1
):
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
after
.
callback
(
engine
.
shutdown
)
after
.
callback
(
engine
.
shutdown
)
...
@@ -341,12 +324,10 @@ async def test_finished_flag(
...
@@ -341,12 +324,10 @@ async def test_finished_flag(
)
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_mid_stream_cancellation
(
async
def
test_mid_stream_cancellation
(
monkeypatch
:
pytest
.
MonkeyPatch
,
engine_args
:
AsyncEngineArgs
,
prompt
:
PromptType
engine_args
:
AsyncEngineArgs
,
prompt
:
PromptType
):
):
"""Test that requests can be cancelled mid-stream."""
"""Test that requests can be cancelled mid-stream."""
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
with
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
set_default_torch_num_threads
(
1
):
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
after
.
callback
(
engine
.
shutdown
)
after
.
callback
(
engine
.
shutdown
)
...
@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
...
@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
be added to the default loggers.
be added to the default loggers.
"""
"""
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
with
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
set_default_torch_num_threads
(
1
):
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
,
TEXT_ENGINE_ARGS
,
...
@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
...
@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
async
def
test_dp_rank_argument
(
monkeypatch
:
pytest
.
MonkeyPatch
):
async
def
test_dp_rank_argument
():
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
with
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
set_default_torch_num_threads
(
1
):
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
after
.
callback
(
engine
.
shutdown
)
after
.
callback
(
engine
.
shutdown
)
...
@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
...
@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_check_health
(
monkeypatch
:
pytest
.
MonkeyPatch
):
async
def
test_check_health
():
"""Test that check_health returns normally for healthy engine
"""Test that check_health returns normally for healthy engine
and raises EngineDeadError when the engine is dead.
and raises EngineDeadError when the engine is dead.
"""
"""
...
@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
...
@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
from
vllm.v1.engine.exceptions
import
EngineDeadError
from
vllm.v1.engine.exceptions
import
EngineDeadError
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
with
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
set_default_torch_num_threads
(
1
):
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
after
.
callback
(
engine
.
shutdown
)
after
.
callback
(
engine
.
shutdown
)
...
@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
...
@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
"output_kind"
,
[
RequestOutputKind
.
DELTA
,
RequestOutputKind
.
FINAL_ONLY
]
"output_kind"
,
[
RequestOutputKind
.
DELTA
,
RequestOutputKind
.
FINAL_ONLY
]
)
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_abort_final_output
(
async
def
test_abort_final_output
(
output_kind
:
RequestOutputKind
):
monkeypatch
:
pytest
.
MonkeyPatch
,
output_kind
:
RequestOutputKind
,
):
"""Test that abort() returns a final output with correct information."""
"""Test that abort() returns a final output with correct information."""
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
with
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
set_default_torch_num_threads
(
1
):
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
after
.
callback
(
engine
.
shutdown
)
after
.
callback
(
engine
.
shutdown
)
...
...
tests/v1/engine/test_engine_args.py
View file @
1e4ecca1
...
@@ -5,18 +5,11 @@ from argparse import ArgumentError
...
@@ -5,18 +5,11 @@ from argparse import ArgumentError
import
pytest
import
pytest
from
vllm
import
envs
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
if
not
envs
.
VLLM_USE_V1
:
pytest
.
skip
(
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test."
,
allow_module_level
=
True
,
)
def
test_prefix_caching_from_cli
():
def
test_prefix_caching_from_cli
():
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
...
...
tests/v1/engine/test_engine_core.py
View file @
1e4ecca1
...
@@ -46,188 +46,184 @@ def make_request() -> EngineCoreRequest:
...
@@ -46,188 +46,184 @@ def make_request() -> EngineCoreRequest:
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
def
test_engine_core
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_engine_core
():
with
monkeypatch
.
context
()
as
m
:
"""Setup the EngineCore."""
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
"""Setup the EngineCore."""
vllm_config
=
engine_args
.
create_engine_config
()
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
executor_class
=
Executor
.
get_class
(
vllm_config
)
vllm_config
=
engine_args
.
create_engine_config
()
executor_class
=
Executor
.
get_class
(
vllm_config
)
with
set_default_torch_num_threads
(
1
):
engine_core
=
EngineCore
(
with
set_default_torch_num_threads
(
1
):
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
log_stats
=
True
engine_core
=
EngineCore
(
)
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
log_stats
=
True
"""Test basic request lifecycle."""
)
"""Test basic request lifecycle."""
# First request.
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
make_request
()))
# First request.
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
make_request
()))
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
running
)
==
1
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
1
# Second request.
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
make_request
()))
# Second request.
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
make_request
()))
assert
len
(
engine_core
.
scheduler
.
running
)
==
1
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
assert
len
(
engine_core
.
scheduler
.
running
)
==
1
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
running
)
==
2
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
2
# Add two requests in a row.
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
make_request
()))
# Add two requests in a row.
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
make_request
()))
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
make_request
()))
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
2
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
make_request
()))
assert
len
(
engine_core
.
scheduler
.
running
)
==
2
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
2
assert
len
(
engine_core
.
scheduler
.
running
)
==
2
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
running
)
==
4
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
4
# Loop through until they are all done.
while
(
outs
:
=
engine_core
.
step
()[
0
].
get
(
0
))
and
outs
.
outputs
:
# Loop through until they are all done.
pass
while
(
outs
:
=
engine_core
.
step
()[
0
].
get
(
0
))
and
outs
.
outputs
:
pass
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
"""Test abort cycle."""
# Basic abort.
req
=
make_request
()
request_id
=
req
.
request_id
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req
))
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
assert
engine_core
.
scheduler
.
has_unfinished_requests
()
assert
not
engine_core
.
scheduler
.
has_finished_requests
()
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
1
assert
engine_core
.
scheduler
.
has_unfinished_requests
()
assert
not
engine_core
.
scheduler
.
has_finished_requests
()
engine_core
.
abort_requests
([
request_id
])
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
assert
not
engine_core
.
scheduler
.
has_unfinished_requests
()
assert
engine_core
.
scheduler
.
has_finished_requests
()
_
=
engine_core
.
step
()
assert
not
engine_core
.
scheduler
.
has_unfinished_requests
()
assert
not
engine_core
.
scheduler
.
has_finished_requests
()
# Add, step, abort 1 of the 3.
req0
=
make_request
()
req1
=
make_request
()
req2
=
make_request
()
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req0
))
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req1
))
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
2
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
2
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req2
))
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
assert
len
(
engine_core
.
scheduler
.
running
)
==
2
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
3
# Abort just one.
engine_core
.
abort_requests
([
req1
.
request_id
])
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
2
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
2
# Abort the other requests at the same time.
engine_core
.
abort_requests
([
req2
.
request_id
,
req0
.
request_id
])
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
# Sending duplicate requests with same request_id
req0
=
make_request
()
req1
=
make_request
()
req0
.
request_id
=
req1
.
request_id
=
"test"
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req0
))
while
(
outs
:
=
engine_core
.
step
()[
0
].
get
(
0
))
and
outs
.
outputs
:
pass
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req1
))
while
(
outs
:
=
engine_core
.
step
()[
0
].
get
(
0
))
and
outs
.
outputs
:
pass
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
"""Test abort cycle."""
# Basic abort.
@
create_new_process_for_each_test
()
req
=
make_request
()
def
test_engine_core_advanced_sampling
():
request_id
=
req
.
request_id
"""
A basic end-to-end test to verify that the engine functions correctly
when additional sampling parameters, such as top_p, min_tokens, and
presence_penalty, are set.
"""
"""Setup the EngineCore."""
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
vllm_config
=
engine_args
.
create_engine_config
()
executor_class
=
Executor
.
get_class
(
vllm_config
)
with
set_default_torch_num_threads
(
1
):
engine_core
=
EngineCore
(
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
log_stats
=
True
)
"""Test basic request lifecycle."""
# First request.
request
:
EngineCoreRequest
=
make_request
()
request
.
sampling_params
=
SamplingParams
(
min_tokens
=
4
,
presence_penalty
=
1.0
,
frequency_penalty
=
1.0
,
repetition_penalty
=
0.1
,
stop_token_ids
=
[
1001
,
1002
],
)
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
request
))
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req
))
def
_check_engine_state
():
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
assert
engine_core
.
scheduler
.
has_unfinished_requests
()
# Loop through until they are all done.
assert
not
engine_core
.
scheduler
.
has_finished_requests
()
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
1
assert
engine_core
.
scheduler
.
has_unfinished_requests
()
assert
not
engine_core
.
scheduler
.
has_finished_requests
()
engine_core
.
abort_requests
([
request_id
])
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
assert
not
engine_core
.
scheduler
.
has_unfinished_requests
()
assert
engine_core
.
scheduler
.
has_finished_requests
()
_
=
engine_core
.
step
()
assert
not
engine_core
.
scheduler
.
has_unfinished_requests
()
assert
not
engine_core
.
scheduler
.
has_finished_requests
()
# Add, step, abort 1 of the 3.
req0
=
make_request
()
req1
=
make_request
()
req2
=
make_request
()
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req0
))
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req1
))
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
2
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
2
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req2
))
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
assert
len
(
engine_core
.
scheduler
.
running
)
==
2
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
3
# Abort just one.
engine_core
.
abort_requests
([
req1
.
request_id
])
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
2
_
=
engine_core
.
step
()
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
2
# Abort the other requests at the same time.
engine_core
.
abort_requests
([
req2
.
request_id
,
req0
.
request_id
])
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
# Sending duplicate requests with same request_id
req0
=
make_request
()
req1
=
make_request
()
req0
.
request_id
=
req1
.
request_id
=
"test"
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req0
))
while
(
outs
:
=
engine_core
.
step
()[
0
].
get
(
0
))
and
outs
.
outputs
:
pass
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req1
))
while
(
outs
:
=
engine_core
.
step
()[
0
].
get
(
0
))
and
outs
.
outputs
:
while
(
outs
:
=
engine_core
.
step
()[
0
].
get
(
0
))
and
outs
.
outputs
:
pass
pass
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
_check_engine_state
()
@
create_new_process_for_each_test
()
# Second request.
def
test_engine_core_advanced_sampling
(
monkeypatch
:
pytest
.
MonkeyPatch
):
request2
=
make_request
()
"""
request2
.
sampling_params
=
SamplingParams
(
A basic end-to-end test to verify that the engine functions correctly
top_p
=
0.99
,
when additional sampling parameters, such as top_p, min_tokens, and
top_k
=
50
,
presence_penalty, are set.
)
"""
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
request2
))
with
monkeypatch
.
context
()
as
m
:
_check_engine_state
()
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
"""Setup the EngineCore."""
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
vllm_config
=
engine_args
.
create_engine_config
()
executor_class
=
Executor
.
get_class
(
vllm_config
)
with
set_default_torch_num_threads
(
1
):
engine_core
=
EngineCore
(
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
log_stats
=
True
)
"""Test basic request lifecycle."""
# First request.
request
:
EngineCoreRequest
=
make_request
()
request
.
sampling_params
=
SamplingParams
(
min_tokens
=
4
,
presence_penalty
=
1.0
,
frequency_penalty
=
1.0
,
repetition_penalty
=
0.1
,
stop_token_ids
=
[
1001
,
1002
],
)
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
request
))
def
_check_engine_state
():
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
# Loop through until they are all done.
while
(
outs
:
=
engine_core
.
step
()[
0
].
get
(
0
))
and
outs
.
outputs
:
pass
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
_check_engine_state
()
# Second request.
request2
=
make_request
()
request2
.
sampling_params
=
SamplingParams
(
top_p
=
0.99
,
top_k
=
50
,
)
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
request2
))
_check_engine_state
()
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
def
test_engine_core_concurrent_batches
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_engine_core_concurrent_batches
():
"""
"""
Test that the engine can handle multiple concurrent batches.
Test that the engine can handle multiple concurrent batches.
"""
"""
...
@@ -272,173 +268,163 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
...
@@ -272,173 +268,163 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
if
hasattr
(
self
,
"thread_pool"
):
if
hasattr
(
self
,
"thread_pool"
):
self
.
thread_pool
.
shutdown
(
wait
=
False
)
self
.
thread_pool
.
shutdown
(
wait
=
False
)
with
monkeypatch
.
context
()
as
m
:
engine_args
=
EngineArgs
(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
model
=
MODEL_NAME
,
# To test concurrent batches.
engine_args
=
EngineArgs
(
max_num_seqs
=
2
,
model
=
MODEL_NAME
,
# Avoid all requests being scheduled once.
# To test concurrent batches.
enable_prefix_caching
=
False
,
max_num_seqs
=
2
,
max_num_batched_tokens
=
10
,
# Avoid all requests being scheduled once.
# Reduce startup time.
enable_prefix_caching
=
False
,
enforce_eager
=
True
,
max_num_batched_tokens
=
10
,
)
# Reduce startup time.
vllm_config
=
engine_args
.
create_engine_config
()
enforce_eager
=
True
,
with
set_default_torch_num_threads
(
1
):
engine_core
=
EngineCore
(
vllm_config
=
vllm_config
,
log_stats
=
False
,
executor_class
=
DummyExecutor
)
)
vllm_config
=
engine_args
.
create_engine_config
()
assert
engine_core
.
batch_queue
is
not
None
with
set_default_torch_num_threads
(
1
):
engine_core
=
EngineCore
(
# Add two requests in a row. Each request have 12 prompt tokens.
vllm_config
=
vllm_config
,
log_stats
=
False
,
executor_class
=
DummyExecutor
req0
=
make_request_with_max_tokens
(
"0"
,
5
)
)
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req0
))
assert
engine_core
.
batch_queue
is
not
None
req1
=
make_request_with_max_tokens
(
"1"
,
5
)
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req1
))
# Add two requests in a row. Each request have 12 prompt tokens.
req0
=
make_request_with_max_tokens
(
"0"
,
5
)
# Schedule Batch 1: (10, req0)
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req0
))
assert
engine_core
.
step_with_batch_queue
()[
0
]
is
None
req1
=
make_request_with_max_tokens
(
"1"
,
5
)
assert
len
(
engine_core
.
batch_queue
)
==
1
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
req1
))
scheduler_output
=
engine_core
.
batch_queue
[
-
1
][
1
]
assert
scheduler_output
.
num_scheduled_tokens
[
"0"
]
==
10
# Schedule Batch 1: (10, req0)
# num_computed_tokens should have been updated immediately.
assert
engine_core
.
step_with_batch_queue
()[
0
]
is
None
assert
engine_core
.
scheduler
.
requests
[
req0
.
request_id
].
num_computed_tokens
==
10
assert
len
(
engine_core
.
batch_queue
)
==
1
scheduler_output
=
engine_core
.
batch_queue
[
-
1
][
1
]
# Schedule Batch 2: (2, req0), (8, req1)
assert
scheduler_output
.
num_scheduled_tokens
[
"0"
]
==
10
assert
engine_core
.
step_with_batch_queue
()[
0
]
==
{}
# num_computed_tokens should have been updated immediately.
assert
len
(
engine_core
.
batch_queue
)
==
1
assert
engine_core
.
scheduler
.
requests
[
req0
.
request_id
].
num_computed_tokens
==
10
scheduler_output
=
engine_core
.
batch_queue
[
-
1
][
1
]
assert
scheduler_output
.
num_scheduled_tokens
[
"0"
]
==
2
# Schedule Batch 2: (2, req0), (8, req1)
assert
scheduler_output
.
num_scheduled_tokens
[
"1"
]
==
8
assert
engine_core
.
step_with_batch_queue
()[
0
]
==
{}
# num_computed_tokens should have been updated immediately.
assert
len
(
engine_core
.
batch_queue
)
==
1
assert
engine_core
.
scheduler
.
requests
[
"0"
].
num_computed_tokens
==
12
scheduler_output
=
engine_core
.
batch_queue
[
-
1
][
1
]
assert
engine_core
.
scheduler
.
requests
[
"1"
].
num_computed_tokens
==
8
assert
scheduler_output
.
num_scheduled_tokens
[
"0"
]
==
2
assert
scheduler_output
.
num_scheduled_tokens
[
"1"
]
==
8
assert
engine_core
.
scheduler
.
get_num_unfinished_requests
()
==
2
# num_computed_tokens should have been updated immediately.
assert
engine_core
.
scheduler
.
requests
[
"0"
].
num_computed_tokens
==
12
# Finish Batch 1 and schedule Batch 3: (4, req1).
assert
engine_core
.
scheduler
.
requests
[
"1"
].
num_computed_tokens
==
8
# Note that req0 cannot be scheduled
# because it is in the decoding stage now.
assert
engine_core
.
scheduler
.
get_num_unfinished_requests
()
==
2
engine_core
.
step_with_batch_queue
()
assert
len
(
engine_core
.
batch_queue
)
==
1
# Finish Batch 1 and schedule Batch 3: (4, req1).
scheduler_output
=
engine_core
.
batch_queue
[
-
1
][
1
]
# Note that req0 cannot be scheduled
assert
scheduler_output
.
num_scheduled_tokens
[
"1"
]
==
4
# because it is in the decoding stage now.
engine_core
.
step_with_batch_queue
()
# Finish Batch 2. Get first token of req0.
assert
len
(
engine_core
.
batch_queue
)
==
1
# Schedule Batch 4: (1, req0).
scheduler_output
=
engine_core
.
batch_queue
[
-
1
][
1
]
output
=
engine_core
.
step_with_batch_queue
()[
0
].
get
(
0
)
assert
scheduler_output
.
num_scheduled_tokens
[
"1"
]
==
4
assert
output
is
not
None
assert
len
(
output
.
outputs
)
==
1
# Finish Batch 2. Get first token of req0.
assert
engine_core
.
scheduler
.
requests
[
req0
.
request_id
].
num_tokens
==
13
# Schedule Batch 4: (1, req0).
scheduler_output
=
engine_core
.
batch_queue
[
-
1
][
1
]
output
=
engine_core
.
step_with_batch_queue
()[
0
].
get
(
0
)
assert
scheduler_output
.
num_scheduled_tokens
[
"0"
]
==
1
# Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
output
=
engine_core
.
step_with_batch_queue
()[
0
].
get
(
0
)
assert
output
is
not
None
assert
len
(
output
.
outputs
)
==
1
assert
engine_core
.
scheduler
.
requests
[
req1
.
request_id
].
num_tokens
==
13
scheduler_output
=
engine_core
.
batch_queue
[
-
1
][
1
]
assert
scheduler_output
.
num_scheduled_tokens
[
"1"
]
==
1
# Loop until req0 is finished.
req_id
=
0
expected_num_tokens
=
[
engine_core
.
scheduler
.
requests
[
"0"
].
num_tokens
+
1
,
engine_core
.
scheduler
.
requests
[
"1"
].
num_tokens
+
1
,
]
while
engine_core
.
scheduler
.
get_num_unfinished_requests
()
==
2
:
output
=
engine_core
.
step_with_batch_queue
()[
0
]
# Every step consumes an output.
assert
output
is
not
None
assert
output
is
not
None
assert
len
(
output
.
outputs
)
==
1
assert
len
(
output
[
0
].
outputs
)
==
1
assert
engine_core
.
scheduler
.
requests
[
req0
.
request_id
].
num_tokens
==
13
if
req_id
in
engine_core
.
scheduler
.
requests
:
scheduler_output
=
engine_core
.
batch_queue
[
-
1
][
1
]
assert
(
assert
scheduler_output
.
num_scheduled_tokens
[
"0"
]
==
1
engine_core
.
scheduler
.
requests
[
req_id
].
num_tokens
==
expected_num_tokens
[
req_id
]
# Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
)
output
=
engine_core
.
step_with_batch_queue
()[
0
].
get
(
0
)
expected_num_tokens
[
req_id
]
+=
1
assert
output
is
not
None
req_id
=
(
req_id
+
1
)
%
2
assert
len
(
output
.
outputs
)
==
1
assert
engine_core
.
scheduler
.
requests
[
req1
.
request_id
].
num_tokens
==
13
scheduler_output
=
engine_core
.
batch_queue
[
-
1
][
1
]
assert
scheduler_output
.
num_scheduled_tokens
[
"1"
]
==
1
# Loop until req0 is finished.
req_id
=
0
expected_num_tokens
=
[
engine_core
.
scheduler
.
requests
[
"0"
].
num_tokens
+
1
,
engine_core
.
scheduler
.
requests
[
"1"
].
num_tokens
+
1
,
]
while
engine_core
.
scheduler
.
get_num_unfinished_requests
()
==
2
:
output
=
engine_core
.
step_with_batch_queue
()[
0
]
# Every step consumes an output.
assert
output
is
not
None
assert
len
(
output
[
0
].
outputs
)
==
1
if
req_id
in
engine_core
.
scheduler
.
requests
:
assert
(
engine_core
.
scheduler
.
requests
[
req_id
].
num_tokens
==
expected_num_tokens
[
req_id
]
)
expected_num_tokens
[
req_id
]
+=
1
req_id
=
(
req_id
+
1
)
%
2
@
multi_gpu_test
(
num_gpus
=
2
)
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_engine_core_tp
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_engine_core_tp
():
"""
"""
Test engine can initialize worker in tp properly
Test engine can initialize worker in tp properly
"""
"""
with
monkeypatch
.
context
()
as
m
:
"""Setup the EngineCore."""
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
engine_args
=
EngineArgs
(
"""Setup the EngineCore."""
model
=
MODEL_NAME
,
engine_args
=
EngineArgs
(
tensor_parallel_size
=
2
,
model
=
MODEL_NAME
,
# Reduce startup time.
tensor_parallel_size
=
2
,
enforce_eager
=
True
,
# Reduce startup time.
)
enforce_eager
=
True
,
vllm_config
=
engine_args
.
create_engine_config
()
)
executor_class
=
Executor
.
get_class
(
vllm_config
)
vllm_config
=
engine_args
.
create_engine_config
()
executor_class
=
Executor
.
get_class
(
vllm_config
)
with
set_default_torch_num_threads
(
1
):
with
set_default_torch_num_threads
(
1
):
engine_core
=
EngineCore
(
engine_core
=
EngineCore
(
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
log_stats
=
True
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
log_stats
=
True
)
)
def
get_worker_cache_config_field
(
worker
,
key
:
str
):
def
get_worker_cache_config_field
(
worker
,
key
:
str
):
return
getattr
(
worker
.
cache_config
,
key
)
return
getattr
(
worker
.
cache_config
,
key
)
num_gpu_blocks
=
engine_core
.
collective_rpc
(
num_gpu_blocks
=
engine_core
.
collective_rpc
(
get_worker_cache_config_field
,
args
=
(
"num_gpu_blocks"
,)
get_worker_cache_config_field
,
args
=
(
"num_gpu_blocks"
,)
)
)
num_cpu_blocks
=
engine_core
.
collective_rpc
(
num_cpu_blocks
=
engine_core
.
collective_rpc
(
get_worker_cache_config_field
,
args
=
(
"num_cpu_blocks"
,)
get_worker_cache_config_field
,
args
=
(
"num_cpu_blocks"
,)
)
)
assert
all
(
x
is
not
None
for
x
in
num_gpu_blocks
)
assert
all
(
x
is
not
None
for
x
in
num_gpu_blocks
)
assert
all
(
x
is
not
None
for
x
in
num_cpu_blocks
)
assert
all
(
x
is
not
None
for
x
in
num_cpu_blocks
)
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
def
test_engine_core_invalid_request_id_type
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_engine_core_invalid_request_id_type
():
"""Test that engine raises TypeError for non-string request_id."""
"""Test that engine raises TypeError for non-string request_id."""
with
monkeypatch
.
context
()
as
m
:
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
vllm_config
=
engine_args
.
create_engine_config
()
executor_class
=
Executor
.
get_class
(
vllm_config
)
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
with
set_default_torch_num_threads
(
1
):
vllm_config
=
engine_args
.
create_engine_config
()
engine_core
=
EngineCore
(
executor_class
=
Executor
.
get_class
(
vllm_config
)
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
log_stats
=
True
)
with
set_default_torch_num_threads
(
1
):
engine_core
=
EngineCore
(
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
log_stats
=
True
)
# Test with UUID object (common mistake)
# Test with UUID object (common mistake)
uuid_request
=
make_request
()
uuid_request
=
make_request
()
uuid_request
.
request_id
=
uuid
.
uuid4
()
# UUID object instead of string
uuid_request
.
request_id
=
uuid
.
uuid4
()
# UUID object instead of string
with
pytest
.
raises
(
TypeError
,
match
=
"request_id must be a string, got.*UUID"
):
with
pytest
.
raises
(
TypeError
,
match
=
"request_id must be a string, got.*UUID"
):
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
uuid_request
))
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
uuid_request
))
# Test with integer
# Test with integer
int_request
=
make_request
()
int_request
=
make_request
()
int_request
.
request_id
=
12345
int_request
.
request_id
=
12345
with
pytest
.
raises
(
TypeError
,
match
=
"request_id must be a string, got.*int"
):
with
pytest
.
raises
(
TypeError
,
match
=
"request_id must be a string, got.*int"
):
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
int_request
))
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
int_request
))
# Test with None
# Test with None
none_request
=
make_request
()
none_request
=
make_request
()
none_request
.
request_id
=
None
none_request
.
request_id
=
None
with
pytest
.
raises
(
with
pytest
.
raises
(
TypeError
,
match
=
"request_id must be a string, got.*NoneType"
):
TypeError
,
match
=
"request_id must be a string, got.*NoneType"
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
none_request
))
):
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
none_request
))
# Verify engine is still functional after errors
# Verify engine is still functional after errors
valid_request
=
make_request
()
valid_request
=
make_request
()
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
valid_request
))
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
valid_request
))
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
tests/v1/engine/test_engine_core_client.py
View file @
1e4ecca1
...
@@ -130,8 +130,6 @@ def test_engine_core_client(
...
@@ -130,8 +130,6 @@ def test_engine_core_client(
monkeypatch
:
pytest
.
MonkeyPatch
,
multiprocessing_mode
:
bool
monkeypatch
:
pytest
.
MonkeyPatch
,
multiprocessing_mode
:
bool
):
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Monkey-patch core engine utility function to test.
# Monkey-patch core engine utility function to test.
m
.
setattr
(
EngineCore
,
"echo"
,
echo
,
raising
=
False
)
m
.
setattr
(
EngineCore
,
"echo"
,
echo
,
raising
=
False
)
...
@@ -218,8 +216,6 @@ def test_engine_core_client(
...
@@ -218,8 +216,6 @@ def test_engine_core_client(
@
pytest
.
mark
.
asyncio
(
loop_scope
=
"function"
)
@
pytest
.
mark
.
asyncio
(
loop_scope
=
"function"
)
async
def
test_engine_core_client_asyncio
(
monkeypatch
:
pytest
.
MonkeyPatch
):
async
def
test_engine_core_client_asyncio
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Monkey-patch core engine utility function to test.
# Monkey-patch core engine utility function to test.
m
.
setattr
(
EngineCore
,
"echo"
,
echo
,
raising
=
False
)
m
.
setattr
(
EngineCore
,
"echo"
,
echo
,
raising
=
False
)
...
@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
...
@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
monkeypatch
:
pytest
.
MonkeyPatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Must set insecure serialization to allow returning custom types.
# Must set insecure serialization to allow returning custom types.
m
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
m
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
...
@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
...
@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
monkeypatch
:
pytest
.
MonkeyPatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Must set insecure serialization to allow returning custom types.
# Must set insecure serialization to allow returning custom types.
m
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
m
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
...
@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
...
@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
monkeypatch
:
pytest
.
MonkeyPatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Must set insecure serialization to allow returning custom types.
# Must set insecure serialization to allow returning custom types.
m
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
m
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
...
@@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures(
...
@@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures(
indirect
=
[
"publisher_config"
],
indirect
=
[
"publisher_config"
],
)
)
def
test_kv_cache_events
(
def
test_kv_cache_events
(
monkeypatch
:
pytest
.
MonkeyPatch
,
multiprocessing_mode
:
bool
,
multiprocessing_mode
:
bool
,
publisher_config
,
publisher_config
,
):
):
with
monkeypatch
.
context
()
as
m
:
block_size
=
16
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
num_blocks
=
2
block_size
=
16
num_blocks
=
2
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
,
engine_args
=
EngineArgs
(
enforce_eager
=
True
,
model
=
MODEL_NAME
,
enable_prefix_caching
=
True
,
enforce_eager
=
True
,
block_size
=
block_size
,
enable_prefix_caching
=
True
,
)
block_size
=
block_size
,
engine_args
.
kv_events_config
=
publisher_config
)
engine_args
.
kv_events_config
=
publisher_config
vllm_config
=
engine_args
.
create_engine_config
(
UsageContext
.
UNKNOWN_CONTEXT
)
vllm_config
=
engine_args
.
create_engine_config
(
UsageContext
.
UNKNOWN_CONTEXT
)
executor_class
=
Executor
.
get_class
(
vllm_config
)
executor_class
=
Executor
.
get_class
(
vllm_config
)
with
set_default_torch_num_threads
(
1
):
with
set_default_torch_num_threads
(
1
):
client
=
EngineCoreClient
.
make_client
(
client
=
EngineCoreClient
.
make_client
(
multiprocess_mode
=
multiprocessing_mode
,
multiprocess_mode
=
multiprocessing_mode
,
asyncio_mode
=
False
,
asyncio_mode
=
False
,
vllm_config
=
vllm_config
,
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
executor_class
=
executor_class
,
log_stats
=
False
,
log_stats
=
False
,
)
endpoint
=
publisher_config
.
endpoint
.
replace
(
"*"
,
"127.0.0.1"
)
subscriber
=
MockSubscriber
(
endpoint
,
topic
=
publisher_config
.
topic
,
decode_type
=
KVEventBatch
)
)
endpoint
=
publisher_config
.
endpoint
.
replace
(
"*"
,
"127.0.0.1"
)
subscriber
=
MockSubscriber
(
endpoint
,
topic
=
publisher_config
.
topic
,
decode_type
=
KVEventBatch
)
try
:
try
:
custom_tokens
=
list
(
range
(
num_blocks
*
block_size
))
custom_tokens
=
list
(
range
(
num_blocks
*
block_size
))
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
request
=
make_request
(
sampling_params
,
custom_tokens
)
request
=
make_request
(
sampling_params
,
custom_tokens
)
client
.
add_request
(
request
)
client
.
add_request
(
request
)
outputs
:
dict
[
str
,
list
]
=
{
request
.
request_id
:
[]}
outputs
:
dict
[
str
,
list
]
=
{
request
.
request_id
:
[]}
loop_until_done
(
client
,
outputs
)
loop_until_done
(
client
,
outputs
)
result
=
subscriber
.
receive_one
(
timeout
=
1000
)
result
=
subscriber
.
receive_one
(
timeout
=
1000
)
assert
result
is
not
None
,
"No message received"
assert
result
is
not
None
,
"No message received"
seq
,
received
=
result
seq
,
received
=
result
assert
seq
==
0
,
"Sequence number mismatch"
assert
seq
==
0
,
"Sequence number mismatch"
assert
len
(
received
.
events
)
==
1
,
(
assert
len
(
received
.
events
)
==
1
,
"We should have exactly one BlockStored event"
"We should have exactly one BlockStored event"
event
=
received
.
events
[
0
]
)
assert
isinstance
(
event
,
BlockStored
),
"We should have a BlockStored event"
event
=
received
.
events
[
0
]
assert
len
(
event
.
block_hashes
)
==
num_blocks
,
(
assert
isinstance
(
event
,
BlockStored
),
"We should have a BlockStored event"
"We should have a BlockStored event with 2 block_hashes"
assert
len
(
event
.
block_hashes
)
==
num_blocks
,
(
)
"We should have a BlockStored event with 2 block_hashes"
assert
event
.
block_size
==
block_size
,
(
)
"Block size should be the same as the block size"
assert
event
.
block_size
==
block_size
,
(
)
"Block size should be the same as the block size"
assert
event
.
parent_block_hash
is
None
,
"Parent block hash should be None"
)
assert
event
.
lora_id
is
None
,
"Lora id should be None"
assert
event
.
parent_block_hash
is
None
,
"Parent block hash should be None"
assert
len
(
event
.
token_ids
)
==
num_blocks
*
block_size
,
(
assert
event
.
lora_id
is
None
,
"Lora id should be None"
"Token ids should be the same as the custom tokens"
assert
len
(
event
.
token_ids
)
==
num_blocks
*
block_size
,
(
)
"Token ids should be the same as the custom tokens"
assert
event
.
token_ids
==
custom_tokens
,
(
)
"Token ids should be the same as the custom tokens"
assert
event
.
token_ids
==
custom_tokens
,
(
)
"Token ids should be the same as the custom tokens"
finally
:
)
client
.
shutdown
()
finally
:
subscriber
.
close
()
client
.
shutdown
()
subscriber
.
close
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -672,101 +657,96 @@ def test_kv_cache_events(
...
@@ -672,101 +657,96 @@ def test_kv_cache_events(
)
)
@
multi_gpu_test
(
num_gpus
=
4
)
@
multi_gpu_test
(
num_gpus
=
4
)
async
def
test_kv_cache_events_dp
(
async
def
test_kv_cache_events_dp
(
monkeypatch
:
pytest
.
MonkeyPatch
,
multiprocessing_mode
:
bool
,
multiprocessing_mode
:
bool
,
publisher_config
,
publisher_config
,
):
):
with
monkeypatch
.
context
()
as
m
:
block_size
=
16
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
num_blocks
=
2
block_size
=
16
dp_size
=
2
num_blocks
=
2
tp_size
=
2
dp_size
=
2
tp_size
=
2
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
,
engine_args
=
EngineArgs
(
enforce_eager
=
True
,
model
=
MODEL_NAME
,
enable_prefix_caching
=
True
,
enforce_eager
=
True
,
data_parallel_size
=
dp_size
,
enable_prefix_caching
=
True
,
tensor_parallel_size
=
tp_size
,
data_parallel_size
=
dp_size
,
block_size
=
block_size
,
tensor_parallel_size
=
tp_size
,
)
block_size
=
block_size
,
engine_args
.
kv_events_config
=
publisher_config
)
engine_args
.
kv_events_config
=
publisher_config
vllm_config
=
engine_args
.
create_engine_config
(
UsageContext
.
UNKNOWN_CONTEXT
)
vllm_config
=
engine_args
.
create_engine_config
(
UsageContext
.
UNKNOWN_CONTEXT
)
executor_class
=
Executor
.
get_class
(
vllm_config
)
executor_class
=
Executor
.
get_class
(
vllm_config
)
with
set_default_torch_num_threads
(
1
):
with
set_default_torch_num_threads
(
1
):
client
=
EngineCoreClient
.
make_client
(
client
=
EngineCoreClient
.
make_client
(
multiprocess_mode
=
multiprocessing_mode
,
multiprocess_mode
=
multiprocessing_mode
,
asyncio_mode
=
True
,
asyncio_mode
=
True
,
vllm_config
=
vllm_config
,
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
executor_class
=
executor_class
,
log_stats
=
False
,
log_stats
=
False
,
)
)
await
asyncio
.
sleep
(
1
)
await
asyncio
.
sleep
(
1
)
# Build endpoints for all DP ranks
# Build endpoints for all DP ranks
base_endpoint
=
publisher_config
.
endpoint
.
replace
(
"*"
,
"127.0.0.1"
)
base_endpoint
=
publisher_config
.
endpoint
.
replace
(
"*"
,
"127.0.0.1"
)
endpoints
=
[]
endpoints
=
[]
for
i
in
range
(
dp_size
):
for
i
in
range
(
dp_size
):
offset_endpoint
=
ZmqEventPublisher
.
offset_endpoint_port
(
base_endpoint
,
i
)
offset_endpoint
=
ZmqEventPublisher
.
offset_endpoint_port
(
base_endpoint
,
i
)
endpoints
.
append
(
offset_endpoint
)
endpoints
.
append
(
offset_endpoint
)
subscriber
=
MockSubscriber
(
subscriber
=
MockSubscriber
(
endpoints
,
topic
=
publisher_config
.
topic
,
decode_type
=
KVEventBatch
endpoints
,
topic
=
publisher_config
.
topic
,
decode_type
=
KVEventBatch
)
)
try
:
try
:
custom_tokens
=
list
(
range
(
num_blocks
*
block_size
))
custom_tokens
=
list
(
range
(
num_blocks
*
block_size
))
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
all_request_ids
=
[]
all_request_ids
=
[]
# Create and add 25 requests
# Create and add 25 requests
# NOTE: attempts to force routing to both dp groups but can be flaky
# NOTE: attempts to force routing to both dp groups but can be flaky
for
i
in
range
(
25
):
for
i
in
range
(
25
):
await
asyncio
.
sleep
(
0.01
)
await
asyncio
.
sleep
(
0.01
)
request
=
make_request
(
sampling_params
,
custom_tokens
)
request
=
make_request
(
sampling_params
,
custom_tokens
)
await
client
.
add_request_async
(
request
)
await
client
.
add_request_async
(
request
)
all_request_ids
.
append
(
request
.
request_id
)
all_request_ids
.
append
(
request
.
request_id
)
await
asyncio
.
sleep
(
0.1
)
await
asyncio
.
sleep
(
0.1
)
# Initialize outputs dict for all requests
# Initialize outputs dict for all requests
outputs
:
dict
[
str
,
list
]
=
{
req_id
:
[]
for
req_id
in
all_request_ids
}
outputs
:
dict
[
str
,
list
]
=
{
req_id
:
[]
for
req_id
in
all_request_ids
}
print
(
"processing requests..."
)
print
(
"processing requests..."
)
await
asyncio
.
wait_for
(
await
asyncio
.
wait_for
(
loop_until_fully_done_async
(
client
,
outputs
),
timeout
=
20.0
loop_until_fully_done_async
(
client
,
outputs
),
timeout
=
20.0
)
)
# Receive from subscriber until no more messages
# Receive from subscriber until no more messages
print
(
"collecting results..."
)
print
(
"collecting results..."
)
results
=
[]
results
=
[]
while
True
:
while
True
:
result
=
subscriber
.
receive_one
(
timeout
=
1
)
result
=
subscriber
.
receive_one
(
timeout
=
1
)
print
(
result
)
print
(
result
)
if
result
is
None
:
if
result
is
None
:
break
break
results
.
append
(
result
)
results
.
append
(
result
)
# Collect all events and data_parallel_ranks from all results
# Collect all events and data_parallel_ranks from all results
all_dp_ranks
=
[
received
.
data_parallel_rank
for
(
_
,
received
)
in
results
]
all_dp_ranks
=
[
received
.
data_parallel_rank
for
(
_
,
received
)
in
results
]
unique_dps
=
set
(
all_dp_ranks
)
unique_dps
=
set
(
all_dp_ranks
)
assert
len
(
unique_dps
)
==
2
,
(
assert
len
(
unique_dps
)
==
2
,
(
f
"Expected 2 unique data_parallel_ranks, got
{
len
(
unique_dps
)
}
"
f
"Expected 2 unique data_parallel_ranks, got
{
len
(
unique_dps
)
}
"
)
)
finally
:
finally
:
client
.
shutdown
()
client
.
shutdown
()
subscriber
.
close
()
subscriber
.
close
()
@
pytest
.
mark
.
timeout
(
20
)
@
pytest
.
mark
.
timeout
(
20
)
def
test_startup_failure
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_startup_failure
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
,
pytest
.
raises
(
Exception
)
as
e_info
:
with
monkeypatch
.
context
()
as
m
,
pytest
.
raises
(
Exception
)
as
e_info
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Monkey-patch to extract core process pid while it's starting.
# Monkey-patch to extract core process pid while it's starting.
core_proc_pid
=
[
None
]
core_proc_pid
=
[
None
]
cepm_ctor
=
CoreEngineProcManager
.
__init__
cepm_ctor
=
CoreEngineProcManager
.
__init__
...
@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
...
@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
mock_executor_class
.
side_effect
=
create_mock_executor
mock_executor_class
.
side_effect
=
create_mock_executor
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"CUDA_VISIBLE_DEVICES"
,
""
)
# No CUDA devices
m
.
setenv
(
"CUDA_VISIBLE_DEVICES"
,
""
)
# No CUDA devices
from
vllm.v1.engine.utils
import
EngineZmqAddresses
from
vllm.v1.engine.utils
import
EngineZmqAddresses
...
...
tests/v1/engine/test_llm_engine.py
View file @
1e4ecca1
...
@@ -21,12 +21,10 @@ DTYPE = "half"
...
@@ -21,12 +21,10 @@ DTYPE = "half"
def
_vllm_model
(
def
_vllm_model
(
apc
:
bool
,
apc
:
bool
,
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
*
,
*
,
skip_tokenizer_init
:
bool
=
False
,
skip_tokenizer_init
:
bool
=
False
,
):
):
"""Set up VllmRunner instance."""
"""Set up VllmRunner instance."""
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
return
vllm_runner
(
return
vllm_runner
(
MODEL
,
MODEL
,
dtype
=
DTYPE
,
dtype
=
DTYPE
,
...
@@ -45,16 +43,16 @@ def _vllm_model(
...
@@ -45,16 +43,16 @@ def _vllm_model(
# Prefix caching
# Prefix caching
params
=
[
False
,
True
],
params
=
[
False
,
True
],
)
)
def
vllm_model
(
vllm_runner
,
request
,
monkeypatch
):
def
vllm_model
(
vllm_runner
,
request
):
"""VllmRunner test fixture parameterized by APC True/False."""
"""VllmRunner test fixture parameterized by APC True/False."""
with
_vllm_model
(
request
.
param
,
vllm_runner
,
monkeypatch
)
as
vllm_model
:
with
_vllm_model
(
request
.
param
,
vllm_runner
)
as
vllm_model
:
yield
vllm_model
yield
vllm_model
@
pytest
.
fixture
(
scope
=
"function"
)
@
pytest
.
fixture
(
scope
=
"function"
)
def
vllm_model_apc
(
vllm_runner
,
monkeypatch
):
def
vllm_model_apc
(
vllm_runner
):
"""VllmRunner test fixture with APC."""
"""VllmRunner test fixture with APC."""
with
_vllm_model
(
True
,
vllm_runner
,
monkeypatch
)
as
vllm_model
:
with
_vllm_model
(
True
,
vllm_runner
)
as
vllm_model
:
yield
vllm_model
yield
vllm_model
...
@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
...
@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
# Prefix caching
# Prefix caching
params
=
[
False
,
True
],
params
=
[
False
,
True
],
)
)
def
vllm_model_skip_tokenizer_init
(
vllm_runner
,
request
,
monkeypatch
):
def
vllm_model_skip_tokenizer_init
(
vllm_runner
,
request
):
"""VllmRunner test fixture with APC."""
"""VllmRunner test fixture with APC."""
with
_vllm_model
(
with
_vllm_model
(
request
.
param
,
request
.
param
,
vllm_runner
,
vllm_runner
,
monkeypatch
,
skip_tokenizer_init
=
True
,
skip_tokenizer_init
=
True
,
)
as
vllm_model
:
)
as
vllm_model
:
yield
vllm_model
yield
vllm_model
...
@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
...
@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
)
)
def
test_engine_metrics
(
vllm_runner
,
monkeypatch
,
example_prompts
):
def
test_engine_metrics
(
vllm_runner
,
example_prompts
):
max_tokens
=
100
max_tokens
=
100
# Use spec decoding to test num_accepted_tokens_per_pos
# Use spec decoding to test num_accepted_tokens_per_pos
speculative_config
=
{
speculative_config
=
{
...
@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
...
@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
"prompt_lookup_min"
:
3
,
"prompt_lookup_min"
:
3
,
"num_speculative_tokens"
:
5
,
"num_speculative_tokens"
:
5
,
}
}
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
with
vllm_runner
(
MODEL
,
MODEL
,
speculative_config
=
speculative_config
,
speculative_config
=
speculative_config
,
...
@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
...
@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"meta-llama/Llama-3.2-1B-Instruct"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"meta-llama/Llama-3.2-1B-Instruct"
])
def
test_skip_tokenizer_initialization
(
model
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_skip_tokenizer_initialization
(
model
:
str
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# This test checks if the flag skip_tokenizer_init skips the initialization
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
# token ids.
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment