Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1e4ecca1
Unverified
Commit
1e4ecca1
authored
Oct 07, 2025
by
Cyrus Leung
Committed by
GitHub
Oct 07, 2025
Browse files
[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
c0a7b89d
Changes
51
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
489 additions
and
606 deletions
+489
-606
tests/kernels/test_flex_attention.py
tests/kernels/test_flex_attention.py
+6
-8
tests/models/multimodal/generation/test_maverick.py
tests/models/multimodal/generation/test_maverick.py
+0
-1
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
..._dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+1
-5
tests/plugins_tests/test_scheduler_plugins.py
tests/plugins_tests/test_scheduler_plugins.py
+0
-1
tests/samplers/test_no_bad_words.py
tests/samplers/test_no_bad_words.py
+0
-7
tests/tpu/lora/test_lora.py
tests/tpu/lora/test_lora.py
+0
-11
tests/v1/attention/utils.py
tests/v1/attention/utils.py
+0
-1
tests/v1/core/test_kv_sharing.py
tests/v1/core/test_kv_sharing.py
+3
-0
tests/v1/core/test_scheduler_e2e.py
tests/v1/core/test_scheduler_e2e.py
+0
-4
tests/v1/cudagraph/test_cudagraph_mode.py
tests/v1/cudagraph/test_cudagraph_mode.py
+2
-2
tests/v1/e2e/test_cascade_attention.py
tests/v1/e2e/test_cascade_attention.py
+0
-1
tests/v1/e2e/test_correctness_sliding_window.py
tests/v1/e2e/test_correctness_sliding_window.py
+29
-34
tests/v1/e2e/test_kv_sharing_fast_prefill.py
tests/v1/e2e/test_kv_sharing_fast_prefill.py
+0
-2
tests/v1/e2e/test_min_tokens.py
tests/v1/e2e/test_min_tokens.py
+1
-5
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+0
-1
tests/v1/engine/test_async_llm.py
tests/v1/engine/test_async_llm.py
+14
-44
tests/v1/engine/test_engine_args.py
tests/v1/engine/test_engine_args.py
+0
-7
tests/v1/engine/test_engine_core.py
tests/v1/engine/test_engine_core.py
+300
-314
tests/v1/engine/test_engine_core_client.py
tests/v1/engine/test_engine_core_client.py
+125
-146
tests/v1/engine/test_llm_engine.py
tests/v1/engine/test_llm_engine.py
+8
-12
No files found.
tests/kernels/test_flex_attention.py
View file @
1e4ecca1
...
...
@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
set_seed
(
seed
)
...
...
@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with default backend
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
...
...
@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
with
vllm_runner
(
model_name
,
...
...
@@ -126,16 +123,17 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
flex_outputs
=
llm_flex
.
embed
(
prompts
)
# Run with default backend
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
with
(
m
onkeypatch
.
context
()
as
m
,
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
)
as
llm_default
:
)
as
llm_default
,
):
default_outputs
=
llm_default
.
embed
(
prompts
)
check_embeddings_close
(
...
...
tests/models/multimodal/generation/test_maverick.py
View file @
1e4ecca1
...
...
@@ -613,7 +613,6 @@ def test_dummy_maverick(
profile
:
bool
=
False
,
)
->
None
:
# Disable multiprocessing allows us to access model executor from LLM engine
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
model_path
=
create_reduced_maverick_model
(
...
...
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
View file @
1e4ecca1
...
...
@@ -8,7 +8,6 @@ if TYPE_CHECKING:
from
vllm.config
import
VllmConfig
else
:
VllmConfig
=
None
from
vllm
import
envs
class
DummyPlatform
(
Platform
):
...
...
@@ -19,10 +18,7 @@ class DummyPlatform(Platform):
@
classmethod
def
check_and_update_config
(
cls
,
vllm_config
:
VllmConfig
)
->
None
:
if
envs
.
VLLM_USE_V1
:
compilation_config
=
vllm_config
.
compilation_config
# Activate custom ops for v1.
compilation_config
.
custom_ops
=
[
"all"
]
vllm_config
.
compilation_config
.
custom_ops
=
[
"all"
]
def
get_attn_backend_cls
(
self
,
...
...
tests/plugins_tests/test_scheduler_plugins.py
View file @
1e4ecca1
...
...
@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler):
def
test_scheduler_plugins_v1
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Explicitly turn off engine multiprocessing so
# that the scheduler runs in this process
m
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
...
...
tests/samplers/test_no_bad_words.py
View file @
1e4ecca1
...
...
@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
from
typing
import
Optional
import
pytest
from
transformers
import
AutoTokenizer
from
vllm
import
LLM
,
SamplingParams
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
monkeypatch
):
"""Only run on vLLM v1."""
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
def
_generate
(
llm
:
LLM
,
prompt
:
str
,
...
...
tests/tpu/lora/test_lora.py
View file @
1e4ecca1
...
...
@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest
# 100 training iterations with a training batch size of 100.
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v1_only
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
for all tests in this file
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
yield
def
setup_vllm
(
num_loras
:
int
,
tp
:
int
)
->
vllm
.
LLM
:
return
vllm
.
LLM
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
...
...
tests/v1/attention/utils.py
View file @
1e4ecca1
...
...
@@ -305,7 +305,6 @@ full_cg_backend_configs = {
"CutlassMLA"
:
BackendConfig
(
name
=
"CutlassMLA"
,
env_vars
=
{
"VLLM_USE_V1"
:
"1"
,
"VLLM_ATTENTION_BACKEND"
:
"CUTLASS_MLA"
,
"FORCE_NUM_KV_SPLITS"
:
"1"
,
# TODO: remove this when hang issue is fixed
},
...
...
tests/v1/core/test_kv_sharing.py
View file @
1e4ecca1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
vllm.v1.kv_cache_interface
import
FullAttentionSpec
,
KVCacheGroupSpec
from
vllm.v1.worker.utils
import
add_kv_sharing_layers_to_kv_cache_groups
pytestmark
=
pytest
.
mark
.
cpu_test
def
new_kv_cache_spec
():
return
FullAttentionSpec
(
16
,
1
,
1
,
torch
.
float32
,
False
)
...
...
tests/v1/core/test_scheduler_e2e.py
View file @
1e4ecca1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
from
vllm
import
LLM
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
MODEL
=
"meta-llama/Llama-3.2-1B"
PROMPT
=
"Hello my name is Robert and I"
...
...
tests/v1/cudagraph/test_cudagraph_mode.py
View file @
1e4ecca1
...
...
@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
):
pytest
.
skip
(
"Only Hopper GPUs support FA3 and FlashMLA"
)
env_vars
=
{
"VLLM_USE_V1"
:
"1"
,
**
backend_configs
[
backend_name
].
env_vars
}
env_vars
=
backend_configs
[
backend_name
].
env_vars
with
temporary_environ
(
env_vars
),
ExitStack
()
as
stack
:
if
not
supported
:
...
...
@@ -117,7 +117,7 @@ combo_cases_2 = [
def
test_cudagraph_compilation_combo
(
combo_case
):
backend_name
,
cudagraph_mode
,
compilation_level
,
supported
=
combo_case
env_vars
=
{
"VLLM_USE_V1"
:
"1"
,
**
backend_configs
[
backend_name
].
env_vars
}
env_vars
=
backend_configs
[
backend_name
].
env_vars
with
temporary_environ
(
env_vars
),
ExitStack
()
as
stack
:
if
not
supported
:
...
...
tests/v1/e2e/test_cascade_attention.py
View file @
1e4ecca1
...
...
@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
llm
=
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
)
...
...
tests/v1/e2e/test_correctness_sliding_window.py
View file @
1e4ecca1
...
...
@@ -32,7 +32,7 @@ model_config = {
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"disable_hybrid_kv_cache_manager"
,
[
True
,
False
])
def
test_sliding_window_retrieval
(
monkeypatch
,
model
,
batch_size
,
seed
,
disable_hybrid_kv_cache_manager
model
,
batch_size
,
seed
,
disable_hybrid_kv_cache_manager
):
"""
The test does a bunch of assignments "x1 = 10
\n
x2 = 33
\n
..." and then
...
...
@@ -40,9 +40,6 @@ def test_sliding_window_retrieval(
If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly).
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
test_config
=
model_config
[
model
]
llm
=
LLM
(
...
...
@@ -50,9 +47,7 @@ def test_sliding_window_retrieval(
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
100
)
prompts
,
answer
,
indices
=
prep_prompts
(
batch_size
,
ln_range
=
test_config
.
ln_range
)
prompts
,
answer
,
indices
=
prep_prompts
(
batch_size
,
ln_range
=
test_config
.
ln_range
)
check_length
(
prompts
,
llm
,
test_config
.
sliding_window
)
...
...
tests/v1/e2e/test_kv_sharing_fast_prefill.py
View file @
1e4ecca1
...
...
@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Make scheduling deterministic for reproducibility
m
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
...
...
tests/v1/e2e/test_min_tokens.py
View file @
1e4ecca1
...
...
@@ -13,7 +13,6 @@ Covers:
5) Multiple stop conditions
"""
import
os
from
typing
import
Optional
,
Union
import
pytest
...
...
@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm_v1
():
"""Create V1 LLM instance for testing"""
# Ensure V1 engine is used
os
.
environ
[
"VLLM_USE_V1"
]
=
"1"
llm
=
LLM
(
model
=
TEST_MODEL
,
tensor_parallel_size
=
1
,
...
...
@@ -503,6 +499,6 @@ if __name__ == "__main__":
Usage:
cd vllm/
VLLM_USE_V1=1
python -m pytest tests/v1/e2e/test_min_tokens.py -v
python -m pytest tests/v1/e2e/test_min_tokens.py -v
"""
pytest
.
main
([
__file__
,
"-v"
])
tests/v1/e2e/test_spec_decode.py
View file @
1e4ecca1
...
...
@@ -301,7 +301,6 @@ def test_mtp_correctness(
model_setup: (method, model_name, tp_size)
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_MLA_DISABLE"
,
"1"
)
method
,
model_name
,
tp_size
=
model_setup
...
...
tests/v1/engine/test_async_llm.py
View file @
1e4ecca1
...
...
@@ -95,17 +95,11 @@ async def generate(
)
@
pytest
.
mark
.
asyncio
async
def
test_load
(
monkeypatch
:
pytest
.
MonkeyPatch
,
output_kind
:
RequestOutputKind
,
engine_args
:
AsyncEngineArgs
,
prompt
:
PromptType
,
):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
# so that in the future when we switch, we don't have to change all the
# tests.
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
ExitStack
()
as
after
:
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
after
.
callback
(
engine
.
shutdown
)
...
...
@@ -149,14 +143,11 @@ async def test_load(
)
@
pytest
.
mark
.
asyncio
async
def
test_abort
(
monkeypatch
:
pytest
.
MonkeyPatch
,
output_kind
:
RequestOutputKind
,
engine_args
:
AsyncEngineArgs
,
prompt
:
PromptType
,
):
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
ExitStack
()
as
after
:
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
after
.
callback
(
engine
.
shutdown
)
...
...
@@ -222,13 +213,8 @@ async def test_abort(
"output_kind"
,
[
RequestOutputKind
.
DELTA
,
RequestOutputKind
.
FINAL_ONLY
]
)
@
pytest
.
mark
.
asyncio
async
def
test_multi_abort
(
monkeypatch
:
pytest
.
MonkeyPatch
,
output_kind
:
RequestOutputKind
,
):
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
async
def
test_multi_abort
(
output_kind
:
RequestOutputKind
):
with
ExitStack
()
as
after
:
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
after
.
callback
(
engine
.
shutdown
)
...
...
@@ -304,14 +290,11 @@ async def test_multi_abort(
)
@
pytest
.
mark
.
asyncio
async
def
test_finished_flag
(
monkeypatch
:
pytest
.
MonkeyPatch
,
n
:
int
,
engine_args
:
AsyncEngineArgs
,
prompt
:
PromptType
,
):
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
ExitStack
()
as
after
:
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
after
.
callback
(
engine
.
shutdown
)
...
...
@@ -341,12 +324,10 @@ async def test_finished_flag(
)
@
pytest
.
mark
.
asyncio
async
def
test_mid_stream_cancellation
(
monkeypatch
:
pytest
.
MonkeyPatch
,
engine_args
:
AsyncEngineArgs
,
prompt
:
PromptType
engine_args
:
AsyncEngineArgs
,
prompt
:
PromptType
):
"""Test that requests can be cancelled mid-stream."""
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
ExitStack
()
as
after
:
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
after
.
callback
(
engine
.
shutdown
)
...
...
@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
be added to the default loggers.
"""
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
ExitStack
()
as
after
:
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
,
...
...
@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
async
def
test_dp_rank_argument
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
async
def
test_dp_rank_argument
():
with
ExitStack
()
as
after
:
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
after
.
callback
(
engine
.
shutdown
)
...
...
@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
@
pytest
.
mark
.
asyncio
async
def
test_check_health
(
monkeypatch
:
pytest
.
MonkeyPatch
):
async
def
test_check_health
():
"""Test that check_health returns normally for healthy engine
and raises EngineDeadError when the engine is dead.
"""
...
...
@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
from
vllm.v1.engine.exceptions
import
EngineDeadError
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
ExitStack
()
as
after
:
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
after
.
callback
(
engine
.
shutdown
)
...
...
@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
"output_kind"
,
[
RequestOutputKind
.
DELTA
,
RequestOutputKind
.
FINAL_ONLY
]
)
@
pytest
.
mark
.
asyncio
async
def
test_abort_final_output
(
monkeypatch
:
pytest
.
MonkeyPatch
,
output_kind
:
RequestOutputKind
,
):
async
def
test_abort_final_output
(
output_kind
:
RequestOutputKind
):
"""Test that abort() returns a final output with correct information."""
with
monkeypatch
.
context
()
as
m
,
ExitStack
()
as
after
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
ExitStack
()
as
after
:
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
after
.
callback
(
engine
.
shutdown
)
...
...
tests/v1/engine/test_engine_args.py
View file @
1e4ecca1
...
...
@@ -5,18 +5,11 @@ from argparse import ArgumentError
import
pytest
from
vllm
import
envs
from
vllm.config
import
VllmConfig
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
FlexibleArgumentParser
if
not
envs
.
VLLM_USE_V1
:
pytest
.
skip
(
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test."
,
allow_module_level
=
True
,
)
def
test_prefix_caching_from_cli
():
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
...
...
tests/v1/engine/test_engine_core.py
View file @
1e4ecca1
...
...
@@ -46,9 +46,7 @@ def make_request() -> EngineCoreRequest:
@
create_new_process_for_each_test
()
def
test_engine_core
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
def
test_engine_core
():
"""Setup the EngineCore."""
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
vllm_config
=
engine_args
.
create_engine_config
()
...
...
@@ -176,14 +174,12 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
@
create_new_process_for_each_test
()
def
test_engine_core_advanced_sampling
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_engine_core_advanced_sampling
():
"""
A basic end-to-end test to verify that the engine functions correctly
when additional sampling parameters, such as top_p, min_tokens, and
presence_penalty, are set.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
"""Setup the EngineCore."""
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
vllm_config
=
engine_args
.
create_engine_config
()
...
...
@@ -227,7 +223,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
@
create_new_process_for_each_test
()
def
test_engine_core_concurrent_batches
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_engine_core_concurrent_batches
():
"""
Test that the engine can handle multiple concurrent batches.
"""
...
...
@@ -272,9 +268,6 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
if
hasattr
(
self
,
"thread_pool"
):
self
.
thread_pool
.
shutdown
(
wait
=
False
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
,
# To test concurrent batches.
...
...
@@ -364,13 +357,11 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_engine_core_tp
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_engine_core_tp
():
"""
Test engine can initialize worker in tp properly
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
"""Setup the EngineCore."""
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
,
...
...
@@ -400,11 +391,8 @@ def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
@
create_new_process_for_each_test
()
def
test_engine_core_invalid_request_id_type
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_engine_core_invalid_request_id_type
():
"""Test that engine raises TypeError for non-string request_id."""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
vllm_config
=
engine_args
.
create_engine_config
()
executor_class
=
Executor
.
get_class
(
vllm_config
)
...
...
@@ -432,9 +420,7 @@ def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch):
none_request
=
make_request
()
none_request
.
request_id
=
None
with
pytest
.
raises
(
TypeError
,
match
=
"request_id must be a string, got.*NoneType"
):
with
pytest
.
raises
(
TypeError
,
match
=
"request_id must be a string, got.*NoneType"
):
engine_core
.
add_request
(
*
engine_core
.
preprocess_add_request
(
none_request
))
# Verify engine is still functional after errors
...
...
tests/v1/engine/test_engine_core_client.py
View file @
1e4ecca1
...
...
@@ -130,8 +130,6 @@ def test_engine_core_client(
monkeypatch
:
pytest
.
MonkeyPatch
,
multiprocessing_mode
:
bool
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Monkey-patch core engine utility function to test.
m
.
setattr
(
EngineCore
,
"echo"
,
echo
,
raising
=
False
)
...
...
@@ -218,8 +216,6 @@ def test_engine_core_client(
@
pytest
.
mark
.
asyncio
(
loop_scope
=
"function"
)
async
def
test_engine_core_client_asyncio
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Monkey-patch core engine utility function to test.
m
.
setattr
(
EngineCore
,
"echo"
,
echo
,
raising
=
False
)
...
...
@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
monkeypatch
:
pytest
.
MonkeyPatch
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Must set insecure serialization to allow returning custom types.
m
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
...
...
@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
monkeypatch
:
pytest
.
MonkeyPatch
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Must set insecure serialization to allow returning custom types.
m
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
...
...
@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
monkeypatch
:
pytest
.
MonkeyPatch
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Must set insecure serialization to allow returning custom types.
m
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
...
...
@@ -592,12 +582,9 @@ async def test_engine_core_client_util_method_nested_structures(
indirect
=
[
"publisher_config"
],
)
def
test_kv_cache_events
(
monkeypatch
:
pytest
.
MonkeyPatch
,
multiprocessing_mode
:
bool
,
publisher_config
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
block_size
=
16
num_blocks
=
2
...
...
@@ -640,9 +627,7 @@ def test_kv_cache_events(
seq
,
received
=
result
assert
seq
==
0
,
"Sequence number mismatch"
assert
len
(
received
.
events
)
==
1
,
(
"We should have exactly one BlockStored event"
)
assert
len
(
received
.
events
)
==
1
,
"We should have exactly one BlockStored event"
event
=
received
.
events
[
0
]
assert
isinstance
(
event
,
BlockStored
),
"We should have a BlockStored event"
assert
len
(
event
.
block_hashes
)
==
num_blocks
,
(
...
...
@@ -672,12 +657,9 @@ def test_kv_cache_events(
)
@
multi_gpu_test
(
num_gpus
=
4
)
async
def
test_kv_cache_events_dp
(
monkeypatch
:
pytest
.
MonkeyPatch
,
multiprocessing_mode
:
bool
,
publisher_config
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
block_size
=
16
num_blocks
=
2
dp_size
=
2
...
...
@@ -765,8 +747,6 @@ async def test_kv_cache_events_dp(
@
pytest
.
mark
.
timeout
(
20
)
def
test_startup_failure
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
,
pytest
.
raises
(
Exception
)
as
e_info
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Monkey-patch to extract core process pid while it's starting.
core_proc_pid
=
[
None
]
cepm_ctor
=
CoreEngineProcManager
.
__init__
...
...
@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
mock_executor_class
.
side_effect
=
create_mock_executor
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"CUDA_VISIBLE_DEVICES"
,
""
)
# No CUDA devices
from
vllm.v1.engine.utils
import
EngineZmqAddresses
...
...
tests/v1/engine/test_llm_engine.py
View file @
1e4ecca1
...
...
@@ -21,12 +21,10 @@ DTYPE = "half"
def
_vllm_model
(
apc
:
bool
,
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
*
,
skip_tokenizer_init
:
bool
=
False
,
):
"""Set up VllmRunner instance."""
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
return
vllm_runner
(
MODEL
,
dtype
=
DTYPE
,
...
...
@@ -45,16 +43,16 @@ def _vllm_model(
# Prefix caching
params
=
[
False
,
True
],
)
def
vllm_model
(
vllm_runner
,
request
,
monkeypatch
):
def
vllm_model
(
vllm_runner
,
request
):
"""VllmRunner test fixture parameterized by APC True/False."""
with
_vllm_model
(
request
.
param
,
vllm_runner
,
monkeypatch
)
as
vllm_model
:
with
_vllm_model
(
request
.
param
,
vllm_runner
)
as
vllm_model
:
yield
vllm_model
@
pytest
.
fixture
(
scope
=
"function"
)
def
vllm_model_apc
(
vllm_runner
,
monkeypatch
):
def
vllm_model_apc
(
vllm_runner
):
"""VllmRunner test fixture with APC."""
with
_vllm_model
(
True
,
vllm_runner
,
monkeypatch
)
as
vllm_model
:
with
_vllm_model
(
True
,
vllm_runner
)
as
vllm_model
:
yield
vllm_model
...
...
@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
# Prefix caching
params
=
[
False
,
True
],
)
def
vllm_model_skip_tokenizer_init
(
vllm_runner
,
request
,
monkeypatch
):
def
vllm_model_skip_tokenizer_init
(
vllm_runner
,
request
):
"""VllmRunner test fixture with APC."""
with
_vllm_model
(
request
.
param
,
vllm_runner
,
monkeypatch
,
skip_tokenizer_init
=
True
,
)
as
vllm_model
:
yield
vllm_model
...
...
@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
)
def
test_engine_metrics
(
vllm_runner
,
monkeypatch
,
example_prompts
):
def
test_engine_metrics
(
vllm_runner
,
example_prompts
):
max_tokens
=
100
# Use spec decoding to test num_accepted_tokens_per_pos
speculative_config
=
{
...
...
@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
"prompt_lookup_min"
:
3
,
"num_speculative_tokens"
:
5
,
}
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
MODEL
,
speculative_config
=
speculative_config
,
...
...
@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"meta-llama/Llama-3.2-1B-Instruct"
])
def
test_skip_tokenizer_initialization
(
model
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
def
test_skip_tokenizer_initialization
(
model
:
str
):
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment