Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1e4ecca1
Unverified
Commit
1e4ecca1
authored
Oct 07, 2025
by
Cyrus Leung
Committed by
GitHub
Oct 07, 2025
Browse files
[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
c0a7b89d
Changes
51
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
247 additions
and
324 deletions
+247
-324
tests/v1/entrypoints/llm/test_struct_output_generate.py
tests/v1/entrypoints/llm/test_struct_output_generate.py
+1
-15
tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
...onnector/nixl_integration/run_tpu_disagg_accuracy_test.sh
+0
-3
tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
...1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
+0
-2
tests/v1/metrics/test_ray_metrics.py
tests/v1/metrics/test_ray_metrics.py
+0
-14
tests/v1/sample/test_logprobs.py
tests/v1/sample/test_logprobs.py
+154
-171
tests/v1/sample/test_sampling_params_e2e.py
tests/v1/sample/test_sampling_params_e2e.py
+0
-12
tests/v1/spec_decode/test_max_len.py
tests/v1/spec_decode/test_max_len.py
+0
-1
tests/v1/tpu/test_basic.py
tests/v1/tpu/test_basic.py
+46
-62
tests/v1/tpu/test_perf.py
tests/v1/tpu/test_perf.py
+38
-42
tests/v1/tracing/test_tracing.py
tests/v1/tracing/test_tracing.py
+1
-1
vllm/v1/worker/cpu_model_runner.py
vllm/v1/worker/cpu_model_runner.py
+7
-1
No files found.
tests/v1/entrypoints/llm/test_struct_output_generate.py
View file @
1e4ecca1
...
...
@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE
,
)
def
test_structured_output
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_json_schema
:
dict
[
str
,
Any
],
unsupported_json_schema
:
dict
[
str
,
Any
],
sample_sql_ebnf
:
str
,
...
...
@@ -115,8 +114,6 @@ def test_structured_output(
model_name
:
str
,
speculative_config
:
dict
[
str
,
Any
],
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
if
current_platform
.
is_tpu
()
and
speculative_config
:
pytest
.
skip
(
"TPU does not support speculative decoding"
)
...
...
@@ -620,15 +617,12 @@ Make the response as short as possible.
],
)
def
test_structured_output_with_reasoning_matrices
(
monkeypatch
:
pytest
.
MonkeyPatch
,
backend
:
str
,
tokenizer_mode
:
TokenizerMode
,
reasoning_parser
:
str
,
model_name
:
str
,
speculative_config
:
dict
[
str
,
Any
]
|
None
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
if
current_platform
.
is_tpu
()
and
speculative_config
:
pytest
.
skip
(
"TPU does not support speculative decoding"
)
...
...
@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"model_name, tokenizer_mode"
,
PARAMS_MODELS_TOKENIZER_MODE
)
def
test_structured_output_auto_mode
(
monkeypatch
:
pytest
.
MonkeyPatch
,
unsupported_json_schema
:
dict
[
str
,
Any
],
model_name
:
str
,
tokenizer_mode
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
,
...
...
@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
@
pytest
.
mark
.
skip_global_cleanup
def
test_guidance_no_additional_properties
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
def
test_guidance_no_additional_properties
():
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-1.5B-Instruct"
,
max_model_len
=
1024
,
...
...
@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"guidance"
,
"xgrammar"
,
"outlines"
])
def
test_structured_output_batched_with_non_structured_outputs_requests
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_json_schema
:
dict
[
str
,
Any
],
backend
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime
enforce_eager
=
bool
(
not
current_platform
.
is_tpu
())
...
...
tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
View file @
1e4ecca1
...
...
@@ -53,7 +53,6 @@ cleanup() {
launch_baseline
()
{
BASELINE_BASE_CMD
=
"source
${
CONDA_PATH
}
/bin/activate
${
CONDA_ENV_NAME
}
;
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
PJRT_DEVICE=TPU
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
...
...
@@ -73,7 +72,6 @@ launch_pd() {
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
PJRT_DEVICE=TPU
\
...
...
@@ -93,7 +91,6 @@ launch_pd() {
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
PJRT_DEVICE=TPU
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
...
...
tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
View file @
1e4ecca1
...
...
@@ -55,7 +55,6 @@ launch_pd() {
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
PJRT_DEVICE=TPU
\
...
...
@@ -75,7 +74,6 @@ launch_pd() {
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
PJRT_DEVICE=TPU
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
...
...
tests/v1/metrics/test_ray_metrics.py
View file @
1e4ecca1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
import
ray
...
...
@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
from
vllm.v1.engine.async_llm
import
AsyncEngineArgs
,
AsyncLLM
from
vllm.v1.metrics.ray_wrappers
import
RayPrometheusMetric
,
RayPrometheusStatLogger
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v1_only
(
monkeypatch
):
"""
The change relies on V1 APIs, so set VLLM_USE_V1=1.
"""
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
MODELS
=
[
"distilbert/distilgpt2"
,
]
...
...
@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
@
ray
.
remote
(
num_gpus
=
1
)
class
EngineTestActor
:
async
def
run
(
self
):
# Set environment variable inside the Ray actor since environment
# variables from pytest fixtures don't propagate to Ray actors
os
.
environ
[
"VLLM_USE_V1"
]
=
"1"
engine_args
=
AsyncEngineArgs
(
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
enforce_eager
=
True
)
...
...
tests/v1/sample/test_logprobs.py
View file @
1e4ecca1
...
...
@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
batch_logprobs_composition
:
BatchLogprobsComposition
,
temperature
:
float
,
example_prompts
:
list
[
str
],
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
"""Test V1 Engine logprobs & prompt logprobs
...
...
@@ -308,12 +307,8 @@ def test_get_logprobs_and_prompt_logprobs(
temperature: "temperature" sampling parameter
example_prompts: example prompt fixture
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
do_apc
=
vllm_model
.
llm
.
llm_engine
.
cache_config
.
enable_prefix_caching
if
do_apc
and
(
temperature
<
2.0
or
batch_logprobs_composition
!=
SAMPLE_PROMPT
):
if
do_apc
and
(
temperature
<
2.0
or
batch_logprobs_composition
!=
SAMPLE_PROMPT
):
# Skip some test-cases to save time.
pytest
.
skip
()
test_prompts
=
example_prompts
...
...
@@ -361,14 +356,11 @@ def test_get_logprobs_and_prompt_logprobs(
)
def
test_max_logprobs
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_max_logprobs
():
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs`
APC should not matter as this test checks basic request validation.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
runner
=
VllmRunner
(
"facebook/opt-125m"
,
max_logprobs
=
1
,
...
...
@@ -386,15 +378,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
runner
.
generate
([
"Hello world"
],
sampling_params
=
bad_sampling_params
)
def
test_none_logprobs
(
vllm_model
,
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_none_logprobs
(
vllm_model
,
example_prompts
):
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
Args:
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
max_tokens
=
5
sampling_params_logprobs_none
=
SamplingParams
(
...
...
@@ -416,15 +406,13 @@ def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa
assert
results_logprobs_none
[
i
].
prompt_logprobs
is
None
def
test_zero_logprobs
(
vllm_model
,
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_zero_logprobs
(
vllm_model
,
example_prompts
):
"""Engine should return sampled token and prompt token logprobs
Args:
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
max_tokens
=
5
sampling_params_logprobs_zero
=
SamplingParams
(
...
...
@@ -450,14 +438,12 @@ def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa
assert
len
(
prompt_token_ids
)
==
len
(
prompt_logprobs
)
def
test_all_logprobs
(
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_all_logprobs
(
example_prompts
):
"""Engine should return all vocabulary logprobs and prompt logprobs
Args:
example_prompts: list of example prompts (test fixture)
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
runner
=
VllmRunner
(
"facebook/opt-125m"
,
max_logprobs
=-
1
,
...
...
@@ -488,16 +474,13 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
@
pytest
.
mark
.
parametrize
(
"logprobs_mode"
,
get_args
(
LogprobsMode
))
def
test_logprobs_mode
(
logprobs_mode
:
LogprobsMode
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_logprobs_mode
(
logprobs_mode
:
LogprobsMode
):
"""Test with LLM engine with different logprobs_mode.
For logprobs, we should have non-positive values.
For logits, we should expect at least one positive values.
"""
from
vllm
import
LLM
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
"facebook/opt-125m"
,
max_logprobs
=
5
,
...
...
tests/v1/sample/test_sampling_params_e2e.py
View file @
1e4ecca1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
MODEL
=
"meta-llama/Llama-3.2-1B"
PROMPT
=
"Hello my name is Robert and I"
...
...
@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
_
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
[
10000000
]))
def
test_priority
(
llm
):
"""Check that we reject requests with priority."""
# Reject all allowed token ids
with
pytest
.
raises
(
ValueError
):
_
=
llm
.
generate
(
PROMPT
,
priority
=
[
1
])
def
test_seed
(
llm
):
"""Check that seed impacts randomness."""
...
...
tests/v1/spec_decode/test_max_len.py
View file @
1e4ecca1
...
...
@@ -38,7 +38,6 @@ def test_eagle_max_len(
monkeypatch
:
pytest
.
MonkeyPatch
,
num_speculative_tokens
:
int
,
attn_backend
:
str
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
if
attn_backend
==
"TRITON_ATTN"
and
not
current_platform
.
is_rocm
():
...
...
tests/v1/tpu/test_basic.py
View file @
1e4ecca1
...
...
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
MAX_NUM_REQS
)
def
test_basic
(
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
model
:
str
,
max_tokens
:
int
,
tensor_parallel_size
:
int
,
...
...
@@ -55,9 +54,6 @@ def test_basic(
)
example_prompts
=
[
prompt
]
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
model
,
# Note: max_num_batched_tokens == 1024 is needed here to
...
...
@@ -82,7 +78,6 @@ def test_basic(
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
[
16
])
def
test_phi3
(
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
max_tokens
:
int
,
max_num_seqs
:
int
,
)
->
None
:
...
...
@@ -99,9 +94,6 @@ def test_phi3(
# test head dim = 96
model
=
"microsoft/Phi-3-mini-128k-instruct"
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
model
,
max_num_batched_tokens
=
256
,
max_num_seqs
=
max_num_seqs
)
as
vllm_model
:
...
...
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
)
def
test_gemma3_27b_with_text_input_and_tp
(
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
model
=
"google/gemma-3-27b-it"
max_tokens
=
16
...
...
@@ -140,9 +131,6 @@ def test_gemma3_27b_with_text_input_and_tp(
" but in rising every time we fall."
,
]
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
model
,
max_num_batched_tokens
=
256
,
...
...
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
)
def
test_w8a8_quantization
(
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
model
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
max_tokens
=
5
...
...
@@ -176,9 +163,6 @@ def test_w8a8_quantization(
)
example_prompts
=
[
prompt
]
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
model
,
max_num_batched_tokens
=
64
,
...
...
tests/v1/tpu/test_perf.py
View file @
1e4ecca1
...
...
@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
@
pytest
.
mark
.
parametrize
(
"params"
,
TEST_PARAMS
)
def
test_perf
(
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
params
:
TestParams
,
)
->
None
:
tokenizer
=
get_tokenizer
(
...
...
@@ -107,9 +106,6 @@ def test_perf(
)
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
sampling_params
=
SamplingParams
(
max_tokens
=
params
.
decode_len
,
temperature
=
1.0
,
min_p
=
0.0
)
...
...
tests/v1/tracing/test_tracing.py
View file @
1e4ecca1
...
...
@@ -82,7 +82,7 @@ def test_traces(
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
OTEL_EXPORTER_OTLP_TRACES_INSECURE
,
"true"
)
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
top_p
=
0.1
,
...
...
vllm/v1/worker/cpu_model_runner.py
View file @
1e4ecca1
...
...
@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
logger
.
info
(
"Warming up model for the compilation..."
)
# Only generate graph for the generic shape
with
_set_global_compilation_settings
(
self
.
vllm_config
):
self
.
_dummy_run
(
max
(
16
,
self
.
max_num_reqs
))
self
.
_dummy_run
(
min
(
max
(
16
,
self
.
max_num_reqs
),
self
.
scheduler_config
.
max_num_batched_tokens
,
)
)
logger
.
info
(
"Warming up done."
)
def
_init_device_properties
(
self
)
->
None
:
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment