Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1e4ecca1
Unverified
Commit
1e4ecca1
authored
Oct 07, 2025
by
Cyrus Leung
Committed by
GitHub
Oct 07, 2025
Browse files
[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
c0a7b89d
Changes
51
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
247 additions
and
324 deletions
+247
-324
tests/v1/entrypoints/llm/test_struct_output_generate.py
tests/v1/entrypoints/llm/test_struct_output_generate.py
+1
-15
tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
...onnector/nixl_integration/run_tpu_disagg_accuracy_test.sh
+0
-3
tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
...1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
+0
-2
tests/v1/metrics/test_ray_metrics.py
tests/v1/metrics/test_ray_metrics.py
+0
-14
tests/v1/sample/test_logprobs.py
tests/v1/sample/test_logprobs.py
+154
-171
tests/v1/sample/test_sampling_params_e2e.py
tests/v1/sample/test_sampling_params_e2e.py
+0
-12
tests/v1/spec_decode/test_max_len.py
tests/v1/spec_decode/test_max_len.py
+0
-1
tests/v1/tpu/test_basic.py
tests/v1/tpu/test_basic.py
+46
-62
tests/v1/tpu/test_perf.py
tests/v1/tpu/test_perf.py
+38
-42
tests/v1/tracing/test_tracing.py
tests/v1/tracing/test_tracing.py
+1
-1
vllm/v1/worker/cpu_model_runner.py
vllm/v1/worker/cpu_model_runner.py
+7
-1
No files found.
tests/v1/entrypoints/llm/test_struct_output_generate.py
View file @
1e4ecca1
...
...
@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE
,
)
def
test_structured_output
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_json_schema
:
dict
[
str
,
Any
],
unsupported_json_schema
:
dict
[
str
,
Any
],
sample_sql_ebnf
:
str
,
...
...
@@ -115,8 +114,6 @@ def test_structured_output(
model_name
:
str
,
speculative_config
:
dict
[
str
,
Any
],
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
if
current_platform
.
is_tpu
()
and
speculative_config
:
pytest
.
skip
(
"TPU does not support speculative decoding"
)
...
...
@@ -620,15 +617,12 @@ Make the response as short as possible.
],
)
def
test_structured_output_with_reasoning_matrices
(
monkeypatch
:
pytest
.
MonkeyPatch
,
backend
:
str
,
tokenizer_mode
:
TokenizerMode
,
reasoning_parser
:
str
,
model_name
:
str
,
speculative_config
:
dict
[
str
,
Any
]
|
None
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
if
current_platform
.
is_tpu
()
and
speculative_config
:
pytest
.
skip
(
"TPU does not support speculative decoding"
)
...
...
@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"model_name, tokenizer_mode"
,
PARAMS_MODELS_TOKENIZER_MODE
)
def
test_structured_output_auto_mode
(
monkeypatch
:
pytest
.
MonkeyPatch
,
unsupported_json_schema
:
dict
[
str
,
Any
],
model_name
:
str
,
tokenizer_mode
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
,
...
...
@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
@
pytest
.
mark
.
skip_global_cleanup
def
test_guidance_no_additional_properties
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
def
test_guidance_no_additional_properties
():
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-1.5B-Instruct"
,
max_model_len
=
1024
,
...
...
@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"guidance"
,
"xgrammar"
,
"outlines"
])
def
test_structured_output_batched_with_non_structured_outputs_requests
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_json_schema
:
dict
[
str
,
Any
],
backend
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime
enforce_eager
=
bool
(
not
current_platform
.
is_tpu
())
...
...
tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
View file @
1e4ecca1
...
...
@@ -53,7 +53,6 @@ cleanup() {
launch_baseline
()
{
BASELINE_BASE_CMD
=
"source
${
CONDA_PATH
}
/bin/activate
${
CONDA_ENV_NAME
}
;
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
PJRT_DEVICE=TPU
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
...
...
@@ -73,7 +72,6 @@ launch_pd() {
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
PJRT_DEVICE=TPU
\
...
...
@@ -93,7 +91,6 @@ launch_pd() {
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
PJRT_DEVICE=TPU
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
...
...
tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
View file @
1e4ecca1
...
...
@@ -55,7 +55,6 @@ launch_pd() {
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
PJRT_DEVICE=TPU
\
...
...
@@ -75,7 +74,6 @@ launch_pd() {
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
PJRT_DEVICE=TPU
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
...
...
tests/v1/metrics/test_ray_metrics.py
View file @
1e4ecca1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
import
ray
...
...
@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
from
vllm.v1.engine.async_llm
import
AsyncEngineArgs
,
AsyncLLM
from
vllm.v1.metrics.ray_wrappers
import
RayPrometheusMetric
,
RayPrometheusStatLogger
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v1_only
(
monkeypatch
):
"""
The change relies on V1 APIs, so set VLLM_USE_V1=1.
"""
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
MODELS
=
[
"distilbert/distilgpt2"
,
]
...
...
@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
@
ray
.
remote
(
num_gpus
=
1
)
class
EngineTestActor
:
async
def
run
(
self
):
# Set environment variable inside the Ray actor since environment
# variables from pytest fixtures don't propagate to Ray actors
os
.
environ
[
"VLLM_USE_V1"
]
=
"1"
engine_args
=
AsyncEngineArgs
(
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
enforce_eager
=
True
)
...
...
tests/v1/sample/test_logprobs.py
View file @
1e4ecca1
...
...
@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
batch_logprobs_composition
:
BatchLogprobsComposition
,
temperature
:
float
,
example_prompts
:
list
[
str
],
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
"""Test V1 Engine logprobs & prompt logprobs
...
...
@@ -308,220 +307,204 @@ def test_get_logprobs_and_prompt_logprobs(
temperature: "temperature" sampling parameter
example_prompts: example prompt fixture
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
do_apc
=
vllm_model
.
llm
.
llm_engine
.
cache_config
.
enable_prefix_caching
if
do_apc
and
(
temperature
<
2.0
or
batch_logprobs_composition
!=
SAMPLE_PROMPT
):
# Skip some test-cases to save time.
pytest
.
skip
()
test_prompts
=
example_prompts
max_tokens
=
5
hf_outputs
=
hf_model
.
generate_greedy
(
test_prompts
,
do_apc
=
vllm_model
.
llm
.
llm_engine
.
cache_config
.
enable_prefix_caching
if
do_apc
and
(
temperature
<
2.0
or
batch_logprobs_composition
!=
SAMPLE_PROMPT
):
# Skip some test-cases to save time.
pytest
.
skip
()
test_prompts
=
example_prompts
max_tokens
=
5
hf_outputs
=
hf_model
.
generate_greedy
(
test_prompts
,
max_tokens
=
max_tokens
,
)
hf_logprobs
=
hf_model
.
generate_greedy_logprobs
(
test_prompts
,
max_tokens
=
max_tokens
,
)
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list
=
get_test_batch
(
batch_logprobs_composition
)
# Ensure that each test prompt has a logprob config for testing
logprob_prompt_logprob_list
=
_repeat_logprob_config
(
test_prompts
,
logprob_prompt_logprob_list
)
# Generate SamplingParams
vllm_sampling_params
=
[
SamplingParams
(
max_tokens
=
max_tokens
,
logprobs
=
num_lp
,
prompt_logprobs
=
num_plp
,
temperature
=
temperature
,
seed
=
1984
,
)
hf_logprobs
=
hf_model
.
generate_greedy_logprobs
(
test_prompts
,
for
num_lp
,
num_plp
in
logprob_prompt_logprob_list
]
for
_
in
range
(
2
if
do_apc
else
1
):
_run_and_validate
(
vllm_model
=
vllm_model
,
test_prompts
=
test_prompts
,
vllm_sampling_params
=
vllm_sampling_params
,
hf_logprobs
=
hf_logprobs
,
hf_outputs
=
hf_outputs
,
logprob_prompt_logprob_list
=
logprob_prompt_logprob_list
,
temperature
=
temperature
,
max_tokens
=
max_tokens
,
do_apc
=
do_apc
,
)
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list
=
get_test_batch
(
batch_logprobs_composition
)
# Ensure that each test prompt has a logprob config for testing
logprob_prompt_logprob_list
=
_repeat_logprob_config
(
test_prompts
,
logprob_prompt_logprob_list
)
# Generate SamplingParams
vllm_sampling_params
=
[
SamplingParams
(
max_tokens
=
max_tokens
,
logprobs
=
num_lp
,
prompt_logprobs
=
num_plp
,
temperature
=
temperature
,
seed
=
1984
,
)
for
num_lp
,
num_plp
in
logprob_prompt_logprob_list
]
for
_
in
range
(
2
if
do_apc
else
1
):
_run_and_validate
(
vllm_model
=
vllm_model
,
test_prompts
=
test_prompts
,
vllm_sampling_params
=
vllm_sampling_params
,
hf_logprobs
=
hf_logprobs
,
hf_outputs
=
hf_outputs
,
logprob_prompt_logprob_list
=
logprob_prompt_logprob_list
,
temperature
=
temperature
,
max_tokens
=
max_tokens
,
do_apc
=
do_apc
,
)
def
test_max_logprobs
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_max_logprobs
():
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs`
APC should not matter as this test checks basic request validation.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
runner
=
VllmRunner
(
"facebook/opt-125m"
,
max_logprobs
=
1
,
enable_prefix_caching
=
False
,
# 2 other llms alive during whole session
gpu_memory_utilization
=
0.15
,
max_model_len
=
256
,
)
vllm_sampling_params
=
SamplingParams
(
logprobs
=
1
)
# should pass
runner
.
generate
([
"Hello world"
],
sampling_params
=
vllm_sampling_params
)
runner
=
VllmRunner
(
"facebook/opt-125m"
,
max_logprobs
=
1
,
enable_prefix_caching
=
False
,
# 2 other llms alive during whole session
gpu_memory_utilization
=
0.15
,
max_model_len
=
256
,
)
vllm_sampling_params
=
SamplingParams
(
logprobs
=
1
)
# should pass
runner
.
generate
([
"Hello world"
],
sampling_params
=
vllm_sampling_params
)
bad_sampling_params
=
SamplingParams
(
logprobs
=
2
)
with
pytest
.
raises
(
ValueError
):
runner
.
generate
([
"Hello world"
],
sampling_params
=
bad_sampling_params
)
bad_sampling_params
=
SamplingParams
(
logprobs
=
2
)
with
pytest
.
raises
(
ValueError
):
runner
.
generate
([
"Hello world"
],
sampling_params
=
bad_sampling_params
)
def
test_none_logprobs
(
vllm_model
,
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_none_logprobs
(
vllm_model
,
example_prompts
):
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
Args:
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
max_tokens
=
5
max_tokens
=
5
sampling_params_logprobs_none
=
SamplingParams
(
max_tokens
=
max_tokens
,
logprobs
=
None
,
prompt_logprobs
=
None
,
temperature
=
0.0
,
)
results_logprobs_none
=
vllm_model
.
llm
.
generate
(
example_prompts
,
sampling_params
=
sampling_params_logprobs_none
,
)
sampling_params_logprobs_none
=
SamplingParams
(
max_tokens
=
max_tokens
,
logprobs
=
None
,
prompt_logprobs
=
None
,
temperature
=
0.0
,
)
results_logprobs_none
=
vllm_model
.
llm
.
generate
(
example_prompts
,
sampling_params
=
sampling_params_logprobs_none
,
)
for
i
in
range
(
len
(
results_logprobs_none
)):
# Check sample logprobs are None
assert
results_logprobs_none
[
i
].
outputs
[
0
].
logprobs
is
None
assert
results_logprobs_none
[
i
].
outputs
[
0
].
cumulative_logprob
is
None
# Check prompt logprobs are None
assert
results_logprobs_none
[
i
].
prompt_logprobs
is
None
for
i
in
range
(
len
(
results_logprobs_none
)):
# Check sample logprobs are None
assert
results_logprobs_none
[
i
].
outputs
[
0
].
logprobs
is
None
assert
results_logprobs_none
[
i
].
outputs
[
0
].
cumulative_logprob
is
None
# Check prompt logprobs are None
assert
results_logprobs_none
[
i
].
prompt_logprobs
is
None
def
test_zero_logprobs
(
vllm_model
,
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_zero_logprobs
(
vllm_model
,
example_prompts
):
"""Engine should return sampled token and prompt token logprobs
Args:
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
max_tokens
=
5
max_tokens
=
5
sampling_params_logprobs_zero
=
SamplingParams
(
max_tokens
=
max_tokens
,
logprobs
=
0
,
prompt_logprobs
=
0
,
temperature
=
0.0
)
results_logprobs_zero
=
vllm_model
.
llm
.
generate
(
example_prompts
,
sampling_params
=
sampling_params_logprobs_zero
)
sampling_params_logprobs_zero
=
SamplingParams
(
max_tokens
=
max_tokens
,
logprobs
=
0
,
prompt_logprobs
=
0
,
temperature
=
0.0
)
results_logprobs_zero
=
vllm_model
.
llm
.
generate
(
example_prompts
,
sampling_params
=
sampling_params_logprobs_zero
)
for
i
in
range
(
len
(
results_logprobs_zero
)):
# Check that there is one sample logprob dict for each
# sample token
logprobs
=
results_logprobs_zero
[
i
].
outputs
[
0
].
logprobs
prompt_logprobs
=
results_logprobs_zero
[
i
].
prompt_logprobs
sampled_token_ids
=
results_logprobs_zero
[
i
].
outputs
[
0
].
token_ids
prompt_token_ids
=
results_logprobs_zero
[
i
].
prompt_token_ids
assert
logprobs
is
not
None
assert
len
(
sampled_token_ids
)
==
len
(
logprobs
)
assert
results_logprobs_zero
[
i
].
outputs
[
0
].
cumulative_logprob
is
not
None
# Check that there is one prompt logprob dict for each
# prompt token
assert
prompt_logprobs
is
not
None
assert
len
(
prompt_token_ids
)
==
len
(
prompt_logprobs
)
def
test_all_logprobs
(
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
for
i
in
range
(
len
(
results_logprobs_zero
)):
# Check that there is one sample logprob dict for each
# sample token
logprobs
=
results_logprobs_zero
[
i
].
outputs
[
0
].
logprobs
prompt_logprobs
=
results_logprobs_zero
[
i
].
prompt_logprobs
sampled_token_ids
=
results_logprobs_zero
[
i
].
outputs
[
0
].
token_ids
prompt_token_ids
=
results_logprobs_zero
[
i
].
prompt_token_ids
assert
logprobs
is
not
None
assert
len
(
sampled_token_ids
)
==
len
(
logprobs
)
assert
results_logprobs_zero
[
i
].
outputs
[
0
].
cumulative_logprob
is
not
None
# Check that there is one prompt logprob dict for each
# prompt token
assert
prompt_logprobs
is
not
None
assert
len
(
prompt_token_ids
)
==
len
(
prompt_logprobs
)
def
test_all_logprobs
(
example_prompts
):
"""Engine should return all vocabulary logprobs and prompt logprobs
Args:
example_prompts: list of example prompts (test fixture)
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
runner
=
VllmRunner
(
"facebook/opt-125m"
,
max_logprobs
=-
1
,
enable_prefix_caching
=
False
,
# 2 other llms alive during whole session
gpu_memory_utilization
=
0.15
,
max_model_len
=
256
,
)
runner
=
VllmRunner
(
"facebook/opt-125m"
,
max_logprobs
=-
1
,
enable_prefix_caching
=
False
,
# 2 other llms alive during whole session
gpu_memory_utilization
=
0.15
,
max_model_len
=
256
,
)
sampling_params_logprobs_all
=
SamplingParams
(
max_tokens
=
5
,
logprobs
=-
1
,
prompt_logprobs
=-
1
)
results_logprobs_all
=
runner
.
llm
.
generate
(
example_prompts
,
sampling_params
=
sampling_params_logprobs_all
)
vocab_size
=
runner
.
llm
.
llm_engine
.
get_model_config
().
get_vocab_size
()
sampling_params_logprobs_all
=
SamplingParams
(
max_tokens
=
5
,
logprobs
=-
1
,
prompt_logprobs
=-
1
)
results_logprobs_all
=
runner
.
llm
.
generate
(
example_prompts
,
sampling_params
=
sampling_params_logprobs_all
)
vocab_size
=
runner
.
llm
.
llm_engine
.
get_model_config
().
get_vocab_size
()
for
i
in
range
(
len
(
results_logprobs_all
)):
logprobs
=
results_logprobs_all
[
i
].
outputs
[
0
].
logprobs
prompt_logprobs
=
results_logprobs_all
[
i
].
prompt_logprobs
assert
logprobs
is
not
None
for
logprob
in
logprobs
:
assert
len
(
logprob
)
==
vocab_size
assert
prompt_logprobs
is
not
None
assert
prompt_logprobs
[
0
]
is
None
for
prompt_logprob
in
prompt_logprobs
[
1
:]:
assert
len
(
prompt_logprob
)
==
vocab_size
for
i
in
range
(
len
(
results_logprobs_all
)):
logprobs
=
results_logprobs_all
[
i
].
outputs
[
0
].
logprobs
prompt_logprobs
=
results_logprobs_all
[
i
].
prompt_logprobs
assert
logprobs
is
not
None
for
logprob
in
logprobs
:
assert
len
(
logprob
)
==
vocab_size
assert
prompt_logprobs
is
not
None
assert
prompt_logprobs
[
0
]
is
None
for
prompt_logprob
in
prompt_logprobs
[
1
:]:
assert
len
(
prompt_logprob
)
==
vocab_size
@
pytest
.
mark
.
parametrize
(
"logprobs_mode"
,
get_args
(
LogprobsMode
))
def
test_logprobs_mode
(
logprobs_mode
:
LogprobsMode
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_logprobs_mode
(
logprobs_mode
:
LogprobsMode
):
"""Test with LLM engine with different logprobs_mode.
For logprobs, we should have non-positive values.
For logits, we should expect at least one positive values.
"""
from
vllm
import
LLM
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
"facebook/opt-125m"
,
max_logprobs
=
5
,
enable_prefix_caching
=
False
,
# 2 other llms alive during whole session
gpu_memory_utilization
=
0.05
,
max_model_len
=
16
,
logprobs_mode
=
logprobs_mode
,
)
vllm_sampling_params
=
SamplingParams
(
logprobs
=
1
)
results
=
llm
.
generate
([
"Hello world"
],
sampling_params
=
vllm_sampling_params
)
total_token_with_logprobs
=
0
positive_values
=
0
for
output
in
results
[
0
].
outputs
:
for
logprobs
in
output
.
logprobs
:
for
token_id
in
logprobs
:
logprob
=
logprobs
[
token_id
]
if
logprobs_mode
in
(
"raw_logprobs"
,
"processed_logprobs"
):
assert
logprob
.
logprob
<=
0
if
logprob
.
logprob
>
0
:
positive_values
=
positive_values
+
1
total_token_with_logprobs
=
total_token_with_logprobs
+
1
assert
total_token_with_logprobs
>=
len
(
results
[
0
].
outputs
)
if
logprobs_mode
in
(
"raw_logits"
,
"processed_logits"
):
assert
positive_values
>
0
del
llm
llm
=
LLM
(
"facebook/opt-125m"
,
max_logprobs
=
5
,
enable_prefix_caching
=
False
,
# 2 other llms alive during whole session
gpu_memory_utilization
=
0.05
,
max_model_len
=
16
,
logprobs_mode
=
logprobs_mode
,
)
vllm_sampling_params
=
SamplingParams
(
logprobs
=
1
)
results
=
llm
.
generate
([
"Hello world"
],
sampling_params
=
vllm_sampling_params
)
total_token_with_logprobs
=
0
positive_values
=
0
for
output
in
results
[
0
].
outputs
:
for
logprobs
in
output
.
logprobs
:
for
token_id
in
logprobs
:
logprob
=
logprobs
[
token_id
]
if
logprobs_mode
in
(
"raw_logprobs"
,
"processed_logprobs"
):
assert
logprob
.
logprob
<=
0
if
logprob
.
logprob
>
0
:
positive_values
=
positive_values
+
1
total_token_with_logprobs
=
total_token_with_logprobs
+
1
assert
total_token_with_logprobs
>=
len
(
results
[
0
].
outputs
)
if
logprobs_mode
in
(
"raw_logits"
,
"processed_logits"
):
assert
positive_values
>
0
del
llm
tests/v1/sample/test_sampling_params_e2e.py
View file @
1e4ecca1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
MODEL
=
"meta-llama/Llama-3.2-1B"
PROMPT
=
"Hello my name is Robert and I"
...
...
@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
_
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
[
10000000
]))
def
test_priority
(
llm
):
"""Check that we reject requests with priority."""
# Reject all allowed token ids
with
pytest
.
raises
(
ValueError
):
_
=
llm
.
generate
(
PROMPT
,
priority
=
[
1
])
def
test_seed
(
llm
):
"""Check that seed impacts randomness."""
...
...
tests/v1/spec_decode/test_max_len.py
View file @
1e4ecca1
...
...
@@ -38,7 +38,6 @@ def test_eagle_max_len(
monkeypatch
:
pytest
.
MonkeyPatch
,
num_speculative_tokens
:
int
,
attn_backend
:
str
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
if
attn_backend
==
"TRITON_ATTN"
and
not
current_platform
.
is_rocm
():
...
...
tests/v1/tpu/test_basic.py
View file @
1e4ecca1
...
...
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
MAX_NUM_REQS
)
def
test_basic
(
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
model
:
str
,
max_tokens
:
int
,
tensor_parallel_size
:
int
,
...
...
@@ -55,23 +54,20 @@ def test_basic(
)
example_prompts
=
[
prompt
]
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
model
,
# Note: max_num_batched_tokens == 1024 is needed here to
# actually test chunked prompt
max_num_batched_tokens
=
1024
,
max_model_len
=
8192
,
gpu_memory_utilization
=
0.7
,
max_num_seqs
=
max_num_seqs
,
tensor_parallel_size
=
tensor_parallel_size
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
output
=
vllm_outputs
[
0
][
1
]
with
vllm_runner
(
model
,
# Note: max_num_batched_tokens == 1024 is needed here to
# actually test chunked prompt
max_num_batched_tokens
=
1024
,
max_model_len
=
8192
,
gpu_memory_utilization
=
0.7
,
max_num_seqs
=
max_num_seqs
,
tensor_parallel_size
=
tensor_parallel_size
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
output
=
vllm_outputs
[
0
][
1
]
assert
"1024"
in
output
or
"0, 1"
in
output
assert
"1024"
in
output
or
"0, 1"
in
output
@
pytest
.
mark
.
skip
(
reason
=
"Temporarily disabled due to timeout"
)
...
...
@@ -82,7 +78,6 @@ def test_basic(
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
[
16
])
def
test_phi3
(
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
max_tokens
:
int
,
max_num_seqs
:
int
,
)
->
None
:
...
...
@@ -99,18 +94,15 @@ def test_phi3(
# test head dim = 96
model
=
"microsoft/Phi-3-mini-128k-instruct"
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
model
,
max_num_batched_tokens
=
256
,
max_num_seqs
=
max_num_seqs
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
# vllm_outputs is a list of tuples whose first element is the token id
# and the second element is the output (including the prompt).
for
output
,
answer
in
zip
(
vllm_outputs
,
answers
):
generated_text
=
output
[
1
]
assert
answer
in
generated_text
with
vllm_runner
(
model
,
max_num_batched_tokens
=
256
,
max_num_seqs
=
max_num_seqs
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
# vllm_outputs is a list of tuples whose first element is the token id
# and the second element is the output (including the prompt).
for
output
,
answer
in
zip
(
vllm_outputs
,
answers
):
generated_text
=
output
[
1
]
assert
answer
in
generated_text
TP_SIZE_8
=
8
...
...
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
)
def
test_gemma3_27b_with_text_input_and_tp
(
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
model
=
"google/gemma-3-27b-it"
max_tokens
=
16
...
...
@@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
" but in rising every time we fall."
,
]
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
model
,
max_num_batched_tokens
=
256
,
max_num_seqs
=
max_num_seqs
,
tensor_parallel_size
=
tensor_parallel_size
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
# vllm_outputs is a list of tuples whose first element is the token id
# and the second element is the output (including the prompt).
for
output
,
answer
in
zip
(
vllm_outputs
,
answers
):
generated_text
=
output
[
1
]
assert
answer
in
generated_text
with
vllm_runner
(
model
,
max_num_batched_tokens
=
256
,
max_num_seqs
=
max_num_seqs
,
tensor_parallel_size
=
tensor_parallel_size
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
# vllm_outputs is a list of tuples whose first element is the token id
# and the second element is the output (including the prompt).
for
output
,
answer
in
zip
(
vllm_outputs
,
answers
):
generated_text
=
output
[
1
]
assert
answer
in
generated_text
@
pytest
.
mark
.
skipif
(
...
...
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
)
def
test_w8a8_quantization
(
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
model
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
max_tokens
=
5
...
...
@@ -176,18 +163,15 @@ def test_w8a8_quantization(
)
example_prompts
=
[
prompt
]
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
model
,
max_num_batched_tokens
=
64
,
max_model_len
=
4096
,
gpu_memory_utilization
=
0.7
,
max_num_seqs
=
max_num_seqs
,
tensor_parallel_size
=
tensor_parallel_size
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
output
=
vllm_outputs
[
0
][
1
]
assert
"1024"
in
output
or
"0, 1"
in
output
with
vllm_runner
(
model
,
max_num_batched_tokens
=
64
,
max_model_len
=
4096
,
gpu_memory_utilization
=
0.7
,
max_num_seqs
=
max_num_seqs
,
tensor_parallel_size
=
tensor_parallel_size
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
output
=
vllm_outputs
[
0
][
1
]
assert
"1024"
in
output
or
"0, 1"
in
output
tests/v1/tpu/test_perf.py
View file @
1e4ecca1
...
...
@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
@
pytest
.
mark
.
parametrize
(
"params"
,
TEST_PARAMS
)
def
test_perf
(
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
params
:
TestParams
,
)
->
None
:
tokenizer
=
get_tokenizer
(
...
...
@@ -107,48 +106,45 @@ def test_perf(
)
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
sampling_params
=
SamplingParams
(
max_tokens
=
params
.
decode_len
,
temperature
=
1.0
,
min_p
=
0.0
)
sampling_params
=
SamplingParams
(
max_tokens
=
params
.
decode_len
,
temperature
=
1.0
,
min_p
=
0.0
with
vllm_runner
(
params
.
model
,
max_num_batched_tokens
=
MAX_MODEL_LEN
,
max_model_len
=
MAX_MODEL_LEN
,
max_num_seqs
=
MAX_NUM_SEQS
,
gpu_memory_utilization
=
GPU_UTIL
,
enforce_eager
=
False
,
tensor_parallel_size
=
1
,
)
as
vllm_model
:
print
(
" -- Warmup / Compile"
)
for
i
in
range
(
NUM_WARMUPS
):
_
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
print
(
" -- Benchmarking... "
)
times
=
[]
for
i
in
range
(
NUM_RUNS
):
start_time
=
time
.
time
()
_
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
times
.
append
(
time
.
time
()
-
start_time
)
avg_time
=
sum
(
times
)
/
len
(
times
)
print
(
" -- avg_time = {}"
.
format
(
avg_time
))
print
(
" -- expected_avg_time = {} with err_tol = {}"
.
format
(
params
.
expected_avg_time
,
params
.
err_tol
)
)
with
vllm_runner
(
params
.
model
,
max_num_batched_tokens
=
MAX_MODEL_LEN
,
max_model_len
=
MAX_MODEL_LEN
,
max_num_seqs
=
MAX_NUM_SEQS
,
gpu_memory_utilization
=
GPU_UTIL
,
enforce_eager
=
False
,
tensor_parallel_size
=
1
,
)
as
vllm_model
:
print
(
" -- Warmup / Compile"
)
for
i
in
range
(
NUM_WARMUPS
):
_
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
print
(
" -- Benchmarking... "
)
times
=
[]
for
i
in
range
(
NUM_RUNS
):
start_time
=
time
.
time
()
_
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
times
.
append
(
time
.
time
()
-
start_time
)
avg_time
=
sum
(
times
)
/
len
(
times
)
print
(
" -- avg_time = {}"
.
format
(
avg_time
))
diff
=
avg_time
-
params
.
expected_avg_time
ok
=
diff
<
params
.
err_tol
if
diff
<
-
params
.
err_tol
:
print
(
"
-- expected_avg_time = {} with err_tol = {}"
.
format
(
params
.
expected_avg_time
,
params
.
err_tol
)
"
!! WARNING !! Performance has improved by {}, "
"it may be necessary to fine-tune the "
"expected_avg_time = {}"
.
format
(
-
diff
,
params
.
expected_avg_time
)
)
diff
=
avg_time
-
params
.
expected_avg_time
ok
=
diff
<
params
.
err_tol
if
diff
<
-
params
.
err_tol
:
print
(
" !! WARNING !! Performance has improved by {}, "
"it may be necessary to fine-tune the "
"expected_avg_time = {}"
.
format
(
-
diff
,
params
.
expected_avg_time
)
)
assert
ok
,
" !! ERROR !! Regression detected"
assert
ok
,
" !! ERROR !! Regression detected"
tests/v1/tracing/test_tracing.py
View file @
1e4ecca1
...
...
@@ -82,7 +82,7 @@ def test_traces(
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
OTEL_EXPORTER_OTLP_TRACES_INSECURE
,
"true"
)
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
top_p
=
0.1
,
...
...
vllm/v1/worker/cpu_model_runner.py
View file @
1e4ecca1
...
...
@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
logger
.
info
(
"Warming up model for the compilation..."
)
# Only generate graph for the generic shape
with
_set_global_compilation_settings
(
self
.
vllm_config
):
self
.
_dummy_run
(
max
(
16
,
self
.
max_num_reqs
))
self
.
_dummy_run
(
min
(
max
(
16
,
self
.
max_num_reqs
),
self
.
scheduler_config
.
max_num_batched_tokens
,
)
)
logger
.
info
(
"Warming up done."
)
def
_init_device_properties
(
self
)
->
None
:
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment