Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1e4ecca1
Unverified
Commit
1e4ecca1
authored
Oct 07, 2025
by
Cyrus Leung
Committed by
GitHub
Oct 07, 2025
Browse files
[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
c0a7b89d
Changes
51
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
247 additions
and
324 deletions
+247
-324
tests/v1/entrypoints/llm/test_struct_output_generate.py
tests/v1/entrypoints/llm/test_struct_output_generate.py
+1
-15
tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
...onnector/nixl_integration/run_tpu_disagg_accuracy_test.sh
+0
-3
tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
...1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
+0
-2
tests/v1/metrics/test_ray_metrics.py
tests/v1/metrics/test_ray_metrics.py
+0
-14
tests/v1/sample/test_logprobs.py
tests/v1/sample/test_logprobs.py
+154
-171
tests/v1/sample/test_sampling_params_e2e.py
tests/v1/sample/test_sampling_params_e2e.py
+0
-12
tests/v1/spec_decode/test_max_len.py
tests/v1/spec_decode/test_max_len.py
+0
-1
tests/v1/tpu/test_basic.py
tests/v1/tpu/test_basic.py
+46
-62
tests/v1/tpu/test_perf.py
tests/v1/tpu/test_perf.py
+38
-42
tests/v1/tracing/test_tracing.py
tests/v1/tracing/test_tracing.py
+1
-1
vllm/v1/worker/cpu_model_runner.py
vllm/v1/worker/cpu_model_runner.py
+7
-1
No files found.
tests/v1/entrypoints/llm/test_struct_output_generate.py
View file @
1e4ecca1
...
@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
...
@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE
,
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE
,
)
)
def
test_structured_output
(
def
test_structured_output
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_json_schema
:
dict
[
str
,
Any
],
sample_json_schema
:
dict
[
str
,
Any
],
unsupported_json_schema
:
dict
[
str
,
Any
],
unsupported_json_schema
:
dict
[
str
,
Any
],
sample_sql_ebnf
:
str
,
sample_sql_ebnf
:
str
,
...
@@ -115,8 +114,6 @@ def test_structured_output(
...
@@ -115,8 +114,6 @@ def test_structured_output(
model_name
:
str
,
model_name
:
str
,
speculative_config
:
dict
[
str
,
Any
],
speculative_config
:
dict
[
str
,
Any
],
):
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
if
current_platform
.
is_tpu
()
and
speculative_config
:
if
current_platform
.
is_tpu
()
and
speculative_config
:
pytest
.
skip
(
"TPU does not support speculative decoding"
)
pytest
.
skip
(
"TPU does not support speculative decoding"
)
...
@@ -620,15 +617,12 @@ Make the response as short as possible.
...
@@ -620,15 +617,12 @@ Make the response as short as possible.
],
],
)
)
def
test_structured_output_with_reasoning_matrices
(
def
test_structured_output_with_reasoning_matrices
(
monkeypatch
:
pytest
.
MonkeyPatch
,
backend
:
str
,
backend
:
str
,
tokenizer_mode
:
TokenizerMode
,
tokenizer_mode
:
TokenizerMode
,
reasoning_parser
:
str
,
reasoning_parser
:
str
,
model_name
:
str
,
model_name
:
str
,
speculative_config
:
dict
[
str
,
Any
]
|
None
,
speculative_config
:
dict
[
str
,
Any
]
|
None
,
):
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
if
current_platform
.
is_tpu
()
and
speculative_config
:
if
current_platform
.
is_tpu
()
and
speculative_config
:
pytest
.
skip
(
"TPU does not support speculative decoding"
)
pytest
.
skip
(
"TPU does not support speculative decoding"
)
...
@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
...
@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"model_name, tokenizer_mode"
,
PARAMS_MODELS_TOKENIZER_MODE
)
@
pytest
.
mark
.
parametrize
(
"model_name, tokenizer_mode"
,
PARAMS_MODELS_TOKENIZER_MODE
)
def
test_structured_output_auto_mode
(
def
test_structured_output_auto_mode
(
monkeypatch
:
pytest
.
MonkeyPatch
,
unsupported_json_schema
:
dict
[
str
,
Any
],
unsupported_json_schema
:
dict
[
str
,
Any
],
model_name
:
str
,
model_name
:
str
,
tokenizer_mode
:
str
,
tokenizer_mode
:
str
,
):
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
llm
=
LLM
(
model
=
model_name
,
model
=
model_name
,
max_model_len
=
1024
,
max_model_len
=
1024
,
...
@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
...
@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_guidance_no_additional_properties
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_guidance_no_additional_properties
():
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-1.5B-Instruct"
,
model
=
"Qwen/Qwen2.5-1.5B-Instruct"
,
max_model_len
=
1024
,
max_model_len
=
1024
,
...
@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
...
@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"guidance"
,
"xgrammar"
,
"outlines"
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"guidance"
,
"xgrammar"
,
"outlines"
])
def
test_structured_output_batched_with_non_structured_outputs_requests
(
def
test_structured_output_batched_with_non_structured_outputs_requests
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_json_schema
:
dict
[
str
,
Any
],
sample_json_schema
:
dict
[
str
,
Any
],
backend
:
str
,
backend
:
str
,
):
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Don't use eager execution on TPUs because we want to test for no
# Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime
# recompilation at runtime
enforce_eager
=
bool
(
not
current_platform
.
is_tpu
())
enforce_eager
=
bool
(
not
current_platform
.
is_tpu
())
...
...
tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
View file @
1e4ecca1
...
@@ -53,7 +53,6 @@ cleanup() {
...
@@ -53,7 +53,6 @@ cleanup() {
launch_baseline
()
{
launch_baseline
()
{
BASELINE_BASE_CMD
=
"source
${
CONDA_PATH
}
/bin/activate
${
CONDA_ENV_NAME
}
;
BASELINE_BASE_CMD
=
"source
${
CONDA_PATH
}
/bin/activate
${
CONDA_ENV_NAME
}
;
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
PJRT_DEVICE=TPU
\
PJRT_DEVICE=TPU
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
...
@@ -73,7 +72,6 @@ launch_pd() {
...
@@ -73,7 +72,6 @@ launch_pd() {
UCX_TLS=tcp
\
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
PJRT_DEVICE=TPU
\
PJRT_DEVICE=TPU
\
...
@@ -93,7 +91,6 @@ launch_pd() {
...
@@ -93,7 +91,6 @@ launch_pd() {
UCX_TLS=tcp
\
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
PJRT_DEVICE=TPU
\
PJRT_DEVICE=TPU
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
...
...
tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
View file @
1e4ecca1
...
@@ -55,7 +55,6 @@ launch_pd() {
...
@@ -55,7 +55,6 @@ launch_pd() {
UCX_TLS=tcp
\
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
PJRT_DEVICE=TPU
\
PJRT_DEVICE=TPU
\
...
@@ -75,7 +74,6 @@ launch_pd() {
...
@@ -75,7 +74,6 @@ launch_pd() {
UCX_TLS=tcp
\
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
PJRT_DEVICE=TPU
\
PJRT_DEVICE=TPU
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
...
...
tests/v1/metrics/test_ray_metrics.py
View file @
1e4ecca1
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
import
pytest
import
ray
import
ray
...
@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
...
@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
from
vllm.v1.engine.async_llm
import
AsyncEngineArgs
,
AsyncLLM
from
vllm.v1.engine.async_llm
import
AsyncEngineArgs
,
AsyncLLM
from
vllm.v1.metrics.ray_wrappers
import
RayPrometheusMetric
,
RayPrometheusStatLogger
from
vllm.v1.metrics.ray_wrappers
import
RayPrometheusMetric
,
RayPrometheusStatLogger
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v1_only
(
monkeypatch
):
"""
The change relies on V1 APIs, so set VLLM_USE_V1=1.
"""
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
MODELS
=
[
MODELS
=
[
"distilbert/distilgpt2"
,
"distilbert/distilgpt2"
,
]
]
...
@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
...
@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
@
ray
.
remote
(
num_gpus
=
1
)
@
ray
.
remote
(
num_gpus
=
1
)
class
EngineTestActor
:
class
EngineTestActor
:
async
def
run
(
self
):
async
def
run
(
self
):
# Set environment variable inside the Ray actor since environment
# variables from pytest fixtures don't propagate to Ray actors
os
.
environ
[
"VLLM_USE_V1"
]
=
"1"
engine_args
=
AsyncEngineArgs
(
engine_args
=
AsyncEngineArgs
(
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
enforce_eager
=
True
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
enforce_eager
=
True
)
)
...
...
tests/v1/sample/test_logprobs.py
View file @
1e4ecca1
...
@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
...
@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
batch_logprobs_composition
:
BatchLogprobsComposition
,
batch_logprobs_composition
:
BatchLogprobsComposition
,
temperature
:
float
,
temperature
:
float
,
example_prompts
:
list
[
str
],
example_prompts
:
list
[
str
],
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
)
->
None
:
"""Test V1 Engine logprobs & prompt logprobs
"""Test V1 Engine logprobs & prompt logprobs
...
@@ -308,220 +307,204 @@ def test_get_logprobs_and_prompt_logprobs(
...
@@ -308,220 +307,204 @@ def test_get_logprobs_and_prompt_logprobs(
temperature: "temperature" sampling parameter
temperature: "temperature" sampling parameter
example_prompts: example prompt fixture
example_prompts: example prompt fixture
"""
"""
with
monkeypatch
.
context
()
as
m
:
do_apc
=
vllm_model
.
llm
.
llm_engine
.
cache_config
.
enable_prefix_caching
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
if
do_apc
and
(
temperature
<
2.0
or
batch_logprobs_composition
!=
SAMPLE_PROMPT
):
do_apc
=
vllm_model
.
llm
.
llm_engine
.
cache_config
.
enable_prefix_caching
# Skip some test-cases to save time.
if
do_apc
and
(
pytest
.
skip
()
temperature
<
2.0
or
batch_logprobs_composition
!=
SAMPLE_PROMPT
test_prompts
=
example_prompts
):
# Skip some test-cases to save time.
max_tokens
=
5
pytest
.
skip
()
hf_outputs
=
hf_model
.
generate_greedy
(
test_prompts
=
example_prompts
test_prompts
,
max_tokens
=
max_tokens
,
max_tokens
=
5
)
hf_outputs
=
hf_model
.
generate_greedy
(
hf_logprobs
=
hf_model
.
generate_greedy_logprobs
(
test_prompts
,
test_prompts
,
max_tokens
=
max_tokens
,
)
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list
=
get_test_batch
(
batch_logprobs_composition
)
# Ensure that each test prompt has a logprob config for testing
logprob_prompt_logprob_list
=
_repeat_logprob_config
(
test_prompts
,
logprob_prompt_logprob_list
)
# Generate SamplingParams
vllm_sampling_params
=
[
SamplingParams
(
max_tokens
=
max_tokens
,
max_tokens
=
max_tokens
,
logprobs
=
num_lp
,
prompt_logprobs
=
num_plp
,
temperature
=
temperature
,
seed
=
1984
,
)
)
hf_logprobs
=
hf_model
.
generate_greedy_logprobs
(
for
num_lp
,
num_plp
in
logprob_prompt_logprob_list
test_prompts
,
]
for
_
in
range
(
2
if
do_apc
else
1
):
_run_and_validate
(
vllm_model
=
vllm_model
,
test_prompts
=
test_prompts
,
vllm_sampling_params
=
vllm_sampling_params
,
hf_logprobs
=
hf_logprobs
,
hf_outputs
=
hf_outputs
,
logprob_prompt_logprob_list
=
logprob_prompt_logprob_list
,
temperature
=
temperature
,
max_tokens
=
max_tokens
,
max_tokens
=
max_tokens
,
do_apc
=
do_apc
,
)
)
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list
=
get_test_batch
(
batch_logprobs_composition
)
# Ensure that each test prompt has a logprob config for testing
def
test_max_logprobs
():
logprob_prompt_logprob_list
=
_repeat_logprob_config
(
test_prompts
,
logprob_prompt_logprob_list
)
# Generate SamplingParams
vllm_sampling_params
=
[
SamplingParams
(
max_tokens
=
max_tokens
,
logprobs
=
num_lp
,
prompt_logprobs
=
num_plp
,
temperature
=
temperature
,
seed
=
1984
,
)
for
num_lp
,
num_plp
in
logprob_prompt_logprob_list
]
for
_
in
range
(
2
if
do_apc
else
1
):
_run_and_validate
(
vllm_model
=
vllm_model
,
test_prompts
=
test_prompts
,
vllm_sampling_params
=
vllm_sampling_params
,
hf_logprobs
=
hf_logprobs
,
hf_outputs
=
hf_outputs
,
logprob_prompt_logprob_list
=
logprob_prompt_logprob_list
,
temperature
=
temperature
,
max_tokens
=
max_tokens
,
do_apc
=
do_apc
,
)
def
test_max_logprobs
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs`
APC should not matter as this test checks basic request validation.
APC should not matter as this test checks basic request validation.
"""
"""
with
monkeypatch
.
context
()
as
m
:
runner
=
VllmRunner
(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
"facebook/opt-125m"
,
max_logprobs
=
1
,
runner
=
VllmRunner
(
enable_prefix_caching
=
False
,
"facebook/opt-125m"
,
# 2 other llms alive during whole session
max_logprobs
=
1
,
gpu_memory_utilization
=
0.15
,
enable_prefix_caching
=
False
,
max_model_len
=
256
,
# 2 other llms alive during whole session
)
gpu_memory_utilization
=
0.15
,
vllm_sampling_params
=
SamplingParams
(
logprobs
=
1
)
max_model_len
=
256
,
# should pass
)
runner
.
generate
([
"Hello world"
],
sampling_params
=
vllm_sampling_params
)
vllm_sampling_params
=
SamplingParams
(
logprobs
=
1
)
# should pass
runner
.
generate
([
"Hello world"
],
sampling_params
=
vllm_sampling_params
)
bad_sampling_params
=
SamplingParams
(
logprobs
=
2
)
bad_sampling_params
=
SamplingParams
(
logprobs
=
2
)
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
runner
.
generate
([
"Hello world"
],
sampling_params
=
bad_sampling_params
)
runner
.
generate
([
"Hello world"
],
sampling_params
=
bad_sampling_params
)
def
test_none_logprobs
(
vllm_model
,
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_none_logprobs
(
vllm_model
,
example_prompts
):
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
Args:
Args:
vllm_model: vLLM model fixture
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
example_prompts: list of example prompts (test fixture)
"""
"""
with
monkeypatch
.
context
()
as
m
:
max_tokens
=
5
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
max_tokens
=
5
sampling_params_logprobs_none
=
SamplingParams
(
sampling_params_logprobs_none
=
SamplingParams
(
max_tokens
=
max_tokens
,
max_tokens
=
max_tokens
,
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs
=
None
,
prompt_logprobs
=
None
,
temperature
=
0.0
,
temperature
=
0.0
,
)
)
results_logprobs_none
=
vllm_model
.
llm
.
generate
(
results_logprobs_none
=
vllm_model
.
llm
.
generate
(
example_prompts
,
example_prompts
,
sampling_params
=
sampling_params_logprobs_none
,
sampling_params
=
sampling_params_logprobs_none
,
)
)
for
i
in
range
(
len
(
results_logprobs_none
)):
for
i
in
range
(
len
(
results_logprobs_none
)):
# Check sample logprobs are None
# Check sample logprobs are None
assert
results_logprobs_none
[
i
].
outputs
[
0
].
logprobs
is
None
assert
results_logprobs_none
[
i
].
outputs
[
0
].
logprobs
is
None
assert
results_logprobs_none
[
i
].
outputs
[
0
].
cumulative_logprob
is
None
assert
results_logprobs_none
[
i
].
outputs
[
0
].
cumulative_logprob
is
None
# Check prompt logprobs are None
# Check prompt logprobs are None
assert
results_logprobs_none
[
i
].
prompt_logprobs
is
None
assert
results_logprobs_none
[
i
].
prompt_logprobs
is
None
def
test_zero_logprobs
(
vllm_model
,
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_zero_logprobs
(
vllm_model
,
example_prompts
):
"""Engine should return sampled token and prompt token logprobs
"""Engine should return sampled token and prompt token logprobs
Args:
Args:
vllm_model: vLLM model fixture
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
example_prompts: list of example prompts (test fixture)
"""
"""
with
monkeypatch
.
context
()
as
m
:
max_tokens
=
5
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
max_tokens
=
5
sampling_params_logprobs_zero
=
SamplingParams
(
sampling_params_logprobs_zero
=
SamplingParams
(
max_tokens
=
max_tokens
,
logprobs
=
0
,
prompt_logprobs
=
0
,
temperature
=
0.0
max_tokens
=
max_tokens
,
logprobs
=
0
,
prompt_logprobs
=
0
,
temperature
=
0.0
)
)
results_logprobs_zero
=
vllm_model
.
llm
.
generate
(
results_logprobs_zero
=
vllm_model
.
llm
.
generate
(
example_prompts
,
sampling_params
=
sampling_params_logprobs_zero
example_prompts
,
sampling_params
=
sampling_params_logprobs_zero
)
)
for
i
in
range
(
len
(
results_logprobs_zero
)):
for
i
in
range
(
len
(
results_logprobs_zero
)):
# Check that there is one sample logprob dict for each
# Check that there is one sample logprob dict for each
# sample token
# sample token
logprobs
=
results_logprobs_zero
[
i
].
outputs
[
0
].
logprobs
logprobs
=
results_logprobs_zero
[
i
].
outputs
[
0
].
logprobs
prompt_logprobs
=
results_logprobs_zero
[
i
].
prompt_logprobs
prompt_logprobs
=
results_logprobs_zero
[
i
].
prompt_logprobs
sampled_token_ids
=
results_logprobs_zero
[
i
].
outputs
[
0
].
token_ids
sampled_token_ids
=
results_logprobs_zero
[
i
].
outputs
[
0
].
token_ids
prompt_token_ids
=
results_logprobs_zero
[
i
].
prompt_token_ids
prompt_token_ids
=
results_logprobs_zero
[
i
].
prompt_token_ids
assert
logprobs
is
not
None
assert
logprobs
is
not
None
assert
len
(
sampled_token_ids
)
==
len
(
logprobs
)
assert
len
(
sampled_token_ids
)
==
len
(
logprobs
)
assert
results_logprobs_zero
[
i
].
outputs
[
0
].
cumulative_logprob
is
not
None
assert
results_logprobs_zero
[
i
].
outputs
[
0
].
cumulative_logprob
is
not
None
# Check that there is one prompt logprob dict for each
# Check that there is one prompt logprob dict for each
# prompt token
# prompt token
assert
prompt_logprobs
is
not
None
assert
prompt_logprobs
is
not
None
assert
len
(
prompt_token_ids
)
==
len
(
prompt_logprobs
)
assert
len
(
prompt_token_ids
)
==
len
(
prompt_logprobs
)
def
test_all_logprobs
(
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_all_logprobs
(
example_prompts
):
"""Engine should return all vocabulary logprobs and prompt logprobs
"""Engine should return all vocabulary logprobs and prompt logprobs
Args:
Args:
example_prompts: list of example prompts (test fixture)
example_prompts: list of example prompts (test fixture)
"""
"""
with
monkeypatch
.
context
()
as
m
:
runner
=
VllmRunner
(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
"facebook/opt-125m"
,
runner
=
VllmRunner
(
max_logprobs
=-
1
,
"facebook/opt-125m"
,
enable_prefix_caching
=
False
,
max_logprobs
=-
1
,
# 2 other llms alive during whole session
enable_prefix_caching
=
False
,
gpu_memory_utilization
=
0.15
,
# 2 other llms alive during whole session
max_model_len
=
256
,
gpu_memory_utilization
=
0.15
,
)
max_model_len
=
256
,
)
sampling_params_logprobs_all
=
SamplingParams
(
sampling_params_logprobs_all
=
SamplingParams
(
max_tokens
=
5
,
logprobs
=-
1
,
prompt_logprobs
=-
1
max_tokens
=
5
,
logprobs
=-
1
,
prompt_logprobs
=-
1
)
)
results_logprobs_all
=
runner
.
llm
.
generate
(
results_logprobs_all
=
runner
.
llm
.
generate
(
example_prompts
,
sampling_params
=
sampling_params_logprobs_all
example_prompts
,
sampling_params
=
sampling_params_logprobs_all
)
)
vocab_size
=
runner
.
llm
.
llm_engine
.
get_model_config
().
get_vocab_size
()
vocab_size
=
runner
.
llm
.
llm_engine
.
get_model_config
().
get_vocab_size
()
for
i
in
range
(
len
(
results_logprobs_all
)):
for
i
in
range
(
len
(
results_logprobs_all
)):
logprobs
=
results_logprobs_all
[
i
].
outputs
[
0
].
logprobs
logprobs
=
results_logprobs_all
[
i
].
outputs
[
0
].
logprobs
prompt_logprobs
=
results_logprobs_all
[
i
].
prompt_logprobs
prompt_logprobs
=
results_logprobs_all
[
i
].
prompt_logprobs
assert
logprobs
is
not
None
assert
logprobs
is
not
None
for
logprob
in
logprobs
:
for
logprob
in
logprobs
:
assert
len
(
logprob
)
==
vocab_size
assert
len
(
logprob
)
==
vocab_size
assert
prompt_logprobs
is
not
None
assert
prompt_logprobs
is
not
None
assert
prompt_logprobs
[
0
]
is
None
assert
prompt_logprobs
[
0
]
is
None
for
prompt_logprob
in
prompt_logprobs
[
1
:]:
for
prompt_logprob
in
prompt_logprobs
[
1
:]:
assert
len
(
prompt_logprob
)
==
vocab_size
assert
len
(
prompt_logprob
)
==
vocab_size
@
pytest
.
mark
.
parametrize
(
"logprobs_mode"
,
get_args
(
LogprobsMode
))
@
pytest
.
mark
.
parametrize
(
"logprobs_mode"
,
get_args
(
LogprobsMode
))
def
test_logprobs_mode
(
logprobs_mode
:
LogprobsMode
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_logprobs_mode
(
logprobs_mode
:
LogprobsMode
):
"""Test with LLM engine with different logprobs_mode.
"""Test with LLM engine with different logprobs_mode.
For logprobs, we should have non-positive values.
For logprobs, we should have non-positive values.
For logits, we should expect at least one positive values.
For logits, we should expect at least one positive values.
"""
"""
from
vllm
import
LLM
from
vllm
import
LLM
with
monkeypatch
.
context
()
as
m
:
llm
=
LLM
(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
"facebook/opt-125m"
,
max_logprobs
=
5
,
llm
=
LLM
(
enable_prefix_caching
=
False
,
"facebook/opt-125m"
,
# 2 other llms alive during whole session
max_logprobs
=
5
,
gpu_memory_utilization
=
0.05
,
enable_prefix_caching
=
False
,
max_model_len
=
16
,
# 2 other llms alive during whole session
logprobs_mode
=
logprobs_mode
,
gpu_memory_utilization
=
0.05
,
)
max_model_len
=
16
,
vllm_sampling_params
=
SamplingParams
(
logprobs
=
1
)
logprobs_mode
=
logprobs_mode
,
results
=
llm
.
generate
([
"Hello world"
],
sampling_params
=
vllm_sampling_params
)
)
vllm_sampling_params
=
SamplingParams
(
logprobs
=
1
)
total_token_with_logprobs
=
0
results
=
llm
.
generate
([
"Hello world"
],
sampling_params
=
vllm_sampling_params
)
positive_values
=
0
for
output
in
results
[
0
].
outputs
:
total_token_with_logprobs
=
0
for
logprobs
in
output
.
logprobs
:
positive_values
=
0
for
token_id
in
logprobs
:
for
output
in
results
[
0
].
outputs
:
logprob
=
logprobs
[
token_id
]
for
logprobs
in
output
.
logprobs
:
if
logprobs_mode
in
(
"raw_logprobs"
,
"processed_logprobs"
):
for
token_id
in
logprobs
:
assert
logprob
.
logprob
<=
0
logprob
=
logprobs
[
token_id
]
if
logprob
.
logprob
>
0
:
if
logprobs_mode
in
(
"raw_logprobs"
,
"processed_logprobs"
):
positive_values
=
positive_values
+
1
assert
logprob
.
logprob
<=
0
total_token_with_logprobs
=
total_token_with_logprobs
+
1
if
logprob
.
logprob
>
0
:
assert
total_token_with_logprobs
>=
len
(
results
[
0
].
outputs
)
positive_values
=
positive_values
+
1
if
logprobs_mode
in
(
"raw_logits"
,
"processed_logits"
):
total_token_with_logprobs
=
total_token_with_logprobs
+
1
assert
positive_values
>
0
assert
total_token_with_logprobs
>=
len
(
results
[
0
].
outputs
)
del
llm
if
logprobs_mode
in
(
"raw_logits"
,
"processed_logits"
):
assert
positive_values
>
0
del
llm
tests/v1/sample/test_sampling_params_e2e.py
View file @
1e4ecca1
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
MODEL
=
"meta-llama/Llama-3.2-1B"
MODEL
=
"meta-llama/Llama-3.2-1B"
PROMPT
=
"Hello my name is Robert and I"
PROMPT
=
"Hello my name is Robert and I"
...
@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
...
@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
_
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
[
10000000
]))
_
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
[
10000000
]))
def
test_priority
(
llm
):
"""Check that we reject requests with priority."""
# Reject all allowed token ids
with
pytest
.
raises
(
ValueError
):
_
=
llm
.
generate
(
PROMPT
,
priority
=
[
1
])
def
test_seed
(
llm
):
def
test_seed
(
llm
):
"""Check that seed impacts randomness."""
"""Check that seed impacts randomness."""
...
...
tests/v1/spec_decode/test_max_len.py
View file @
1e4ecca1
...
@@ -38,7 +38,6 @@ def test_eagle_max_len(
...
@@ -38,7 +38,6 @@ def test_eagle_max_len(
monkeypatch
:
pytest
.
MonkeyPatch
,
num_speculative_tokens
:
int
,
attn_backend
:
str
monkeypatch
:
pytest
.
MonkeyPatch
,
num_speculative_tokens
:
int
,
attn_backend
:
str
):
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
if
attn_backend
==
"TRITON_ATTN"
and
not
current_platform
.
is_rocm
():
if
attn_backend
==
"TRITON_ATTN"
and
not
current_platform
.
is_rocm
():
...
...
tests/v1/tpu/test_basic.py
View file @
1e4ecca1
...
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
...
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
MAX_NUM_REQS
)
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
MAX_NUM_REQS
)
def
test_basic
(
def
test_basic
(
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
model
:
str
,
model
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
tensor_parallel_size
:
int
,
tensor_parallel_size
:
int
,
...
@@ -55,23 +54,20 @@ def test_basic(
...
@@ -55,23 +54,20 @@ def test_basic(
)
)
example_prompts
=
[
prompt
]
example_prompts
=
[
prompt
]
with
monkeypatch
.
context
()
as
m
:
with
vllm_runner
(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
model
,
# Note: max_num_batched_tokens == 1024 is needed here to
# actually test chunked prompt
max_num_batched_tokens
=
1024
,
max_model_len
=
8192
,
gpu_memory_utilization
=
0.7
,
max_num_seqs
=
max_num_seqs
,
tensor_parallel_size
=
tensor_parallel_size
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
output
=
vllm_outputs
[
0
][
1
]
with
vllm_runner
(
assert
"1024"
in
output
or
"0, 1"
in
output
model
,
# Note: max_num_batched_tokens == 1024 is needed here to
# actually test chunked prompt
max_num_batched_tokens
=
1024
,
max_model_len
=
8192
,
gpu_memory_utilization
=
0.7
,
max_num_seqs
=
max_num_seqs
,
tensor_parallel_size
=
tensor_parallel_size
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
output
=
vllm_outputs
[
0
][
1
]
assert
"1024"
in
output
or
"0, 1"
in
output
@
pytest
.
mark
.
skip
(
reason
=
"Temporarily disabled due to timeout"
)
@
pytest
.
mark
.
skip
(
reason
=
"Temporarily disabled due to timeout"
)
...
@@ -82,7 +78,6 @@ def test_basic(
...
@@ -82,7 +78,6 @@ def test_basic(
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
[
16
])
def
test_phi3
(
def
test_phi3
(
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
max_tokens
:
int
,
max_tokens
:
int
,
max_num_seqs
:
int
,
max_num_seqs
:
int
,
)
->
None
:
)
->
None
:
...
@@ -99,18 +94,15 @@ def test_phi3(
...
@@ -99,18 +94,15 @@ def test_phi3(
# test head dim = 96
# test head dim = 96
model
=
"microsoft/Phi-3-mini-128k-instruct"
model
=
"microsoft/Phi-3-mini-128k-instruct"
with
monkeypatch
.
context
()
as
m
:
with
vllm_runner
(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
model
,
max_num_batched_tokens
=
256
,
max_num_seqs
=
max_num_seqs
)
as
vllm_model
:
with
vllm_runner
(
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
model
,
max_num_batched_tokens
=
256
,
max_num_seqs
=
max_num_seqs
# vllm_outputs is a list of tuples whose first element is the token id
)
as
vllm_model
:
# and the second element is the output (including the prompt).
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
for
output
,
answer
in
zip
(
vllm_outputs
,
answers
):
# vllm_outputs is a list of tuples whose first element is the token id
generated_text
=
output
[
1
]
# and the second element is the output (including the prompt).
assert
answer
in
generated_text
for
output
,
answer
in
zip
(
vllm_outputs
,
answers
):
generated_text
=
output
[
1
]
assert
answer
in
generated_text
TP_SIZE_8
=
8
TP_SIZE_8
=
8
...
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
...
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
)
)
def
test_gemma3_27b_with_text_input_and_tp
(
def
test_gemma3_27b_with_text_input_and_tp
(
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
)
->
None
:
model
=
"google/gemma-3-27b-it"
model
=
"google/gemma-3-27b-it"
max_tokens
=
16
max_tokens
=
16
...
@@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
...
@@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
" but in rising every time we fall."
,
" but in rising every time we fall."
,
]
]
with
monkeypatch
.
context
()
as
m
:
with
vllm_runner
(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
model
,
max_num_batched_tokens
=
256
,
with
vllm_runner
(
max_num_seqs
=
max_num_seqs
,
model
,
tensor_parallel_size
=
tensor_parallel_size
,
max_num_batched_tokens
=
256
,
)
as
vllm_model
:
max_num_seqs
=
max_num_seqs
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
tensor_parallel_size
=
tensor_parallel_size
,
# vllm_outputs is a list of tuples whose first element is the token id
)
as
vllm_model
:
# and the second element is the output (including the prompt).
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
for
output
,
answer
in
zip
(
vllm_outputs
,
answers
):
# vllm_outputs is a list of tuples whose first element is the token id
generated_text
=
output
[
1
]
# and the second element is the output (including the prompt).
assert
answer
in
generated_text
for
output
,
answer
in
zip
(
vllm_outputs
,
answers
):
generated_text
=
output
[
1
]
assert
answer
in
generated_text
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
...
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
...
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
)
)
def
test_w8a8_quantization
(
def
test_w8a8_quantization
(
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
)
->
None
:
model
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
model
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
max_tokens
=
5
max_tokens
=
5
...
@@ -176,18 +163,15 @@ def test_w8a8_quantization(
...
@@ -176,18 +163,15 @@ def test_w8a8_quantization(
)
)
example_prompts
=
[
prompt
]
example_prompts
=
[
prompt
]
with
monkeypatch
.
context
()
as
m
:
with
vllm_runner
(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
model
,
max_num_batched_tokens
=
64
,
with
vllm_runner
(
max_model_len
=
4096
,
model
,
gpu_memory_utilization
=
0.7
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
max_num_seqs
,
max_model_len
=
4096
,
tensor_parallel_size
=
tensor_parallel_size
,
gpu_memory_utilization
=
0.7
,
)
as
vllm_model
:
max_num_seqs
=
max_num_seqs
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
tensor_parallel_size
=
tensor_parallel_size
,
output
=
vllm_outputs
[
0
][
1
]
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
"1024"
in
output
or
"0, 1"
in
output
output
=
vllm_outputs
[
0
][
1
]
assert
"1024"
in
output
or
"0, 1"
in
output
tests/v1/tpu/test_perf.py
View file @
1e4ecca1
...
@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
...
@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
@
pytest
.
mark
.
parametrize
(
"params"
,
TEST_PARAMS
)
@
pytest
.
mark
.
parametrize
(
"params"
,
TEST_PARAMS
)
def
test_perf
(
def
test_perf
(
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
params
:
TestParams
,
params
:
TestParams
,
)
->
None
:
)
->
None
:
tokenizer
=
get_tokenizer
(
tokenizer
=
get_tokenizer
(
...
@@ -107,48 +106,45 @@ def test_perf(
...
@@ -107,48 +106,45 @@ def test_perf(
)
)
)
)
with
monkeypatch
.
context
()
as
m
:
sampling_params
=
SamplingParams
(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
max_tokens
=
params
.
decode_len
,
temperature
=
1.0
,
min_p
=
0.0
)
sampling_params
=
SamplingParams
(
with
vllm_runner
(
max_tokens
=
params
.
decode_len
,
temperature
=
1.0
,
min_p
=
0.0
params
.
model
,
max_num_batched_tokens
=
MAX_MODEL_LEN
,
max_model_len
=
MAX_MODEL_LEN
,
max_num_seqs
=
MAX_NUM_SEQS
,
gpu_memory_utilization
=
GPU_UTIL
,
enforce_eager
=
False
,
tensor_parallel_size
=
1
,
)
as
vllm_model
:
print
(
" -- Warmup / Compile"
)
for
i
in
range
(
NUM_WARMUPS
):
_
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
print
(
" -- Benchmarking... "
)
times
=
[]
for
i
in
range
(
NUM_RUNS
):
start_time
=
time
.
time
()
_
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
times
.
append
(
time
.
time
()
-
start_time
)
avg_time
=
sum
(
times
)
/
len
(
times
)
print
(
" -- avg_time = {}"
.
format
(
avg_time
))
print
(
" -- expected_avg_time = {} with err_tol = {}"
.
format
(
params
.
expected_avg_time
,
params
.
err_tol
)
)
)
diff
=
avg_time
-
params
.
expected_avg_time
with
vllm_runner
(
ok
=
diff
<
params
.
err_tol
params
.
model
,
if
diff
<
-
params
.
err_tol
:
max_num_batched_tokens
=
MAX_MODEL_LEN
,
max_model_len
=
MAX_MODEL_LEN
,
max_num_seqs
=
MAX_NUM_SEQS
,
gpu_memory_utilization
=
GPU_UTIL
,
enforce_eager
=
False
,
tensor_parallel_size
=
1
,
)
as
vllm_model
:
print
(
" -- Warmup / Compile"
)
for
i
in
range
(
NUM_WARMUPS
):
_
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
print
(
" -- Benchmarking... "
)
times
=
[]
for
i
in
range
(
NUM_RUNS
):
start_time
=
time
.
time
()
_
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
times
.
append
(
time
.
time
()
-
start_time
)
avg_time
=
sum
(
times
)
/
len
(
times
)
print
(
" -- avg_time = {}"
.
format
(
avg_time
))
print
(
print
(
"
-- expected_avg_time = {} with err_tol = {}"
.
format
(
"
!! WARNING !! Performance has improved by {}, "
params
.
expected_avg_time
,
params
.
err_tol
"it may be necessary to fine-tune the "
)
"expected_avg_time = {}"
.
format
(
-
diff
,
params
.
expected_avg_time
)
)
)
diff
=
avg_time
-
params
.
expected_avg_time
ok
=
diff
<
params
.
err_tol
assert
ok
,
" !! ERROR !! Regression detected"
if
diff
<
-
params
.
err_tol
:
print
(
" !! WARNING !! Performance has improved by {}, "
"it may be necessary to fine-tune the "
"expected_avg_time = {}"
.
format
(
-
diff
,
params
.
expected_avg_time
)
)
assert
ok
,
" !! ERROR !! Regression detected"
tests/v1/tracing/test_tracing.py
View file @
1e4ecca1
...
@@ -82,7 +82,7 @@ def test_traces(
...
@@ -82,7 +82,7 @@ def test_traces(
):
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
OTEL_EXPORTER_OTLP_TRACES_INSECURE
,
"true"
)
m
.
setenv
(
OTEL_EXPORTER_OTLP_TRACES_INSECURE
,
"true"
)
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
temperature
=
0.01
,
top_p
=
0.1
,
top_p
=
0.1
,
...
...
vllm/v1/worker/cpu_model_runner.py
View file @
1e4ecca1
...
@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
...
@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
logger
.
info
(
"Warming up model for the compilation..."
)
logger
.
info
(
"Warming up model for the compilation..."
)
# Only generate graph for the generic shape
# Only generate graph for the generic shape
with
_set_global_compilation_settings
(
self
.
vllm_config
):
with
_set_global_compilation_settings
(
self
.
vllm_config
):
self
.
_dummy_run
(
max
(
16
,
self
.
max_num_reqs
))
self
.
_dummy_run
(
min
(
max
(
16
,
self
.
max_num_reqs
),
self
.
scheduler_config
.
max_num_batched_tokens
,
)
)
logger
.
info
(
"Warming up done."
)
logger
.
info
(
"Warming up done."
)
def
_init_device_properties
(
self
)
->
None
:
def
_init_device_properties
(
self
)
->
None
:
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment