Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1e4ecca1
Unverified
Commit
1e4ecca1
authored
Oct 07, 2025
by
Cyrus Leung
Committed by
GitHub
Oct 07, 2025
Browse files
[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
c0a7b89d
Changes
51
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
247 additions
and
324 deletions
+247
-324
tests/v1/entrypoints/llm/test_struct_output_generate.py
tests/v1/entrypoints/llm/test_struct_output_generate.py
+1
-15
tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
...onnector/nixl_integration/run_tpu_disagg_accuracy_test.sh
+0
-3
tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
...1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
+0
-2
tests/v1/metrics/test_ray_metrics.py
tests/v1/metrics/test_ray_metrics.py
+0
-14
tests/v1/sample/test_logprobs.py
tests/v1/sample/test_logprobs.py
+154
-171
tests/v1/sample/test_sampling_params_e2e.py
tests/v1/sample/test_sampling_params_e2e.py
+0
-12
tests/v1/spec_decode/test_max_len.py
tests/v1/spec_decode/test_max_len.py
+0
-1
tests/v1/tpu/test_basic.py
tests/v1/tpu/test_basic.py
+46
-62
tests/v1/tpu/test_perf.py
tests/v1/tpu/test_perf.py
+38
-42
tests/v1/tracing/test_tracing.py
tests/v1/tracing/test_tracing.py
+1
-1
vllm/v1/worker/cpu_model_runner.py
vllm/v1/worker/cpu_model_runner.py
+7
-1
No files found.
tests/v1/entrypoints/llm/test_struct_output_generate.py
View file @
1e4ecca1
...
@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
...
@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE
,
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE
,
)
)
def
test_structured_output
(
def
test_structured_output
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_json_schema
:
dict
[
str
,
Any
],
sample_json_schema
:
dict
[
str
,
Any
],
unsupported_json_schema
:
dict
[
str
,
Any
],
unsupported_json_schema
:
dict
[
str
,
Any
],
sample_sql_ebnf
:
str
,
sample_sql_ebnf
:
str
,
...
@@ -115,8 +114,6 @@ def test_structured_output(
...
@@ -115,8 +114,6 @@ def test_structured_output(
model_name
:
str
,
model_name
:
str
,
speculative_config
:
dict
[
str
,
Any
],
speculative_config
:
dict
[
str
,
Any
],
):
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
if
current_platform
.
is_tpu
()
and
speculative_config
:
if
current_platform
.
is_tpu
()
and
speculative_config
:
pytest
.
skip
(
"TPU does not support speculative decoding"
)
pytest
.
skip
(
"TPU does not support speculative decoding"
)
...
@@ -620,15 +617,12 @@ Make the response as short as possible.
...
@@ -620,15 +617,12 @@ Make the response as short as possible.
],
],
)
)
def
test_structured_output_with_reasoning_matrices
(
def
test_structured_output_with_reasoning_matrices
(
monkeypatch
:
pytest
.
MonkeyPatch
,
backend
:
str
,
backend
:
str
,
tokenizer_mode
:
TokenizerMode
,
tokenizer_mode
:
TokenizerMode
,
reasoning_parser
:
str
,
reasoning_parser
:
str
,
model_name
:
str
,
model_name
:
str
,
speculative_config
:
dict
[
str
,
Any
]
|
None
,
speculative_config
:
dict
[
str
,
Any
]
|
None
,
):
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
if
current_platform
.
is_tpu
()
and
speculative_config
:
if
current_platform
.
is_tpu
()
and
speculative_config
:
pytest
.
skip
(
"TPU does not support speculative decoding"
)
pytest
.
skip
(
"TPU does not support speculative decoding"
)
...
@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
...
@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"model_name, tokenizer_mode"
,
PARAMS_MODELS_TOKENIZER_MODE
)
@
pytest
.
mark
.
parametrize
(
"model_name, tokenizer_mode"
,
PARAMS_MODELS_TOKENIZER_MODE
)
def
test_structured_output_auto_mode
(
def
test_structured_output_auto_mode
(
monkeypatch
:
pytest
.
MonkeyPatch
,
unsupported_json_schema
:
dict
[
str
,
Any
],
unsupported_json_schema
:
dict
[
str
,
Any
],
model_name
:
str
,
model_name
:
str
,
tokenizer_mode
:
str
,
tokenizer_mode
:
str
,
):
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
llm
=
LLM
(
model
=
model_name
,
model
=
model_name
,
max_model_len
=
1024
,
max_model_len
=
1024
,
...
@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
...
@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_guidance_no_additional_properties
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_guidance_no_additional_properties
():
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-1.5B-Instruct"
,
model
=
"Qwen/Qwen2.5-1.5B-Instruct"
,
max_model_len
=
1024
,
max_model_len
=
1024
,
...
@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
...
@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"guidance"
,
"xgrammar"
,
"outlines"
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"guidance"
,
"xgrammar"
,
"outlines"
])
def
test_structured_output_batched_with_non_structured_outputs_requests
(
def
test_structured_output_batched_with_non_structured_outputs_requests
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_json_schema
:
dict
[
str
,
Any
],
sample_json_schema
:
dict
[
str
,
Any
],
backend
:
str
,
backend
:
str
,
):
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Don't use eager execution on TPUs because we want to test for no
# Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime
# recompilation at runtime
enforce_eager
=
bool
(
not
current_platform
.
is_tpu
())
enforce_eager
=
bool
(
not
current_platform
.
is_tpu
())
...
...
tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
View file @
1e4ecca1
...
@@ -53,7 +53,6 @@ cleanup() {
...
@@ -53,7 +53,6 @@ cleanup() {
launch_baseline
()
{
launch_baseline
()
{
BASELINE_BASE_CMD
=
"source
${
CONDA_PATH
}
/bin/activate
${
CONDA_ENV_NAME
}
;
BASELINE_BASE_CMD
=
"source
${
CONDA_PATH
}
/bin/activate
${
CONDA_ENV_NAME
}
;
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
PJRT_DEVICE=TPU
\
PJRT_DEVICE=TPU
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
...
@@ -73,7 +72,6 @@ launch_pd() {
...
@@ -73,7 +72,6 @@ launch_pd() {
UCX_TLS=tcp
\
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
PJRT_DEVICE=TPU
\
PJRT_DEVICE=TPU
\
...
@@ -93,7 +91,6 @@ launch_pd() {
...
@@ -93,7 +91,6 @@ launch_pd() {
UCX_TLS=tcp
\
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
PJRT_DEVICE=TPU
\
PJRT_DEVICE=TPU
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
...
...
tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
View file @
1e4ecca1
...
@@ -55,7 +55,6 @@ launch_pd() {
...
@@ -55,7 +55,6 @@ launch_pd() {
UCX_TLS=tcp
\
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
PJRT_DEVICE=TPU
\
PJRT_DEVICE=TPU
\
...
@@ -75,7 +74,6 @@ launch_pd() {
...
@@ -75,7 +74,6 @@ launch_pd() {
UCX_TLS=tcp
\
UCX_TLS=tcp
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_LOGGING_LEVEL=DEBUG
\
VLLM_USE_V1=1
\
PJRT_DEVICE=TPU
\
PJRT_DEVICE=TPU
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve
$MODEL_NAME
\
...
...
tests/v1/metrics/test_ray_metrics.py
View file @
1e4ecca1
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
import
pytest
import
ray
import
ray
...
@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
...
@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
from
vllm.v1.engine.async_llm
import
AsyncEngineArgs
,
AsyncLLM
from
vllm.v1.engine.async_llm
import
AsyncEngineArgs
,
AsyncLLM
from
vllm.v1.metrics.ray_wrappers
import
RayPrometheusMetric
,
RayPrometheusStatLogger
from
vllm.v1.metrics.ray_wrappers
import
RayPrometheusMetric
,
RayPrometheusStatLogger
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v1_only
(
monkeypatch
):
"""
The change relies on V1 APIs, so set VLLM_USE_V1=1.
"""
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
MODELS
=
[
MODELS
=
[
"distilbert/distilgpt2"
,
"distilbert/distilgpt2"
,
]
]
...
@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
...
@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
@
ray
.
remote
(
num_gpus
=
1
)
@
ray
.
remote
(
num_gpus
=
1
)
class
EngineTestActor
:
class
EngineTestActor
:
async
def
run
(
self
):
async
def
run
(
self
):
# Set environment variable inside the Ray actor since environment
# variables from pytest fixtures don't propagate to Ray actors
os
.
environ
[
"VLLM_USE_V1"
]
=
"1"
engine_args
=
AsyncEngineArgs
(
engine_args
=
AsyncEngineArgs
(
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
enforce_eager
=
True
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
enforce_eager
=
True
)
)
...
...
tests/v1/sample/test_logprobs.py
View file @
1e4ecca1
...
@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
...
@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
batch_logprobs_composition
:
BatchLogprobsComposition
,
batch_logprobs_composition
:
BatchLogprobsComposition
,
temperature
:
float
,
temperature
:
float
,
example_prompts
:
list
[
str
],
example_prompts
:
list
[
str
],
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
)
->
None
:
"""Test V1 Engine logprobs & prompt logprobs
"""Test V1 Engine logprobs & prompt logprobs
...
@@ -308,12 +307,8 @@ def test_get_logprobs_and_prompt_logprobs(
...
@@ -308,12 +307,8 @@ def test_get_logprobs_and_prompt_logprobs(
temperature: "temperature" sampling parameter
temperature: "temperature" sampling parameter
example_prompts: example prompt fixture
example_prompts: example prompt fixture
"""
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
do_apc
=
vllm_model
.
llm
.
llm_engine
.
cache_config
.
enable_prefix_caching
do_apc
=
vllm_model
.
llm
.
llm_engine
.
cache_config
.
enable_prefix_caching
if
do_apc
and
(
if
do_apc
and
(
temperature
<
2.0
or
batch_logprobs_composition
!=
SAMPLE_PROMPT
):
temperature
<
2.0
or
batch_logprobs_composition
!=
SAMPLE_PROMPT
):
# Skip some test-cases to save time.
# Skip some test-cases to save time.
pytest
.
skip
()
pytest
.
skip
()
test_prompts
=
example_prompts
test_prompts
=
example_prompts
...
@@ -361,14 +356,11 @@ def test_get_logprobs_and_prompt_logprobs(
...
@@ -361,14 +356,11 @@ def test_get_logprobs_and_prompt_logprobs(
)
)
def
test_max_logprobs
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_max_logprobs
():
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs`
APC should not matter as this test checks basic request validation.
APC should not matter as this test checks basic request validation.
"""
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
runner
=
VllmRunner
(
runner
=
VllmRunner
(
"facebook/opt-125m"
,
"facebook/opt-125m"
,
max_logprobs
=
1
,
max_logprobs
=
1
,
...
@@ -386,15 +378,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
...
@@ -386,15 +378,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
runner
.
generate
([
"Hello world"
],
sampling_params
=
bad_sampling_params
)
runner
.
generate
([
"Hello world"
],
sampling_params
=
bad_sampling_params
)
def
test_none_logprobs
(
vllm_model
,
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_none_logprobs
(
vllm_model
,
example_prompts
):
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
Args:
Args:
vllm_model: vLLM model fixture
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
example_prompts: list of example prompts (test fixture)
"""
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
max_tokens
=
5
max_tokens
=
5
sampling_params_logprobs_none
=
SamplingParams
(
sampling_params_logprobs_none
=
SamplingParams
(
...
@@ -416,15 +406,13 @@ def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa
...
@@ -416,15 +406,13 @@ def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa
assert
results_logprobs_none
[
i
].
prompt_logprobs
is
None
assert
results_logprobs_none
[
i
].
prompt_logprobs
is
None
def
test_zero_logprobs
(
vllm_model
,
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_zero_logprobs
(
vllm_model
,
example_prompts
):
"""Engine should return sampled token and prompt token logprobs
"""Engine should return sampled token and prompt token logprobs
Args:
Args:
vllm_model: vLLM model fixture
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
example_prompts: list of example prompts (test fixture)
"""
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
max_tokens
=
5
max_tokens
=
5
sampling_params_logprobs_zero
=
SamplingParams
(
sampling_params_logprobs_zero
=
SamplingParams
(
...
@@ -450,14 +438,12 @@ def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa
...
@@ -450,14 +438,12 @@ def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa
assert
len
(
prompt_token_ids
)
==
len
(
prompt_logprobs
)
assert
len
(
prompt_token_ids
)
==
len
(
prompt_logprobs
)
def
test_all_logprobs
(
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_all_logprobs
(
example_prompts
):
"""Engine should return all vocabulary logprobs and prompt logprobs
"""Engine should return all vocabulary logprobs and prompt logprobs
Args:
Args:
example_prompts: list of example prompts (test fixture)
example_prompts: list of example prompts (test fixture)
"""
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
runner
=
VllmRunner
(
runner
=
VllmRunner
(
"facebook/opt-125m"
,
"facebook/opt-125m"
,
max_logprobs
=-
1
,
max_logprobs
=-
1
,
...
@@ -488,16 +474,13 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
...
@@ -488,16 +474,13 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
@
pytest
.
mark
.
parametrize
(
"logprobs_mode"
,
get_args
(
LogprobsMode
))
@
pytest
.
mark
.
parametrize
(
"logprobs_mode"
,
get_args
(
LogprobsMode
))
def
test_logprobs_mode
(
logprobs_mode
:
LogprobsMode
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_logprobs_mode
(
logprobs_mode
:
LogprobsMode
):
"""Test with LLM engine with different logprobs_mode.
"""Test with LLM engine with different logprobs_mode.
For logprobs, we should have non-positive values.
For logprobs, we should have non-positive values.
For logits, we should expect at least one positive values.
For logits, we should expect at least one positive values.
"""
"""
from
vllm
import
LLM
from
vllm
import
LLM
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
llm
=
LLM
(
"facebook/opt-125m"
,
"facebook/opt-125m"
,
max_logprobs
=
5
,
max_logprobs
=
5
,
...
...
tests/v1/sample/test_sampling_params_e2e.py
View file @
1e4ecca1
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
MODEL
=
"meta-llama/Llama-3.2-1B"
MODEL
=
"meta-llama/Llama-3.2-1B"
PROMPT
=
"Hello my name is Robert and I"
PROMPT
=
"Hello my name is Robert and I"
...
@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
...
@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
_
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
[
10000000
]))
_
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
[
10000000
]))
def
test_priority
(
llm
):
"""Check that we reject requests with priority."""
# Reject all allowed token ids
with
pytest
.
raises
(
ValueError
):
_
=
llm
.
generate
(
PROMPT
,
priority
=
[
1
])
def
test_seed
(
llm
):
def
test_seed
(
llm
):
"""Check that seed impacts randomness."""
"""Check that seed impacts randomness."""
...
...
tests/v1/spec_decode/test_max_len.py
View file @
1e4ecca1
...
@@ -38,7 +38,6 @@ def test_eagle_max_len(
...
@@ -38,7 +38,6 @@ def test_eagle_max_len(
monkeypatch
:
pytest
.
MonkeyPatch
,
num_speculative_tokens
:
int
,
attn_backend
:
str
monkeypatch
:
pytest
.
MonkeyPatch
,
num_speculative_tokens
:
int
,
attn_backend
:
str
):
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
if
attn_backend
==
"TRITON_ATTN"
and
not
current_platform
.
is_rocm
():
if
attn_backend
==
"TRITON_ATTN"
and
not
current_platform
.
is_rocm
():
...
...
tests/v1/tpu/test_basic.py
View file @
1e4ecca1
...
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
...
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
MAX_NUM_REQS
)
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
MAX_NUM_REQS
)
def
test_basic
(
def
test_basic
(
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
model
:
str
,
model
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
tensor_parallel_size
:
int
,
tensor_parallel_size
:
int
,
...
@@ -55,9 +54,6 @@ def test_basic(
...
@@ -55,9 +54,6 @@ def test_basic(
)
)
example_prompts
=
[
prompt
]
example_prompts
=
[
prompt
]
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
# Note: max_num_batched_tokens == 1024 is needed here to
# Note: max_num_batched_tokens == 1024 is needed here to
...
@@ -82,7 +78,6 @@ def test_basic(
...
@@ -82,7 +78,6 @@ def test_basic(
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
[
16
])
def
test_phi3
(
def
test_phi3
(
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
max_tokens
:
int
,
max_tokens
:
int
,
max_num_seqs
:
int
,
max_num_seqs
:
int
,
)
->
None
:
)
->
None
:
...
@@ -99,9 +94,6 @@ def test_phi3(
...
@@ -99,9 +94,6 @@ def test_phi3(
# test head dim = 96
# test head dim = 96
model
=
"microsoft/Phi-3-mini-128k-instruct"
model
=
"microsoft/Phi-3-mini-128k-instruct"
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
with
vllm_runner
(
model
,
max_num_batched_tokens
=
256
,
max_num_seqs
=
max_num_seqs
model
,
max_num_batched_tokens
=
256
,
max_num_seqs
=
max_num_seqs
)
as
vllm_model
:
)
as
vllm_model
:
...
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
...
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
)
)
def
test_gemma3_27b_with_text_input_and_tp
(
def
test_gemma3_27b_with_text_input_and_tp
(
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
)
->
None
:
model
=
"google/gemma-3-27b-it"
model
=
"google/gemma-3-27b-it"
max_tokens
=
16
max_tokens
=
16
...
@@ -140,9 +131,6 @@ def test_gemma3_27b_with_text_input_and_tp(
...
@@ -140,9 +131,6 @@ def test_gemma3_27b_with_text_input_and_tp(
" but in rising every time we fall."
,
" but in rising every time we fall."
,
]
]
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
max_num_batched_tokens
=
256
,
max_num_batched_tokens
=
256
,
...
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
...
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
)
)
def
test_w8a8_quantization
(
def
test_w8a8_quantization
(
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
)
->
None
:
model
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
model
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
max_tokens
=
5
max_tokens
=
5
...
@@ -176,9 +163,6 @@ def test_w8a8_quantization(
...
@@ -176,9 +163,6 @@ def test_w8a8_quantization(
)
)
example_prompts
=
[
prompt
]
example_prompts
=
[
prompt
]
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
max_num_batched_tokens
=
64
,
max_num_batched_tokens
=
64
,
...
...
tests/v1/tpu/test_perf.py
View file @
1e4ecca1
...
@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
...
@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
@
pytest
.
mark
.
parametrize
(
"params"
,
TEST_PARAMS
)
@
pytest
.
mark
.
parametrize
(
"params"
,
TEST_PARAMS
)
def
test_perf
(
def
test_perf
(
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
params
:
TestParams
,
params
:
TestParams
,
)
->
None
:
)
->
None
:
tokenizer
=
get_tokenizer
(
tokenizer
=
get_tokenizer
(
...
@@ -107,9 +106,6 @@ def test_perf(
...
@@ -107,9 +106,6 @@ def test_perf(
)
)
)
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
max_tokens
=
params
.
decode_len
,
temperature
=
1.0
,
min_p
=
0.0
max_tokens
=
params
.
decode_len
,
temperature
=
1.0
,
min_p
=
0.0
)
)
...
...
tests/v1/tracing/test_tracing.py
View file @
1e4ecca1
...
@@ -82,7 +82,7 @@ def test_traces(
...
@@ -82,7 +82,7 @@ def test_traces(
):
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
OTEL_EXPORTER_OTLP_TRACES_INSECURE
,
"true"
)
m
.
setenv
(
OTEL_EXPORTER_OTLP_TRACES_INSECURE
,
"true"
)
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
temperature
=
0.01
,
top_p
=
0.1
,
top_p
=
0.1
,
...
...
vllm/v1/worker/cpu_model_runner.py
View file @
1e4ecca1
...
@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
...
@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
logger
.
info
(
"Warming up model for the compilation..."
)
logger
.
info
(
"Warming up model for the compilation..."
)
# Only generate graph for the generic shape
# Only generate graph for the generic shape
with
_set_global_compilation_settings
(
self
.
vllm_config
):
with
_set_global_compilation_settings
(
self
.
vllm_config
):
self
.
_dummy_run
(
max
(
16
,
self
.
max_num_reqs
))
self
.
_dummy_run
(
min
(
max
(
16
,
self
.
max_num_reqs
),
self
.
scheduler_config
.
max_num_batched_tokens
,
)
)
logger
.
info
(
"Warming up done."
)
logger
.
info
(
"Warming up done."
)
def
_init_device_properties
(
self
)
->
None
:
def
_init_device_properties
(
self
)
->
None
:
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment