Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1e4ecca1
Unverified
Commit
1e4ecca1
authored
Oct 07, 2025
by
Cyrus Leung
Committed by
GitHub
Oct 07, 2025
Browse files
[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
c0a7b89d
Changes
51
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
125 additions
and
389 deletions
+125
-389
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+2
-1
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+2
-7
tests/basic_correctness/test_cumem.py
tests/basic_correctness/test_cumem.py
+48
-57
tests/compile/piecewise/test_full_cudagraph.py
tests/compile/piecewise/test_full_cudagraph.py
+0
-2
tests/compile/piecewise/test_simple.py
tests/compile/piecewise/test_simple.py
+0
-3
tests/compile/test_async_tp.py
tests/compile/test_async_tp.py
+1
-7
tests/compile/test_config.py
tests/compile/test_config.py
+1
-11
tests/compile/test_fusion_attn.py
tests/compile/test_fusion_attn.py
+0
-3
tests/config/test_mp_reducer.py
tests/config/test_mp_reducer.py
+1
-4
tests/detokenizer/test_stop_strings.py
tests/detokenizer/test_stop_strings.py
+2
-4
tests/distributed/test_context_parallel.py
tests/distributed/test_context_parallel.py
+8
-45
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+2
-7
tests/distributed/test_sequence_parallel.py
tests/distributed/test_sequence_parallel.py
+2
-44
tests/entrypoints/llm/test_accuracy.py
tests/entrypoints/llm/test_accuracy.py
+18
-34
tests/entrypoints/openai/correctness/test_lmeval.py
tests/entrypoints/openai/correctness/test_lmeval.py
+6
-15
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+1
-12
tests/entrypoints/openai/test_lora_adapters.py
tests/entrypoints/openai/test_lora_adapters.py
+1
-14
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+21
-94
tests/entrypoints/openai/test_prompt_validation.py
tests/entrypoints/openai/test_prompt_validation.py
+0
-5
tests/kernels/attention/test_attention_selector.py
tests/kernels/attention/test_attention_selector.py
+9
-20
No files found.
.buildkite/test-pipeline.yaml
View file @
1e4ecca1
...
@@ -296,6 +296,7 @@ steps:
...
@@ -296,6 +296,7 @@ steps:
-
tests/v1
-
tests/v1
commands
:
commands
:
# split the test to avoid interference
# split the test to avoid interference
-
pytest -v -s -m 'not cpu_test' v1/core
-
pytest -v -s v1/executor
-
pytest -v -s v1/executor
-
pytest -v -s v1/kv_offload
-
pytest -v -s v1/kv_offload
-
pytest -v -s v1/sample
-
pytest -v -s v1/sample
...
@@ -317,7 +318,7 @@ steps:
...
@@ -317,7 +318,7 @@ steps:
no_gpu
:
true
no_gpu
:
true
commands
:
commands
:
# split the test to avoid interference
# split the test to avoid interference
-
pytest -v -s v1/core
-
pytest -v -s
-m 'cpu_test'
v1/core
-
pytest -v -s v1/structured_output
-
pytest -v -s v1/structured_output
-
pytest -v -s v1/test_serial_utils.py
-
pytest -v -s v1/test_serial_utils.py
-
pytest -v -s -m 'cpu_test' v1/kv_connector/unit
-
pytest -v -s -m 'cpu_test' v1/kv_connector/unit
...
...
tests/basic_correctness/test_basic_correctness.py
View file @
1e4ecca1
...
@@ -13,7 +13,7 @@ import pytest
...
@@ -13,7 +13,7 @@ import pytest
import
torch
import
torch
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.v1.engine.llm_engine
import
LLMEngine
as
LLMEngineV1
from
vllm.v1.engine.llm_engine
import
LLMEngine
from
..conftest
import
HfRunner
,
VllmRunner
from
..conftest
import
HfRunner
,
VllmRunner
from
..models.utils
import
check_outputs_equal
from
..models.utils
import
check_outputs_equal
...
@@ -211,16 +211,11 @@ def test_models_distributed(
...
@@ -211,16 +211,11 @@ def test_models_distributed(
def
test_failed_model_execution
(
vllm_runner
,
monkeypatch
)
->
None
:
def
test_failed_model_execution
(
vllm_runner
,
monkeypatch
)
->
None
:
from
vllm.envs
import
VLLM_USE_V1
if
not
VLLM_USE_V1
:
pytest
.
skip
(
"Skipping V0 test, dump input not supported"
)
# Needed to mock an error in the same process
# Needed to mock an error in the same process
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
with
vllm_runner
(
"facebook/opt-125m"
,
enforce_eager
=
True
)
as
vllm_model
:
with
vllm_runner
(
"facebook/opt-125m"
,
enforce_eager
=
True
)
as
vllm_model
:
if
isinstance
(
vllm_model
.
llm
.
llm_engine
,
LLMEngine
V1
):
if
isinstance
(
vllm_model
.
llm
.
llm_engine
,
LLMEngine
):
v1_test_failed_model_execution
(
vllm_model
)
v1_test_failed_model_execution
(
vllm_model
)
...
...
tests/basic_correctness/test_cumem.py
View file @
1e4ecca1
...
@@ -117,18 +117,15 @@ def test_cumem_with_cudagraph():
...
@@ -117,18 +117,15 @@ def test_cumem_with_cudagraph():
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model
, use_v1
"
,
"model"
,
[
[
# sleep mode with safetensors
# sleep mode with safetensors
(
"meta-llama/Llama-3.2-1B"
,
True
),
"meta-llama/Llama-3.2-1B"
,
# sleep mode with pytorch checkpoint
# sleep mode with pytorch checkpoint
(
"facebook/opt-125m"
,
True
),
"facebook/opt-125m"
,
],
],
)
)
def
test_end_to_end
(
monkeypatch
:
pytest
.
MonkeyPatch
,
model
:
str
,
use_v1
:
bool
):
def
test_end_to_end
(
model
:
str
):
with
monkeypatch
.
context
()
as
m
:
assert
use_v1
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
free
,
total
=
torch
.
cuda
.
mem_get_info
()
free
,
total
=
torch
.
cuda
.
mem_get_info
()
used_bytes_baseline
=
total
-
free
# in case other process is running
used_bytes_baseline
=
total
-
free
# in case other process is running
llm
=
LLM
(
model
,
enable_sleep_mode
=
True
)
llm
=
LLM
(
model
,
enable_sleep_mode
=
True
)
...
@@ -151,10 +148,7 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
...
@@ -151,10 +148,7 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
# therefore high memory usage after `llm.sleep` is called is expected.
# therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# in V1.
# in V1.
if
use_v1
:
assert
used_bytes
<
7
*
GiB_bytes
assert
used_bytes
<
7
*
GiB_bytes
else
:
assert
used_bytes
<
2
*
GiB_bytes
llm
.
wake_up
()
llm
.
wake_up
()
output2
=
llm
.
generate
(
prompt
,
sampling_params
)
output2
=
llm
.
generate
(
prompt
,
sampling_params
)
...
@@ -168,10 +162,7 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
...
@@ -168,10 +162,7 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
used_bytes
=
total
-
free_gpu_bytes_wake_up_w
-
used_bytes_baseline
used_bytes
=
total
-
free_gpu_bytes_wake_up_w
-
used_bytes_baseline
# should just reallocate memory for weights (1B model, ~2GiB weights)
# should just reallocate memory for weights (1B model, ~2GiB weights)
if
use_v1
:
assert
used_bytes
<
10
*
GiB_bytes
assert
used_bytes
<
10
*
GiB_bytes
else
:
assert
used_bytes
<
6
*
GiB_bytes
# now allocate kv cache memory
# now allocate kv cache memory
llm
.
wake_up
(
tags
=
[
"kv_cache"
])
llm
.
wake_up
(
tags
=
[
"kv_cache"
])
...
...
tests/compile/piecewise/test_full_cudagraph.py
View file @
1e4ecca1
...
@@ -66,7 +66,6 @@ def llm_pair(request):
...
@@ -66,7 +66,6 @@ def llm_pair(request):
pytest
.
skip
(
"Only Blackwell GPUs support Cutlass MLA"
)
pytest
.
skip
(
"Only Blackwell GPUs support Cutlass MLA"
)
env_vars
=
{
env_vars
=
{
"VLLM_USE_V1"
:
"1"
,
# Force native sampler to avoid potential nondeterminism in FlashInfer
# Force native sampler to avoid potential nondeterminism in FlashInfer
# when per-request generators are not used in V1.
# when per-request generators are not used in V1.
"VLLM_USE_FLASHINFER_SAMPLER"
:
"0"
,
"VLLM_USE_FLASHINFER_SAMPLER"
:
"0"
,
...
@@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
...
@@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
with
(
with
(
temporary_environ
(
temporary_environ
(
{
{
"VLLM_USE_V1"
:
"1"
,
"VLLM_ATTENTION_BACKEND"
:
"FLEX_ATTENTION"
,
"VLLM_ATTENTION_BACKEND"
:
"FLEX_ATTENTION"
,
# Flex_Attention is not supported with full cuda graph
# Flex_Attention is not supported with full cuda graph
}
}
...
...
tests/compile/piecewise/test_simple.py
View file @
1e4ecca1
...
@@ -18,7 +18,6 @@ from vllm.config import (
...
@@ -18,7 +18,6 @@ from vllm.config import (
VllmConfig
,
VllmConfig
,
set_current_vllm_config
,
set_current_vllm_config
,
)
)
from
vllm.envs
import
VLLM_USE_V1
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.utils
import
is_torch_equal_or_newer
from
vllm.utils
import
is_torch_equal_or_newer
...
@@ -127,7 +126,6 @@ def _run_simple_model(
...
@@ -127,7 +126,6 @@ def _run_simple_model(
@
pytest
.
mark
.
parametrize
(
"use_inductor"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_inductor"
,
[
True
,
False
])
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_simple_piecewise_compile
(
use_inductor
):
def
test_simple_piecewise_compile
(
use_inductor
):
assert
VLLM_USE_V1
_run_simple_model
(
_run_simple_model
(
splitting_ops
=
[
"silly.attention"
],
splitting_ops
=
[
"silly.attention"
],
use_inductor_graph_partition
=
False
,
use_inductor_graph_partition
=
False
,
...
@@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
...
@@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"splitting_ops"
,
[[
"silly.attention"
],
[]])
@
pytest
.
mark
.
parametrize
(
"splitting_ops"
,
[[
"silly.attention"
],
[]])
def
test_simple_inductor_graph_partition
(
splitting_ops
):
def
test_simple_inductor_graph_partition
(
splitting_ops
):
assert
VLLM_USE_V1
if
not
is_torch_equal_or_newer
(
"2.9.0.dev"
):
if
not
is_torch_equal_or_newer
(
"2.9.0.dev"
):
pytest
.
skip
(
"inductor graph partition is only available in PyTorch 2.9+"
)
pytest
.
skip
(
"inductor graph partition is only available in PyTorch 2.9+"
)
...
...
tests/compile/test_async_tp.py
View file @
1e4ecca1
...
@@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
...
@@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
"pass_config"
:
{
"enable_async_tp"
:
async_tp_enabled
},
"pass_config"
:
{
"enable_async_tp"
:
async_tp_enabled
},
}
}
async_tp_env
=
tp_env
=
{
"VLLM_USE_V1"
:
"1"
,
}
async_tp_args
=
[
async_tp_args
=
[
*
common_args
,
*
common_args
,
"--tensor-parallel-size"
,
"--tensor-parallel-size"
,
...
@@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
...
@@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
"mp"
,
"mp"
,
]
]
compare_two_settings
(
compare_two_settings
(
model_id
,
async_tp_args
,
tp_args
,
method
=
"generate"
)
model_id
,
async_tp_args
,
tp_args
,
async_tp_env
,
tp_env
,
method
=
"generate"
)
tests/compile/test_config.py
View file @
1e4ecca1
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
import
vllm
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.counter
import
compilation_counter
from
vllm.config
import
CompilationConfig
,
CUDAGraphMode
,
VllmConfig
from
vllm.config
import
CompilationConfig
,
CUDAGraphMode
,
VllmConfig
from
vllm.utils
import
_is_torch_equal_or_newer
from
vllm.utils
import
_is_torch_equal_or_newer
...
@@ -16,15 +15,10 @@ def test_version():
...
@@ -16,15 +15,10 @@ def test_version():
assert
not
_is_torch_equal_or_newer
(
"2.7.1"
,
"2.8.0.dev"
)
assert
not
_is_torch_equal_or_newer
(
"2.7.1"
,
"2.8.0.dev"
)
def
test_use_cudagraphs_dynamic
(
monkeypatch
):
def
test_use_cudagraphs_dynamic
():
assert
vllm
.
envs
.
VLLM_USE_V1
vllm_config
=
VllmConfig
()
vllm_config
=
VllmConfig
()
assert
vllm_config
.
compilation_config
.
use_cudagraph
assert
vllm_config
.
compilation_config
.
use_cudagraph
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
vllm_config
=
VllmConfig
()
assert
not
vllm_config
.
compilation_config
.
use_cudagraph
def
test_custom_op
():
def
test_custom_op
():
# proper syntax
# proper syntax
...
@@ -41,8 +35,6 @@ def test_custom_op():
...
@@ -41,8 +35,6 @@ def test_custom_op():
# may be influenced by other tests.
# may be influenced by other tests.
@
pytest
.
mark
.
parametrize
(
"val"
,
[
"1"
])
@
pytest
.
mark
.
parametrize
(
"val"
,
[
"1"
])
def
test_VLLM_DISABLE_COMPILE_CACHE
(
vllm_runner
,
monkeypatch
,
val
):
def
test_VLLM_DISABLE_COMPILE_CACHE
(
vllm_runner
,
monkeypatch
,
val
):
assert
vllm
.
envs
.
VLLM_USE_V1
# Disable multiprocessing so that the counter is in the same process
# Disable multiprocessing so that the counter is in the same process
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
val
)
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
val
)
...
@@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
...
@@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
@
pytest
.
mark
.
forked
@
pytest
.
mark
.
forked
@
pytest
.
mark
.
parametrize
(
"enabled"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"enabled"
,
[
True
,
False
])
def
test_use_cudagraphs
(
vllm_runner
,
monkeypatch
,
enabled
):
def
test_use_cudagraphs
(
vllm_runner
,
monkeypatch
,
enabled
):
assert
vllm
.
envs
.
VLLM_USE_V1
# Disable multiprocessing so that the counter is in the same process
# Disable multiprocessing so that the counter is in the same process
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
...
...
tests/compile/test_fusion_attn.py
View file @
1e4ecca1
...
@@ -303,7 +303,6 @@ def test_attention_quant_pattern(
...
@@ -303,7 +303,6 @@ def test_attention_quant_pattern(
model_class
:
type
[
AttentionQuantPatternModel
],
model_class
:
type
[
AttentionQuantPatternModel
],
backend
:
_Backend
,
backend
:
_Backend
,
use_inductor_graph_partition
:
bool
,
use_inductor_graph_partition
:
bool
,
monkeypatch
,
dist_init
,
dist_init
,
caplog_vllm
,
caplog_vllm
,
):
):
...
@@ -312,8 +311,6 @@ def test_attention_quant_pattern(
...
@@ -312,8 +311,6 @@ def test_attention_quant_pattern(
if
use_inductor_graph_partition
and
not
is_torch_equal_or_newer
(
"2.9.0.dev"
):
if
use_inductor_graph_partition
and
not
is_torch_equal_or_newer
(
"2.9.0.dev"
):
pytest
.
skip
(
"inductor graph partition is only available in PyTorch 2.9+"
)
pytest
.
skip
(
"inductor graph partition is only available in PyTorch 2.9+"
)
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
device
=
torch
.
device
(
"cuda:0"
)
device
=
torch
.
device
(
"cuda:0"
)
torch
.
manual_seed
(
42
)
torch
.
manual_seed
(
42
)
...
...
tests/config/test_mp_reducer.py
View file @
1e4ecca1
...
@@ -8,16 +8,13 @@ from vllm.engine.arg_utils import AsyncEngineArgs
...
@@ -8,16 +8,13 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.async_llm
import
AsyncLLM
def
test_mp_reducer
(
monkeypatch
):
def
test_mp_reducer
():
"""
"""
Test that _reduce_config reducer is registered when AsyncLLM is instantiated
Test that _reduce_config reducer is registered when AsyncLLM is instantiated
without transformers_modules. This is a regression test for
without transformers_modules. This is a regression test for
https://github.com/vllm-project/vllm/pull/18640.
https://github.com/vllm-project/vllm/pull/18640.
"""
"""
# Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Ensure transformers_modules is not in sys.modules
# Ensure transformers_modules is not in sys.modules
if
"transformers_modules"
in
sys
.
modules
:
if
"transformers_modules"
in
sys
.
modules
:
del
sys
.
modules
[
"transformers_modules"
]
del
sys
.
modules
[
"transformers_modules"
]
...
...
tests/detokenizer/test_stop_strings.py
View file @
1e4ecca1
...
@@ -5,7 +5,7 @@ from typing import Any, Optional
...
@@ -5,7 +5,7 @@ from typing import Any, Optional
import
pytest
import
pytest
from
vllm
import
LLM
,
SamplingParams
,
envs
from
vllm
import
LLM
,
SamplingParams
MODEL
=
"meta-llama/llama-2-7b-hf"
MODEL
=
"meta-llama/llama-2-7b-hf"
MAX_TOKENS
=
200
MAX_TOKENS
=
200
...
@@ -111,9 +111,7 @@ def _stop_token_id(llm):
...
@@ -111,9 +111,7 @@ def _stop_token_id(llm):
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_strings
():
def
test_stop_strings
():
# If V0, must set enforce_eager=False since we use
llm
=
LLM
(
MODEL
,
enforce_eager
=
True
)
# async output processing below.
llm
=
LLM
(
MODEL
,
enforce_eager
=
envs
.
VLLM_USE_V1
)
_stop_basic
(
llm
)
_stop_basic
(
llm
)
_stop_multi_tokens
(
llm
)
_stop_multi_tokens
(
llm
)
...
...
tests/distributed/test_context_parallel.py
View file @
1e4ecca1
...
@@ -42,24 +42,10 @@ class CPTestOptions(NamedTuple):
...
@@ -42,24 +42,10 @@ class CPTestOptions(NamedTuple):
@
dataclass
@
dataclass
class
CPTestSettings
:
class
CPTestSettings
:
parallel_setups
:
list
[
ParallelSetup
]
parallel_setups
:
list
[
ParallelSetup
]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends
:
list
[
str
]
distributed_backends
:
list
[
str
]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions
:
list
[
str
]
runner
:
RunnerOption
runner
:
RunnerOption
test_options
:
CPTestOptions
test_options
:
CPTestOptions
def
__post_init__
(
self
):
if
len
(
self
.
distributed_backends
)
!=
len
(
self
.
vllm_major_versions
):
raise
ValueError
(
f
"Length mismatch: distributed_backends "
f
"(
{
len
(
self
.
distributed_backends
)
}
) != "
f
"vllm_major_versions (
{
len
(
self
.
vllm_major_versions
)
}
)"
)
@
staticmethod
@
staticmethod
def
detailed
(
def
detailed
(
*
,
*
,
...
@@ -87,7 +73,6 @@ class CPTestSettings:
...
@@ -87,7 +73,6 @@ class CPTestSettings:
return
CPTestSettings
(
return
CPTestSettings
(
parallel_setups
=
parallel_setups
,
parallel_setups
=
parallel_setups
,
distributed_backends
=
[
"mp"
],
distributed_backends
=
[
"mp"
],
vllm_major_versions
=
[
"1"
],
runner
=
runner
,
runner
=
runner
,
test_options
=
CPTestOptions
(
test_options
=
CPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
multi_node_only
=
multi_node_only
,
load_format
=
load_format
...
@@ -98,14 +83,11 @@ class CPTestSettings:
...
@@ -98,14 +83,11 @@ class CPTestSettings:
opts
=
self
.
test_options
opts
=
self
.
test_options
for
parallel_setup
in
self
.
parallel_setups
:
for
parallel_setup
in
self
.
parallel_setups
:
for
backend
,
vllm_major_version
in
zip
(
for
backend
in
self
.
distributed_backends
:
self
.
distributed_backends
,
self
.
vllm_major_versions
):
yield
(
yield
(
model_id
,
model_id
,
parallel_setup
,
parallel_setup
,
backend
,
backend
,
vllm_major_version
,
self
.
runner
,
self
.
runner
,
opts
,
opts
,
)
)
...
@@ -115,7 +97,6 @@ def _compare_cp_with_tp(
...
@@ -115,7 +97,6 @@ def _compare_cp_with_tp(
model_id
:
str
,
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
runner
:
RunnerOption
,
runner
:
RunnerOption
,
test_options
:
CPTestOptions
,
test_options
:
CPTestOptions
,
num_gpus_available
:
int
,
num_gpus_available
:
int
,
...
@@ -191,10 +172,6 @@ def _compare_cp_with_tp(
...
@@ -191,10 +172,6 @@ def _compare_cp_with_tp(
if
hf_overrides
:
if
hf_overrides
:
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)])
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)])
cp_env
=
tp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
# Note(hc): DCP only support V1 engine only
}
cp_args
=
[
cp_args
=
[
*
common_args
,
*
common_args
,
"--tensor-parallel-size"
,
"--tensor-parallel-size"
,
...
@@ -217,24 +194,13 @@ def _compare_cp_with_tp(
...
@@ -217,24 +194,13 @@ def _compare_cp_with_tp(
distributed_backend
,
distributed_backend
,
]
]
try
:
compare_two_settings
(
compare_two_settings
(
model_id
,
model_id
,
cp_args
,
cp_args
,
tp_args
,
tp_args
,
cp_env
,
tp_env
,
method
=
method
,
method
=
method
,
max_wait_seconds
=
720
,
max_wait_seconds
=
720
,
)
)
except
Exception
:
testing_ray_compiled_graph
=
cp_env
is
not
None
if
testing_ray_compiled_graph
and
vllm_major_version
==
"0"
:
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger
.
exception
(
"Ray Compiled Graph tests failed"
)
else
:
raise
CP_TEXT_GENERATION_MODELS
=
{
CP_TEXT_GENERATION_MODELS
=
{
...
@@ -257,7 +223,6 @@ CP_TEST_MODELS = [
...
@@ -257,7 +223,6 @@ CP_TEST_MODELS = [
"model_id"
,
"model_id"
,
"parallel_setup"
,
"parallel_setup"
,
"distributed_backend"
,
"distributed_backend"
,
"vllm_major_version"
,
"runner"
,
"runner"
,
"test_options"
,
"test_options"
,
),
),
...
@@ -274,7 +239,6 @@ def test_cp_generation(
...
@@ -274,7 +239,6 @@ def test_cp_generation(
model_id
:
str
,
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
runner
:
RunnerOption
,
runner
:
RunnerOption
,
test_options
:
CPTestOptions
,
test_options
:
CPTestOptions
,
num_gpus_available
,
num_gpus_available
,
...
@@ -283,7 +247,6 @@ def test_cp_generation(
...
@@ -283,7 +247,6 @@ def test_cp_generation(
model_id
,
model_id
,
parallel_setup
,
parallel_setup
,
distributed_backend
,
distributed_backend
,
vllm_major_version
,
runner
,
runner
,
test_options
,
test_options
,
num_gpus_available
,
num_gpus_available
,
...
...
tests/distributed/test_pipeline_parallel.py
View file @
1e4ecca1
...
@@ -307,7 +307,6 @@ def _compare_tp(
...
@@ -307,7 +307,6 @@ def _compare_tp(
if
distributed_backend
==
"ray"
:
if
distributed_backend
==
"ray"
:
# For V1, test Ray Compiled Graph for all the tests
# For V1, test Ray Compiled Graph for all the tests
pp_env
=
{
pp_env
=
{
"VLLM_USE_V1"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG"
:
"1"
,
"VLLM_USE_RAY_SPMD_WORKER"
:
"1"
,
"VLLM_USE_RAY_SPMD_WORKER"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
"1"
,
...
@@ -316,15 +315,11 @@ def _compare_tp(
...
@@ -316,15 +315,11 @@ def _compare_tp(
# terminate because of a Ray Compiled Graph issue.
# terminate because of a Ray Compiled Graph issue.
common_args
.
append
(
"--disable-frontend-multiprocessing"
)
common_args
.
append
(
"--disable-frontend-multiprocessing"
)
elif
distributed_backend
==
"mp"
:
elif
distributed_backend
==
"mp"
:
pp_env
=
{
pp_env
=
None
"VLLM_USE_V1"
:
"1"
,
}
else
:
else
:
pp_env
=
None
pp_env
=
None
tp_env
=
{
tp_env
=
None
"VLLM_USE_V1"
:
"1"
,
}
pp_args
=
[
pp_args
=
[
*
common_args
,
*
common_args
,
...
...
tests/distributed/test_sequence_parallel.py
View file @
1e4ecca1
...
@@ -42,24 +42,10 @@ class SPTestOptions(NamedTuple):
...
@@ -42,24 +42,10 @@ class SPTestOptions(NamedTuple):
@
dataclass
@
dataclass
class
SPTestSettings
:
class
SPTestSettings
:
parallel_setups
:
list
[
ParallelSetup
]
parallel_setups
:
list
[
ParallelSetup
]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends
:
list
[
str
]
distributed_backends
:
list
[
str
]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions
:
list
[
str
]
runner
:
RunnerOption
runner
:
RunnerOption
test_options
:
SPTestOptions
test_options
:
SPTestOptions
def
__post_init__
(
self
):
if
len
(
self
.
distributed_backends
)
!=
len
(
self
.
vllm_major_versions
):
raise
ValueError
(
f
"Length mismatch: distributed_backends "
f
"(
{
len
(
self
.
distributed_backends
)
}
) != "
f
"vllm_major_versions (
{
len
(
self
.
vllm_major_versions
)
}
)"
)
@
staticmethod
@
staticmethod
def
detailed
(
def
detailed
(
*
,
*
,
...
@@ -85,7 +71,6 @@ class SPTestSettings:
...
@@ -85,7 +71,6 @@ class SPTestSettings:
return
SPTestSettings
(
return
SPTestSettings
(
parallel_setups
=
parallel_setups
,
parallel_setups
=
parallel_setups
,
distributed_backends
=
[
"mp"
,
"ray"
],
distributed_backends
=
[
"mp"
,
"ray"
],
vllm_major_versions
=
[
"1"
,
"1"
],
runner
=
runner
,
runner
=
runner
,
test_options
=
SPTestOptions
(
test_options
=
SPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
multi_node_only
=
multi_node_only
,
load_format
=
load_format
...
@@ -117,7 +102,6 @@ class SPTestSettings:
...
@@ -117,7 +102,6 @@ class SPTestSettings:
return
SPTestSettings
(
return
SPTestSettings
(
parallel_setups
=
parallel_setups
,
parallel_setups
=
parallel_setups
,
distributed_backends
=
[
"mp"
,
"ray"
],
distributed_backends
=
[
"mp"
,
"ray"
],
vllm_major_versions
=
[
"1"
,
"1"
],
runner
=
runner
,
runner
=
runner
,
test_options
=
SPTestOptions
(
test_options
=
SPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
multi_node_only
=
multi_node_only
,
load_format
=
load_format
...
@@ -147,7 +131,6 @@ class SPTestSettings:
...
@@ -147,7 +131,6 @@ class SPTestSettings:
return
SPTestSettings
(
return
SPTestSettings
(
parallel_setups
=
parallel_setups
,
parallel_setups
=
parallel_setups
,
distributed_backends
=
[
"mp"
,
"ray"
],
distributed_backends
=
[
"mp"
,
"ray"
],
vllm_major_versions
=
[
"1"
,
"1"
],
runner
=
runner
,
runner
=
runner
,
test_options
=
SPTestOptions
(
test_options
=
SPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
multi_node_only
=
multi_node_only
,
load_format
=
load_format
...
@@ -158,14 +141,11 @@ class SPTestSettings:
...
@@ -158,14 +141,11 @@ class SPTestSettings:
opts
=
self
.
test_options
opts
=
self
.
test_options
for
parallel_setup
in
self
.
parallel_setups
:
for
parallel_setup
in
self
.
parallel_setups
:
for
backend
,
vllm_major_version
in
zip
(
for
backend
in
self
.
distributed_backends
:
self
.
distributed_backends
,
self
.
vllm_major_versions
):
yield
(
yield
(
model_id
,
model_id
,
parallel_setup
,
parallel_setup
,
backend
,
backend
,
vllm_major_version
,
self
.
runner
,
self
.
runner
,
opts
,
opts
,
)
)
...
@@ -175,7 +155,6 @@ def _compare_sp(
...
@@ -175,7 +155,6 @@ def _compare_sp(
model_id
:
str
,
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
runner
:
RunnerOption
,
runner
:
RunnerOption
,
test_options
:
SPTestOptions
,
test_options
:
SPTestOptions
,
num_gpus_available
:
int
,
num_gpus_available
:
int
,
...
@@ -265,10 +244,6 @@ def _compare_sp(
...
@@ -265,10 +244,6 @@ def _compare_sp(
},
},
}
}
tp_sp_env
=
tp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
}
tp_sp_args
=
[
tp_sp_args
=
[
*
common_args
,
*
common_args
,
"--tensor-parallel-size"
,
"--tensor-parallel-size"
,
...
@@ -281,9 +256,6 @@ def _compare_sp(
...
@@ -281,9 +256,6 @@ def _compare_sp(
json
.
dumps
(
compilation_config
),
json
.
dumps
(
compilation_config
),
]
]
tp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
}
tp_args
=
[
tp_args
=
[
*
common_args
,
*
common_args
,
"--tensor-parallel-size"
,
"--tensor-parallel-size"
,
...
@@ -292,18 +264,7 @@ def _compare_sp(
...
@@ -292,18 +264,7 @@ def _compare_sp(
"mp"
,
"mp"
,
]
]
try
:
compare_two_settings
(
model_id
,
tp_sp_args
,
tp_args
,
method
=
method
)
compare_two_settings
(
model_id
,
tp_sp_args
,
tp_args
,
tp_sp_env
,
tp_env
,
method
=
method
)
except
Exception
:
testing_ray_compiled_graph
=
tp_sp_env
is
not
None
if
testing_ray_compiled_graph
and
vllm_major_version
==
"0"
:
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger
.
exception
(
"Ray Compiled Graph tests failed"
)
else
:
raise
SP_TEXT_GENERATION_MODELS
=
{
SP_TEXT_GENERATION_MODELS
=
{
...
@@ -325,7 +286,6 @@ SP_TEST_MODELS = [
...
@@ -325,7 +286,6 @@ SP_TEST_MODELS = [
"model_id"
,
"model_id"
,
"parallel_setup"
,
"parallel_setup"
,
"distributed_backend"
,
"distributed_backend"
,
"vllm_major_version"
,
"runner"
,
"runner"
,
"test_options"
,
"test_options"
,
),
),
...
@@ -341,7 +301,6 @@ def test_tp_sp_generation(
...
@@ -341,7 +301,6 @@ def test_tp_sp_generation(
model_id
:
str
,
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
runner
:
RunnerOption
,
runner
:
RunnerOption
,
test_options
:
SPTestOptions
,
test_options
:
SPTestOptions
,
num_gpus_available
,
num_gpus_available
,
...
@@ -350,7 +309,6 @@ def test_tp_sp_generation(
...
@@ -350,7 +309,6 @@ def test_tp_sp_generation(
model_id
,
model_id
,
parallel_setup
,
parallel_setup
,
distributed_backend
,
distributed_backend
,
vllm_major_version
,
runner
,
runner
,
test_options
,
test_options
,
num_gpus_available
,
num_gpus_available
,
...
...
tests/entrypoints/llm/test_accuracy.py
View file @
1e4ecca1
...
@@ -61,17 +61,10 @@ def run_test(model_name, more_args=None):
...
@@ -61,17 +61,10 @@ def run_test(model_name, more_args=None):
TPU_TP_TEST_STR
=
""
# "tensor_parallel_size=4"
TPU_TP_TEST_STR
=
""
# "tensor_parallel_size=4"
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
()
and
not
current_platform
.
is_tpu
(),
reason
=
"V1 is currently only supported on CUDA and TPU"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODEL_NAMES
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODEL_NAMES
)
def
test_lm_eval_accuracy_v1_engine
(
model
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_lm_eval_accuracy_v1_engine
(
model
):
"""Run with the V1 Engine."""
"""Run with the V1 Engine."""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
more_args
=
None
more_args
=
None
if
current_platform
.
is_tpu
():
if
current_platform
.
is_tpu
():
# Limit compilation time for TPU V1
# Limit compilation time for TPU V1
...
@@ -85,19 +78,10 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
...
@@ -85,19 +78,10 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
run_test
(
model
,
more_args
)
run_test
(
model
,
more_args
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
()
and
not
current_platform
.
is_tpu
(),
reason
=
"V1 is currently only supported on CUDA and TPU"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
FP8_KV_MODEL_NAMES
)
@
pytest
.
mark
.
parametrize
(
"model"
,
FP8_KV_MODEL_NAMES
)
def
test_lm_eval_accuracy_v1_engine_fp8_kv_cache
(
def
test_lm_eval_accuracy_v1_engine_fp8_kv_cache
(
model
):
model
,
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Run with the V1 Engine."""
"""Run with the V1 Engine."""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
more_args
=
None
more_args
=
None
if
current_platform
.
is_tpu
():
if
current_platform
.
is_tpu
():
# Limit compilation time for TPU V1
# Limit compilation time for TPU V1
...
...
tests/entrypoints/openai/correctness/test_lmeval.py
View file @
1e4ecca1
...
@@ -10,7 +10,6 @@ AsyncLLMEngine are working correctly.
...
@@ -10,7 +10,6 @@ AsyncLLMEngine are working correctly.
"""
"""
import
lm_eval
import
lm_eval
import
pytest
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
@@ -67,17 +66,9 @@ def run_test(more_args):
...
@@ -67,17 +66,9 @@ def run_test(more_args):
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
@
pytest
.
mark
.
skipif
(
def
test_lm_eval_accuracy_v1_engine
():
not
current_platform
.
is_cuda
()
and
not
current_platform
.
is_tpu
()
and
not
current_platform
.
is_xpu
(),
reason
=
"V1 currently only supported on CUDA, XPU and TPU"
,
)
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Run with the V1 Engine."""
"""Run with the V1 Engine."""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
more_args
=
[]
more_args
=
[]
# Limit compilation time for V1
# Limit compilation time for V1
...
...
tests/entrypoints/openai/test_chat.py
View file @
1e4ecca1
...
@@ -21,18 +21,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
...
@@ -21,18 +21,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
monkeypatch_module
():
def
server
(
zephyr_lora_files
):
# noqa: F811
from
_pytest.monkeypatch
import
MonkeyPatch
mpatch
=
MonkeyPatch
()
yield
mpatch
mpatch
.
undo
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
monkeypatch_module
,
zephyr_lora_files
):
# noqa: F811
monkeypatch_module
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
args
=
[
args
=
[
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
...
...
tests/entrypoints/openai/test_lora_adapters.py
View file @
1e4ecca1
...
@@ -37,21 +37,8 @@ BADREQUEST_CASES = [
...
@@ -37,21 +37,8 @@ BADREQUEST_CASES = [
]
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
monkeypatch_module
():
from
_pytest.monkeypatch
import
MonkeyPatch
mpatch
=
MonkeyPatch
()
yield
mpatch
mpatch
.
undo
()
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
True
])
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
True
])
def
server_with_lora_modules_json
(
request
,
monkeypatch_module
,
zephyr_lora_files
):
def
server_with_lora_modules_json
(
request
,
zephyr_lora_files
):
use_v1
=
request
.
param
assert
use_v1
monkeypatch_module
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Define the json format LoRA module configurations
# Define the json format LoRA module configurations
lora_module_1
=
{
lora_module_1
=
{
"name"
:
"zephyr-lora"
,
"name"
:
"zephyr-lora"
,
...
...
tests/entrypoints/openai/test_metrics.py
View file @
1e4ecca1
...
@@ -22,24 +22,6 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
...
@@ -22,24 +22,6 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PREV_MINOR_VERSION
=
version
.
_prev_minor_version
()
PREV_MINOR_VERSION
=
version
.
_prev_minor_version
()
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
True
])
def
use_v1
(
request
):
# Module-scoped variant of run_with_both_engines
#
# Use this fixture to run a test with both v0 and v1, and
# also to conditionalize the test logic e.g.
#
# def test_metrics_exist(use_v1, server, client):
# ...
# expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
# for metric in expected:
# assert metric in response.text
#
# @skip_v1 wouldn't work here because this is a module-level
# fixture - per-function decorators would have no effect
yield
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
():
def
default_server_args
():
return
[
return
[
...
@@ -63,13 +45,11 @@ def default_server_args():
...
@@ -63,13 +45,11 @@ def default_server_args():
f
"--show-hidden-metrics-for-version=
{
PREV_MINOR_VERSION
}
"
,
f
"--show-hidden-metrics-for-version=
{
PREV_MINOR_VERSION
}
"
,
],
],
)
)
def
server
(
use_v1
,
default_server_args
,
request
):
def
server
(
default_server_args
,
request
):
if
request
.
param
:
if
request
.
param
:
default_server_args
.
append
(
request
.
param
)
default_server_args
.
append
(
request
.
param
)
env_dict
=
dict
(
VLLM_USE_V1
=
"1"
if
use_v1
else
"0"
)
with
RemoteOpenAIServer
(
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
MODEL_NAME
,
default_server_args
,
env_dict
=
env_dict
)
as
remote_server
:
yield
remote_server
yield
remote_server
...
@@ -129,7 +109,8 @@ EXPECTED_VALUES = {
...
@@ -129,7 +109,8 @@ EXPECTED_VALUES = {
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_metrics_counts
(
async
def
test_metrics_counts
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncClient
,
use_v1
:
bool
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncClient
,
):
):
for
_
in
range
(
_NUM_REQUESTS
):
for
_
in
range
(
_NUM_REQUESTS
):
# sending a request triggers the metrics to be logged.
# sending a request triggers the metrics to be logged.
...
@@ -145,7 +126,7 @@ async def test_metrics_counts(
...
@@ -145,7 +126,7 @@ async def test_metrics_counts(
# Loop over all expected metric_families
# Loop over all expected metric_families
for
metric_family
,
suffix_values_list
in
EXPECTED_VALUES
.
items
():
for
metric_family
,
suffix_values_list
in
EXPECTED_VALUES
.
items
():
if
(
use_v1
and
metric_family
not
in
EXPECTED_METRICS_V1
)
or
(
if
(
metric_family
not
in
EXPECTED_METRICS_V1
)
or
(
not
server
.
show_hidden_metrics
not
server
.
show_hidden_metrics
and
metric_family
in
HIDDEN_DEPRECATED_METRICS
and
metric_family
in
HIDDEN_DEPRECATED_METRICS
):
):
...
@@ -183,62 +164,6 @@ async def test_metrics_counts(
...
@@ -183,62 +164,6 @@ async def test_metrics_counts(
assert
found_metric
,
f
"Did not find
{
metric_family
}
in prom endpoint"
assert
found_metric
,
f
"Did not find
{
metric_family
}
in prom endpoint"
EXPECTED_METRICS
=
[
"vllm:num_requests_running"
,
"vllm:num_requests_waiting"
,
"vllm:gpu_cache_usage_perc"
,
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_count"
,
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_count"
,
"vllm:e2e_request_latency_seconds_sum"
,
"vllm:e2e_request_latency_seconds_bucket"
,
"vllm:e2e_request_latency_seconds_count"
,
"vllm:request_queue_time_seconds_sum"
,
"vllm:request_queue_time_seconds_bucket"
,
"vllm:request_queue_time_seconds_count"
,
"vllm:request_inference_time_seconds_sum"
,
"vllm:request_inference_time_seconds_bucket"
,
"vllm:request_inference_time_seconds_count"
,
"vllm:request_prefill_time_seconds_sum"
,
"vllm:request_prefill_time_seconds_bucket"
,
"vllm:request_prefill_time_seconds_count"
,
"vllm:request_decode_time_seconds_sum"
,
"vllm:request_decode_time_seconds_bucket"
,
"vllm:request_decode_time_seconds_count"
,
"vllm:request_prompt_tokens_sum"
,
"vllm:request_prompt_tokens_bucket"
,
"vllm:request_prompt_tokens_count"
,
"vllm:request_generation_tokens_sum"
,
"vllm:request_generation_tokens_bucket"
,
"vllm:request_generation_tokens_count"
,
"vllm:request_params_n_sum"
,
"vllm:request_params_n_bucket"
,
"vllm:request_params_n_count"
,
"vllm:request_params_max_tokens_sum"
,
"vllm:request_params_max_tokens_bucket"
,
"vllm:request_params_max_tokens_count"
,
"vllm:iteration_tokens_total"
,
"vllm:num_preemptions_total"
,
"vllm:prompt_tokens_total"
,
"vllm:generation_tokens_total"
,
"vllm:request_success_total"
,
"vllm:cache_config_info"
,
# labels in cache_config_info
"block_size"
,
"cache_dtype"
,
"cpu_offload_gb"
,
"enable_prefix_caching"
,
"gpu_memory_utilization"
,
"num_cpu_blocks"
,
"num_gpu_blocks"
,
"num_gpu_blocks_override"
,
"sliding_window"
,
"swap_space_bytes"
,
]
EXPECTED_METRICS_V1
=
[
EXPECTED_METRICS_V1
=
[
"vllm:num_requests_running"
,
"vllm:num_requests_running"
,
"vllm:num_requests_waiting"
,
"vllm:num_requests_waiting"
,
...
@@ -304,17 +229,21 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
...
@@ -304,17 +229,21 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_metrics_exist
(
async
def
test_metrics_exist
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncClient
,
use_v1
:
bool
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncClient
,
):
):
# sending a request triggers the metrics to be logged.
# sending a request triggers the metrics to be logged.
await
client
.
completions
.
create
(
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
model
=
MODEL_NAME
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
,
)
)
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
))
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
))
assert
response
.
status_code
==
HTTPStatus
.
OK
assert
response
.
status_code
==
HTTPStatus
.
OK
for
metric
in
EXPECTED_METRICS_V1
if
use_v1
else
EXPECTED_METRICS
:
for
metric
in
EXPECTED_METRICS_V1
:
if
metric
in
HIDDEN_DEPRECATED_METRICS
and
not
server
.
show_hidden_metrics
:
if
metric
in
HIDDEN_DEPRECATED_METRICS
and
not
server
.
show_hidden_metrics
:
continue
continue
assert
metric
in
response
.
text
assert
metric
in
response
.
text
...
@@ -322,10 +251,11 @@ async def test_metrics_exist(
...
@@ -322,10 +251,11 @@ async def test_metrics_exist(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_abort_metrics_reset
(
async
def
test_abort_metrics_reset
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncClient
,
use_v1
:
bool
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncClient
,
):
):
running_requests
,
waiting_requests
,
kv_cache_usage
=
_get_running_metrics_from_api
(
running_requests
,
waiting_requests
,
kv_cache_usage
=
_get_running_metrics_from_api
(
server
,
use_v1
server
)
)
# Expect no running requests or kvcache usage
# Expect no running requests or kvcache usage
...
@@ -351,7 +281,7 @@ async def test_abort_metrics_reset(
...
@@ -351,7 +281,7 @@ async def test_abort_metrics_reset(
# Check that we have running requests
# Check that we have running requests
running_requests
,
waiting_requests
,
kv_cache_usage
=
_get_running_metrics_from_api
(
running_requests
,
waiting_requests
,
kv_cache_usage
=
_get_running_metrics_from_api
(
server
,
use_v1
server
)
)
# Expect running requests and kvcache usage
# Expect running requests and kvcache usage
...
@@ -371,7 +301,7 @@ async def test_abort_metrics_reset(
...
@@ -371,7 +301,7 @@ async def test_abort_metrics_reset(
# Verify running and waiting requests counts and KV cache usage are zero
# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after
,
waiting_requests_after
,
kv_cache_usage_after
=
(
running_requests_after
,
waiting_requests_after
,
kv_cache_usage_after
=
(
_get_running_metrics_from_api
(
server
,
use_v1
)
_get_running_metrics_from_api
(
server
)
)
)
assert
running_requests_after
==
0
,
(
assert
running_requests_after
==
0
,
(
...
@@ -385,7 +315,7 @@ async def test_abort_metrics_reset(
...
@@ -385,7 +315,7 @@ async def test_abort_metrics_reset(
)
)
def
_get_running_metrics_from_api
(
server
:
RemoteOpenAIServer
,
use_v1
:
bool
):
def
_get_running_metrics_from_api
(
server
:
RemoteOpenAIServer
):
"""Return (running_count, waiting_count, kv_cache_usage)"""
"""Return (running_count, waiting_count, kv_cache_usage)"""
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
))
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
))
...
@@ -394,9 +324,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
...
@@ -394,9 +324,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
# Verify running and waiting requests counts and KV cache usage are zero
# Verify running and waiting requests counts and KV cache usage are zero
running_requests
,
waiting_requests
,
kv_cache_usage
=
None
,
None
,
None
running_requests
,
waiting_requests
,
kv_cache_usage
=
None
,
None
,
None
kv_cache_usage_metric
=
(
kv_cache_usage_metric
=
"vllm:kv_cache_usage_perc"
"vllm:kv_cache_usage_perc"
if
use_v1
else
"vllm:gpu_cache_usage_perc"
)
for
family
in
text_string_to_metric_families
(
response
.
text
):
for
family
in
text_string_to_metric_families
(
response
.
text
):
if
family
.
name
==
"vllm:num_requests_running"
:
if
family
.
name
==
"vllm:num_requests_running"
:
...
@@ -422,7 +350,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
...
@@ -422,7 +350,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
return
running_requests
,
waiting_requests
,
kv_cache_usage
return
running_requests
,
waiting_requests
,
kv_cache_usage
def
test_metrics_exist_run_batch
(
use_v1
:
bool
):
def
test_metrics_exist_run_batch
():
input_batch
=
"""{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""
# noqa: E501
input_batch
=
"""{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""
# noqa: E501
base_url
=
"0.0.0.0"
base_url
=
"0.0.0.0"
...
@@ -452,7 +380,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
...
@@ -452,7 +380,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
"--port"
,
"--port"
,
port
,
port
,
],
],
env
=
{
"VLLM_USE_V1"
:
"1"
},
)
)
def
is_server_up
(
url
):
def
is_server_up
(
url
):
...
...
tests/entrypoints/openai/test_prompt_validation.py
View file @
1e4ecca1
...
@@ -15,11 +15,6 @@ from vllm.entrypoints.renderer import BaseRenderer
...
@@ -15,11 +15,6 @@ from vllm.entrypoints.renderer import BaseRenderer
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v1_only
(
monkeypatch
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_empty_prompt
():
async
def
test_empty_prompt
():
model_name
=
"gpt2"
model_name
=
"gpt2"
...
...
tests/kernels/attention/test_attention_selector.py
View file @
1e4ecca1
...
@@ -80,7 +80,6 @@ def test_env(
...
@@ -80,7 +80,6 @@ def test_env(
):
):
"""Test attention backend selection with valid device-backend pairs."""
"""Test attention backend selection with valid device-backend pairs."""
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
name
)
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
name
)
m
.
setenv
(
"VLLM_MLA_DISABLE"
,
"1"
if
use_mla
else
"0"
)
m
.
setenv
(
"VLLM_MLA_DISABLE"
,
"1"
if
use_mla
else
"0"
)
...
@@ -212,14 +211,8 @@ def test_env(
...
@@ -212,14 +211,8 @@ def test_env(
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"cuda"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"cuda"
])
def
test_fp32_fallback
(
def
test_fp32_fallback
(
device
:
str
):
device
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
"""Test attention backend selection with fp32."""
"""Test attention backend selection with fp32."""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
if
device
==
"cpu"
:
if
device
==
"cpu"
:
with
patch
(
"vllm.attention.selector.current_platform"
,
CpuPlatform
()):
with
patch
(
"vllm.attention.selector.current_platform"
,
CpuPlatform
()):
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
backend
=
get_attn_backend
(
16
,
torch
.
float32
,
None
,
16
)
...
@@ -233,9 +226,6 @@ def test_fp32_fallback(
...
@@ -233,9 +226,6 @@ def test_fp32_fallback(
def
test_flash_attn
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_flash_attn
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Test FlashAttn validation."""
"""Test FlashAttn validation."""
# TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend
pytest
.
skip
(
pytest
.
skip
(
"Skipping as current backend selector does not "
"Skipping as current backend selector does not "
"handle fallbacks when a backend is set via env var."
"handle fallbacks when a backend is set via env var."
...
@@ -289,7 +279,6 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
...
@@ -289,7 +279,6 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
monkeypatch
.
context
()
as
m
,
monkeypatch
.
context
()
as
m
,
patch
(
"vllm.attention.selector.current_platform"
,
CudaPlatform
()),
patch
(
"vllm.attention.selector.current_platform"
,
CudaPlatform
()),
):
):
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
STR_INVALID_VAL
)
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
STR_INVALID_VAL
)
# Should raise ValueError for invalid backend
# Should raise ValueError for invalid backend
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment