Unverified Commit 1e4ecca1 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent c0a7b89d
......@@ -296,6 +296,7 @@ steps:
- tests/v1
commands:
# split the test to avoid interference
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
......@@ -317,7 +318,7 @@ steps:
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s v1/core
- pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
......
......@@ -13,7 +13,7 @@ import pytest
import torch
from vllm import LLM
from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
from vllm.v1.engine.llm_engine import LLMEngine
from ..conftest import HfRunner, VllmRunner
from ..models.utils import check_outputs_equal
......@@ -211,16 +211,11 @@ def test_models_distributed(
def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
from vllm.envs import VLLM_USE_V1
if not VLLM_USE_V1:
pytest.skip("Skipping V0 test, dump input not supported")
# Needed to mock an error in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
if isinstance(vllm_model.llm.llm_engine, LLMEngine):
v1_test_failed_model_execution(vllm_model)
......
......@@ -117,68 +117,59 @@ def test_cumem_with_cudagraph():
@create_new_process_for_each_test()
@pytest.mark.parametrize(
"model, use_v1",
"model",
[
# sleep mode with safetensors
("meta-llama/Llama-3.2-1B", True),
"meta-llama/Llama-3.2-1B",
# sleep mode with pytorch checkpoint
("facebook/opt-125m", True),
"facebook/opt-125m",
],
)
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
with monkeypatch.context() as m:
assert use_v1
m.setenv("VLLM_USE_V1", "1")
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only
# test sleep level 1 here.
llm.sleep(level=1)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights)
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
# is captured but cannot be releasesd from PyTorch due to a known bug,
# therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# in V1.
if use_v1:
assert used_bytes < 7 * GiB_bytes
else:
assert used_bytes < 2 * GiB_bytes
llm.wake_up()
output2 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text
llm.sleep(level=1)
llm.wake_up(tags=["weights"])
free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
# should just reallocate memory for weights (1B model, ~2GiB weights)
if use_v1:
assert used_bytes < 10 * GiB_bytes
else:
assert used_bytes < 6 * GiB_bytes
# now allocate kv cache memory
llm.wake_up(tags=["kv_cache"])
output3 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output3[0].outputs[0].text
def test_end_to_end(model: str):
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only
# test sleep level 1 here.
llm.sleep(level=1)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights)
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
# is captured but cannot be releasesd from PyTorch due to a known bug,
# therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# in V1.
assert used_bytes < 7 * GiB_bytes
llm.wake_up()
output2 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text
llm.sleep(level=1)
llm.wake_up(tags=["weights"])
free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
# should just reallocate memory for weights (1B model, ~2GiB weights)
assert used_bytes < 10 * GiB_bytes
# now allocate kv cache memory
llm.wake_up(tags=["kv_cache"])
output3 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output3[0].outputs[0].text
@create_new_process_for_each_test()
......
......@@ -66,7 +66,6 @@ def llm_pair(request):
pytest.skip("Only Blackwell GPUs support Cutlass MLA")
env_vars = {
"VLLM_USE_V1": "1",
# Force native sampler to avoid potential nondeterminism in FlashInfer
# when per-request generators are not used in V1.
"VLLM_USE_FLASHINFER_SAMPLER": "0",
......@@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
with (
temporary_environ(
{
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
# Flex_Attention is not supported with full cuda graph
}
......
......@@ -18,7 +18,6 @@ from vllm.config import (
VllmConfig,
set_current_vllm_config,
)
from vllm.envs import VLLM_USE_V1
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import is_torch_equal_or_newer
......@@ -127,7 +126,6 @@ def _run_simple_model(
@pytest.mark.parametrize("use_inductor", [True, False])
@torch.inference_mode()
def test_simple_piecewise_compile(use_inductor):
assert VLLM_USE_V1
_run_simple_model(
splitting_ops=["silly.attention"],
use_inductor_graph_partition=False,
......@@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
@torch.inference_mode()
@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
def test_simple_inductor_graph_partition(splitting_ops):
assert VLLM_USE_V1
if not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
......
......@@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
"pass_config": {"enable_async_tp": async_tp_enabled},
}
async_tp_env = tp_env = {
"VLLM_USE_V1": "1",
}
async_tp_args = [
*common_args,
"--tensor-parallel-size",
......@@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
"mp",
]
compare_two_settings(
model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
)
compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
......@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import vllm
from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
from vllm.utils import _is_torch_equal_or_newer
......@@ -16,15 +15,10 @@ def test_version():
assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
def test_use_cudagraphs_dynamic(monkeypatch):
assert vllm.envs.VLLM_USE_V1
def test_use_cudagraphs_dynamic():
vllm_config = VllmConfig()
assert vllm_config.compilation_config.use_cudagraph
monkeypatch.setenv("VLLM_USE_V1", "0")
vllm_config = VllmConfig()
assert not vllm_config.compilation_config.use_cudagraph
def test_custom_op():
# proper syntax
......@@ -41,8 +35,6 @@ def test_custom_op():
# may be influenced by other tests.
@pytest.mark.parametrize("val", ["1"])
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
assert vllm.envs.VLLM_USE_V1
# Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
......@@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
@pytest.mark.forked
@pytest.mark.parametrize("enabled", [True, False])
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
assert vllm.envs.VLLM_USE_V1
# Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
......
......@@ -303,7 +303,6 @@ def test_attention_quant_pattern(
model_class: type[AttentionQuantPatternModel],
backend: _Backend,
use_inductor_graph_partition: bool,
monkeypatch,
dist_init,
caplog_vllm,
):
......@@ -312,8 +311,6 @@ def test_attention_quant_pattern(
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
monkeypatch.setenv("VLLM_USE_V1", "1")
device = torch.device("cuda:0")
torch.manual_seed(42)
......
......@@ -8,16 +8,13 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.v1.engine.async_llm import AsyncLLM
def test_mp_reducer(monkeypatch):
def test_mp_reducer():
"""
Test that _reduce_config reducer is registered when AsyncLLM is instantiated
without transformers_modules. This is a regression test for
https://github.com/vllm-project/vllm/pull/18640.
"""
# Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
monkeypatch.setenv("VLLM_USE_V1", "1")
# Ensure transformers_modules is not in sys.modules
if "transformers_modules" in sys.modules:
del sys.modules["transformers_modules"]
......
......@@ -5,7 +5,7 @@ from typing import Any, Optional
import pytest
from vllm import LLM, SamplingParams, envs
from vllm import LLM, SamplingParams
MODEL = "meta-llama/llama-2-7b-hf"
MAX_TOKENS = 200
......@@ -111,9 +111,7 @@ def _stop_token_id(llm):
@pytest.mark.skip_global_cleanup
def test_stop_strings():
# If V0, must set enforce_eager=False since we use
# async output processing below.
llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
llm = LLM(MODEL, enforce_eager=True)
_stop_basic(llm)
_stop_multi_tokens(llm)
......
......@@ -42,24 +42,10 @@ class CPTestOptions(NamedTuple):
@dataclass
class CPTestSettings:
parallel_setups: list[ParallelSetup]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends: list[str]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions: list[str]
runner: RunnerOption
test_options: CPTestOptions
def __post_init__(self):
if len(self.distributed_backends) != len(self.vllm_major_versions):
raise ValueError(
f"Length mismatch: distributed_backends "
f"({len(self.distributed_backends)}) != "
f"vllm_major_versions ({len(self.vllm_major_versions)})"
)
@staticmethod
def detailed(
*,
......@@ -87,7 +73,6 @@ class CPTestSettings:
return CPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp"],
vllm_major_versions=["1"],
runner=runner,
test_options=CPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
......@@ -98,14 +83,11 @@ class CPTestSettings:
opts = self.test_options
for parallel_setup in self.parallel_setups:
for backend, vllm_major_version in zip(
self.distributed_backends, self.vllm_major_versions
):
for backend in self.distributed_backends:
yield (
model_id,
parallel_setup,
backend,
vllm_major_version,
self.runner,
opts,
)
......@@ -115,7 +97,6 @@ def _compare_cp_with_tp(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption,
test_options: CPTestOptions,
num_gpus_available: int,
......@@ -191,10 +172,6 @@ def _compare_cp_with_tp(
if hf_overrides:
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
cp_env = tp_env = {
"VLLM_USE_V1": vllm_major_version, # Note(hc): DCP only support V1 engine only
}
cp_args = [
*common_args,
"--tensor-parallel-size",
......@@ -217,24 +194,13 @@ def _compare_cp_with_tp(
distributed_backend,
]
try:
compare_two_settings(
model_id,
cp_args,
tp_args,
cp_env,
tp_env,
method=method,
max_wait_seconds=720,
)
except Exception:
testing_ray_compiled_graph = cp_env is not None
if testing_ray_compiled_graph and vllm_major_version == "0":
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger.exception("Ray Compiled Graph tests failed")
else:
raise
compare_two_settings(
model_id,
cp_args,
tp_args,
method=method,
max_wait_seconds=720,
)
CP_TEXT_GENERATION_MODELS = {
......@@ -257,7 +223,6 @@ CP_TEST_MODELS = [
"model_id",
"parallel_setup",
"distributed_backend",
"vllm_major_version",
"runner",
"test_options",
),
......@@ -274,7 +239,6 @@ def test_cp_generation(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption,
test_options: CPTestOptions,
num_gpus_available,
......@@ -283,7 +247,6 @@ def test_cp_generation(
model_id,
parallel_setup,
distributed_backend,
vllm_major_version,
runner,
test_options,
num_gpus_available,
......
......@@ -307,7 +307,6 @@ def _compare_tp(
if distributed_backend == "ray":
# For V1, test Ray Compiled Graph for all the tests
pp_env = {
"VLLM_USE_V1": "1",
"VLLM_USE_RAY_COMPILED_DAG": "1",
"VLLM_USE_RAY_SPMD_WORKER": "1",
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
......@@ -316,15 +315,11 @@ def _compare_tp(
# terminate because of a Ray Compiled Graph issue.
common_args.append("--disable-frontend-multiprocessing")
elif distributed_backend == "mp":
pp_env = {
"VLLM_USE_V1": "1",
}
pp_env = None
else:
pp_env = None
tp_env = {
"VLLM_USE_V1": "1",
}
tp_env = None
pp_args = [
*common_args,
......
......@@ -42,24 +42,10 @@ class SPTestOptions(NamedTuple):
@dataclass
class SPTestSettings:
parallel_setups: list[ParallelSetup]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends: list[str]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions: list[str]
runner: RunnerOption
test_options: SPTestOptions
def __post_init__(self):
if len(self.distributed_backends) != len(self.vllm_major_versions):
raise ValueError(
f"Length mismatch: distributed_backends "
f"({len(self.distributed_backends)}) != "
f"vllm_major_versions ({len(self.vllm_major_versions)})"
)
@staticmethod
def detailed(
*,
......@@ -85,7 +71,6 @@ class SPTestSettings:
return SPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
runner=runner,
test_options=SPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
......@@ -117,7 +102,6 @@ class SPTestSettings:
return SPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
runner=runner,
test_options=SPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
......@@ -147,7 +131,6 @@ class SPTestSettings:
return SPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
runner=runner,
test_options=SPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
......@@ -158,14 +141,11 @@ class SPTestSettings:
opts = self.test_options
for parallel_setup in self.parallel_setups:
for backend, vllm_major_version in zip(
self.distributed_backends, self.vllm_major_versions
):
for backend in self.distributed_backends:
yield (
model_id,
parallel_setup,
backend,
vllm_major_version,
self.runner,
opts,
)
......@@ -175,7 +155,6 @@ def _compare_sp(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption,
test_options: SPTestOptions,
num_gpus_available: int,
......@@ -265,10 +244,6 @@ def _compare_sp(
},
}
tp_sp_env = tp_env = {
"VLLM_USE_V1": vllm_major_version,
}
tp_sp_args = [
*common_args,
"--tensor-parallel-size",
......@@ -281,9 +256,6 @@ def _compare_sp(
json.dumps(compilation_config),
]
tp_env = {
"VLLM_USE_V1": vllm_major_version,
}
tp_args = [
*common_args,
"--tensor-parallel-size",
......@@ -292,18 +264,7 @@ def _compare_sp(
"mp",
]
try:
compare_two_settings(
model_id, tp_sp_args, tp_args, tp_sp_env, tp_env, method=method
)
except Exception:
testing_ray_compiled_graph = tp_sp_env is not None
if testing_ray_compiled_graph and vllm_major_version == "0":
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger.exception("Ray Compiled Graph tests failed")
else:
raise
compare_two_settings(model_id, tp_sp_args, tp_args, method=method)
SP_TEXT_GENERATION_MODELS = {
......@@ -325,7 +286,6 @@ SP_TEST_MODELS = [
"model_id",
"parallel_setup",
"distributed_backend",
"vllm_major_version",
"runner",
"test_options",
),
......@@ -341,7 +301,6 @@ def test_tp_sp_generation(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption,
test_options: SPTestOptions,
num_gpus_available,
......@@ -350,7 +309,6 @@ def test_tp_sp_generation(
model_id,
parallel_setup,
distributed_backend,
vllm_major_version,
runner,
test_options,
num_gpus_available,
......
......@@ -61,50 +61,34 @@ def run_test(model_name, more_args=None):
TPU_TP_TEST_STR = "" # "tensor_parallel_size=4"
@pytest.mark.skipif(
not current_platform.is_cuda() and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU",
)
@pytest.mark.parametrize("model", MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
def test_lm_eval_accuracy_v1_engine(model):
"""Run with the V1 Engine."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_model_len=2048,max_num_seqs=64"
more_args = "max_model_len=2048,max_num_seqs=64"
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
run_test(model, more_args)
run_test(model, more_args)
@pytest.mark.skipif(
not current_platform.is_cuda() and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU",
)
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
model, monkeypatch: pytest.MonkeyPatch
):
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
"""Run with the V1 Engine."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
run_test(model, more_args)
run_test(model, more_args)
......@@ -10,7 +10,6 @@ AsyncLLMEngine are working correctly.
"""
import lm_eval
import pytest
from vllm.platforms import current_platform
......@@ -67,21 +66,13 @@ def run_test(more_args):
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
@pytest.mark.skipif(
not current_platform.is_cuda()
and not current_platform.is_tpu()
and not current_platform.is_xpu(),
reason="V1 currently only supported on CUDA, XPU and TPU",
)
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
def test_lm_eval_accuracy_v1_engine():
"""Run with the V1 Engine."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
more_args = []
more_args = []
# Limit compilation time for V1
if current_platform.is_tpu():
more_args = ["--max-num-seqs", "64"]
# Limit compilation time for V1
if current_platform.is_tpu():
more_args = ["--max-num-seqs", "64"]
run_test(more_args)
run_test(more_args)
......@@ -21,18 +21,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module")
def monkeypatch_module():
from _pytest.monkeypatch import MonkeyPatch
mpatch = MonkeyPatch()
yield mpatch
mpatch.undo()
@pytest.fixture(scope="module")
def server(monkeypatch_module, zephyr_lora_files): # noqa: F811
monkeypatch_module.setenv("VLLM_USE_V1", "1")
def server(zephyr_lora_files): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
......
......@@ -37,21 +37,8 @@ BADREQUEST_CASES = [
]
@pytest.fixture(scope="module")
def monkeypatch_module():
from _pytest.monkeypatch import MonkeyPatch
mpatch = MonkeyPatch()
yield mpatch
mpatch.undo()
@pytest.fixture(scope="module", params=[True])
def server_with_lora_modules_json(request, monkeypatch_module, zephyr_lora_files):
use_v1 = request.param
assert use_v1
monkeypatch_module.setenv("VLLM_USE_V1", "1")
def server_with_lora_modules_json(request, zephyr_lora_files):
# Define the json format LoRA module configurations
lora_module_1 = {
"name": "zephyr-lora",
......
......@@ -22,24 +22,6 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PREV_MINOR_VERSION = version._prev_minor_version()
@pytest.fixture(scope="module", params=[True])
def use_v1(request):
# Module-scoped variant of run_with_both_engines
#
# Use this fixture to run a test with both v0 and v1, and
# also to conditionalize the test logic e.g.
#
# def test_metrics_exist(use_v1, server, client):
# ...
# expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
# for metric in expected:
# assert metric in response.text
#
# @skip_v1 wouldn't work here because this is a module-level
# fixture - per-function decorators would have no effect
yield request.param
@pytest.fixture(scope="module")
def default_server_args():
return [
......@@ -63,13 +45,11 @@ def default_server_args():
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
],
)
def server(use_v1, default_server_args, request):
def server(default_server_args, request):
if request.param:
default_server_args.append(request.param)
env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
with RemoteOpenAIServer(
MODEL_NAME, default_server_args, env_dict=env_dict
) as remote_server:
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
......@@ -129,7 +109,8 @@ EXPECTED_VALUES = {
@pytest.mark.asyncio
async def test_metrics_counts(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
server: RemoteOpenAIServer,
client: openai.AsyncClient,
):
for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged.
......@@ -145,7 +126,7 @@ async def test_metrics_counts(
# Loop over all expected metric_families
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
if (metric_family not in EXPECTED_METRICS_V1) or (
not server.show_hidden_metrics
and metric_family in HIDDEN_DEPRECATED_METRICS
):
......@@ -183,62 +164,6 @@ async def test_metrics_counts(
assert found_metric, f"Did not find {metric_family} in prom endpoint"
EXPECTED_METRICS = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
"vllm:e2e_request_latency_seconds_sum",
"vllm:e2e_request_latency_seconds_bucket",
"vllm:e2e_request_latency_seconds_count",
"vllm:request_queue_time_seconds_sum",
"vllm:request_queue_time_seconds_bucket",
"vllm:request_queue_time_seconds_count",
"vllm:request_inference_time_seconds_sum",
"vllm:request_inference_time_seconds_bucket",
"vllm:request_inference_time_seconds_count",
"vllm:request_prefill_time_seconds_sum",
"vllm:request_prefill_time_seconds_bucket",
"vllm:request_prefill_time_seconds_count",
"vllm:request_decode_time_seconds_sum",
"vllm:request_decode_time_seconds_bucket",
"vllm:request_decode_time_seconds_count",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:iteration_tokens_total",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_success_total",
"vllm:cache_config_info",
# labels in cache_config_info
"block_size",
"cache_dtype",
"cpu_offload_gb",
"enable_prefix_caching",
"gpu_memory_utilization",
"num_cpu_blocks",
"num_gpu_blocks",
"num_gpu_blocks_override",
"sliding_window",
"swap_space_bytes",
]
EXPECTED_METRICS_V1 = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
......@@ -304,17 +229,21 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
@pytest.mark.asyncio
async def test_metrics_exist(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
server: RemoteOpenAIServer,
client: openai.AsyncClient,
):
# sending a request triggers the metrics to be logged.
await client.completions.create(
model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0,
)
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
for metric in EXPECTED_METRICS_V1:
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
continue
assert metric in response.text
......@@ -322,10 +251,11 @@ async def test_metrics_exist(
@pytest.mark.asyncio
async def test_abort_metrics_reset(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
server: RemoteOpenAIServer,
client: openai.AsyncClient,
):
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server, use_v1
server
)
# Expect no running requests or kvcache usage
......@@ -351,7 +281,7 @@ async def test_abort_metrics_reset(
# Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server, use_v1
server
)
# Expect running requests and kvcache usage
......@@ -371,7 +301,7 @@ async def test_abort_metrics_reset(
# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
_get_running_metrics_from_api(server, use_v1)
_get_running_metrics_from_api(server)
)
assert running_requests_after == 0, (
......@@ -385,7 +315,7 @@ async def test_abort_metrics_reset(
)
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
"""Return (running_count, waiting_count, kv_cache_usage)"""
response = requests.get(server.url_for("metrics"))
......@@ -394,9 +324,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
# Verify running and waiting requests counts and KV cache usage are zero
running_requests, waiting_requests, kv_cache_usage = None, None, None
kv_cache_usage_metric = (
"vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
)
kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
for family in text_string_to_metric_families(response.text):
if family.name == "vllm:num_requests_running":
......@@ -422,7 +350,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
return running_requests, waiting_requests, kv_cache_usage
def test_metrics_exist_run_batch(use_v1: bool):
def test_metrics_exist_run_batch():
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
base_url = "0.0.0.0"
......@@ -452,7 +380,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
"--port",
port,
],
env={"VLLM_USE_V1": "1"},
)
def is_server_up(url):
......
......@@ -15,11 +15,6 @@ from vllm.entrypoints.renderer import BaseRenderer
from ...utils import RemoteOpenAIServer
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch):
monkeypatch.setenv("VLLM_USE_V1", "1")
@pytest.mark.asyncio
async def test_empty_prompt():
model_name = "gpt2"
......
......@@ -80,7 +80,6 @@ def test_env(
):
"""Test attention backend selection with valid device-backend pairs."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv(STR_BACKEND_ENV_VAR, name)
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
......@@ -212,30 +211,21 @@ def test_env(
@pytest.mark.parametrize("device", ["cpu", "cuda"])
def test_fp32_fallback(
device: str,
monkeypatch: pytest.MonkeyPatch,
):
def test_fp32_fallback(device: str):
"""Test attention backend selection with fp32."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
if device == "cpu":
with patch("vllm.attention.selector.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "TORCH_SDPA"
if device == "cpu":
with patch("vllm.attention.selector.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "TORCH_SDPA"
elif device == "cuda":
with patch("vllm.attention.selector.current_platform", CudaPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "FLEX_ATTENTION"
elif device == "cuda":
with patch("vllm.attention.selector.current_platform", CudaPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "FLEX_ATTENTION"
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation."""
# TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend
pytest.skip(
"Skipping as current backend selector does not "
"handle fallbacks when a backend is set via env var."
......@@ -289,7 +279,6 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
monkeypatch.context() as m,
patch("vllm.attention.selector.current_platform", CudaPlatform()),
):
m.setenv("VLLM_USE_V1", "1")
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
# Should raise ValueError for invalid backend
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment