Unverified Commit 1e4ecca1 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent c0a7b89d
...@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention # Run with flex attention
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
set_seed(seed) set_seed(seed)
...@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with default backend # Run with default backend
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
set_seed(seed) set_seed(seed)
with vllm_runner( with vllm_runner(
model_name, model_name,
...@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention # Run with flex attention
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
with vllm_runner( with vllm_runner(
model_name, model_name,
...@@ -126,16 +123,17 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -126,16 +123,17 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
flex_outputs = llm_flex.embed(prompts) flex_outputs = llm_flex.embed(prompts)
# Run with default backend # Run with default backend
with monkeypatch.context() as m: with (
m.setenv("VLLM_USE_V1", "1") monkeypatch.context() as m,
with vllm_runner( vllm_runner(
model_name, model_name,
runner="pooling", runner="pooling",
dtype=torch.bfloat16, dtype=torch.bfloat16,
tensor_parallel_size=1, tensor_parallel_size=1,
max_model_len=100, max_model_len=100,
enforce_eager=True, enforce_eager=True,
) as llm_default: ) as llm_default,
):
default_outputs = llm_default.embed(prompts) default_outputs = llm_default.embed(prompts)
check_embeddings_close( check_embeddings_close(
......
...@@ -613,7 +613,6 @@ def test_dummy_maverick( ...@@ -613,7 +613,6 @@ def test_dummy_maverick(
profile: bool = False, profile: bool = False,
) -> None: ) -> None:
# Disable multiprocessing allows us to access model executor from LLM engine # Disable multiprocessing allows us to access model executor from LLM engine
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
model_path = create_reduced_maverick_model( model_path = create_reduced_maverick_model(
......
...@@ -8,7 +8,6 @@ if TYPE_CHECKING: ...@@ -8,7 +8,6 @@ if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import VllmConfig
else: else:
VllmConfig = None VllmConfig = None
from vllm import envs
class DummyPlatform(Platform): class DummyPlatform(Platform):
...@@ -19,10 +18,7 @@ class DummyPlatform(Platform): ...@@ -19,10 +18,7 @@ class DummyPlatform(Platform):
@classmethod @classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None: def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if envs.VLLM_USE_V1: vllm_config.compilation_config.custom_ops = ["all"]
compilation_config = vllm_config.compilation_config
# Activate custom ops for v1.
compilation_config.custom_ops = ["all"]
def get_attn_backend_cls( def get_attn_backend_cls(
self, self,
......
...@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler): ...@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler):
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch): def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Explicitly turn off engine multiprocessing so # Explicitly turn off engine multiprocessing so
# that the scheduler runs in this process # that the scheduler runs in this process
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
......
...@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`. ...@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
from typing import Optional from typing import Optional
import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
@pytest.fixture(autouse=True)
def v1(monkeypatch):
"""Only run on vLLM v1."""
monkeypatch.setenv("VLLM_USE_V1", "1")
def _generate( def _generate(
llm: LLM, llm: LLM,
prompt: str, prompt: str,
......
...@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest ...@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest
# 100 training iterations with a training batch size of 100. # 100 training iterations with a training batch size of 100.
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch: pytest.MonkeyPatch):
"""
Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
for all tests in this file
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
yield
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM: def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
return vllm.LLM( return vllm.LLM(
model="Qwen/Qwen2.5-3B-Instruct", model="Qwen/Qwen2.5-3B-Instruct",
......
...@@ -305,7 +305,6 @@ full_cg_backend_configs = { ...@@ -305,7 +305,6 @@ full_cg_backend_configs = {
"CutlassMLA": BackendConfig( "CutlassMLA": BackendConfig(
name="CutlassMLA", name="CutlassMLA",
env_vars={ env_vars={
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA", "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
"FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed "FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed
}, },
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch import torch
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
pytestmark = pytest.mark.cpu_test
def new_kv_cache_spec(): def new_kv_cache_spec():
return FullAttentionSpec(16, 1, 1, torch.float32, False) return FullAttentionSpec(16, 1, 1, torch.float32, False)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
from vllm import LLM from vllm import LLM
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B" MODEL = "meta-llama/Llama-3.2-1B"
PROMPT = "Hello my name is Robert and I" PROMPT = "Hello my name is Robert and I"
......
...@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte ...@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
): ):
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA") pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars} env_vars = backend_configs[backend_name].env_vars
with temporary_environ(env_vars), ExitStack() as stack: with temporary_environ(env_vars), ExitStack() as stack:
if not supported: if not supported:
...@@ -117,7 +117,7 @@ combo_cases_2 = [ ...@@ -117,7 +117,7 @@ combo_cases_2 = [
def test_cudagraph_compilation_combo(combo_case): def test_cudagraph_compilation_combo(combo_case):
backend_name, cudagraph_mode, compilation_level, supported = combo_case backend_name, cudagraph_mode, compilation_level, supported = combo_case
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars} env_vars = backend_configs[backend_name].env_vars
with temporary_environ(env_vars), ExitStack() as stack: with temporary_environ(env_vars), ExitStack() as stack:
if not supported: if not supported:
......
...@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend): ...@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
) )
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
......
...@@ -32,7 +32,7 @@ model_config = { ...@@ -32,7 +32,7 @@ model_config = {
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False]) @pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
def test_sliding_window_retrieval( def test_sliding_window_retrieval(
monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager model, batch_size, seed, disable_hybrid_kv_cache_manager
): ):
""" """
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
...@@ -40,9 +40,6 @@ def test_sliding_window_retrieval( ...@@ -40,9 +40,6 @@ def test_sliding_window_retrieval(
If we tell it upfront which we are going to be looking for, then If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly). it answers correctly (mostly).
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
test_config = model_config[model] test_config = model_config[model]
llm = LLM( llm = LLM(
...@@ -50,9 +47,7 @@ def test_sliding_window_retrieval( ...@@ -50,9 +47,7 @@ def test_sliding_window_retrieval(
) )
sampling_params = SamplingParams(temperature=0.0, max_tokens=100) sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
prompts, answer, indices = prep_prompts( prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)
batch_size, ln_range=test_config.ln_range
)
check_length(prompts, llm, test_config.sliding_window) check_length(prompts, llm, test_config.sliding_window)
......
...@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill( ...@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
) )
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Make scheduling deterministic for reproducibility # Make scheduling deterministic for reproducibility
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
......
...@@ -13,7 +13,6 @@ Covers: ...@@ -13,7 +13,6 @@ Covers:
5) Multiple stop conditions 5) Multiple stop conditions
""" """
import os
from typing import Optional, Union from typing import Optional, Union
import pytest import pytest
...@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [ ...@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llm_v1(): def llm_v1():
"""Create V1 LLM instance for testing""" """Create V1 LLM instance for testing"""
# Ensure V1 engine is used
os.environ["VLLM_USE_V1"] = "1"
llm = LLM( llm = LLM(
model=TEST_MODEL, model=TEST_MODEL,
tensor_parallel_size=1, tensor_parallel_size=1,
...@@ -503,6 +499,6 @@ if __name__ == "__main__": ...@@ -503,6 +499,6 @@ if __name__ == "__main__":
Usage: Usage:
cd vllm/ cd vllm/
VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v python -m pytest tests/v1/e2e/test_min_tokens.py -v
""" """
pytest.main([__file__, "-v"]) pytest.main([__file__, "-v"])
...@@ -301,7 +301,6 @@ def test_mtp_correctness( ...@@ -301,7 +301,6 @@ def test_mtp_correctness(
model_setup: (method, model_name, tp_size) model_setup: (method, model_name, tp_size)
""" """
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_MLA_DISABLE", "1") m.setenv("VLLM_MLA_DISABLE", "1")
method, model_name, tp_size = model_setup method, model_name, tp_size = model_setup
......
...@@ -95,17 +95,11 @@ async def generate( ...@@ -95,17 +95,11 @@ async def generate(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load( async def test_load(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind, output_kind: RequestOutputKind,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
prompt: PromptType, prompt: PromptType,
): ):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1 with ExitStack() as after:
# so that in the future when we switch, we don't have to change all the
# tests.
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -149,14 +143,11 @@ async def test_load( ...@@ -149,14 +143,11 @@ async def test_load(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_abort( async def test_abort(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind, output_kind: RequestOutputKind,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
prompt: PromptType, prompt: PromptType,
): ):
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -222,13 +213,8 @@ async def test_abort( ...@@ -222,13 +213,8 @@ async def test_abort(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY] "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_multi_abort( async def test_multi_abort(output_kind: RequestOutputKind):
monkeypatch: pytest.MonkeyPatch, with ExitStack() as after:
output_kind: RequestOutputKind,
):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -304,14 +290,11 @@ async def test_multi_abort( ...@@ -304,14 +290,11 @@ async def test_multi_abort(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_finished_flag( async def test_finished_flag(
monkeypatch: pytest.MonkeyPatch,
n: int, n: int,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
prompt: PromptType, prompt: PromptType,
): ):
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -341,12 +324,10 @@ async def test_finished_flag( ...@@ -341,12 +324,10 @@ async def test_finished_flag(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_mid_stream_cancellation( async def test_mid_stream_cancellation(
monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType engine_args: AsyncEngineArgs, prompt: PromptType
): ):
"""Test that requests can be cancelled mid-stream.""" """Test that requests can be cancelled mid-stream."""
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch): ...@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
be added to the default loggers. be added to the default loggers.
""" """
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args( engine = AsyncLLM.from_engine_args(
TEXT_ENGINE_ARGS, TEXT_ENGINE_ARGS,
...@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch): ...@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
@pytest.mark.asyncio(scope="module") @pytest.mark.asyncio(scope="module")
async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch): async def test_dp_rank_argument():
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch): ...@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_check_health(monkeypatch: pytest.MonkeyPatch): async def test_check_health():
"""Test that check_health returns normally for healthy engine """Test that check_health returns normally for healthy engine
and raises EngineDeadError when the engine is dead. and raises EngineDeadError when the engine is dead.
""" """
...@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch): ...@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.engine.exceptions import EngineDeadError
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch): ...@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY] "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_abort_final_output( async def test_abort_final_output(output_kind: RequestOutputKind):
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
):
"""Test that abort() returns a final output with correct information.""" """Test that abort() returns a final output with correct information."""
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
......
...@@ -5,18 +5,11 @@ from argparse import ArgumentError ...@@ -5,18 +5,11 @@ from argparse import ArgumentError
import pytest import pytest
from vllm import envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
if not envs.VLLM_USE_V1:
pytest.skip(
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
allow_module_level=True,
)
def test_prefix_caching_from_cli(): def test_prefix_caching_from_cli():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
......
...@@ -46,9 +46,7 @@ def make_request() -> EngineCoreRequest: ...@@ -46,9 +46,7 @@ def make_request() -> EngineCoreRequest:
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_engine_core(monkeypatch: pytest.MonkeyPatch): def test_engine_core():
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
"""Setup the EngineCore.""" """Setup the EngineCore."""
engine_args = EngineArgs(model=MODEL_NAME) engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config() vllm_config = engine_args.create_engine_config()
...@@ -176,14 +174,12 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): ...@@ -176,14 +174,12 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): def test_engine_core_advanced_sampling():
""" """
A basic end-to-end test to verify that the engine functions correctly A basic end-to-end test to verify that the engine functions correctly
when additional sampling parameters, such as top_p, min_tokens, and when additional sampling parameters, such as top_p, min_tokens, and
presence_penalty, are set. presence_penalty, are set.
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
"""Setup the EngineCore.""" """Setup the EngineCore."""
engine_args = EngineArgs(model=MODEL_NAME) engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config() vllm_config = engine_args.create_engine_config()
...@@ -227,7 +223,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): ...@@ -227,7 +223,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): def test_engine_core_concurrent_batches():
""" """
Test that the engine can handle multiple concurrent batches. Test that the engine can handle multiple concurrent batches.
""" """
...@@ -272,9 +268,6 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): ...@@ -272,9 +268,6 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
if hasattr(self, "thread_pool"): if hasattr(self, "thread_pool"):
self.thread_pool.shutdown(wait=False) self.thread_pool.shutdown(wait=False)
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
engine_args = EngineArgs( engine_args = EngineArgs(
model=MODEL_NAME, model=MODEL_NAME,
# To test concurrent batches. # To test concurrent batches.
...@@ -364,13 +357,11 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): ...@@ -364,13 +357,11 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch): def test_engine_core_tp():
""" """
Test engine can initialize worker in tp properly Test engine can initialize worker in tp properly
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
"""Setup the EngineCore.""" """Setup the EngineCore."""
engine_args = EngineArgs( engine_args = EngineArgs(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -400,11 +391,8 @@ def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch): ...@@ -400,11 +391,8 @@ def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch): def test_engine_core_invalid_request_id_type():
"""Test that engine raises TypeError for non-string request_id.""" """Test that engine raises TypeError for non-string request_id."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
engine_args = EngineArgs(model=MODEL_NAME) engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config() vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config) executor_class = Executor.get_class(vllm_config)
...@@ -432,9 +420,7 @@ def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch): ...@@ -432,9 +420,7 @@ def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch):
none_request = make_request() none_request = make_request()
none_request.request_id = None none_request.request_id = None
with pytest.raises( with pytest.raises(TypeError, match="request_id must be a string, got.*NoneType"):
TypeError, match="request_id must be a string, got.*NoneType"
):
engine_core.add_request(*engine_core.preprocess_add_request(none_request)) engine_core.add_request(*engine_core.preprocess_add_request(none_request))
# Verify engine is still functional after errors # Verify engine is still functional after errors
......
...@@ -130,8 +130,6 @@ def test_engine_core_client( ...@@ -130,8 +130,6 @@ def test_engine_core_client(
monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch core engine utility function to test. # Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False) m.setattr(EngineCore, "echo", echo, raising=False)
...@@ -218,8 +216,6 @@ def test_engine_core_client( ...@@ -218,8 +216,6 @@ def test_engine_core_client(
@pytest.mark.asyncio(loop_scope="function") @pytest.mark.asyncio(loop_scope="function")
async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch core engine utility function to test. # Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False) m.setattr(EngineCore, "echo", echo, raising=False)
...@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return( ...@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types. # Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
...@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return( ...@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types. # Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
...@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures( ...@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types. # Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
...@@ -592,12 +582,9 @@ async def test_engine_core_client_util_method_nested_structures( ...@@ -592,12 +582,9 @@ async def test_engine_core_client_util_method_nested_structures(
indirect=["publisher_config"], indirect=["publisher_config"],
) )
def test_kv_cache_events( def test_kv_cache_events(
monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool, multiprocessing_mode: bool,
publisher_config, publisher_config,
): ):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
block_size = 16 block_size = 16
num_blocks = 2 num_blocks = 2
...@@ -640,9 +627,7 @@ def test_kv_cache_events( ...@@ -640,9 +627,7 @@ def test_kv_cache_events(
seq, received = result seq, received = result
assert seq == 0, "Sequence number mismatch" assert seq == 0, "Sequence number mismatch"
assert len(received.events) == 1, ( assert len(received.events) == 1, "We should have exactly one BlockStored event"
"We should have exactly one BlockStored event"
)
event = received.events[0] event = received.events[0]
assert isinstance(event, BlockStored), "We should have a BlockStored event" assert isinstance(event, BlockStored), "We should have a BlockStored event"
assert len(event.block_hashes) == num_blocks, ( assert len(event.block_hashes) == num_blocks, (
...@@ -672,12 +657,9 @@ def test_kv_cache_events( ...@@ -672,12 +657,9 @@ def test_kv_cache_events(
) )
@multi_gpu_test(num_gpus=4) @multi_gpu_test(num_gpus=4)
async def test_kv_cache_events_dp( async def test_kv_cache_events_dp(
monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool, multiprocessing_mode: bool,
publisher_config, publisher_config,
): ):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
block_size = 16 block_size = 16
num_blocks = 2 num_blocks = 2
dp_size = 2 dp_size = 2
...@@ -765,8 +747,6 @@ async def test_kv_cache_events_dp( ...@@ -765,8 +747,6 @@ async def test_kv_cache_events_dp(
@pytest.mark.timeout(20) @pytest.mark.timeout(20)
def test_startup_failure(monkeypatch: pytest.MonkeyPatch): def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m, pytest.raises(Exception) as e_info: with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch to extract core process pid while it's starting. # Monkey-patch to extract core process pid while it's starting.
core_proc_pid = [None] core_proc_pid = [None]
cepm_ctor = CoreEngineProcManager.__init__ cepm_ctor = CoreEngineProcManager.__init__
...@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat ...@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
mock_executor_class.side_effect = create_mock_executor mock_executor_class.side_effect = create_mock_executor
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices
from vllm.v1.engine.utils import EngineZmqAddresses from vllm.v1.engine.utils import EngineZmqAddresses
......
...@@ -21,12 +21,10 @@ DTYPE = "half" ...@@ -21,12 +21,10 @@ DTYPE = "half"
def _vllm_model( def _vllm_model(
apc: bool, apc: bool,
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
*, *,
skip_tokenizer_init: bool = False, skip_tokenizer_init: bool = False,
): ):
"""Set up VllmRunner instance.""" """Set up VllmRunner instance."""
monkeypatch.setenv("VLLM_USE_V1", "1")
return vllm_runner( return vllm_runner(
MODEL, MODEL,
dtype=DTYPE, dtype=DTYPE,
...@@ -45,16 +43,16 @@ def _vllm_model( ...@@ -45,16 +43,16 @@ def _vllm_model(
# Prefix caching # Prefix caching
params=[False, True], params=[False, True],
) )
def vllm_model(vllm_runner, request, monkeypatch): def vllm_model(vllm_runner, request):
"""VllmRunner test fixture parameterized by APC True/False.""" """VllmRunner test fixture parameterized by APC True/False."""
with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model: with _vllm_model(request.param, vllm_runner) as vllm_model:
yield vllm_model yield vllm_model
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def vllm_model_apc(vllm_runner, monkeypatch): def vllm_model_apc(vllm_runner):
"""VllmRunner test fixture with APC.""" """VllmRunner test fixture with APC."""
with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model: with _vllm_model(True, vllm_runner) as vllm_model:
yield vllm_model yield vllm_model
...@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch): ...@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
# Prefix caching # Prefix caching
params=[False, True], params=[False, True],
) )
def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch): def vllm_model_skip_tokenizer_init(vllm_runner, request):
"""VllmRunner test fixture with APC.""" """VllmRunner test fixture with APC."""
with _vllm_model( with _vllm_model(
request.param, request.param,
vllm_runner, vllm_runner,
monkeypatch,
skip_tokenizer_init=True, skip_tokenizer_init=True,
) as vllm_model: ) as vllm_model:
yield vllm_model yield vllm_model
...@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None: ...@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
) )
def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): def test_engine_metrics(vllm_runner, example_prompts):
max_tokens = 100 max_tokens = 100
# Use spec decoding to test num_accepted_tokens_per_pos # Use spec decoding to test num_accepted_tokens_per_pos
speculative_config = { speculative_config = {
...@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): ...@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
"prompt_lookup_min": 3, "prompt_lookup_min": 3,
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
} }
monkeypatch.setenv("VLLM_USE_V1", "1")
with vllm_runner( with vllm_runner(
MODEL, MODEL,
speculative_config=speculative_config, speculative_config=speculative_config,
...@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): ...@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"]) @pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
def test_skip_tokenizer_initialization(model: str, monkeypatch: pytest.MonkeyPatch): def test_skip_tokenizer_initialization(model: str):
monkeypatch.setenv("VLLM_USE_V1", "1")
# This test checks if the flag skip_tokenizer_init skips the initialization # This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain # of tokenizer and detokenizer. The generated output is expected to contain
# token ids. # token ids.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment