Unverified Commit 1e4ecca1 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent c0a7b89d
...@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention # Run with flex attention
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
set_seed(seed) set_seed(seed)
...@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with default backend # Run with default backend
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
set_seed(seed) set_seed(seed)
with vllm_runner( with vllm_runner(
model_name, model_name,
...@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention # Run with flex attention
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
with vllm_runner( with vllm_runner(
model_name, model_name,
...@@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
flex_outputs = llm_flex.embed(prompts) flex_outputs = llm_flex.embed(prompts)
# Run with default backend # Run with default backend
with monkeypatch.context() as m: with (
m.setenv("VLLM_USE_V1", "1") monkeypatch.context() as m,
with vllm_runner( vllm_runner(
model_name, model_name,
runner="pooling", runner="pooling",
dtype=torch.bfloat16, dtype=torch.bfloat16,
tensor_parallel_size=1, tensor_parallel_size=1,
max_model_len=100, max_model_len=100,
enforce_eager=True, enforce_eager=True,
) as llm_default: ) as llm_default,
default_outputs = llm_default.embed(prompts) ):
default_outputs = llm_default.embed(prompts)
check_embeddings_close( check_embeddings_close(
embeddings_0_lst=flex_outputs, embeddings_0_lst=flex_outputs,
......
...@@ -613,7 +613,6 @@ def test_dummy_maverick( ...@@ -613,7 +613,6 @@ def test_dummy_maverick(
profile: bool = False, profile: bool = False,
) -> None: ) -> None:
# Disable multiprocessing allows us to access model executor from LLM engine # Disable multiprocessing allows us to access model executor from LLM engine
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
model_path = create_reduced_maverick_model( model_path = create_reduced_maverick_model(
......
...@@ -8,7 +8,6 @@ if TYPE_CHECKING: ...@@ -8,7 +8,6 @@ if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import VllmConfig
else: else:
VllmConfig = None VllmConfig = None
from vllm import envs
class DummyPlatform(Platform): class DummyPlatform(Platform):
...@@ -19,10 +18,7 @@ class DummyPlatform(Platform): ...@@ -19,10 +18,7 @@ class DummyPlatform(Platform):
@classmethod @classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None: def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if envs.VLLM_USE_V1: vllm_config.compilation_config.custom_ops = ["all"]
compilation_config = vllm_config.compilation_config
# Activate custom ops for v1.
compilation_config.custom_ops = ["all"]
def get_attn_backend_cls( def get_attn_backend_cls(
self, self,
......
...@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler): ...@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler):
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch): def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Explicitly turn off engine multiprocessing so # Explicitly turn off engine multiprocessing so
# that the scheduler runs in this process # that the scheduler runs in this process
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
......
...@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`. ...@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
from typing import Optional from typing import Optional
import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
@pytest.fixture(autouse=True)
def v1(monkeypatch):
"""Only run on vLLM v1."""
monkeypatch.setenv("VLLM_USE_V1", "1")
def _generate( def _generate(
llm: LLM, llm: LLM,
prompt: str, prompt: str,
......
...@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest ...@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest
# 100 training iterations with a training batch size of 100. # 100 training iterations with a training batch size of 100.
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch: pytest.MonkeyPatch):
"""
Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
for all tests in this file
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
yield
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM: def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
return vllm.LLM( return vllm.LLM(
model="Qwen/Qwen2.5-3B-Instruct", model="Qwen/Qwen2.5-3B-Instruct",
......
...@@ -305,7 +305,6 @@ full_cg_backend_configs = { ...@@ -305,7 +305,6 @@ full_cg_backend_configs = {
"CutlassMLA": BackendConfig( "CutlassMLA": BackendConfig(
name="CutlassMLA", name="CutlassMLA",
env_vars={ env_vars={
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA", "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
"FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed "FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed
}, },
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch import torch
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
pytestmark = pytest.mark.cpu_test
def new_kv_cache_spec(): def new_kv_cache_spec():
return FullAttentionSpec(16, 1, 1, torch.float32, False) return FullAttentionSpec(16, 1, 1, torch.float32, False)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
from vllm import LLM from vllm import LLM
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B" MODEL = "meta-llama/Llama-3.2-1B"
PROMPT = "Hello my name is Robert and I" PROMPT = "Hello my name is Robert and I"
......
...@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte ...@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
): ):
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA") pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars} env_vars = backend_configs[backend_name].env_vars
with temporary_environ(env_vars), ExitStack() as stack: with temporary_environ(env_vars), ExitStack() as stack:
if not supported: if not supported:
...@@ -117,7 +117,7 @@ combo_cases_2 = [ ...@@ -117,7 +117,7 @@ combo_cases_2 = [
def test_cudagraph_compilation_combo(combo_case): def test_cudagraph_compilation_combo(combo_case):
backend_name, cudagraph_mode, compilation_level, supported = combo_case backend_name, cudagraph_mode, compilation_level, supported = combo_case
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars} env_vars = backend_configs[backend_name].env_vars
with temporary_environ(env_vars), ExitStack() as stack: with temporary_environ(env_vars), ExitStack() as stack:
if not supported: if not supported:
......
...@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend): ...@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
) )
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
......
...@@ -32,7 +32,7 @@ model_config = { ...@@ -32,7 +32,7 @@ model_config = {
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False]) @pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
def test_sliding_window_retrieval( def test_sliding_window_retrieval(
monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager model, batch_size, seed, disable_hybrid_kv_cache_manager
): ):
""" """
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
...@@ -40,39 +40,34 @@ def test_sliding_window_retrieval( ...@@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
If we tell it upfront which we are going to be looking for, then If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly). it answers correctly (mostly).
""" """
with monkeypatch.context() as m: test_config = model_config[model]
m.setenv("VLLM_USE_V1", "1")
llm = LLM(
test_config = model_config[model] model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
)
llm = LLM( sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
) prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
check_length(prompts, llm, test_config.sliding_window)
prompts, answer, indices = prep_prompts(
batch_size, ln_range=test_config.ln_range # Fresh generation
) responses = llm.generate(prompts, sampling_params)
check_answers(
check_length(prompts, llm, test_config.sliding_window) indices,
answer,
# Fresh generation [response.outputs[0].text for response in responses],
responses = llm.generate(prompts, sampling_params) accept_rate=1.0,
check_answers( )
indices,
answer, # Re-generate with the same prompts to test prefix caching
[response.outputs[0].text for response in responses], responses = llm.generate(prompts, sampling_params)
accept_rate=1.0, check_answers(
) indices,
answer,
# Re-generate with the same prompts to test prefix caching [response.outputs[0].text for response in responses],
responses = llm.generate(prompts, sampling_params) accept_rate=1.0,
check_answers( )
indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0,
)
def check_length(prompts: list[str], llm: LLM, sliding_window: int): def check_length(prompts: list[str], llm: LLM, sliding_window: int):
......
...@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill( ...@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
) )
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Make scheduling deterministic for reproducibility # Make scheduling deterministic for reproducibility
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
......
...@@ -13,7 +13,6 @@ Covers: ...@@ -13,7 +13,6 @@ Covers:
5) Multiple stop conditions 5) Multiple stop conditions
""" """
import os
from typing import Optional, Union from typing import Optional, Union
import pytest import pytest
...@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [ ...@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llm_v1(): def llm_v1():
"""Create V1 LLM instance for testing""" """Create V1 LLM instance for testing"""
# Ensure V1 engine is used
os.environ["VLLM_USE_V1"] = "1"
llm = LLM( llm = LLM(
model=TEST_MODEL, model=TEST_MODEL,
tensor_parallel_size=1, tensor_parallel_size=1,
...@@ -503,6 +499,6 @@ if __name__ == "__main__": ...@@ -503,6 +499,6 @@ if __name__ == "__main__":
Usage: Usage:
cd vllm/ cd vllm/
VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v python -m pytest tests/v1/e2e/test_min_tokens.py -v
""" """
pytest.main([__file__, "-v"]) pytest.main([__file__, "-v"])
...@@ -301,7 +301,6 @@ def test_mtp_correctness( ...@@ -301,7 +301,6 @@ def test_mtp_correctness(
model_setup: (method, model_name, tp_size) model_setup: (method, model_name, tp_size)
""" """
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_MLA_DISABLE", "1") m.setenv("VLLM_MLA_DISABLE", "1")
method, model_name, tp_size = model_setup method, model_name, tp_size = model_setup
......
...@@ -95,17 +95,11 @@ async def generate( ...@@ -95,17 +95,11 @@ async def generate(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load( async def test_load(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind, output_kind: RequestOutputKind,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
prompt: PromptType, prompt: PromptType,
): ):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1 with ExitStack() as after:
# so that in the future when we switch, we don't have to change all the
# tests.
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -149,14 +143,11 @@ async def test_load( ...@@ -149,14 +143,11 @@ async def test_load(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_abort( async def test_abort(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind, output_kind: RequestOutputKind,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
prompt: PromptType, prompt: PromptType,
): ):
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -222,13 +213,8 @@ async def test_abort( ...@@ -222,13 +213,8 @@ async def test_abort(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY] "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_multi_abort( async def test_multi_abort(output_kind: RequestOutputKind):
monkeypatch: pytest.MonkeyPatch, with ExitStack() as after:
output_kind: RequestOutputKind,
):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -304,14 +290,11 @@ async def test_multi_abort( ...@@ -304,14 +290,11 @@ async def test_multi_abort(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_finished_flag( async def test_finished_flag(
monkeypatch: pytest.MonkeyPatch,
n: int, n: int,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
prompt: PromptType, prompt: PromptType,
): ):
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -341,12 +324,10 @@ async def test_finished_flag( ...@@ -341,12 +324,10 @@ async def test_finished_flag(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_mid_stream_cancellation( async def test_mid_stream_cancellation(
monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType engine_args: AsyncEngineArgs, prompt: PromptType
): ):
"""Test that requests can be cancelled mid-stream.""" """Test that requests can be cancelled mid-stream."""
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch): ...@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
be added to the default loggers. be added to the default loggers.
""" """
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args( engine = AsyncLLM.from_engine_args(
TEXT_ENGINE_ARGS, TEXT_ENGINE_ARGS,
...@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch): ...@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
@pytest.mark.asyncio(scope="module") @pytest.mark.asyncio(scope="module")
async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch): async def test_dp_rank_argument():
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch): ...@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_check_health(monkeypatch: pytest.MonkeyPatch): async def test_check_health():
"""Test that check_health returns normally for healthy engine """Test that check_health returns normally for healthy engine
and raises EngineDeadError when the engine is dead. and raises EngineDeadError when the engine is dead.
""" """
...@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch): ...@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.engine.exceptions import EngineDeadError
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch): ...@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY] "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_abort_final_output( async def test_abort_final_output(output_kind: RequestOutputKind):
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
):
"""Test that abort() returns a final output with correct information.""" """Test that abort() returns a final output with correct information."""
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
......
...@@ -5,18 +5,11 @@ from argparse import ArgumentError ...@@ -5,18 +5,11 @@ from argparse import ArgumentError
import pytest import pytest
from vllm import envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
if not envs.VLLM_USE_V1:
pytest.skip(
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
allow_module_level=True,
)
def test_prefix_caching_from_cli(): def test_prefix_caching_from_cli():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
......
This diff is collapsed.
...@@ -130,8 +130,6 @@ def test_engine_core_client( ...@@ -130,8 +130,6 @@ def test_engine_core_client(
monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch core engine utility function to test. # Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False) m.setattr(EngineCore, "echo", echo, raising=False)
...@@ -218,8 +216,6 @@ def test_engine_core_client( ...@@ -218,8 +216,6 @@ def test_engine_core_client(
@pytest.mark.asyncio(loop_scope="function") @pytest.mark.asyncio(loop_scope="function")
async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch core engine utility function to test. # Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False) m.setattr(EngineCore, "echo", echo, raising=False)
...@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return( ...@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types. # Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
...@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return( ...@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types. # Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
...@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures( ...@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types. # Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
...@@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures( ...@@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures(
indirect=["publisher_config"], indirect=["publisher_config"],
) )
def test_kv_cache_events( def test_kv_cache_events(
monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool, multiprocessing_mode: bool,
publisher_config, publisher_config,
): ):
with monkeypatch.context() as m: block_size = 16
m.setenv("VLLM_USE_V1", "1") num_blocks = 2
block_size = 16
num_blocks = 2 engine_args = EngineArgs(
model=MODEL_NAME,
engine_args = EngineArgs( enforce_eager=True,
model=MODEL_NAME, enable_prefix_caching=True,
enforce_eager=True, block_size=block_size,
enable_prefix_caching=True, )
block_size=block_size, engine_args.kv_events_config = publisher_config
)
engine_args.kv_events_config = publisher_config
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT) vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config) executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client( client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode, multiprocess_mode=multiprocessing_mode,
asyncio_mode=False, asyncio_mode=False,
vllm_config=vllm_config, vllm_config=vllm_config,
executor_class=executor_class, executor_class=executor_class,
log_stats=False, log_stats=False,
)
endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
subscriber = MockSubscriber(
endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
) )
endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
subscriber = MockSubscriber(
endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
)
try: try:
custom_tokens = list(range(num_blocks * block_size)) custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
request = make_request(sampling_params, custom_tokens) request = make_request(sampling_params, custom_tokens)
client.add_request(request) client.add_request(request)
outputs: dict[str, list] = {request.request_id: []} outputs: dict[str, list] = {request.request_id: []}
loop_until_done(client, outputs) loop_until_done(client, outputs)
result = subscriber.receive_one(timeout=1000) result = subscriber.receive_one(timeout=1000)
assert result is not None, "No message received" assert result is not None, "No message received"
seq, received = result seq, received = result
assert seq == 0, "Sequence number mismatch" assert seq == 0, "Sequence number mismatch"
assert len(received.events) == 1, ( assert len(received.events) == 1, "We should have exactly one BlockStored event"
"We should have exactly one BlockStored event" event = received.events[0]
) assert isinstance(event, BlockStored), "We should have a BlockStored event"
event = received.events[0] assert len(event.block_hashes) == num_blocks, (
assert isinstance(event, BlockStored), "We should have a BlockStored event" "We should have a BlockStored event with 2 block_hashes"
assert len(event.block_hashes) == num_blocks, ( )
"We should have a BlockStored event with 2 block_hashes" assert event.block_size == block_size, (
) "Block size should be the same as the block size"
assert event.block_size == block_size, ( )
"Block size should be the same as the block size" assert event.parent_block_hash is None, "Parent block hash should be None"
) assert event.lora_id is None, "Lora id should be None"
assert event.parent_block_hash is None, "Parent block hash should be None" assert len(event.token_ids) == num_blocks * block_size, (
assert event.lora_id is None, "Lora id should be None" "Token ids should be the same as the custom tokens"
assert len(event.token_ids) == num_blocks * block_size, ( )
"Token ids should be the same as the custom tokens" assert event.token_ids == custom_tokens, (
) "Token ids should be the same as the custom tokens"
assert event.token_ids == custom_tokens, ( )
"Token ids should be the same as the custom tokens" finally:
) client.shutdown()
finally: subscriber.close()
client.shutdown()
subscriber.close()
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -672,101 +657,96 @@ def test_kv_cache_events( ...@@ -672,101 +657,96 @@ def test_kv_cache_events(
) )
@multi_gpu_test(num_gpus=4) @multi_gpu_test(num_gpus=4)
async def test_kv_cache_events_dp( async def test_kv_cache_events_dp(
monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool, multiprocessing_mode: bool,
publisher_config, publisher_config,
): ):
with monkeypatch.context() as m: block_size = 16
m.setenv("VLLM_USE_V1", "1") num_blocks = 2
block_size = 16 dp_size = 2
num_blocks = 2 tp_size = 2
dp_size = 2
tp_size = 2 engine_args = EngineArgs(
model=MODEL_NAME,
engine_args = EngineArgs( enforce_eager=True,
model=MODEL_NAME, enable_prefix_caching=True,
enforce_eager=True, data_parallel_size=dp_size,
enable_prefix_caching=True, tensor_parallel_size=tp_size,
data_parallel_size=dp_size, block_size=block_size,
tensor_parallel_size=tp_size, )
block_size=block_size, engine_args.kv_events_config = publisher_config
)
engine_args.kv_events_config = publisher_config
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT) vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config) executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client( client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode, multiprocess_mode=multiprocessing_mode,
asyncio_mode=True, asyncio_mode=True,
vllm_config=vllm_config, vllm_config=vllm_config,
executor_class=executor_class, executor_class=executor_class,
log_stats=False, log_stats=False,
) )
await asyncio.sleep(1) await asyncio.sleep(1)
# Build endpoints for all DP ranks # Build endpoints for all DP ranks
base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
endpoints = [] endpoints = []
for i in range(dp_size): for i in range(dp_size):
offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i) offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
endpoints.append(offset_endpoint) endpoints.append(offset_endpoint)
subscriber = MockSubscriber( subscriber = MockSubscriber(
endpoints, topic=publisher_config.topic, decode_type=KVEventBatch endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
) )
try: try:
custom_tokens = list(range(num_blocks * block_size)) custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
all_request_ids = [] all_request_ids = []
# Create and add 25 requests # Create and add 25 requests
# NOTE: attempts to force routing to both dp groups but can be flaky # NOTE: attempts to force routing to both dp groups but can be flaky
for i in range(25): for i in range(25):
await asyncio.sleep(0.01) await asyncio.sleep(0.01)
request = make_request(sampling_params, custom_tokens) request = make_request(sampling_params, custom_tokens)
await client.add_request_async(request) await client.add_request_async(request)
all_request_ids.append(request.request_id) all_request_ids.append(request.request_id)
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
# Initialize outputs dict for all requests # Initialize outputs dict for all requests
outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids} outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
print("processing requests...") print("processing requests...")
await asyncio.wait_for( await asyncio.wait_for(
loop_until_fully_done_async(client, outputs), timeout=20.0 loop_until_fully_done_async(client, outputs), timeout=20.0
) )
# Receive from subscriber until no more messages # Receive from subscriber until no more messages
print("collecting results...") print("collecting results...")
results = [] results = []
while True: while True:
result = subscriber.receive_one(timeout=1) result = subscriber.receive_one(timeout=1)
print(result) print(result)
if result is None: if result is None:
break break
results.append(result) results.append(result)
# Collect all events and data_parallel_ranks from all results # Collect all events and data_parallel_ranks from all results
all_dp_ranks = [received.data_parallel_rank for (_, received) in results] all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
unique_dps = set(all_dp_ranks) unique_dps = set(all_dp_ranks)
assert len(unique_dps) == 2, ( assert len(unique_dps) == 2, (
f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}" f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
) )
finally: finally:
client.shutdown() client.shutdown()
subscriber.close() subscriber.close()
@pytest.mark.timeout(20) @pytest.mark.timeout(20)
def test_startup_failure(monkeypatch: pytest.MonkeyPatch): def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m, pytest.raises(Exception) as e_info: with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch to extract core process pid while it's starting. # Monkey-patch to extract core process pid while it's starting.
core_proc_pid = [None] core_proc_pid = [None]
cepm_ctor = CoreEngineProcManager.__init__ cepm_ctor = CoreEngineProcManager.__init__
...@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat ...@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
mock_executor_class.side_effect = create_mock_executor mock_executor_class.side_effect = create_mock_executor
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices
from vllm.v1.engine.utils import EngineZmqAddresses from vllm.v1.engine.utils import EngineZmqAddresses
......
...@@ -21,12 +21,10 @@ DTYPE = "half" ...@@ -21,12 +21,10 @@ DTYPE = "half"
def _vllm_model( def _vllm_model(
apc: bool, apc: bool,
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
*, *,
skip_tokenizer_init: bool = False, skip_tokenizer_init: bool = False,
): ):
"""Set up VllmRunner instance.""" """Set up VllmRunner instance."""
monkeypatch.setenv("VLLM_USE_V1", "1")
return vllm_runner( return vllm_runner(
MODEL, MODEL,
dtype=DTYPE, dtype=DTYPE,
...@@ -45,16 +43,16 @@ def _vllm_model( ...@@ -45,16 +43,16 @@ def _vllm_model(
# Prefix caching # Prefix caching
params=[False, True], params=[False, True],
) )
def vllm_model(vllm_runner, request, monkeypatch): def vllm_model(vllm_runner, request):
"""VllmRunner test fixture parameterized by APC True/False.""" """VllmRunner test fixture parameterized by APC True/False."""
with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model: with _vllm_model(request.param, vllm_runner) as vllm_model:
yield vllm_model yield vllm_model
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def vllm_model_apc(vllm_runner, monkeypatch): def vllm_model_apc(vllm_runner):
"""VllmRunner test fixture with APC.""" """VllmRunner test fixture with APC."""
with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model: with _vllm_model(True, vllm_runner) as vllm_model:
yield vllm_model yield vllm_model
...@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch): ...@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
# Prefix caching # Prefix caching
params=[False, True], params=[False, True],
) )
def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch): def vllm_model_skip_tokenizer_init(vllm_runner, request):
"""VllmRunner test fixture with APC.""" """VllmRunner test fixture with APC."""
with _vllm_model( with _vllm_model(
request.param, request.param,
vllm_runner, vllm_runner,
monkeypatch,
skip_tokenizer_init=True, skip_tokenizer_init=True,
) as vllm_model: ) as vllm_model:
yield vllm_model yield vllm_model
...@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None: ...@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
) )
def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): def test_engine_metrics(vllm_runner, example_prompts):
max_tokens = 100 max_tokens = 100
# Use spec decoding to test num_accepted_tokens_per_pos # Use spec decoding to test num_accepted_tokens_per_pos
speculative_config = { speculative_config = {
...@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): ...@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
"prompt_lookup_min": 3, "prompt_lookup_min": 3,
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
} }
monkeypatch.setenv("VLLM_USE_V1", "1")
with vllm_runner( with vllm_runner(
MODEL, MODEL,
speculative_config=speculative_config, speculative_config=speculative_config,
...@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): ...@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"]) @pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
def test_skip_tokenizer_initialization(model: str, monkeypatch: pytest.MonkeyPatch): def test_skip_tokenizer_initialization(model: str):
monkeypatch.setenv("VLLM_USE_V1", "1")
# This test checks if the flag skip_tokenizer_init skips the initialization # This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain # of tokenizer and detokenizer. The generated output is expected to contain
# token ids. # token ids.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment