"tests/vscode:/vscode.git/clone" did not exist on "245e4f2c01d19a567742fee4117badf1f6027da0"
Unverified Commit 1e4ecca1 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent c0a7b89d
...@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention # Run with flex attention
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
set_seed(seed) set_seed(seed)
...@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with default backend # Run with default backend
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
set_seed(seed) set_seed(seed)
with vllm_runner( with vllm_runner(
model_name, model_name,
...@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention # Run with flex attention
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
with vllm_runner( with vllm_runner(
model_name, model_name,
...@@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
flex_outputs = llm_flex.embed(prompts) flex_outputs = llm_flex.embed(prompts)
# Run with default backend # Run with default backend
with monkeypatch.context() as m: with (
m.setenv("VLLM_USE_V1", "1") monkeypatch.context() as m,
with vllm_runner( vllm_runner(
model_name, model_name,
runner="pooling", runner="pooling",
dtype=torch.bfloat16, dtype=torch.bfloat16,
tensor_parallel_size=1, tensor_parallel_size=1,
max_model_len=100, max_model_len=100,
enforce_eager=True, enforce_eager=True,
) as llm_default: ) as llm_default,
default_outputs = llm_default.embed(prompts) ):
default_outputs = llm_default.embed(prompts)
check_embeddings_close( check_embeddings_close(
embeddings_0_lst=flex_outputs, embeddings_0_lst=flex_outputs,
......
...@@ -613,7 +613,6 @@ def test_dummy_maverick( ...@@ -613,7 +613,6 @@ def test_dummy_maverick(
profile: bool = False, profile: bool = False,
) -> None: ) -> None:
# Disable multiprocessing allows us to access model executor from LLM engine # Disable multiprocessing allows us to access model executor from LLM engine
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
model_path = create_reduced_maverick_model( model_path = create_reduced_maverick_model(
......
...@@ -8,7 +8,6 @@ if TYPE_CHECKING: ...@@ -8,7 +8,6 @@ if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import VllmConfig
else: else:
VllmConfig = None VllmConfig = None
from vllm import envs
class DummyPlatform(Platform): class DummyPlatform(Platform):
...@@ -19,10 +18,7 @@ class DummyPlatform(Platform): ...@@ -19,10 +18,7 @@ class DummyPlatform(Platform):
@classmethod @classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None: def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if envs.VLLM_USE_V1: vllm_config.compilation_config.custom_ops = ["all"]
compilation_config = vllm_config.compilation_config
# Activate custom ops for v1.
compilation_config.custom_ops = ["all"]
def get_attn_backend_cls( def get_attn_backend_cls(
self, self,
......
...@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler): ...@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler):
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch): def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Explicitly turn off engine multiprocessing so # Explicitly turn off engine multiprocessing so
# that the scheduler runs in this process # that the scheduler runs in this process
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
......
...@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`. ...@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
from typing import Optional from typing import Optional
import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
@pytest.fixture(autouse=True)
def v1(monkeypatch):
"""Only run on vLLM v1."""
monkeypatch.setenv("VLLM_USE_V1", "1")
def _generate( def _generate(
llm: LLM, llm: LLM,
prompt: str, prompt: str,
......
...@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest ...@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest
# 100 training iterations with a training batch size of 100. # 100 training iterations with a training batch size of 100.
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch: pytest.MonkeyPatch):
"""
Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
for all tests in this file
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
yield
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM: def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
return vllm.LLM( return vllm.LLM(
model="Qwen/Qwen2.5-3B-Instruct", model="Qwen/Qwen2.5-3B-Instruct",
......
...@@ -305,7 +305,6 @@ full_cg_backend_configs = { ...@@ -305,7 +305,6 @@ full_cg_backend_configs = {
"CutlassMLA": BackendConfig( "CutlassMLA": BackendConfig(
name="CutlassMLA", name="CutlassMLA",
env_vars={ env_vars={
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA", "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
"FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed "FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed
}, },
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch import torch
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
pytestmark = pytest.mark.cpu_test
def new_kv_cache_spec(): def new_kv_cache_spec():
return FullAttentionSpec(16, 1, 1, torch.float32, False) return FullAttentionSpec(16, 1, 1, torch.float32, False)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
from vllm import LLM from vllm import LLM
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B" MODEL = "meta-llama/Llama-3.2-1B"
PROMPT = "Hello my name is Robert and I" PROMPT = "Hello my name is Robert and I"
......
...@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte ...@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
): ):
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA") pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars} env_vars = backend_configs[backend_name].env_vars
with temporary_environ(env_vars), ExitStack() as stack: with temporary_environ(env_vars), ExitStack() as stack:
if not supported: if not supported:
...@@ -117,7 +117,7 @@ combo_cases_2 = [ ...@@ -117,7 +117,7 @@ combo_cases_2 = [
def test_cudagraph_compilation_combo(combo_case): def test_cudagraph_compilation_combo(combo_case):
backend_name, cudagraph_mode, compilation_level, supported = combo_case backend_name, cudagraph_mode, compilation_level, supported = combo_case
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars} env_vars = backend_configs[backend_name].env_vars
with temporary_environ(env_vars), ExitStack() as stack: with temporary_environ(env_vars), ExitStack() as stack:
if not supported: if not supported:
......
...@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend): ...@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
) )
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
......
...@@ -32,7 +32,7 @@ model_config = { ...@@ -32,7 +32,7 @@ model_config = {
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False]) @pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
def test_sliding_window_retrieval( def test_sliding_window_retrieval(
monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager model, batch_size, seed, disable_hybrid_kv_cache_manager
): ):
""" """
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
...@@ -40,39 +40,34 @@ def test_sliding_window_retrieval( ...@@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
If we tell it upfront which we are going to be looking for, then If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly). it answers correctly (mostly).
""" """
with monkeypatch.context() as m: test_config = model_config[model]
m.setenv("VLLM_USE_V1", "1")
llm = LLM(
test_config = model_config[model] model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
)
llm = LLM( sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
) prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
check_length(prompts, llm, test_config.sliding_window)
prompts, answer, indices = prep_prompts(
batch_size, ln_range=test_config.ln_range # Fresh generation
) responses = llm.generate(prompts, sampling_params)
check_answers(
check_length(prompts, llm, test_config.sliding_window) indices,
answer,
# Fresh generation [response.outputs[0].text for response in responses],
responses = llm.generate(prompts, sampling_params) accept_rate=1.0,
check_answers( )
indices,
answer, # Re-generate with the same prompts to test prefix caching
[response.outputs[0].text for response in responses], responses = llm.generate(prompts, sampling_params)
accept_rate=1.0, check_answers(
) indices,
answer,
# Re-generate with the same prompts to test prefix caching [response.outputs[0].text for response in responses],
responses = llm.generate(prompts, sampling_params) accept_rate=1.0,
check_answers( )
indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0,
)
def check_length(prompts: list[str], llm: LLM, sliding_window: int): def check_length(prompts: list[str], llm: LLM, sliding_window: int):
......
...@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill( ...@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
) )
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Make scheduling deterministic for reproducibility # Make scheduling deterministic for reproducibility
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
......
...@@ -13,7 +13,6 @@ Covers: ...@@ -13,7 +13,6 @@ Covers:
5) Multiple stop conditions 5) Multiple stop conditions
""" """
import os
from typing import Optional, Union from typing import Optional, Union
import pytest import pytest
...@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [ ...@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llm_v1(): def llm_v1():
"""Create V1 LLM instance for testing""" """Create V1 LLM instance for testing"""
# Ensure V1 engine is used
os.environ["VLLM_USE_V1"] = "1"
llm = LLM( llm = LLM(
model=TEST_MODEL, model=TEST_MODEL,
tensor_parallel_size=1, tensor_parallel_size=1,
...@@ -503,6 +499,6 @@ if __name__ == "__main__": ...@@ -503,6 +499,6 @@ if __name__ == "__main__":
Usage: Usage:
cd vllm/ cd vllm/
VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v python -m pytest tests/v1/e2e/test_min_tokens.py -v
""" """
pytest.main([__file__, "-v"]) pytest.main([__file__, "-v"])
...@@ -301,7 +301,6 @@ def test_mtp_correctness( ...@@ -301,7 +301,6 @@ def test_mtp_correctness(
model_setup: (method, model_name, tp_size) model_setup: (method, model_name, tp_size)
""" """
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_MLA_DISABLE", "1") m.setenv("VLLM_MLA_DISABLE", "1")
method, model_name, tp_size = model_setup method, model_name, tp_size = model_setup
......
...@@ -95,17 +95,11 @@ async def generate( ...@@ -95,17 +95,11 @@ async def generate(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load( async def test_load(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind, output_kind: RequestOutputKind,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
prompt: PromptType, prompt: PromptType,
): ):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1 with ExitStack() as after:
# so that in the future when we switch, we don't have to change all the
# tests.
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -149,14 +143,11 @@ async def test_load( ...@@ -149,14 +143,11 @@ async def test_load(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_abort( async def test_abort(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind, output_kind: RequestOutputKind,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
prompt: PromptType, prompt: PromptType,
): ):
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -222,13 +213,8 @@ async def test_abort( ...@@ -222,13 +213,8 @@ async def test_abort(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY] "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_multi_abort( async def test_multi_abort(output_kind: RequestOutputKind):
monkeypatch: pytest.MonkeyPatch, with ExitStack() as after:
output_kind: RequestOutputKind,
):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -304,14 +290,11 @@ async def test_multi_abort( ...@@ -304,14 +290,11 @@ async def test_multi_abort(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_finished_flag( async def test_finished_flag(
monkeypatch: pytest.MonkeyPatch,
n: int, n: int,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
prompt: PromptType, prompt: PromptType,
): ):
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -341,12 +324,10 @@ async def test_finished_flag( ...@@ -341,12 +324,10 @@ async def test_finished_flag(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_mid_stream_cancellation( async def test_mid_stream_cancellation(
monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType engine_args: AsyncEngineArgs, prompt: PromptType
): ):
"""Test that requests can be cancelled mid-stream.""" """Test that requests can be cancelled mid-stream."""
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch): ...@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
be added to the default loggers. be added to the default loggers.
""" """
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args( engine = AsyncLLM.from_engine_args(
TEXT_ENGINE_ARGS, TEXT_ENGINE_ARGS,
...@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch): ...@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
@pytest.mark.asyncio(scope="module") @pytest.mark.asyncio(scope="module")
async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch): async def test_dp_rank_argument():
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch): ...@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_check_health(monkeypatch: pytest.MonkeyPatch): async def test_check_health():
"""Test that check_health returns normally for healthy engine """Test that check_health returns normally for healthy engine
and raises EngineDeadError when the engine is dead. and raises EngineDeadError when the engine is dead.
""" """
...@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch): ...@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.engine.exceptions import EngineDeadError
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
...@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch): ...@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY] "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_abort_final_output( async def test_abort_final_output(output_kind: RequestOutputKind):
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
):
"""Test that abort() returns a final output with correct information.""" """Test that abort() returns a final output with correct information."""
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
......
...@@ -5,18 +5,11 @@ from argparse import ArgumentError ...@@ -5,18 +5,11 @@ from argparse import ArgumentError
import pytest import pytest
from vllm import envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
if not envs.VLLM_USE_V1:
pytest.skip(
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
allow_module_level=True,
)
def test_prefix_caching_from_cli(): def test_prefix_caching_from_cli():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
......
...@@ -46,188 +46,184 @@ def make_request() -> EngineCoreRequest: ...@@ -46,188 +46,184 @@ def make_request() -> EngineCoreRequest:
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_engine_core(monkeypatch: pytest.MonkeyPatch): def test_engine_core():
with monkeypatch.context() as m: """Setup the EngineCore."""
m.setenv("VLLM_USE_V1", "1") engine_args = EngineArgs(model=MODEL_NAME)
"""Setup the EngineCore.""" vllm_config = engine_args.create_engine_config()
engine_args = EngineArgs(model=MODEL_NAME) executor_class = Executor.get_class(vllm_config)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config) with set_default_torch_num_threads(1):
engine_core = EngineCore(
with set_default_torch_num_threads(1): vllm_config=vllm_config, executor_class=executor_class, log_stats=True
engine_core = EngineCore( )
vllm_config=vllm_config, executor_class=executor_class, log_stats=True """Test basic request lifecycle."""
)
"""Test basic request lifecycle.""" # First request.
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
# First request. assert len(engine_core.scheduler.waiting) == 1
engine_core.add_request(*engine_core.preprocess_add_request(make_request())) assert len(engine_core.scheduler.running) == 0
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0 _ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
_ = engine_core.step() assert len(engine_core.scheduler.running) == 1
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 1 # Second request.
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
# Second request. assert len(engine_core.scheduler.waiting) == 1
engine_core.add_request(*engine_core.preprocess_add_request(make_request())) assert len(engine_core.scheduler.running) == 1
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 1 _ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
_ = engine_core.step() assert len(engine_core.scheduler.running) == 2
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2 # Add two requests in a row.
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
# Add two requests in a row. engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
engine_core.add_request(*engine_core.preprocess_add_request(make_request())) assert len(engine_core.scheduler.waiting) == 2
engine_core.add_request(*engine_core.preprocess_add_request(make_request())) assert len(engine_core.scheduler.running) == 2
assert len(engine_core.scheduler.waiting) == 2
assert len(engine_core.scheduler.running) == 2 _ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
_ = engine_core.step() assert len(engine_core.scheduler.running) == 4
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 4 # Loop through until they are all done.
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
# Loop through until they are all done. pass
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
"""Test abort cycle."""
# Basic abort.
req = make_request()
request_id = req.request_id
engine_core.add_request(*engine_core.preprocess_add_request(req))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
assert engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests()
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 1
assert engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests()
engine_core.abort_requests([request_id])
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
assert not engine_core.scheduler.has_unfinished_requests()
assert engine_core.scheduler.has_finished_requests()
_ = engine_core.step()
assert not engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests()
# Add, step, abort 1 of the 3.
req0 = make_request()
req1 = make_request()
req2 = make_request()
engine_core.add_request(*engine_core.preprocess_add_request(req0))
engine_core.add_request(*engine_core.preprocess_add_request(req1))
assert len(engine_core.scheduler.waiting) == 2
assert len(engine_core.scheduler.running) == 0
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
engine_core.add_request(*engine_core.preprocess_add_request(req2))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 2
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 3
# Abort just one.
engine_core.abort_requests([req1.request_id])
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
# Abort the other requests at the same time.
engine_core.abort_requests([req2.request_id, req0.request_id])
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
# Sending duplicate requests with same request_id
req0 = make_request()
req1 = make_request()
req0.request_id = req1.request_id = "test"
engine_core.add_request(*engine_core.preprocess_add_request(req0))
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
engine_core.add_request(*engine_core.preprocess_add_request(req1))
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
"""Test abort cycle."""
# Basic abort. @create_new_process_for_each_test()
req = make_request() def test_engine_core_advanced_sampling():
request_id = req.request_id """
A basic end-to-end test to verify that the engine functions correctly
when additional sampling parameters, such as top_p, min_tokens, and
presence_penalty, are set.
"""
"""Setup the EngineCore."""
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
"""Test basic request lifecycle."""
# First request.
request: EngineCoreRequest = make_request()
request.sampling_params = SamplingParams(
min_tokens=4,
presence_penalty=1.0,
frequency_penalty=1.0,
repetition_penalty=0.1,
stop_token_ids=[1001, 1002],
)
engine_core.add_request(*engine_core.preprocess_add_request(request))
engine_core.add_request(*engine_core.preprocess_add_request(req)) def _check_engine_state():
assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0 assert len(engine_core.scheduler.running) == 0
assert engine_core.scheduler.has_unfinished_requests() # Loop through until they are all done.
assert not engine_core.scheduler.has_finished_requests()
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 1
assert engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests()
engine_core.abort_requests([request_id])
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
assert not engine_core.scheduler.has_unfinished_requests()
assert engine_core.scheduler.has_finished_requests()
_ = engine_core.step()
assert not engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests()
# Add, step, abort 1 of the 3.
req0 = make_request()
req1 = make_request()
req2 = make_request()
engine_core.add_request(*engine_core.preprocess_add_request(req0))
engine_core.add_request(*engine_core.preprocess_add_request(req1))
assert len(engine_core.scheduler.waiting) == 2
assert len(engine_core.scheduler.running) == 0
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
engine_core.add_request(*engine_core.preprocess_add_request(req2))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 2
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 3
# Abort just one.
engine_core.abort_requests([req1.request_id])
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
# Abort the other requests at the same time.
engine_core.abort_requests([req2.request_id, req0.request_id])
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
# Sending duplicate requests with same request_id
req0 = make_request()
req1 = make_request()
req0.request_id = req1.request_id = "test"
engine_core.add_request(*engine_core.preprocess_add_request(req0))
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
engine_core.add_request(*engine_core.preprocess_add_request(req1))
while (outs := engine_core.step()[0].get(0)) and outs.outputs: while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass pass
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0 assert len(engine_core.scheduler.running) == 0
_check_engine_state()
@create_new_process_for_each_test() # Second request.
def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): request2 = make_request()
""" request2.sampling_params = SamplingParams(
A basic end-to-end test to verify that the engine functions correctly top_p=0.99,
when additional sampling parameters, such as top_p, min_tokens, and top_k=50,
presence_penalty, are set. )
""" engine_core.add_request(*engine_core.preprocess_add_request(request2))
with monkeypatch.context() as m: _check_engine_state()
m.setenv("VLLM_USE_V1", "1")
"""Setup the EngineCore."""
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
"""Test basic request lifecycle."""
# First request.
request: EngineCoreRequest = make_request()
request.sampling_params = SamplingParams(
min_tokens=4,
presence_penalty=1.0,
frequency_penalty=1.0,
repetition_penalty=0.1,
stop_token_ids=[1001, 1002],
)
engine_core.add_request(*engine_core.preprocess_add_request(request))
def _check_engine_state():
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
# Loop through until they are all done.
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
_check_engine_state()
# Second request.
request2 = make_request()
request2.sampling_params = SamplingParams(
top_p=0.99,
top_k=50,
)
engine_core.add_request(*engine_core.preprocess_add_request(request2))
_check_engine_state()
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): def test_engine_core_concurrent_batches():
""" """
Test that the engine can handle multiple concurrent batches. Test that the engine can handle multiple concurrent batches.
""" """
...@@ -272,173 +268,163 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): ...@@ -272,173 +268,163 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
if hasattr(self, "thread_pool"): if hasattr(self, "thread_pool"):
self.thread_pool.shutdown(wait=False) self.thread_pool.shutdown(wait=False)
with monkeypatch.context() as m: engine_args = EngineArgs(
m.setenv("VLLM_USE_V1", "1") model=MODEL_NAME,
# To test concurrent batches.
engine_args = EngineArgs( max_num_seqs=2,
model=MODEL_NAME, # Avoid all requests being scheduled once.
# To test concurrent batches. enable_prefix_caching=False,
max_num_seqs=2, max_num_batched_tokens=10,
# Avoid all requests being scheduled once. # Reduce startup time.
enable_prefix_caching=False, enforce_eager=True,
max_num_batched_tokens=10, )
# Reduce startup time. vllm_config = engine_args.create_engine_config()
enforce_eager=True, with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
) )
vllm_config = engine_args.create_engine_config() assert engine_core.batch_queue is not None
with set_default_torch_num_threads(1):
engine_core = EngineCore( # Add two requests in a row. Each request have 12 prompt tokens.
vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor req0 = make_request_with_max_tokens("0", 5)
) engine_core.add_request(*engine_core.preprocess_add_request(req0))
assert engine_core.batch_queue is not None req1 = make_request_with_max_tokens("1", 5)
engine_core.add_request(*engine_core.preprocess_add_request(req1))
# Add two requests in a row. Each request have 12 prompt tokens.
req0 = make_request_with_max_tokens("0", 5) # Schedule Batch 1: (10, req0)
engine_core.add_request(*engine_core.preprocess_add_request(req0)) assert engine_core.step_with_batch_queue()[0] is None
req1 = make_request_with_max_tokens("1", 5) assert len(engine_core.batch_queue) == 1
engine_core.add_request(*engine_core.preprocess_add_request(req1)) scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 10
# Schedule Batch 1: (10, req0) # num_computed_tokens should have been updated immediately.
assert engine_core.step_with_batch_queue()[0] is None assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1] # Schedule Batch 2: (2, req0), (8, req1)
assert scheduler_output.num_scheduled_tokens["0"] == 10 assert engine_core.step_with_batch_queue()[0] == {}
# num_computed_tokens should have been updated immediately. assert len(engine_core.batch_queue) == 1
assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10 scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 2
# Schedule Batch 2: (2, req0), (8, req1) assert scheduler_output.num_scheduled_tokens["1"] == 8
assert engine_core.step_with_batch_queue()[0] == {} # num_computed_tokens should have been updated immediately.
assert len(engine_core.batch_queue) == 1 assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
scheduler_output = engine_core.batch_queue[-1][1] assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
assert scheduler_output.num_scheduled_tokens["0"] == 2
assert scheduler_output.num_scheduled_tokens["1"] == 8 assert engine_core.scheduler.get_num_unfinished_requests() == 2
# num_computed_tokens should have been updated immediately.
assert engine_core.scheduler.requests["0"].num_computed_tokens == 12 # Finish Batch 1 and schedule Batch 3: (4, req1).
assert engine_core.scheduler.requests["1"].num_computed_tokens == 8 # Note that req0 cannot be scheduled
# because it is in the decoding stage now.
assert engine_core.scheduler.get_num_unfinished_requests() == 2 engine_core.step_with_batch_queue()
assert len(engine_core.batch_queue) == 1
# Finish Batch 1 and schedule Batch 3: (4, req1). scheduler_output = engine_core.batch_queue[-1][1]
# Note that req0 cannot be scheduled assert scheduler_output.num_scheduled_tokens["1"] == 4
# because it is in the decoding stage now.
engine_core.step_with_batch_queue() # Finish Batch 2. Get first token of req0.
assert len(engine_core.batch_queue) == 1 # Schedule Batch 4: (1, req0).
scheduler_output = engine_core.batch_queue[-1][1] output = engine_core.step_with_batch_queue()[0].get(0)
assert scheduler_output.num_scheduled_tokens["1"] == 4 assert output is not None
assert len(output.outputs) == 1
# Finish Batch 2. Get first token of req0. assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
# Schedule Batch 4: (1, req0). scheduler_output = engine_core.batch_queue[-1][1]
output = engine_core.step_with_batch_queue()[0].get(0) assert scheduler_output.num_scheduled_tokens["0"] == 1
# Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
output = engine_core.step_with_batch_queue()[0].get(0)
assert output is not None
assert len(output.outputs) == 1
assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["1"] == 1
# Loop until req0 is finished.
req_id = 0
expected_num_tokens = [
engine_core.scheduler.requests["0"].num_tokens + 1,
engine_core.scheduler.requests["1"].num_tokens + 1,
]
while engine_core.scheduler.get_num_unfinished_requests() == 2:
output = engine_core.step_with_batch_queue()[0]
# Every step consumes an output.
assert output is not None assert output is not None
assert len(output.outputs) == 1 assert len(output[0].outputs) == 1
assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13 if req_id in engine_core.scheduler.requests:
scheduler_output = engine_core.batch_queue[-1][1] assert (
assert scheduler_output.num_scheduled_tokens["0"] == 1 engine_core.scheduler.requests[req_id].num_tokens
== expected_num_tokens[req_id]
# Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1). )
output = engine_core.step_with_batch_queue()[0].get(0) expected_num_tokens[req_id] += 1
assert output is not None req_id = (req_id + 1) % 2
assert len(output.outputs) == 1
assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["1"] == 1
# Loop until req0 is finished.
req_id = 0
expected_num_tokens = [
engine_core.scheduler.requests["0"].num_tokens + 1,
engine_core.scheduler.requests["1"].num_tokens + 1,
]
while engine_core.scheduler.get_num_unfinished_requests() == 2:
output = engine_core.step_with_batch_queue()[0]
# Every step consumes an output.
assert output is not None
assert len(output[0].outputs) == 1
if req_id in engine_core.scheduler.requests:
assert (
engine_core.scheduler.requests[req_id].num_tokens
== expected_num_tokens[req_id]
)
expected_num_tokens[req_id] += 1
req_id = (req_id + 1) % 2
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch): def test_engine_core_tp():
""" """
Test engine can initialize worker in tp properly Test engine can initialize worker in tp properly
""" """
with monkeypatch.context() as m: """Setup the EngineCore."""
m.setenv("VLLM_USE_V1", "1") engine_args = EngineArgs(
"""Setup the EngineCore.""" model=MODEL_NAME,
engine_args = EngineArgs( tensor_parallel_size=2,
model=MODEL_NAME, # Reduce startup time.
tensor_parallel_size=2, enforce_eager=True,
# Reduce startup time. )
enforce_eager=True, vllm_config = engine_args.create_engine_config()
) executor_class = Executor.get_class(vllm_config)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine_core = EngineCore( engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True vllm_config=vllm_config, executor_class=executor_class, log_stats=True
) )
def get_worker_cache_config_field(worker, key: str): def get_worker_cache_config_field(worker, key: str):
return getattr(worker.cache_config, key) return getattr(worker.cache_config, key)
num_gpu_blocks = engine_core.collective_rpc( num_gpu_blocks = engine_core.collective_rpc(
get_worker_cache_config_field, args=("num_gpu_blocks",) get_worker_cache_config_field, args=("num_gpu_blocks",)
) )
num_cpu_blocks = engine_core.collective_rpc( num_cpu_blocks = engine_core.collective_rpc(
get_worker_cache_config_field, args=("num_cpu_blocks",) get_worker_cache_config_field, args=("num_cpu_blocks",)
) )
assert all(x is not None for x in num_gpu_blocks) assert all(x is not None for x in num_gpu_blocks)
assert all(x is not None for x in num_cpu_blocks) assert all(x is not None for x in num_cpu_blocks)
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch): def test_engine_core_invalid_request_id_type():
"""Test that engine raises TypeError for non-string request_id.""" """Test that engine raises TypeError for non-string request_id."""
with monkeypatch.context() as m: engine_args = EngineArgs(model=MODEL_NAME)
m.setenv("VLLM_USE_V1", "1") vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
engine_args = EngineArgs(model=MODEL_NAME) with set_default_torch_num_threads(1):
vllm_config = engine_args.create_engine_config() engine_core = EngineCore(
executor_class = Executor.get_class(vllm_config) vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
# Test with UUID object (common mistake) # Test with UUID object (common mistake)
uuid_request = make_request() uuid_request = make_request()
uuid_request.request_id = uuid.uuid4() # UUID object instead of string uuid_request.request_id = uuid.uuid4() # UUID object instead of string
with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"): with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
engine_core.add_request(*engine_core.preprocess_add_request(uuid_request)) engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))
# Test with integer # Test with integer
int_request = make_request() int_request = make_request()
int_request.request_id = 12345 int_request.request_id = 12345
with pytest.raises(TypeError, match="request_id must be a string, got.*int"): with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
engine_core.add_request(*engine_core.preprocess_add_request(int_request)) engine_core.add_request(*engine_core.preprocess_add_request(int_request))
# Test with None # Test with None
none_request = make_request() none_request = make_request()
none_request.request_id = None none_request.request_id = None
with pytest.raises( with pytest.raises(TypeError, match="request_id must be a string, got.*NoneType"):
TypeError, match="request_id must be a string, got.*NoneType" engine_core.add_request(*engine_core.preprocess_add_request(none_request))
):
engine_core.add_request(*engine_core.preprocess_add_request(none_request))
# Verify engine is still functional after errors # Verify engine is still functional after errors
valid_request = make_request() valid_request = make_request()
engine_core.add_request(*engine_core.preprocess_add_request(valid_request)) engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0 assert len(engine_core.scheduler.running) == 0
...@@ -130,8 +130,6 @@ def test_engine_core_client( ...@@ -130,8 +130,6 @@ def test_engine_core_client(
monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch core engine utility function to test. # Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False) m.setattr(EngineCore, "echo", echo, raising=False)
...@@ -218,8 +216,6 @@ def test_engine_core_client( ...@@ -218,8 +216,6 @@ def test_engine_core_client(
@pytest.mark.asyncio(loop_scope="function") @pytest.mark.asyncio(loop_scope="function")
async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch core engine utility function to test. # Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False) m.setattr(EngineCore, "echo", echo, raising=False)
...@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return( ...@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types. # Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
...@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return( ...@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types. # Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
...@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures( ...@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types. # Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
...@@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures( ...@@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures(
indirect=["publisher_config"], indirect=["publisher_config"],
) )
def test_kv_cache_events( def test_kv_cache_events(
monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool, multiprocessing_mode: bool,
publisher_config, publisher_config,
): ):
with monkeypatch.context() as m: block_size = 16
m.setenv("VLLM_USE_V1", "1") num_blocks = 2
block_size = 16
num_blocks = 2 engine_args = EngineArgs(
model=MODEL_NAME,
engine_args = EngineArgs( enforce_eager=True,
model=MODEL_NAME, enable_prefix_caching=True,
enforce_eager=True, block_size=block_size,
enable_prefix_caching=True, )
block_size=block_size, engine_args.kv_events_config = publisher_config
)
engine_args.kv_events_config = publisher_config
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT) vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config) executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client( client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode, multiprocess_mode=multiprocessing_mode,
asyncio_mode=False, asyncio_mode=False,
vllm_config=vllm_config, vllm_config=vllm_config,
executor_class=executor_class, executor_class=executor_class,
log_stats=False, log_stats=False,
)
endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
subscriber = MockSubscriber(
endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
) )
endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
subscriber = MockSubscriber(
endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
)
try: try:
custom_tokens = list(range(num_blocks * block_size)) custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
request = make_request(sampling_params, custom_tokens) request = make_request(sampling_params, custom_tokens)
client.add_request(request) client.add_request(request)
outputs: dict[str, list] = {request.request_id: []} outputs: dict[str, list] = {request.request_id: []}
loop_until_done(client, outputs) loop_until_done(client, outputs)
result = subscriber.receive_one(timeout=1000) result = subscriber.receive_one(timeout=1000)
assert result is not None, "No message received" assert result is not None, "No message received"
seq, received = result seq, received = result
assert seq == 0, "Sequence number mismatch" assert seq == 0, "Sequence number mismatch"
assert len(received.events) == 1, ( assert len(received.events) == 1, "We should have exactly one BlockStored event"
"We should have exactly one BlockStored event" event = received.events[0]
) assert isinstance(event, BlockStored), "We should have a BlockStored event"
event = received.events[0] assert len(event.block_hashes) == num_blocks, (
assert isinstance(event, BlockStored), "We should have a BlockStored event" "We should have a BlockStored event with 2 block_hashes"
assert len(event.block_hashes) == num_blocks, ( )
"We should have a BlockStored event with 2 block_hashes" assert event.block_size == block_size, (
) "Block size should be the same as the block size"
assert event.block_size == block_size, ( )
"Block size should be the same as the block size" assert event.parent_block_hash is None, "Parent block hash should be None"
) assert event.lora_id is None, "Lora id should be None"
assert event.parent_block_hash is None, "Parent block hash should be None" assert len(event.token_ids) == num_blocks * block_size, (
assert event.lora_id is None, "Lora id should be None" "Token ids should be the same as the custom tokens"
assert len(event.token_ids) == num_blocks * block_size, ( )
"Token ids should be the same as the custom tokens" assert event.token_ids == custom_tokens, (
) "Token ids should be the same as the custom tokens"
assert event.token_ids == custom_tokens, ( )
"Token ids should be the same as the custom tokens" finally:
) client.shutdown()
finally: subscriber.close()
client.shutdown()
subscriber.close()
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -672,101 +657,96 @@ def test_kv_cache_events( ...@@ -672,101 +657,96 @@ def test_kv_cache_events(
) )
@multi_gpu_test(num_gpus=4) @multi_gpu_test(num_gpus=4)
async def test_kv_cache_events_dp( async def test_kv_cache_events_dp(
monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool, multiprocessing_mode: bool,
publisher_config, publisher_config,
): ):
with monkeypatch.context() as m: block_size = 16
m.setenv("VLLM_USE_V1", "1") num_blocks = 2
block_size = 16 dp_size = 2
num_blocks = 2 tp_size = 2
dp_size = 2
tp_size = 2 engine_args = EngineArgs(
model=MODEL_NAME,
engine_args = EngineArgs( enforce_eager=True,
model=MODEL_NAME, enable_prefix_caching=True,
enforce_eager=True, data_parallel_size=dp_size,
enable_prefix_caching=True, tensor_parallel_size=tp_size,
data_parallel_size=dp_size, block_size=block_size,
tensor_parallel_size=tp_size, )
block_size=block_size, engine_args.kv_events_config = publisher_config
)
engine_args.kv_events_config = publisher_config
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT) vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config) executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client( client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode, multiprocess_mode=multiprocessing_mode,
asyncio_mode=True, asyncio_mode=True,
vllm_config=vllm_config, vllm_config=vllm_config,
executor_class=executor_class, executor_class=executor_class,
log_stats=False, log_stats=False,
) )
await asyncio.sleep(1) await asyncio.sleep(1)
# Build endpoints for all DP ranks # Build endpoints for all DP ranks
base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
endpoints = [] endpoints = []
for i in range(dp_size): for i in range(dp_size):
offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i) offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
endpoints.append(offset_endpoint) endpoints.append(offset_endpoint)
subscriber = MockSubscriber( subscriber = MockSubscriber(
endpoints, topic=publisher_config.topic, decode_type=KVEventBatch endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
) )
try: try:
custom_tokens = list(range(num_blocks * block_size)) custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
all_request_ids = [] all_request_ids = []
# Create and add 25 requests # Create and add 25 requests
# NOTE: attempts to force routing to both dp groups but can be flaky # NOTE: attempts to force routing to both dp groups but can be flaky
for i in range(25): for i in range(25):
await asyncio.sleep(0.01) await asyncio.sleep(0.01)
request = make_request(sampling_params, custom_tokens) request = make_request(sampling_params, custom_tokens)
await client.add_request_async(request) await client.add_request_async(request)
all_request_ids.append(request.request_id) all_request_ids.append(request.request_id)
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
# Initialize outputs dict for all requests # Initialize outputs dict for all requests
outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids} outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
print("processing requests...") print("processing requests...")
await asyncio.wait_for( await asyncio.wait_for(
loop_until_fully_done_async(client, outputs), timeout=20.0 loop_until_fully_done_async(client, outputs), timeout=20.0
) )
# Receive from subscriber until no more messages # Receive from subscriber until no more messages
print("collecting results...") print("collecting results...")
results = [] results = []
while True: while True:
result = subscriber.receive_one(timeout=1) result = subscriber.receive_one(timeout=1)
print(result) print(result)
if result is None: if result is None:
break break
results.append(result) results.append(result)
# Collect all events and data_parallel_ranks from all results # Collect all events and data_parallel_ranks from all results
all_dp_ranks = [received.data_parallel_rank for (_, received) in results] all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
unique_dps = set(all_dp_ranks) unique_dps = set(all_dp_ranks)
assert len(unique_dps) == 2, ( assert len(unique_dps) == 2, (
f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}" f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
) )
finally: finally:
client.shutdown() client.shutdown()
subscriber.close() subscriber.close()
@pytest.mark.timeout(20) @pytest.mark.timeout(20)
def test_startup_failure(monkeypatch: pytest.MonkeyPatch): def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m, pytest.raises(Exception) as e_info: with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch to extract core process pid while it's starting. # Monkey-patch to extract core process pid while it's starting.
core_proc_pid = [None] core_proc_pid = [None]
cepm_ctor = CoreEngineProcManager.__init__ cepm_ctor = CoreEngineProcManager.__init__
...@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat ...@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
mock_executor_class.side_effect = create_mock_executor mock_executor_class.side_effect = create_mock_executor
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices
from vllm.v1.engine.utils import EngineZmqAddresses from vllm.v1.engine.utils import EngineZmqAddresses
......
...@@ -21,12 +21,10 @@ DTYPE = "half" ...@@ -21,12 +21,10 @@ DTYPE = "half"
def _vllm_model( def _vllm_model(
apc: bool, apc: bool,
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
*, *,
skip_tokenizer_init: bool = False, skip_tokenizer_init: bool = False,
): ):
"""Set up VllmRunner instance.""" """Set up VllmRunner instance."""
monkeypatch.setenv("VLLM_USE_V1", "1")
return vllm_runner( return vllm_runner(
MODEL, MODEL,
dtype=DTYPE, dtype=DTYPE,
...@@ -45,16 +43,16 @@ def _vllm_model( ...@@ -45,16 +43,16 @@ def _vllm_model(
# Prefix caching # Prefix caching
params=[False, True], params=[False, True],
) )
def vllm_model(vllm_runner, request, monkeypatch): def vllm_model(vllm_runner, request):
"""VllmRunner test fixture parameterized by APC True/False.""" """VllmRunner test fixture parameterized by APC True/False."""
with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model: with _vllm_model(request.param, vllm_runner) as vllm_model:
yield vllm_model yield vllm_model
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def vllm_model_apc(vllm_runner, monkeypatch): def vllm_model_apc(vllm_runner):
"""VllmRunner test fixture with APC.""" """VllmRunner test fixture with APC."""
with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model: with _vllm_model(True, vllm_runner) as vllm_model:
yield vllm_model yield vllm_model
...@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch): ...@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
# Prefix caching # Prefix caching
params=[False, True], params=[False, True],
) )
def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch): def vllm_model_skip_tokenizer_init(vllm_runner, request):
"""VllmRunner test fixture with APC.""" """VllmRunner test fixture with APC."""
with _vllm_model( with _vllm_model(
request.param, request.param,
vllm_runner, vllm_runner,
monkeypatch,
skip_tokenizer_init=True, skip_tokenizer_init=True,
) as vllm_model: ) as vllm_model:
yield vllm_model yield vllm_model
...@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None: ...@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
) )
def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): def test_engine_metrics(vllm_runner, example_prompts):
max_tokens = 100 max_tokens = 100
# Use spec decoding to test num_accepted_tokens_per_pos # Use spec decoding to test num_accepted_tokens_per_pos
speculative_config = { speculative_config = {
...@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): ...@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
"prompt_lookup_min": 3, "prompt_lookup_min": 3,
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
} }
monkeypatch.setenv("VLLM_USE_V1", "1")
with vllm_runner( with vllm_runner(
MODEL, MODEL,
speculative_config=speculative_config, speculative_config=speculative_config,
...@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): ...@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"]) @pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
def test_skip_tokenizer_initialization(model: str, monkeypatch: pytest.MonkeyPatch): def test_skip_tokenizer_initialization(model: str):
monkeypatch.setenv("VLLM_USE_V1", "1")
# This test checks if the flag skip_tokenizer_init skips the initialization # This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain # of tokenizer and detokenizer. The generated output is expected to contain
# token ids. # token ids.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment