Unverified Commit 1e4ecca1 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent c0a7b89d
......@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
set_seed(seed)
......@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with default backend
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
set_seed(seed)
with vllm_runner(
model_name,
......@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
with vllm_runner(
model_name,
......@@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
flex_outputs = llm_flex.embed(prompts)
# Run with default backend
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
with vllm_runner(
with (
monkeypatch.context() as m,
vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
) as llm_default:
default_outputs = llm_default.embed(prompts)
) as llm_default,
):
default_outputs = llm_default.embed(prompts)
check_embeddings_close(
embeddings_0_lst=flex_outputs,
......
......@@ -613,7 +613,6 @@ def test_dummy_maverick(
profile: bool = False,
) -> None:
# Disable multiprocessing allows us to access model executor from LLM engine
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
model_path = create_reduced_maverick_model(
......
......@@ -8,7 +8,6 @@ if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None
from vllm import envs
class DummyPlatform(Platform):
......@@ -19,10 +18,7 @@ class DummyPlatform(Platform):
@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if envs.VLLM_USE_V1:
compilation_config = vllm_config.compilation_config
# Activate custom ops for v1.
compilation_config.custom_ops = ["all"]
vllm_config.compilation_config.custom_ops = ["all"]
def get_attn_backend_cls(
self,
......
......@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler):
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Explicitly turn off engine multiprocessing so
# that the scheduler runs in this process
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
......
......@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
from typing import Optional
import pytest
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
@pytest.fixture(autouse=True)
def v1(monkeypatch):
"""Only run on vLLM v1."""
monkeypatch.setenv("VLLM_USE_V1", "1")
def _generate(
llm: LLM,
prompt: str,
......
......@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest
# 100 training iterations with a training batch size of 100.
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch: pytest.MonkeyPatch):
"""
Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
for all tests in this file
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
yield
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
return vllm.LLM(
model="Qwen/Qwen2.5-3B-Instruct",
......
......@@ -305,7 +305,6 @@ full_cg_backend_configs = {
"CutlassMLA": BackendConfig(
name="CutlassMLA",
env_vars={
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
"FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed
},
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
pytestmark = pytest.mark.cpu_test
def new_kv_cache_spec():
return FullAttentionSpec(16, 1, 1, torch.float32, False)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from vllm import LLM
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B"
PROMPT = "Hello my name is Robert and I"
......
......@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
):
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
env_vars = backend_configs[backend_name].env_vars
with temporary_environ(env_vars), ExitStack() as stack:
if not supported:
......@@ -117,7 +117,7 @@ combo_cases_2 = [
def test_cudagraph_compilation_combo(combo_case):
backend_name, cudagraph_mode, compilation_level, supported = combo_case
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
env_vars = backend_configs[backend_name].env_vars
with temporary_environ(env_vars), ExitStack() as stack:
if not supported:
......
......@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
)
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
......
......@@ -32,7 +32,7 @@ model_config = {
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
def test_sliding_window_retrieval(
monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager
model, batch_size, seed, disable_hybrid_kv_cache_manager
):
"""
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
......@@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly).
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
test_config = model_config[model]
llm = LLM(
model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
)
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
prompts, answer, indices = prep_prompts(
batch_size, ln_range=test_config.ln_range
)
check_length(prompts, llm, test_config.sliding_window)
# Fresh generation
responses = llm.generate(prompts, sampling_params)
check_answers(
indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0,
)
# Re-generate with the same prompts to test prefix caching
responses = llm.generate(prompts, sampling_params)
check_answers(
indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0,
)
test_config = model_config[model]
llm = LLM(
model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
)
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)
check_length(prompts, llm, test_config.sliding_window)
# Fresh generation
responses = llm.generate(prompts, sampling_params)
check_answers(
indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0,
)
# Re-generate with the same prompts to test prefix caching
responses = llm.generate(prompts, sampling_params)
check_answers(
indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0,
)
def check_length(prompts: list[str], llm: LLM, sliding_window: int):
......
......@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
)
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Make scheduling deterministic for reproducibility
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
......
......@@ -13,7 +13,6 @@ Covers:
5) Multiple stop conditions
"""
import os
from typing import Optional, Union
import pytest
......@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
@pytest.fixture(scope="module")
def llm_v1():
"""Create V1 LLM instance for testing"""
# Ensure V1 engine is used
os.environ["VLLM_USE_V1"] = "1"
llm = LLM(
model=TEST_MODEL,
tensor_parallel_size=1,
......@@ -503,6 +499,6 @@ if __name__ == "__main__":
Usage:
cd vllm/
VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v
python -m pytest tests/v1/e2e/test_min_tokens.py -v
"""
pytest.main([__file__, "-v"])
......@@ -301,7 +301,6 @@ def test_mtp_correctness(
model_setup: (method, model_name, tp_size)
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_MLA_DISABLE", "1")
method, model_name, tp_size = model_setup
......
......@@ -95,17 +95,11 @@ async def generate(
)
@pytest.mark.asyncio
async def test_load(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
engine_args: AsyncEngineArgs,
prompt: PromptType,
):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
# so that in the future when we switch, we don't have to change all the
# tests.
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)
......@@ -149,14 +143,11 @@ async def test_load(
)
@pytest.mark.asyncio
async def test_abort(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
engine_args: AsyncEngineArgs,
prompt: PromptType,
):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)
......@@ -222,13 +213,8 @@ async def test_abort(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
)
@pytest.mark.asyncio
async def test_multi_abort(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
async def test_multi_abort(output_kind: RequestOutputKind):
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown)
......@@ -304,14 +290,11 @@ async def test_multi_abort(
)
@pytest.mark.asyncio
async def test_finished_flag(
monkeypatch: pytest.MonkeyPatch,
n: int,
engine_args: AsyncEngineArgs,
prompt: PromptType,
):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)
......@@ -341,12 +324,10 @@ async def test_finished_flag(
)
@pytest.mark.asyncio
async def test_mid_stream_cancellation(
monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType
engine_args: AsyncEngineArgs, prompt: PromptType
):
"""Test that requests can be cancelled mid-stream."""
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)
......@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
be added to the default loggers.
"""
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(
TEXT_ENGINE_ARGS,
......@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
@pytest.mark.asyncio(scope="module")
async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
async def test_dp_rank_argument():
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown)
......@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.asyncio
async def test_check_health(monkeypatch: pytest.MonkeyPatch):
async def test_check_health():
"""Test that check_health returns normally for healthy engine
and raises EngineDeadError when the engine is dead.
"""
......@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
from vllm.v1.engine.exceptions import EngineDeadError
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown)
......@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
)
@pytest.mark.asyncio
async def test_abort_final_output(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
):
async def test_abort_final_output(output_kind: RequestOutputKind):
"""Test that abort() returns a final output with correct information."""
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown)
......
......@@ -5,18 +5,11 @@ from argparse import ArgumentError
import pytest
from vllm import envs
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser
if not envs.VLLM_USE_V1:
pytest.skip(
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
allow_module_level=True,
)
def test_prefix_caching_from_cli():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
......
This diff is collapsed.
......@@ -130,8 +130,6 @@ def test_engine_core_client(
monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False)
......@@ -218,8 +216,6 @@ def test_engine_core_client(
@pytest.mark.asyncio(loop_scope="function")
async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False)
......@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
monkeypatch: pytest.MonkeyPatch,
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
......@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
monkeypatch: pytest.MonkeyPatch,
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
......@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
monkeypatch: pytest.MonkeyPatch,
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
......@@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures(
indirect=["publisher_config"],
)
def test_kv_cache_events(
monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool,
publisher_config,
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
block_size = 16
num_blocks = 2
engine_args = EngineArgs(
model=MODEL_NAME,
enforce_eager=True,
enable_prefix_caching=True,
block_size=block_size,
)
engine_args.kv_events_config = publisher_config
block_size = 16
num_blocks = 2
engine_args = EngineArgs(
model=MODEL_NAME,
enforce_eager=True,
enable_prefix_caching=True,
block_size=block_size,
)
engine_args.kv_events_config = publisher_config
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode,
asyncio_mode=False,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
)
endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
subscriber = MockSubscriber(
endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode,
asyncio_mode=False,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
)
endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
subscriber = MockSubscriber(
endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
)
try:
custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1)
request = make_request(sampling_params, custom_tokens)
client.add_request(request)
try:
custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1)
request = make_request(sampling_params, custom_tokens)
client.add_request(request)
outputs: dict[str, list] = {request.request_id: []}
loop_until_done(client, outputs)
outputs: dict[str, list] = {request.request_id: []}
loop_until_done(client, outputs)
result = subscriber.receive_one(timeout=1000)
assert result is not None, "No message received"
result = subscriber.receive_one(timeout=1000)
assert result is not None, "No message received"
seq, received = result
seq, received = result
assert seq == 0, "Sequence number mismatch"
assert len(received.events) == 1, (
"We should have exactly one BlockStored event"
)
event = received.events[0]
assert isinstance(event, BlockStored), "We should have a BlockStored event"
assert len(event.block_hashes) == num_blocks, (
"We should have a BlockStored event with 2 block_hashes"
)
assert event.block_size == block_size, (
"Block size should be the same as the block size"
)
assert event.parent_block_hash is None, "Parent block hash should be None"
assert event.lora_id is None, "Lora id should be None"
assert len(event.token_ids) == num_blocks * block_size, (
"Token ids should be the same as the custom tokens"
)
assert event.token_ids == custom_tokens, (
"Token ids should be the same as the custom tokens"
)
finally:
client.shutdown()
subscriber.close()
assert seq == 0, "Sequence number mismatch"
assert len(received.events) == 1, "We should have exactly one BlockStored event"
event = received.events[0]
assert isinstance(event, BlockStored), "We should have a BlockStored event"
assert len(event.block_hashes) == num_blocks, (
"We should have a BlockStored event with 2 block_hashes"
)
assert event.block_size == block_size, (
"Block size should be the same as the block size"
)
assert event.parent_block_hash is None, "Parent block hash should be None"
assert event.lora_id is None, "Lora id should be None"
assert len(event.token_ids) == num_blocks * block_size, (
"Token ids should be the same as the custom tokens"
)
assert event.token_ids == custom_tokens, (
"Token ids should be the same as the custom tokens"
)
finally:
client.shutdown()
subscriber.close()
@pytest.mark.asyncio
......@@ -672,101 +657,96 @@ def test_kv_cache_events(
)
@multi_gpu_test(num_gpus=4)
async def test_kv_cache_events_dp(
monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool,
publisher_config,
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
block_size = 16
num_blocks = 2
dp_size = 2
tp_size = 2
engine_args = EngineArgs(
model=MODEL_NAME,
enforce_eager=True,
enable_prefix_caching=True,
data_parallel_size=dp_size,
tensor_parallel_size=tp_size,
block_size=block_size,
)
engine_args.kv_events_config = publisher_config
block_size = 16
num_blocks = 2
dp_size = 2
tp_size = 2
engine_args = EngineArgs(
model=MODEL_NAME,
enforce_eager=True,
enable_prefix_caching=True,
data_parallel_size=dp_size,
tensor_parallel_size=tp_size,
block_size=block_size,
)
engine_args.kv_events_config = publisher_config
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode,
asyncio_mode=True,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
)
await asyncio.sleep(1)
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode,
asyncio_mode=True,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
)
await asyncio.sleep(1)
# Build endpoints for all DP ranks
base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
endpoints = []
for i in range(dp_size):
offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
endpoints.append(offset_endpoint)
# Build endpoints for all DP ranks
base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
endpoints = []
for i in range(dp_size):
offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
endpoints.append(offset_endpoint)
subscriber = MockSubscriber(
endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
)
subscriber = MockSubscriber(
endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
)
try:
custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1)
all_request_ids = []
try:
custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1)
all_request_ids = []
# Create and add 25 requests
# NOTE: attempts to force routing to both dp groups but can be flaky
for i in range(25):
await asyncio.sleep(0.01)
request = make_request(sampling_params, custom_tokens)
await client.add_request_async(request)
all_request_ids.append(request.request_id)
# Create and add 25 requests
# NOTE: attempts to force routing to both dp groups but can be flaky
for i in range(25):
await asyncio.sleep(0.01)
request = make_request(sampling_params, custom_tokens)
await client.add_request_async(request)
all_request_ids.append(request.request_id)
await asyncio.sleep(0.1)
await asyncio.sleep(0.1)
# Initialize outputs dict for all requests
outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
# Initialize outputs dict for all requests
outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
print("processing requests...")
await asyncio.wait_for(
loop_until_fully_done_async(client, outputs), timeout=20.0
)
print("processing requests...")
await asyncio.wait_for(
loop_until_fully_done_async(client, outputs), timeout=20.0
)
# Receive from subscriber until no more messages
print("collecting results...")
results = []
while True:
result = subscriber.receive_one(timeout=1)
print(result)
if result is None:
break
results.append(result)
# Collect all events and data_parallel_ranks from all results
all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
unique_dps = set(all_dp_ranks)
assert len(unique_dps) == 2, (
f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
)
# Receive from subscriber until no more messages
print("collecting results...")
results = []
while True:
result = subscriber.receive_one(timeout=1)
print(result)
if result is None:
break
results.append(result)
# Collect all events and data_parallel_ranks from all results
all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
unique_dps = set(all_dp_ranks)
assert len(unique_dps) == 2, (
f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
)
finally:
client.shutdown()
subscriber.close()
finally:
client.shutdown()
subscriber.close()
@pytest.mark.timeout(20)
def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch to extract core process pid while it's starting.
core_proc_pid = [None]
cepm_ctor = CoreEngineProcManager.__init__
......@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
mock_executor_class.side_effect = create_mock_executor
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices
from vllm.v1.engine.utils import EngineZmqAddresses
......
......@@ -21,12 +21,10 @@ DTYPE = "half"
def _vllm_model(
apc: bool,
vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
*,
skip_tokenizer_init: bool = False,
):
"""Set up VllmRunner instance."""
monkeypatch.setenv("VLLM_USE_V1", "1")
return vllm_runner(
MODEL,
dtype=DTYPE,
......@@ -45,16 +43,16 @@ def _vllm_model(
# Prefix caching
params=[False, True],
)
def vllm_model(vllm_runner, request, monkeypatch):
def vllm_model(vllm_runner, request):
"""VllmRunner test fixture parameterized by APC True/False."""
with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model:
with _vllm_model(request.param, vllm_runner) as vllm_model:
yield vllm_model
@pytest.fixture(scope="function")
def vllm_model_apc(vllm_runner, monkeypatch):
def vllm_model_apc(vllm_runner):
"""VllmRunner test fixture with APC."""
with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model:
with _vllm_model(True, vllm_runner) as vllm_model:
yield vllm_model
......@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
# Prefix caching
params=[False, True],
)
def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
def vllm_model_skip_tokenizer_init(vllm_runner, request):
"""VllmRunner test fixture with APC."""
with _vllm_model(
request.param,
vllm_runner,
monkeypatch,
skip_tokenizer_init=True,
) as vllm_model:
yield vllm_model
......@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
)
def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
def test_engine_metrics(vllm_runner, example_prompts):
max_tokens = 100
# Use spec decoding to test num_accepted_tokens_per_pos
speculative_config = {
......@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
"prompt_lookup_min": 3,
"num_speculative_tokens": 5,
}
monkeypatch.setenv("VLLM_USE_V1", "1")
with vllm_runner(
MODEL,
speculative_config=speculative_config,
......@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
def test_skip_tokenizer_initialization(model: str, monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_V1", "1")
def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment