[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1e4ecca1 · Cyrus Leung · GitHub · c0a7b89d · 1e4ecca1 · 1e4ecca1
Unverified Commit 1e4ecca1 authored Oct 07, 2025 by Cyrus Leung Committed by GitHub Oct 07, 2025
20 changed files
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
    # Run with flex attention
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
        set_seed(seed)
@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
    # Run with default backend
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        set_seed(seed)
        with vllm_runner(
            model_name,
@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
    # Run with flex attention
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
        with vllm_runner(
            model_name,
@@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
            flex_outputs = llm_flex.embed(prompts)
    # Run with default backend
-    with monkeypatch.context() as m:
+    with (
-        m.setenv("VLLM_USE_V1", "1")
+        monkeypatch.context() as m,
-        with vllm_runner(
+        vllm_runner(
            model_name,
            runner="pooling",
            dtype=torch.bfloat16,
            tensor_parallel_size=1,
            max_model_len=100,
            enforce_eager=True,
-        ) as llm_default:
+        ) as llm_default,
-            default_outputs = llm_default.embed(prompts)
+    ):
+        default_outputs = llm_default.embed(prompts)
    check_embeddings_close(
        embeddings_0_lst=flex_outputs,

--- a/tests/models/multimodal/generation/test_maverick.py
+++ b/tests/models/multimodal/generation/test_maverick.py
@@ -613,7 +613,6 @@ def test_dummy_maverick(
    profile: bool = False,
 ) -> None:
    # Disable multiprocessing allows us to access model executor from LLM engine
-    monkeypatch.setenv("VLLM_USE_V1", "1")
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
    model_path = create_reduced_maverick_model(

--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -8,7 +8,6 @@ if TYPE_CHECKING:
    from vllm.config import VllmConfig
 else:
    VllmConfig = None
-from vllm import envs
 class DummyPlatform(Platform):
@@ -19,10 +18,7 @@ class DummyPlatform(Platform):
    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        if envs.VLLM_USE_V1:
+        vllm_config.compilation_config.custom_ops = ["all"]
-            compilation_config = vllm_config.compilation_config
-            # Activate custom ops for v1.
-            compilation_config.custom_ops = ["all"]
    def get_attn_backend_cls(
        self,

--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler):
 def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        # Explicitly turn off engine multiprocessing so
        # that the scheduler runs in this process
        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
 from typing import Optional
-import pytest
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
-@pytest.fixture(autouse=True)
-def v1(monkeypatch):
-    """Only run on vLLM v1."""
-    monkeypatch.setenv("VLLM_USE_V1", "1")
 def _generate(
    llm: LLM,
    prompt: str,

--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest
 # 100 training iterations with a training batch size of 100.
-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
-    for all tests in this file
-    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        yield
 def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
    return vllm.LLM(
        model="Qwen/Qwen2.5-3B-Instruct",

--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -305,7 +305,6 @@ full_cg_backend_configs = {
    "CutlassMLA": BackendConfig(
        name="CutlassMLA",
        env_vars={
-            "VLLM_USE_V1": "1",
            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
            "FORCE_NUM_KV_SPLITS": "1",  # TODO: remove this when hang issue is fixed
        },

--- a/tests/v1/core/test_kv_sharing.py
+++ b/tests/v1/core/test_kv_sharing.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
 import torch
 from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
 from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
+pytestmark = pytest.mark.cpu_test
 def new_kv_cache_spec():
    return FullAttentionSpec(16, 1, 1, torch.float32, False)

--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 import pytest
 from vllm import LLM
-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
 MODEL = "meta-llama/Llama-3.2-1B"
 PROMPT = "Hello my name is Robert and I"

--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
    ):
        pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
-    env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
+    env_vars = backend_configs[backend_name].env_vars
    with temporary_environ(env_vars), ExitStack() as stack:
        if not supported:
@@ -117,7 +117,7 @@ combo_cases_2 = [
 def test_cudagraph_compilation_combo(combo_case):
    backend_name, cudagraph_mode, compilation_level, supported = combo_case
-    env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
+    env_vars = backend_configs[backend_name].env_vars
    with temporary_environ(env_vars), ExitStack() as stack:
        if not supported:

--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
        )
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")

--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -32,7 +32,7 @@ model_config = {
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
 def test_sliding_window_retrieval(
-    monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager
+    model, batch_size, seed, disable_hybrid_kv_cache_manager
 ):
    """
    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
@@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
    If we tell it upfront which we are going to be looking for, then
    it answers correctly (mostly).
    """
-    with monkeypatch.context() as m:
+    test_config = model_config[model]
-        m.setenv("VLLM_USE_V1", "1")
+    llm = LLM(
-        test_config = model_config[model]
+        model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
+    )
-        llm = LLM(
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
-            model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
-        )
+    prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)
-        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+    check_length(prompts, llm, test_config.sliding_window)
-        prompts, answer, indices = prep_prompts(
-            batch_size, ln_range=test_config.ln_range
+    # Fresh generation
-        )
+    responses = llm.generate(prompts, sampling_params)
+    check_answers(
-        check_length(prompts, llm, test_config.sliding_window)
+        indices,
+        answer,
-        # Fresh generation
+        [response.outputs[0].text for response in responses],
-        responses = llm.generate(prompts, sampling_params)
+        accept_rate=1.0,
-        check_answers(
+    )
-            indices,
-            answer,
+    # Re-generate with the same prompts to test prefix caching
-            [response.outputs[0].text for response in responses],
+    responses = llm.generate(prompts, sampling_params)
-            accept_rate=1.0,
+    check_answers(
-        )
+        indices,
+        answer,
-        # Re-generate with the same prompts to test prefix caching
+        [response.outputs[0].text for response in responses],
-        responses = llm.generate(prompts, sampling_params)
+        accept_rate=1.0,
-        check_answers(
+    )
-            indices,
-            answer,
-            [response.outputs[0].text for response in responses],
-            accept_rate=1.0,
-        )
 def check_length(prompts: list[str], llm: LLM, sliding_window: int):

--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
    )
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        # Make scheduling deterministic for reproducibility
        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

--- a/tests/v1/e2e/test_min_tokens.py
+++ b/tests/v1/e2e/test_min_tokens.py
@@ -13,7 +13,6 @@ Covers:
 5) Multiple stop conditions
 """
-import os
 from typing import Optional, Union
 import pytest
@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
 @pytest.fixture(scope="module")
 def llm_v1():
    """Create V1 LLM instance for testing"""
-    # Ensure V1 engine is used
-    os.environ["VLLM_USE_V1"] = "1"
    llm = LLM(
        model=TEST_MODEL,
        tensor_parallel_size=1,
@@ -503,6 +499,6 @@ if __name__ == "__main__":
    Usage:
        cd vllm/
-        VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v
+        python -m pytest tests/v1/e2e/test_min_tokens.py -v
    """
    pytest.main([__file__, "-v"])
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -301,7 +301,6 @@ def test_mtp_correctness(
    model_setup: (method, model_name, tp_size)
    """
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_MLA_DISABLE", "1")
        method, model_name, tp_size = model_setup

--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -95,17 +95,11 @@ async def generate(
 )
 @pytest.mark.asyncio
 async def test_load(
-    monkeypatch: pytest.MonkeyPatch,
    output_kind: RequestOutputKind,
    engine_args: AsyncEngineArgs,
    prompt: PromptType,
 ):
-    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
+    with ExitStack() as after:
-    # so that in the future when we switch, we don't have to change all the
-    # tests.
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(engine_args)
        after.callback(engine.shutdown)
@@ -149,14 +143,11 @@ async def test_load(
 )
 @pytest.mark.asyncio
 async def test_abort(
-    monkeypatch: pytest.MonkeyPatch,
    output_kind: RequestOutputKind,
    engine_args: AsyncEngineArgs,
    prompt: PromptType,
 ):
-    with monkeypatch.context() as m, ExitStack() as after:
+    with ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(engine_args)
        after.callback(engine.shutdown)
@@ -222,13 +213,8 @@ async def test_abort(
    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
 )
 @pytest.mark.asyncio
-async def test_multi_abort(
+async def test_multi_abort(output_kind: RequestOutputKind):
-    monkeypatch: pytest.MonkeyPatch,
+    with ExitStack() as after:
-    output_kind: RequestOutputKind,
-):
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
        after.callback(engine.shutdown)
@@ -304,14 +290,11 @@ async def test_multi_abort(
 )
 @pytest.mark.asyncio
 async def test_finished_flag(
-    monkeypatch: pytest.MonkeyPatch,
    n: int,
    engine_args: AsyncEngineArgs,
    prompt: PromptType,
 ):
-    with monkeypatch.context() as m, ExitStack() as after:
+    with ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(engine_args)
        after.callback(engine.shutdown)
@@ -341,12 +324,10 @@ async def test_finished_flag(
 )
 @pytest.mark.asyncio
 async def test_mid_stream_cancellation(
-    monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType
+    engine_args: AsyncEngineArgs, prompt: PromptType
 ):
    """Test that requests can be cancelled mid-stream."""
-    with monkeypatch.context() as m, ExitStack() as after:
+    with ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(engine_args)
        after.callback(engine.shutdown)
@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
    be added to the default loggers.
    """
-    with monkeypatch.context() as m, ExitStack() as after:
+    with ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(
                TEXT_ENGINE_ARGS,
@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
 @pytest.mark.asyncio(scope="module")
-async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
+async def test_dp_rank_argument():
-    with monkeypatch.context() as m, ExitStack() as after:
+    with ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
        after.callback(engine.shutdown)
@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
 @pytest.mark.asyncio
-async def test_check_health(monkeypatch: pytest.MonkeyPatch):
+async def test_check_health():
    """Test that check_health returns normally for healthy engine
    and raises EngineDeadError when the engine is dead.
    """
@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
    from vllm.v1.engine.exceptions import EngineDeadError
-    with monkeypatch.context() as m, ExitStack() as after:
+    with ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
        after.callback(engine.shutdown)
@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
 )
 @pytest.mark.asyncio
-async def test_abort_final_output(
+async def test_abort_final_output(output_kind: RequestOutputKind):
-    monkeypatch: pytest.MonkeyPatch,
-    output_kind: RequestOutputKind,
-):
    """Test that abort() returns a final output with correct information."""
-    with monkeypatch.context() as m, ExitStack() as after:
+    with ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
        after.callback(engine.shutdown)

--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -5,18 +5,11 @@ from argparse import ArgumentError
 import pytest
-from vllm import envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
-if not envs.VLLM_USE_V1:
-    pytest.skip(
-        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
-        allow_module_level=True,
-    )
 def test_prefix_caching_from_cli():
    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())

--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -46,188 +46,184 @@ def make_request() -> EngineCoreRequest:
 @create_new_process_for_each_test()
-def test_engine_core(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core():
-    with monkeypatch.context() as m:
+    """Setup the EngineCore."""
-        m.setenv("VLLM_USE_V1", "1")
+    engine_args = EngineArgs(model=MODEL_NAME)
-        """Setup the EngineCore."""
+    vllm_config = engine_args.create_engine_config()
-        engine_args = EngineArgs(model=MODEL_NAME)
+    executor_class = Executor.get_class(vllm_config)
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
-        with set_default_torch_num_threads(1):
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
-            engine_core = EngineCore(
+        )
-                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+    """Test basic request lifecycle."""
-            )
-        """Test basic request lifecycle."""
+    # First request.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
-        # First request.
+    assert len(engine_core.scheduler.waiting) == 1
-        engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.running) == 0
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
-        _ = engine_core.step()
+    assert len(engine_core.scheduler.running) == 1
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 1
+    # Second request.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
-        # Second request.
+    assert len(engine_core.scheduler.waiting) == 1
-        engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.running) == 1
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 1
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
-        _ = engine_core.step()
+    assert len(engine_core.scheduler.running) == 2
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
+    # Add two requests in a row.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
-        # Add two requests in a row.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
-        engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.waiting) == 2
-        engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.running) == 2
-        assert len(engine_core.scheduler.waiting) == 2
-        assert len(engine_core.scheduler.running) == 2
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
-        _ = engine_core.step()
+    assert len(engine_core.scheduler.running) == 4
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 4
+    # Loop through until they are all done.
+    while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-        # Loop through until they are all done.
+        pass
-        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-            pass
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
+    """Test abort cycle."""
+    # Basic abort.
+    req = make_request()
+    request_id = req.request_id
+    engine_core.add_request(*engine_core.preprocess_add_request(req))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 0
+    assert engine_core.scheduler.has_unfinished_requests()
+    assert not engine_core.scheduler.has_finished_requests()
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 1
+    assert engine_core.scheduler.has_unfinished_requests()
+    assert not engine_core.scheduler.has_finished_requests()
+    engine_core.abort_requests([request_id])
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
+    assert not engine_core.scheduler.has_unfinished_requests()
+    assert engine_core.scheduler.has_finished_requests()
+    _ = engine_core.step()
+    assert not engine_core.scheduler.has_unfinished_requests()
+    assert not engine_core.scheduler.has_finished_requests()
+    # Add, step, abort 1 of the 3.
+    req0 = make_request()
+    req1 = make_request()
+    req2 = make_request()
+    engine_core.add_request(*engine_core.preprocess_add_request(req0))
+    engine_core.add_request(*engine_core.preprocess_add_request(req1))
+    assert len(engine_core.scheduler.waiting) == 2
+    assert len(engine_core.scheduler.running) == 0
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2
+    engine_core.add_request(*engine_core.preprocess_add_request(req2))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 2
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 3
+    # Abort just one.
+    engine_core.abort_requests([req1.request_id])
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2
+    # Abort the other requests at the same time.
+    engine_core.abort_requests([req2.request_id, req0.request_id])
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
+    # Sending duplicate requests with same request_id
+    req0 = make_request()
+    req1 = make_request()
+    req0.request_id = req1.request_id = "test"
+    engine_core.add_request(*engine_core.preprocess_add_request(req0))
+    while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+        pass
+    engine_core.add_request(*engine_core.preprocess_add_request(req1))
+    while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+        pass
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
-        """Test abort cycle."""
-        # Basic abort.
+@create_new_process_for_each_test()
-        req = make_request()
+def test_engine_core_advanced_sampling():
-        request_id = req.request_id
+    """
+    A basic end-to-end test to verify that the engine functions correctly
+    when additional sampling parameters, such as top_p, min_tokens, and
+    presence_penalty, are set.
+    """
+    """Setup the EngineCore."""
+    engine_args = EngineArgs(model=MODEL_NAME)
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
+    """Test basic request lifecycle."""
+    # First request.
+    request: EngineCoreRequest = make_request()
+    request.sampling_params = SamplingParams(
+        min_tokens=4,
+        presence_penalty=1.0,
+        frequency_penalty=1.0,
+        repetition_penalty=0.1,
+        stop_token_ids=[1001, 1002],
+    )
+    engine_core.add_request(*engine_core.preprocess_add_request(request))
-        engine_core.add_request(*engine_core.preprocess_add_request(req))
+    def _check_engine_state():
        assert len(engine_core.scheduler.waiting) == 1
        assert len(engine_core.scheduler.running) == 0
-        assert engine_core.scheduler.has_unfinished_requests()
+        # Loop through until they are all done.
-        assert not engine_core.scheduler.has_finished_requests()
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 1
-        assert engine_core.scheduler.has_unfinished_requests()
-        assert not engine_core.scheduler.has_finished_requests()
-        engine_core.abort_requests([request_id])
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
-        assert not engine_core.scheduler.has_unfinished_requests()
-        assert engine_core.scheduler.has_finished_requests()
-        _ = engine_core.step()
-        assert not engine_core.scheduler.has_unfinished_requests()
-        assert not engine_core.scheduler.has_finished_requests()
-        # Add, step, abort 1 of the 3.
-        req0 = make_request()
-        req1 = make_request()
-        req2 = make_request()
-        engine_core.add_request(*engine_core.preprocess_add_request(req0))
-        engine_core.add_request(*engine_core.preprocess_add_request(req1))
-        assert len(engine_core.scheduler.waiting) == 2
-        assert len(engine_core.scheduler.running) == 0
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
-        engine_core.add_request(*engine_core.preprocess_add_request(req2))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 2
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 3
-        # Abort just one.
-        engine_core.abort_requests([req1.request_id])
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
-        # Abort the other requests at the same time.
-        engine_core.abort_requests([req2.request_id, req0.request_id])
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
-        # Sending duplicate requests with same request_id
-        req0 = make_request()
-        req1 = make_request()
-        req0.request_id = req1.request_id = "test"
-        engine_core.add_request(*engine_core.preprocess_add_request(req0))
-        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-            pass
-        engine_core.add_request(*engine_core.preprocess_add_request(req1))
        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
            pass
        assert len(engine_core.scheduler.waiting) == 0
        assert len(engine_core.scheduler.running) == 0
+    _check_engine_state()
-@create_new_process_for_each_test()
+    # Second request.
-def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
+    request2 = make_request()
-    """
+    request2.sampling_params = SamplingParams(
-    A basic end-to-end test to verify that the engine functions correctly
+        top_p=0.99,
-    when additional sampling parameters, such as top_p, min_tokens, and
+        top_k=50,
-    presence_penalty, are set.
+    )
-    """
+    engine_core.add_request(*engine_core.preprocess_add_request(request2))
-    with monkeypatch.context() as m:
+    _check_engine_state()
-        m.setenv("VLLM_USE_V1", "1")
-        """Setup the EngineCore."""
-        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
-                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
-            )
-        """Test basic request lifecycle."""
-        # First request.
-        request: EngineCoreRequest = make_request()
-        request.sampling_params = SamplingParams(
-            min_tokens=4,
-            presence_penalty=1.0,
-            frequency_penalty=1.0,
-            repetition_penalty=0.1,
-            stop_token_ids=[1001, 1002],
-        )
-        engine_core.add_request(*engine_core.preprocess_add_request(request))
-        def _check_engine_state():
-            assert len(engine_core.scheduler.waiting) == 1
-            assert len(engine_core.scheduler.running) == 0
-            # Loop through until they are all done.
-            while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-                pass
-            assert len(engine_core.scheduler.waiting) == 0
-            assert len(engine_core.scheduler.running) == 0
-        _check_engine_state()
-        # Second request.
-        request2 = make_request()
-        request2.sampling_params = SamplingParams(
-            top_p=0.99,
-            top_k=50,
-        )
-        engine_core.add_request(*engine_core.preprocess_add_request(request2))
-        _check_engine_state()
 @create_new_process_for_each_test()
-def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_concurrent_batches():
    """
    Test that the engine can handle multiple concurrent batches.
    """
@@ -272,173 +268,163 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
            if hasattr(self, "thread_pool"):
                self.thread_pool.shutdown(wait=False)
-    with monkeypatch.context() as m:
+    engine_args = EngineArgs(
-        m.setenv("VLLM_USE_V1", "1")
+        model=MODEL_NAME,
+        # To test concurrent batches.
-        engine_args = EngineArgs(
+        max_num_seqs=2,
-            model=MODEL_NAME,
+        # Avoid all requests being scheduled once.
-            # To test concurrent batches.
+        enable_prefix_caching=False,
-            max_num_seqs=2,
+        max_num_batched_tokens=10,
-            # Avoid all requests being scheduled once.
+        # Reduce startup time.
-            enable_prefix_caching=False,
+        enforce_eager=True,
-            max_num_batched_tokens=10,
+    )
-            # Reduce startup time.
+    vllm_config = engine_args.create_engine_config()
-            enforce_eager=True,
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
        )
-        vllm_config = engine_args.create_engine_config()
+    assert engine_core.batch_queue is not None
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
+    # Add two requests in a row. Each request have 12 prompt tokens.
-                vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
+    req0 = make_request_with_max_tokens("0", 5)
-            )
+    engine_core.add_request(*engine_core.preprocess_add_request(req0))
-        assert engine_core.batch_queue is not None
+    req1 = make_request_with_max_tokens("1", 5)
+    engine_core.add_request(*engine_core.preprocess_add_request(req1))
-        # Add two requests in a row. Each request have 12 prompt tokens.
-        req0 = make_request_with_max_tokens("0", 5)
+    # Schedule Batch 1: (10, req0)
-        engine_core.add_request(*engine_core.preprocess_add_request(req0))
+    assert engine_core.step_with_batch_queue()[0] is None
-        req1 = make_request_with_max_tokens("1", 5)
+    assert len(engine_core.batch_queue) == 1
-        engine_core.add_request(*engine_core.preprocess_add_request(req1))
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["0"] == 10
-        # Schedule Batch 1: (10, req0)
+    # num_computed_tokens should have been updated immediately.
-        assert engine_core.step_with_batch_queue()[0] is None
+    assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
-        assert len(engine_core.batch_queue) == 1
-        scheduler_output = engine_core.batch_queue[-1][1]
+    # Schedule Batch 2: (2, req0), (8, req1)
-        assert scheduler_output.num_scheduled_tokens["0"] == 10
+    assert engine_core.step_with_batch_queue()[0] == {}
-        # num_computed_tokens should have been updated immediately.
+    assert len(engine_core.batch_queue) == 1
-        assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["0"] == 2
-        # Schedule Batch 2: (2, req0), (8, req1)
+    assert scheduler_output.num_scheduled_tokens["1"] == 8
-        assert engine_core.step_with_batch_queue()[0] == {}
+    # num_computed_tokens should have been updated immediately.
-        assert len(engine_core.batch_queue) == 1
+    assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
-        scheduler_output = engine_core.batch_queue[-1][1]
+    assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
-        assert scheduler_output.num_scheduled_tokens["0"] == 2
-        assert scheduler_output.num_scheduled_tokens["1"] == 8
+    assert engine_core.scheduler.get_num_unfinished_requests() == 2
-        # num_computed_tokens should have been updated immediately.
-        assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
+    # Finish Batch 1 and schedule Batch 3: (4, req1).
-        assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
+    # Note that req0 cannot be scheduled
+    # because it is in the decoding stage now.
-        assert engine_core.scheduler.get_num_unfinished_requests() == 2
+    engine_core.step_with_batch_queue()
+    assert len(engine_core.batch_queue) == 1
-        # Finish Batch 1 and schedule Batch 3: (4, req1).
+    scheduler_output = engine_core.batch_queue[-1][1]
-        # Note that req0 cannot be scheduled
+    assert scheduler_output.num_scheduled_tokens["1"] == 4
-        # because it is in the decoding stage now.
-        engine_core.step_with_batch_queue()
+    # Finish Batch 2. Get first token of req0.
-        assert len(engine_core.batch_queue) == 1
+    # Schedule Batch 4: (1, req0).
-        scheduler_output = engine_core.batch_queue[-1][1]
+    output = engine_core.step_with_batch_queue()[0].get(0)
-        assert scheduler_output.num_scheduled_tokens["1"] == 4
+    assert output is not None
+    assert len(output.outputs) == 1
-        # Finish Batch 2. Get first token of req0.
+    assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
-        # Schedule Batch 4: (1, req0).
+    scheduler_output = engine_core.batch_queue[-1][1]
-        output = engine_core.step_with_batch_queue()[0].get(0)
+    assert scheduler_output.num_scheduled_tokens["0"] == 1
+    # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
+    output = engine_core.step_with_batch_queue()[0].get(0)
+    assert output is not None
+    assert len(output.outputs) == 1
+    assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["1"] == 1
+    # Loop until req0 is finished.
+    req_id = 0
+    expected_num_tokens = [
+        engine_core.scheduler.requests["0"].num_tokens + 1,
+        engine_core.scheduler.requests["1"].num_tokens + 1,
+    ]
+    while engine_core.scheduler.get_num_unfinished_requests() == 2:
+        output = engine_core.step_with_batch_queue()[0]
+        # Every step consumes an output.
        assert output is not None
-        assert len(output.outputs) == 1
+        assert len(output[0].outputs) == 1
-        assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
+        if req_id in engine_core.scheduler.requests:
-        scheduler_output = engine_core.batch_queue[-1][1]
+            assert (
-        assert scheduler_output.num_scheduled_tokens["0"] == 1
+                engine_core.scheduler.requests[req_id].num_tokens
+                == expected_num_tokens[req_id]
-        # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
+            )
-        output = engine_core.step_with_batch_queue()[0].get(0)
+        expected_num_tokens[req_id] += 1
-        assert output is not None
+        req_id = (req_id + 1) % 2
-        assert len(output.outputs) == 1
-        assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["1"] == 1
-        # Loop until req0 is finished.
-        req_id = 0
-        expected_num_tokens = [
-            engine_core.scheduler.requests["0"].num_tokens + 1,
-            engine_core.scheduler.requests["1"].num_tokens + 1,
-        ]
-        while engine_core.scheduler.get_num_unfinished_requests() == 2:
-            output = engine_core.step_with_batch_queue()[0]
-            # Every step consumes an output.
-            assert output is not None
-            assert len(output[0].outputs) == 1
-            if req_id in engine_core.scheduler.requests:
-                assert (
-                    engine_core.scheduler.requests[req_id].num_tokens
-                    == expected_num_tokens[req_id]
-                )
-            expected_num_tokens[req_id] += 1
-            req_id = (req_id + 1) % 2
 @multi_gpu_test(num_gpus=2)
-def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_tp():
    """
    Test engine can initialize worker in tp properly
    """
-    with monkeypatch.context() as m:
+    """Setup the EngineCore."""
-        m.setenv("VLLM_USE_V1", "1")
+    engine_args = EngineArgs(
-        """Setup the EngineCore."""
+        model=MODEL_NAME,
-        engine_args = EngineArgs(
+        tensor_parallel_size=2,
-            model=MODEL_NAME,
+        # Reduce startup time.
-            tensor_parallel_size=2,
+        enforce_eager=True,
-            # Reduce startup time.
+    )
-            enforce_eager=True,
+    vllm_config = engine_args.create_engine_config()
-        )
+    executor_class = Executor.get_class(vllm_config)
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
-        with set_default_torch_num_threads(1):
+    with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
+        engine_core = EngineCore(
-                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
-            )
+        )
-        def get_worker_cache_config_field(worker, key: str):
+    def get_worker_cache_config_field(worker, key: str):
-            return getattr(worker.cache_config, key)
+        return getattr(worker.cache_config, key)
-        num_gpu_blocks = engine_core.collective_rpc(
+    num_gpu_blocks = engine_core.collective_rpc(
-            get_worker_cache_config_field, args=("num_gpu_blocks",)
+        get_worker_cache_config_field, args=("num_gpu_blocks",)
-        )
+    )
-        num_cpu_blocks = engine_core.collective_rpc(
+    num_cpu_blocks = engine_core.collective_rpc(
-            get_worker_cache_config_field, args=("num_cpu_blocks",)
+        get_worker_cache_config_field, args=("num_cpu_blocks",)
-        )
+    )
-        assert all(x is not None for x in num_gpu_blocks)
+    assert all(x is not None for x in num_gpu_blocks)
-        assert all(x is not None for x in num_cpu_blocks)
+    assert all(x is not None for x in num_cpu_blocks)
 @create_new_process_for_each_test()
-def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_invalid_request_id_type():
    """Test that engine raises TypeError for non-string request_id."""
-    with monkeypatch.context() as m:
+    engine_args = EngineArgs(model=MODEL_NAME)
-        m.setenv("VLLM_USE_V1", "1")
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)
-        engine_args = EngineArgs(model=MODEL_NAME)
+    with set_default_torch_num_threads(1):
-        vllm_config = engine_args.create_engine_config()
+        engine_core = EngineCore(
-        executor_class = Executor.get_class(vllm_config)
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
-                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
-            )
-        # Test with UUID object (common mistake)
+    # Test with UUID object (common mistake)
-        uuid_request = make_request()
+    uuid_request = make_request()
-        uuid_request.request_id = uuid.uuid4()  # UUID object instead of string
+    uuid_request.request_id = uuid.uuid4()  # UUID object instead of string
-        with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
+    with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
-            engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))
+        engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))
-        # Test with integer
+    # Test with integer
-        int_request = make_request()
+    int_request = make_request()
-        int_request.request_id = 12345
+    int_request.request_id = 12345
-        with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
+    with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
-            engine_core.add_request(*engine_core.preprocess_add_request(int_request))
+        engine_core.add_request(*engine_core.preprocess_add_request(int_request))
-        # Test with None
+    # Test with None
-        none_request = make_request()
+    none_request = make_request()
-        none_request.request_id = None
+    none_request.request_id = None
-        with pytest.raises(
+    with pytest.raises(TypeError, match="request_id must be a string, got.*NoneType"):
-            TypeError, match="request_id must be a string, got.*NoneType"
+        engine_core.add_request(*engine_core.preprocess_add_request(none_request))
-        ):
-            engine_core.add_request(*engine_core.preprocess_add_request(none_request))
-        # Verify engine is still functional after errors
+    # Verify engine is still functional after errors
-        valid_request = make_request()
+    valid_request = make_request()
-        engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
+    engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
-        assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
+    assert len(engine_core.scheduler.running) == 0
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -130,8 +130,6 @@ def test_engine_core_client(
    monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
 ):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        # Monkey-patch core engine utility function to test.
        m.setattr(EngineCore, "echo", echo, raising=False)
@@ -218,8 +216,6 @@ def test_engine_core_client(
 @pytest.mark.asyncio(loop_scope="function")
 async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        # Monkey-patch core engine utility function to test.
        m.setattr(EngineCore, "echo", echo, raising=False)
@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
    monkeypatch: pytest.MonkeyPatch,
 ):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        # Must set insecure serialization to allow returning custom types.
        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
    monkeypatch: pytest.MonkeyPatch,
 ):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        # Must set insecure serialization to allow returning custom types.
        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
    monkeypatch: pytest.MonkeyPatch,
 ):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        # Must set insecure serialization to allow returning custom types.
        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures(
    indirect=["publisher_config"],
 )
 def test_kv_cache_events(
-    monkeypatch: pytest.MonkeyPatch,
    multiprocessing_mode: bool,
    publisher_config,
 ):
-    with monkeypatch.context() as m:
+    block_size = 16
-        m.setenv("VLLM_USE_V1", "1")
+    num_blocks = 2
-        block_size = 16
-        num_blocks = 2
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
-        engine_args = EngineArgs(
+        enforce_eager=True,
-            model=MODEL_NAME,
+        enable_prefix_caching=True,
-            enforce_eager=True,
+        block_size=block_size,
-            enable_prefix_caching=True,
+    )
-            block_size=block_size,
+    engine_args.kv_events_config = publisher_config
-        )
-        engine_args.kv_events_config = publisher_config
-        vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
+    vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
-        executor_class = Executor.get_class(vllm_config)
+    executor_class = Executor.get_class(vllm_config)
-        with set_default_torch_num_threads(1):
+    with set_default_torch_num_threads(1):
-            client = EngineCoreClient.make_client(
+        client = EngineCoreClient.make_client(
-                multiprocess_mode=multiprocessing_mode,
+            multiprocess_mode=multiprocessing_mode,
-                asyncio_mode=False,
+            asyncio_mode=False,
-                vllm_config=vllm_config,
+            vllm_config=vllm_config,
-                executor_class=executor_class,
+            executor_class=executor_class,
-                log_stats=False,
+            log_stats=False,
-            )
-        endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
-        subscriber = MockSubscriber(
-            endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
        )
+    endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+    subscriber = MockSubscriber(
+        endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
+    )
-        try:
+    try:
-            custom_tokens = list(range(num_blocks * block_size))
+        custom_tokens = list(range(num_blocks * block_size))
-            sampling_params = SamplingParams(max_tokens=1)
+        sampling_params = SamplingParams(max_tokens=1)
-            request = make_request(sampling_params, custom_tokens)
+        request = make_request(sampling_params, custom_tokens)
-            client.add_request(request)
+        client.add_request(request)
-            outputs: dict[str, list] = {request.request_id: []}
+        outputs: dict[str, list] = {request.request_id: []}
-            loop_until_done(client, outputs)
+        loop_until_done(client, outputs)
-            result = subscriber.receive_one(timeout=1000)
+        result = subscriber.receive_one(timeout=1000)
-            assert result is not None, "No message received"
+        assert result is not None, "No message received"
-            seq, received = result
+        seq, received = result
-            assert seq == 0, "Sequence number mismatch"
+        assert seq == 0, "Sequence number mismatch"
-            assert len(received.events) == 1, (
+        assert len(received.events) == 1, "We should have exactly one BlockStored event"
-                "We should have exactly one BlockStored event"
+        event = received.events[0]
-            )
+        assert isinstance(event, BlockStored), "We should have a BlockStored event"
-            event = received.events[0]
+        assert len(event.block_hashes) == num_blocks, (
-            assert isinstance(event, BlockStored), "We should have a BlockStored event"
+            "We should have a BlockStored event with 2 block_hashes"
-            assert len(event.block_hashes) == num_blocks, (
+        )
-                "We should have a BlockStored event with 2 block_hashes"
+        assert event.block_size == block_size, (
-            )
+            "Block size should be the same as the block size"
-            assert event.block_size == block_size, (
+        )
-                "Block size should be the same as the block size"
+        assert event.parent_block_hash is None, "Parent block hash should be None"
-            )
+        assert event.lora_id is None, "Lora id should be None"
-            assert event.parent_block_hash is None, "Parent block hash should be None"
+        assert len(event.token_ids) == num_blocks * block_size, (
-            assert event.lora_id is None, "Lora id should be None"
+            "Token ids should be the same as the custom tokens"
-            assert len(event.token_ids) == num_blocks * block_size, (
+        )
-                "Token ids should be the same as the custom tokens"
+        assert event.token_ids == custom_tokens, (
-            )
+            "Token ids should be the same as the custom tokens"
-            assert event.token_ids == custom_tokens, (
+        )
-                "Token ids should be the same as the custom tokens"
+    finally:
-            )
+        client.shutdown()
-        finally:
+        subscriber.close()
-            client.shutdown()
-            subscriber.close()
 @pytest.mark.asyncio
@@ -672,101 +657,96 @@ def test_kv_cache_events(
 )
 @multi_gpu_test(num_gpus=4)
 async def test_kv_cache_events_dp(
-    monkeypatch: pytest.MonkeyPatch,
    multiprocessing_mode: bool,
    publisher_config,
 ):
-    with monkeypatch.context() as m:
+    block_size = 16
-        m.setenv("VLLM_USE_V1", "1")
+    num_blocks = 2
-        block_size = 16
+    dp_size = 2
-        num_blocks = 2
+    tp_size = 2
-        dp_size = 2
-        tp_size = 2
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
-        engine_args = EngineArgs(
+        enforce_eager=True,
-            model=MODEL_NAME,
+        enable_prefix_caching=True,
-            enforce_eager=True,
+        data_parallel_size=dp_size,
-            enable_prefix_caching=True,
+        tensor_parallel_size=tp_size,
-            data_parallel_size=dp_size,
+        block_size=block_size,
-            tensor_parallel_size=tp_size,
+    )
-            block_size=block_size,
+    engine_args.kv_events_config = publisher_config
-        )
-        engine_args.kv_events_config = publisher_config
-        vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
+    vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
-        executor_class = Executor.get_class(vllm_config)
+    executor_class = Executor.get_class(vllm_config)
-        with set_default_torch_num_threads(1):
+    with set_default_torch_num_threads(1):
-            client = EngineCoreClient.make_client(
+        client = EngineCoreClient.make_client(
-                multiprocess_mode=multiprocessing_mode,
+            multiprocess_mode=multiprocessing_mode,
-                asyncio_mode=True,
+            asyncio_mode=True,
-                vllm_config=vllm_config,
+            vllm_config=vllm_config,
-                executor_class=executor_class,
+            executor_class=executor_class,
-                log_stats=False,
+            log_stats=False,
-            )
+        )
-        await asyncio.sleep(1)
+    await asyncio.sleep(1)
-        # Build endpoints for all DP ranks
+    # Build endpoints for all DP ranks
-        base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+    base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
-        endpoints = []
+    endpoints = []
-        for i in range(dp_size):
+    for i in range(dp_size):
-            offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
+        offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
-            endpoints.append(offset_endpoint)
+        endpoints.append(offset_endpoint)
-        subscriber = MockSubscriber(
+    subscriber = MockSubscriber(
-            endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
+        endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
-        )
+    )
-        try:
+    try:
-            custom_tokens = list(range(num_blocks * block_size))
+        custom_tokens = list(range(num_blocks * block_size))
-            sampling_params = SamplingParams(max_tokens=1)
+        sampling_params = SamplingParams(max_tokens=1)
-            all_request_ids = []
+        all_request_ids = []
-            # Create and add 25 requests
+        # Create and add 25 requests
-            # NOTE: attempts to force routing to both dp groups but can be flaky
+        # NOTE: attempts to force routing to both dp groups but can be flaky
-            for i in range(25):
+        for i in range(25):
-                await asyncio.sleep(0.01)
+            await asyncio.sleep(0.01)
-                request = make_request(sampling_params, custom_tokens)
+            request = make_request(sampling_params, custom_tokens)
-                await client.add_request_async(request)
+            await client.add_request_async(request)
-                all_request_ids.append(request.request_id)
+            all_request_ids.append(request.request_id)
-            await asyncio.sleep(0.1)
+        await asyncio.sleep(0.1)
-            # Initialize outputs dict for all requests
+        # Initialize outputs dict for all requests
-            outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
+        outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
-            print("processing requests...")
+        print("processing requests...")
-            await asyncio.wait_for(
+        await asyncio.wait_for(
-                loop_until_fully_done_async(client, outputs), timeout=20.0
+            loop_until_fully_done_async(client, outputs), timeout=20.0
-            )
+        )
-            # Receive from subscriber until no more messages
+        # Receive from subscriber until no more messages
-            print("collecting results...")
+        print("collecting results...")
-            results = []
+        results = []
-            while True:
+        while True:
-                result = subscriber.receive_one(timeout=1)
+            result = subscriber.receive_one(timeout=1)
-                print(result)
+            print(result)
-                if result is None:
+            if result is None:
-                    break
+                break
-                results.append(result)
+            results.append(result)
-            # Collect all events and data_parallel_ranks from all results
+        # Collect all events and data_parallel_ranks from all results
-            all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
+        all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
-            unique_dps = set(all_dp_ranks)
+        unique_dps = set(all_dp_ranks)
-            assert len(unique_dps) == 2, (
+        assert len(unique_dps) == 2, (
-                f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
+            f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
-            )
+        )
-        finally:
+    finally:
-            client.shutdown()
+        client.shutdown()
-            subscriber.close()
+        subscriber.close()
 @pytest.mark.timeout(20)
 def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
-        m.setenv("VLLM_USE_V1", "1")
        # Monkey-patch to extract core process pid while it's starting.
        core_proc_pid = [None]
        cepm_ctor = CoreEngineProcManager.__init__
@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
    mock_executor_class.side_effect = create_mock_executor
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("CUDA_VISIBLE_DEVICES", "")  # No CUDA devices
        from vllm.v1.engine.utils import EngineZmqAddresses

--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -21,12 +21,10 @@ DTYPE = "half"
 def _vllm_model(
    apc: bool,
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    *,
    skip_tokenizer_init: bool = False,
 ):
    """Set up VllmRunner instance."""
-    monkeypatch.setenv("VLLM_USE_V1", "1")
    return vllm_runner(
        MODEL,
        dtype=DTYPE,
@@ -45,16 +43,16 @@ def _vllm_model(
    # Prefix caching
    params=[False, True],
 )
-def vllm_model(vllm_runner, request, monkeypatch):
+def vllm_model(vllm_runner, request):
    """VllmRunner test fixture parameterized by APC True/False."""
-    with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model:
+    with _vllm_model(request.param, vllm_runner) as vllm_model:
        yield vllm_model
 @pytest.fixture(scope="function")
-def vllm_model_apc(vllm_runner, monkeypatch):
+def vllm_model_apc(vllm_runner):
    """VllmRunner test fixture with APC."""
-    with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model:
+    with _vllm_model(True, vllm_runner) as vllm_model:
        yield vllm_model
@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
    # Prefix caching
    params=[False, True],
 )
-def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
+def vllm_model_skip_tokenizer_init(vllm_runner, request):
    """VllmRunner test fixture with APC."""
    with _vllm_model(
        request.param,
        vllm_runner,
-        monkeypatch,
        skip_tokenizer_init=True,
    ) as vllm_model:
        yield vllm_model
@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
            )
-def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
+def test_engine_metrics(vllm_runner, example_prompts):
    max_tokens = 100
    # Use spec decoding to test num_accepted_tokens_per_pos
    speculative_config = {
@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
        "prompt_lookup_min": 3,
        "num_speculative_tokens": 5,
    }
-    monkeypatch.setenv("VLLM_USE_V1", "1")
    with vllm_runner(
        MODEL,
        speculative_config=speculative_config,
@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
 @pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
-def test_skip_tokenizer_initialization(model: str, monkeypatch: pytest.MonkeyPatch):
+def test_skip_tokenizer_initialization(model: str):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
    # This test checks if the flag skip_tokenizer_init skips the initialization
    # of tokenizer and detokenizer. The generated output is expected to contain
    # token ids.