[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1e4ecca1 · Cyrus Leung · GitHub · c0a7b89d · 1e4ecca1 · 1e4ecca1
Unverified Commit 1e4ecca1 authored Oct 07, 2025 by Cyrus Leung Committed by GitHub Oct 07, 2025
20 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -296,6 +296,7 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
+    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
@@ -317,7 +318,7 @@ steps:
  no_gpu: true
  commands:
    # split the test to avoid interference
-    - pytest -v -s v1/core
+    - pytest -v -s -m 'cpu_test' v1/core
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -13,7 +13,7 @@ import pytest
 import torch

 from vllm import LLM
-from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
+from vllm.v1.engine.llm_engine import LLMEngine

 from ..conftest import HfRunner, VllmRunner
 from ..models.utils import check_outputs_equal
@@ -211,16 +211,11 @@ def test_models_distributed(


 def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
-    from vllm.envs import VLLM_USE_V1
-
-    if not VLLM_USE_V1:
-        pytest.skip("Skipping V0 test, dump input not supported")
-
    # Needed to mock an error in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

    with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
-        if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
+        if isinstance(vllm_model.llm.llm_engine, LLMEngine):
            v1_test_failed_model_execution(vllm_model)



--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -117,68 +117,59 @@ def test_cumem_with_cudagraph():

 @create_new_process_for_each_test()
 @pytest.mark.parametrize(
-    "model, use_v1",
+    "model",
    [
        # sleep mode with safetensors
-        ("meta-llama/Llama-3.2-1B", True),
+        "meta-llama/Llama-3.2-1B",
        # sleep mode with pytorch checkpoint
-        ("facebook/opt-125m", True),
+        "facebook/opt-125m",
    ],
 )
-def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
-    with monkeypatch.context() as m:
-        assert use_v1
-        m.setenv("VLLM_USE_V1", "1")
-        free, total = torch.cuda.mem_get_info()
-        used_bytes_baseline = total - free  # in case other process is running
-        llm = LLM(model, enable_sleep_mode=True)
-        prompt = "How are you?"
-        sampling_params = SamplingParams(temperature=0, max_tokens=10)
-        output = llm.generate(prompt, sampling_params)
-
-        # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
-        # which is difficult to measure in the test. therefore, we only
-        # test sleep level 1 here.
-        llm.sleep(level=1)
-
-        free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
-        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
-        # now the memory usage is mostly cudagraph memory pool,
-        # and it should be less than the model weights (1B model, 2GiB weights)
-
-        # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
-        # is captured but cannot be releasesd from PyTorch due to a known bug,
-        # therefore high memory usage after `llm.sleep` is called is expected.
-        # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
-        # in V1.
-        if use_v1:
-            assert used_bytes < 7 * GiB_bytes
-        else:
-            assert used_bytes < 2 * GiB_bytes
-
-        llm.wake_up()
-        output2 = llm.generate(prompt, sampling_params)
-        # cmp output
-        assert output[0].outputs[0].text == output2[0].outputs[0].text
-
-        llm.sleep(level=1)
-        llm.wake_up(tags=["weights"])
-
-        free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
-        used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
-
-        # should just reallocate memory for weights (1B model, ~2GiB weights)
-        if use_v1:
-            assert used_bytes < 10 * GiB_bytes
-        else:
-            assert used_bytes < 6 * GiB_bytes
-
-        # now allocate kv cache memory
-        llm.wake_up(tags=["kv_cache"])
-        output3 = llm.generate(prompt, sampling_params)
-
-        # cmp output
-        assert output[0].outputs[0].text == output3[0].outputs[0].text
+def test_end_to_end(model: str):
+    free, total = torch.cuda.mem_get_info()
+    used_bytes_baseline = total - free  # in case other process is running
+    llm = LLM(model, enable_sleep_mode=True)
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)
+
+    # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+    # which is difficult to measure in the test. therefore, we only
+    # test sleep level 1 here.
+    llm.sleep(level=1)
+
+    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+    # now the memory usage is mostly cudagraph memory pool,
+    # and it should be less than the model weights (1B model, 2GiB weights)
+
+    # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+    # is captured but cannot be releasesd from PyTorch due to a known bug,
+    # therefore high memory usage after `llm.sleep` is called is expected.
+    # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+    # in V1.
+    assert used_bytes < 7 * GiB_bytes
+
+    llm.wake_up()
+    output2 = llm.generate(prompt, sampling_params)
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
+
+    llm.sleep(level=1)
+    llm.wake_up(tags=["weights"])
+
+    free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+
+    # should just reallocate memory for weights (1B model, ~2GiB weights)
+    assert used_bytes < 10 * GiB_bytes
+
+    # now allocate kv cache memory
+    llm.wake_up(tags=["kv_cache"])
+    output3 = llm.generate(prompt, sampling_params)
+
+    # cmp output
+    assert output[0].outputs[0].text == output3[0].outputs[0].text


 @create_new_process_for_each_test()

--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -66,7 +66,6 @@ def llm_pair(request):
            pytest.skip("Only Blackwell GPUs support Cutlass MLA")

    env_vars = {
-        "VLLM_USE_V1": "1",
        # Force native sampler to avoid potential nondeterminism in FlashInfer
        # when per-request generators are not used in V1.
        "VLLM_USE_FLASHINFER_SAMPLER": "0",
@@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
    with (
        temporary_environ(
            {
-                "VLLM_USE_V1": "1",
                "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
                # Flex_Attention is not supported with full cuda graph
            }

--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -18,7 +18,6 @@ from vllm.config import (
    VllmConfig,
    set_current_vllm_config,
 )
-from vllm.envs import VLLM_USE_V1
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import is_torch_equal_or_newer

@@ -127,7 +126,6 @@ def _run_simple_model(
 @pytest.mark.parametrize("use_inductor", [True, False])
 @torch.inference_mode()
 def test_simple_piecewise_compile(use_inductor):
-    assert VLLM_USE_V1
    _run_simple_model(
        splitting_ops=["silly.attention"],
        use_inductor_graph_partition=False,
@@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
 @torch.inference_mode()
 @pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
 def test_simple_inductor_graph_partition(splitting_ops):
-    assert VLLM_USE_V1
    if not is_torch_equal_or_newer("2.9.0.dev"):
        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")


--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
        "pass_config": {"enable_async_tp": async_tp_enabled},
    }

-    async_tp_env = tp_env = {
-        "VLLM_USE_V1": "1",
-    }
-
    async_tp_args = [
        *common_args,
        "--tensor-parallel-size",
@@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
        "mp",
    ]

-    compare_two_settings(
-        model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
-    )
+    compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

-import vllm
 from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.utils import _is_torch_equal_or_newer
@@ -16,15 +15,10 @@ def test_version():
    assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")


-def test_use_cudagraphs_dynamic(monkeypatch):
-    assert vllm.envs.VLLM_USE_V1
+def test_use_cudagraphs_dynamic():
    vllm_config = VllmConfig()
    assert vllm_config.compilation_config.use_cudagraph

-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    vllm_config = VllmConfig()
-    assert not vllm_config.compilation_config.use_cudagraph
-

 def test_custom_op():
    # proper syntax
@@ -41,8 +35,6 @@ def test_custom_op():
 # may be influenced by other tests.
 @pytest.mark.parametrize("val", ["1"])
 def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
-    assert vllm.envs.VLLM_USE_V1
-
    # Disable multiprocessing so that the counter is in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
@@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
 @pytest.mark.forked
 @pytest.mark.parametrize("enabled", [True, False])
 def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
-    assert vllm.envs.VLLM_USE_V1
-
    # Disable multiprocessing so that the counter is in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")


--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -303,7 +303,6 @@ def test_attention_quant_pattern(
    model_class: type[AttentionQuantPatternModel],
    backend: _Backend,
    use_inductor_graph_partition: bool,
-    monkeypatch,
    dist_init,
    caplog_vllm,
 ):
@@ -312,8 +311,6 @@ def test_attention_quant_pattern(
    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    device = torch.device("cuda:0")
    torch.manual_seed(42)


--- a/tests/config/test_mp_reducer.py
+++ b/tests/config/test_mp_reducer.py
@@ -8,16 +8,13 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.v1.engine.async_llm import AsyncLLM


-def test_mp_reducer(monkeypatch):
+def test_mp_reducer():
    """
    Test that _reduce_config reducer is registered when AsyncLLM is instantiated
    without transformers_modules. This is a regression test for
    https://github.com/vllm-project/vllm/pull/18640.
    """

-    # Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    # Ensure transformers_modules is not in sys.modules
    if "transformers_modules" in sys.modules:
        del sys.modules["transformers_modules"]

--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@@ -5,7 +5,7 @@ from typing import Any, Optional

 import pytest

-from vllm import LLM, SamplingParams, envs
+from vllm import LLM, SamplingParams

 MODEL = "meta-llama/llama-2-7b-hf"
 MAX_TOKENS = 200
@@ -111,9 +111,7 @@ def _stop_token_id(llm):

 @pytest.mark.skip_global_cleanup
 def test_stop_strings():
-    # If V0, must set enforce_eager=False since we use
-    # async output processing below.
-    llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
+    llm = LLM(MODEL, enforce_eager=True)

    _stop_basic(llm)
    _stop_multi_tokens(llm)

--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -42,24 +42,10 @@ class CPTestOptions(NamedTuple):
 @dataclass
 class CPTestSettings:
    parallel_setups: list[ParallelSetup]
-    # NOTE: the length of distributed_backends and
-    # vllm_major_versions should be the same, and they
-    # are first zipped together to iterate over all
-    # test settings.
    distributed_backends: list[str]
-    # vllm major version: "0" for V0, "1" for V1
-    vllm_major_versions: list[str]
    runner: RunnerOption
    test_options: CPTestOptions

-    def __post_init__(self):
-        if len(self.distributed_backends) != len(self.vllm_major_versions):
-            raise ValueError(
-                f"Length mismatch: distributed_backends "
-                f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})"
-            )
-
    @staticmethod
    def detailed(
        *,
@@ -87,7 +73,6 @@ class CPTestSettings:
        return CPTestSettings(
            parallel_setups=parallel_setups,
            distributed_backends=["mp"],
-            vllm_major_versions=["1"],
            runner=runner,
            test_options=CPTestOptions(
                multi_node_only=multi_node_only, load_format=load_format
@@ -98,14 +83,11 @@ class CPTestSettings:
        opts = self.test_options

        for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(
-                self.distributed_backends, self.vllm_major_versions
-            ):
+            for backend in self.distributed_backends:
                yield (
                    model_id,
                    parallel_setup,
                    backend,
-                    vllm_major_version,
                    self.runner,
                    opts,
                )
@@ -115,7 +97,6 @@ def _compare_cp_with_tp(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    vllm_major_version: str,
    runner: RunnerOption,
    test_options: CPTestOptions,
    num_gpus_available: int,
@@ -191,10 +172,6 @@ def _compare_cp_with_tp(
    if hf_overrides:
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])

-    cp_env = tp_env = {
-        "VLLM_USE_V1": vllm_major_version,  # Note(hc): DCP only support V1 engine only
-    }
-
    cp_args = [
        *common_args,
        "--tensor-parallel-size",
@@ -217,24 +194,13 @@ def _compare_cp_with_tp(
        distributed_backend,
    ]

-    try:
-        compare_two_settings(
-            model_id,
-            cp_args,
-            tp_args,
-            cp_env,
-            tp_env,
-            method=method,
-            max_wait_seconds=720,
-        )
-    except Exception:
-        testing_ray_compiled_graph = cp_env is not None
-        if testing_ray_compiled_graph and vllm_major_version == "0":
-            # Ray Compiled Graph tests are flaky for V0,
-            # so we don't want to fail the test
-            logger.exception("Ray Compiled Graph tests failed")
-        else:
-            raise
+    compare_two_settings(
+        model_id,
+        cp_args,
+        tp_args,
+        method=method,
+        max_wait_seconds=720,
+    )


 CP_TEXT_GENERATION_MODELS = {
@@ -257,7 +223,6 @@ CP_TEST_MODELS = [
        "model_id",
        "parallel_setup",
        "distributed_backend",
-        "vllm_major_version",
        "runner",
        "test_options",
    ),
@@ -274,7 +239,6 @@ def test_cp_generation(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    vllm_major_version: str,
    runner: RunnerOption,
    test_options: CPTestOptions,
    num_gpus_available,
@@ -283,7 +247,6 @@ def test_cp_generation(
        model_id,
        parallel_setup,
        distributed_backend,
-        vllm_major_version,
        runner,
        test_options,
        num_gpus_available,

--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -307,7 +307,6 @@ def _compare_tp(
    if distributed_backend == "ray":
        # For V1, test Ray Compiled Graph for all the tests
        pp_env = {
-            "VLLM_USE_V1": "1",
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
@@ -316,15 +315,11 @@ def _compare_tp(
        # terminate because of a Ray Compiled Graph issue.
        common_args.append("--disable-frontend-multiprocessing")
    elif distributed_backend == "mp":
-        pp_env = {
-            "VLLM_USE_V1": "1",
-        }
+        pp_env = None
    else:
        pp_env = None

-    tp_env = {
-        "VLLM_USE_V1": "1",
-    }
+    tp_env = None

    pp_args = [
        *common_args,

--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -42,24 +42,10 @@ class SPTestOptions(NamedTuple):
 @dataclass
 class SPTestSettings:
    parallel_setups: list[ParallelSetup]
-    # NOTE: the length of distributed_backends and
-    # vllm_major_versions should be the same, and they
-    # are first zipped together to iterate over all
-    # test settings.
    distributed_backends: list[str]
-    # vllm major version: "0" for V0, "1" for V1
-    vllm_major_versions: list[str]
    runner: RunnerOption
    test_options: SPTestOptions

-    def __post_init__(self):
-        if len(self.distributed_backends) != len(self.vllm_major_versions):
-            raise ValueError(
-                f"Length mismatch: distributed_backends "
-                f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})"
-            )
-
    @staticmethod
    def detailed(
        *,
@@ -85,7 +71,6 @@ class SPTestSettings:
        return SPTestSettings(
            parallel_setups=parallel_setups,
            distributed_backends=["mp", "ray"],
-            vllm_major_versions=["1", "1"],
            runner=runner,
            test_options=SPTestOptions(
                multi_node_only=multi_node_only, load_format=load_format
@@ -117,7 +102,6 @@ class SPTestSettings:
        return SPTestSettings(
            parallel_setups=parallel_setups,
            distributed_backends=["mp", "ray"],
-            vllm_major_versions=["1", "1"],
            runner=runner,
            test_options=SPTestOptions(
                multi_node_only=multi_node_only, load_format=load_format
@@ -147,7 +131,6 @@ class SPTestSettings:
        return SPTestSettings(
            parallel_setups=parallel_setups,
            distributed_backends=["mp", "ray"],
-            vllm_major_versions=["1", "1"],
            runner=runner,
            test_options=SPTestOptions(
                multi_node_only=multi_node_only, load_format=load_format
@@ -158,14 +141,11 @@ class SPTestSettings:
        opts = self.test_options

        for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(
-                self.distributed_backends, self.vllm_major_versions
-            ):
+            for backend in self.distributed_backends:
                yield (
                    model_id,
                    parallel_setup,
                    backend,
-                    vllm_major_version,
                    self.runner,
                    opts,
                )
@@ -175,7 +155,6 @@ def _compare_sp(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    vllm_major_version: str,
    runner: RunnerOption,
    test_options: SPTestOptions,
    num_gpus_available: int,
@@ -265,10 +244,6 @@ def _compare_sp(
        },
    }

-    tp_sp_env = tp_env = {
-        "VLLM_USE_V1": vllm_major_version,
-    }
-
    tp_sp_args = [
        *common_args,
        "--tensor-parallel-size",
@@ -281,9 +256,6 @@ def _compare_sp(
        json.dumps(compilation_config),
    ]

-    tp_env = {
-        "VLLM_USE_V1": vllm_major_version,
-    }
    tp_args = [
        *common_args,
        "--tensor-parallel-size",
@@ -292,18 +264,7 @@ def _compare_sp(
        "mp",
    ]

-    try:
-        compare_two_settings(
-            model_id, tp_sp_args, tp_args, tp_sp_env, tp_env, method=method
-        )
-    except Exception:
-        testing_ray_compiled_graph = tp_sp_env is not None
-        if testing_ray_compiled_graph and vllm_major_version == "0":
-            # Ray Compiled Graph tests are flaky for V0,
-            # so we don't want to fail the test
-            logger.exception("Ray Compiled Graph tests failed")
-        else:
-            raise
+    compare_two_settings(model_id, tp_sp_args, tp_args, method=method)


 SP_TEXT_GENERATION_MODELS = {
@@ -325,7 +286,6 @@ SP_TEST_MODELS = [
        "model_id",
        "parallel_setup",
        "distributed_backend",
-        "vllm_major_version",
        "runner",
        "test_options",
    ),
@@ -341,7 +301,6 @@ def test_tp_sp_generation(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    vllm_major_version: str,
    runner: RunnerOption,
    test_options: SPTestOptions,
    num_gpus_available,
@@ -350,7 +309,6 @@ def test_tp_sp_generation(
        model_id,
        parallel_setup,
        distributed_backend,
-        vllm_major_version,
        runner,
        test_options,
        num_gpus_available,

--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -61,50 +61,34 @@ def run_test(model_name, more_args=None):
 TPU_TP_TEST_STR = ""  # "tensor_parallel_size=4"


-@pytest.mark.skipif(
-    not current_platform.is_cuda() and not current_platform.is_tpu(),
-    reason="V1 is currently only supported on CUDA and TPU",
-)
 @pytest.mark.parametrize("model", MODEL_NAMES)
-def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
+def test_lm_eval_accuracy_v1_engine(model):
    """Run with the V1 Engine."""

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    more_args = None
+    if current_platform.is_tpu():
+        # Limit compilation time for TPU V1

-        more_args = None
-        if current_platform.is_tpu():
-            # Limit compilation time for TPU V1
+        more_args = "max_model_len=2048,max_num_seqs=64"

-            more_args = "max_model_len=2048,max_num_seqs=64"
+        # Add TP test (if provided)
+        if TPU_TP_TEST_STR:
+            more_args += ",{}".format(TPU_TP_TEST_STR)

-            # Add TP test (if provided)
-            if TPU_TP_TEST_STR:
-                more_args += ",{}".format(TPU_TP_TEST_STR)
+    run_test(model, more_args)

-        run_test(model, more_args)

-
-@pytest.mark.skipif(
-    not current_platform.is_cuda() and not current_platform.is_tpu(),
-    reason="V1 is currently only supported on CUDA and TPU",
-)
 @pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
-def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
-    model, monkeypatch: pytest.MonkeyPatch
-):
+def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
    """Run with the V1 Engine."""

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        more_args = None
-        if current_platform.is_tpu():
-            # Limit compilation time for TPU V1
-            more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
+    more_args = None
+    if current_platform.is_tpu():
+        # Limit compilation time for TPU V1
+        more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"

-            # Add TP test (if provided)
-            if TPU_TP_TEST_STR:
-                more_args += ",{}".format(TPU_TP_TEST_STR)
+        # Add TP test (if provided)
+        if TPU_TP_TEST_STR:
+            more_args += ",{}".format(TPU_TP_TEST_STR)

-        run_test(model, more_args)
+    run_test(model, more_args)
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -10,7 +10,6 @@ AsyncLLMEngine are working correctly.
 """

 import lm_eval
-import pytest

 from vllm.platforms import current_platform

@@ -67,21 +66,13 @@ def run_test(more_args):
        ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"


-@pytest.mark.skipif(
-    not current_platform.is_cuda()
-    and not current_platform.is_tpu()
-    and not current_platform.is_xpu(),
-    reason="V1 currently only supported on CUDA, XPU and TPU",
-)
-def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
+def test_lm_eval_accuracy_v1_engine():
    """Run with the V1 Engine."""

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        more_args = []
+    more_args = []

-        # Limit compilation time for V1
-        if current_platform.is_tpu():
-            more_args = ["--max-num-seqs", "64"]
+    # Limit compilation time for V1
+    if current_platform.is_tpu():
+        more_args = ["--max-num-seqs", "64"]

-        run_test(more_args)
+    run_test(more_args)
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -21,18 +21,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"


 @pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module")
-def server(monkeypatch_module, zephyr_lora_files):  # noqa: F811
-    monkeypatch_module.setenv("VLLM_USE_V1", "1")
-
+def server(zephyr_lora_files):  # noqa: F811
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",

--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -37,21 +37,8 @@ BADREQUEST_CASES = [
 ]


-@pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
 @pytest.fixture(scope="module", params=[True])
-def server_with_lora_modules_json(request, monkeypatch_module, zephyr_lora_files):
-    use_v1 = request.param
-    assert use_v1
-    monkeypatch_module.setenv("VLLM_USE_V1", "1")
-
+def server_with_lora_modules_json(request, zephyr_lora_files):
    # Define the json format LoRA module configurations
    lora_module_1 = {
        "name": "zephyr-lora",

--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -22,24 +22,6 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 PREV_MINOR_VERSION = version._prev_minor_version()


-@pytest.fixture(scope="module", params=[True])
-def use_v1(request):
-    # Module-scoped variant of run_with_both_engines
-    #
-    # Use this fixture to run a test with both v0 and v1, and
-    # also to conditionalize the test logic e.g.
-    #
-    # def test_metrics_exist(use_v1, server, client):
-    #     ...
-    #     expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
-    #     for metric in expected:
-    #         assert metric in response.text
-    #
-    # @skip_v1 wouldn't work here because this is a module-level
-    # fixture - per-function decorators would have no effect
-    yield request.param
-
-
 @pytest.fixture(scope="module")
 def default_server_args():
    return [
@@ -63,13 +45,11 @@ def default_server_args():
        f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
    ],
 )
-def server(use_v1, default_server_args, request):
+def server(default_server_args, request):
    if request.param:
        default_server_args.append(request.param)
-    env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
-    with RemoteOpenAIServer(
-        MODEL_NAME, default_server_args, env_dict=env_dict
-    ) as remote_server:
+
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
        yield remote_server


@@ -129,7 +109,8 @@ EXPECTED_VALUES = {

 @pytest.mark.asyncio
 async def test_metrics_counts(
-    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
 ):
    for _ in range(_NUM_REQUESTS):
        # sending a request triggers the metrics to be logged.
@@ -145,7 +126,7 @@ async def test_metrics_counts(

    # Loop over all expected metric_families
    for metric_family, suffix_values_list in EXPECTED_VALUES.items():
-        if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
+        if (metric_family not in EXPECTED_METRICS_V1) or (
            not server.show_hidden_metrics
            and metric_family in HIDDEN_DEPRECATED_METRICS
        ):
@@ -183,62 +164,6 @@ async def test_metrics_counts(
        assert found_metric, f"Did not find {metric_family} in prom endpoint"


-EXPECTED_METRICS = [
-    "vllm:num_requests_running",
-    "vllm:num_requests_waiting",
-    "vllm:gpu_cache_usage_perc",
-    "vllm:time_to_first_token_seconds_sum",
-    "vllm:time_to_first_token_seconds_bucket",
-    "vllm:time_to_first_token_seconds_count",
-    "vllm:time_per_output_token_seconds_sum",
-    "vllm:time_per_output_token_seconds_bucket",
-    "vllm:time_per_output_token_seconds_count",
-    "vllm:e2e_request_latency_seconds_sum",
-    "vllm:e2e_request_latency_seconds_bucket",
-    "vllm:e2e_request_latency_seconds_count",
-    "vllm:request_queue_time_seconds_sum",
-    "vllm:request_queue_time_seconds_bucket",
-    "vllm:request_queue_time_seconds_count",
-    "vllm:request_inference_time_seconds_sum",
-    "vllm:request_inference_time_seconds_bucket",
-    "vllm:request_inference_time_seconds_count",
-    "vllm:request_prefill_time_seconds_sum",
-    "vllm:request_prefill_time_seconds_bucket",
-    "vllm:request_prefill_time_seconds_count",
-    "vllm:request_decode_time_seconds_sum",
-    "vllm:request_decode_time_seconds_bucket",
-    "vllm:request_decode_time_seconds_count",
-    "vllm:request_prompt_tokens_sum",
-    "vllm:request_prompt_tokens_bucket",
-    "vllm:request_prompt_tokens_count",
-    "vllm:request_generation_tokens_sum",
-    "vllm:request_generation_tokens_bucket",
-    "vllm:request_generation_tokens_count",
-    "vllm:request_params_n_sum",
-    "vllm:request_params_n_bucket",
-    "vllm:request_params_n_count",
-    "vllm:request_params_max_tokens_sum",
-    "vllm:request_params_max_tokens_bucket",
-    "vllm:request_params_max_tokens_count",
-    "vllm:iteration_tokens_total",
-    "vllm:num_preemptions_total",
-    "vllm:prompt_tokens_total",
-    "vllm:generation_tokens_total",
-    "vllm:request_success_total",
-    "vllm:cache_config_info",
-    # labels in cache_config_info
-    "block_size",
-    "cache_dtype",
-    "cpu_offload_gb",
-    "enable_prefix_caching",
-    "gpu_memory_utilization",
-    "num_cpu_blocks",
-    "num_gpu_blocks",
-    "num_gpu_blocks_override",
-    "sliding_window",
-    "swap_space_bytes",
-]
-
 EXPECTED_METRICS_V1 = [
    "vllm:num_requests_running",
    "vllm:num_requests_waiting",
@@ -304,17 +229,21 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [

 @pytest.mark.asyncio
 async def test_metrics_exist(
-    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
 ):
    # sending a request triggers the metrics to be logged.
    await client.completions.create(
-        model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
+        model=MODEL_NAME,
+        prompt="Hello, my name is",
+        max_tokens=5,
+        temperature=0.0,
    )

    response = requests.get(server.url_for("metrics"))
    assert response.status_code == HTTPStatus.OK

-    for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
+    for metric in EXPECTED_METRICS_V1:
        if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
            continue
        assert metric in response.text
@@ -322,10 +251,11 @@ async def test_metrics_exist(

 @pytest.mark.asyncio
 async def test_abort_metrics_reset(
-    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
 ):
    running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
-        server, use_v1
+        server
    )

    # Expect no running requests or kvcache usage
@@ -351,7 +281,7 @@ async def test_abort_metrics_reset(

    # Check that we have running requests
    running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
-        server, use_v1
+        server
    )

    # Expect running requests and kvcache usage
@@ -371,7 +301,7 @@ async def test_abort_metrics_reset(

    # Verify running and waiting requests counts and KV cache usage are zero
    running_requests_after, waiting_requests_after, kv_cache_usage_after = (
-        _get_running_metrics_from_api(server, use_v1)
+        _get_running_metrics_from_api(server)
    )

    assert running_requests_after == 0, (
@@ -385,7 +315,7 @@ async def test_abort_metrics_reset(
    )


-def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
+def _get_running_metrics_from_api(server: RemoteOpenAIServer):
    """Return (running_count, waiting_count, kv_cache_usage)"""

    response = requests.get(server.url_for("metrics"))
@@ -394,9 +324,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
    # Verify running and waiting requests counts and KV cache usage are zero
    running_requests, waiting_requests, kv_cache_usage = None, None, None

-    kv_cache_usage_metric = (
-        "vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
-    )
+    kv_cache_usage_metric = "vllm:kv_cache_usage_perc"

    for family in text_string_to_metric_families(response.text):
        if family.name == "vllm:num_requests_running":
@@ -422,7 +350,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
    return running_requests, waiting_requests, kv_cache_usage


-def test_metrics_exist_run_batch(use_v1: bool):
+def test_metrics_exist_run_batch():
    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501

    base_url = "0.0.0.0"
@@ -452,7 +380,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
                "--port",
                port,
            ],
-            env={"VLLM_USE_V1": "1"},
        )

        def is_server_up(url):

--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -15,11 +15,6 @@ from vllm.entrypoints.renderer import BaseRenderer
 from ...utils import RemoteOpenAIServer


-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
-
 @pytest.mark.asyncio
 async def test_empty_prompt():
    model_name = "gpt2"

--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -80,7 +80,6 @@ def test_env(
 ):
    """Test attention backend selection with valid device-backend pairs."""
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv(STR_BACKEND_ENV_VAR, name)
        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")

@@ -212,30 +211,21 @@ def test_env(


 @pytest.mark.parametrize("device", ["cpu", "cuda"])
-def test_fp32_fallback(
-    device: str,
-    monkeypatch: pytest.MonkeyPatch,
-):
+def test_fp32_fallback(device: str):
    """Test attention backend selection with fp32."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    if device == "cpu":
+        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+            backend = get_attn_backend(16, torch.float32, None, 16)
+        assert backend.get_name() == "TORCH_SDPA"

-        if device == "cpu":
-            with patch("vllm.attention.selector.current_platform", CpuPlatform()):
-                backend = get_attn_backend(16, torch.float32, None, 16)
-            assert backend.get_name() == "TORCH_SDPA"
-
-        elif device == "cuda":
-            with patch("vllm.attention.selector.current_platform", CudaPlatform()):
-                backend = get_attn_backend(16, torch.float32, None, 16)
-            assert backend.get_name() == "FLEX_ATTENTION"
+    elif device == "cuda":
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            backend = get_attn_backend(16, torch.float32, None, 16)
+        assert backend.get_name() == "FLEX_ATTENTION"


 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    """Test FlashAttn validation."""
-    # TODO: When testing for v1, pipe in `use_v1` as an argument to
-    # get_attn_backend
-
    pytest.skip(
        "Skipping as current backend selector does not "
        "handle fallbacks when a backend is set via env var."
@@ -289,7 +279,6 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
        monkeypatch.context() as m,
        patch("vllm.attention.selector.current_platform", CudaPlatform()),
    ):
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)

        # Should raise ValueError for invalid backend