[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1e4ecca1 · Cyrus Leung · GitHub · c0a7b89d · 1e4ecca1 · 1e4ecca1
Unverified Commit 1e4ecca1 authored Oct 07, 2025 by Cyrus Leung Committed by GitHub Oct 07, 2025
11 changed files
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
 )
 def test_structured_output(
-    monkeypatch: pytest.MonkeyPatch,
    sample_json_schema: dict[str, Any],
    unsupported_json_schema: dict[str, Any],
    sample_sql_ebnf: str,
@@ -115,8 +114,6 @@ def test_structured_output(
    model_name: str,
    speculative_config: dict[str, Any],
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    if current_platform.is_tpu() and speculative_config:
        pytest.skip("TPU does not support speculative decoding")

@@ -620,15 +617,12 @@ Make the response as short as possible.
    ],
 )
 def test_structured_output_with_reasoning_matrices(
-    monkeypatch: pytest.MonkeyPatch,
    backend: str,
    tokenizer_mode: TokenizerMode,
    reasoning_parser: str,
    model_name: str,
    speculative_config: dict[str, Any] | None,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    if current_platform.is_tpu() and speculative_config:
        pytest.skip("TPU does not support speculative decoding")

@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
-    monkeypatch: pytest.MonkeyPatch,
    unsupported_json_schema: dict[str, Any],
    model_name: str,
    tokenizer_mode: str,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    llm = LLM(
        model=model_name,
        max_model_len=1024,
@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(


 @pytest.mark.skip_global_cleanup
-def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
+def test_guidance_no_additional_properties():
    llm = LLM(
        model="Qwen/Qwen2.5-1.5B-Instruct",
        max_model_len=1024,
@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):

 @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
 def test_structured_output_batched_with_non_structured_outputs_requests(
-    monkeypatch: pytest.MonkeyPatch,
    sample_json_schema: dict[str, Any],
    backend: str,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    # Don't use eager execution on TPUs because we want to test for no
    # recompilation at runtime
    enforce_eager = bool(not current_platform.is_tpu())

--- a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
@@ -53,7 +53,6 @@ cleanup() {
 launch_baseline() {
  BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  PJRT_DEVICE=TPU \
  VLLM_WORKER_MULTIPROC_METHOD=spawn \
  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
@@ -73,7 +72,6 @@ launch_pd() {
  UCX_TLS=tcp \
  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
  VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
  PJRT_DEVICE=TPU \
@@ -93,7 +91,6 @@ launch_pd() {
  UCX_TLS=tcp \
  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  PJRT_DEVICE=TPU \
  VLLM_WORKER_MULTIPROC_METHOD=spawn \
  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \

--- a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
@@ -55,7 +55,6 @@ launch_pd() {
  UCX_TLS=tcp \
  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
  VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
  PJRT_DEVICE=TPU \
@@ -75,7 +74,6 @@ launch_pd() {
  UCX_TLS=tcp \
  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  PJRT_DEVICE=TPU \
  VLLM_WORKER_MULTIPROC_METHOD=spawn \
  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \

--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os

 import pytest
 import ray
@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
 from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
 from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger

-
-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch):
-    """
-    The change relies on V1 APIs, so set VLLM_USE_V1=1.
-    """
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
-
 MODELS = [
    "distilbert/distilgpt2",
 ]
@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
    @ray.remote(num_gpus=1)
    class EngineTestActor:
        async def run(self):
-            # Set environment variable inside the Ray actor since environment
-            # variables from pytest fixtures don't propagate to Ray actors
-            os.environ["VLLM_USE_V1"] = "1"
-
            engine_args = AsyncEngineArgs(
                model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
            )

--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
    batch_logprobs_composition: BatchLogprobsComposition,
    temperature: float,
    example_prompts: list[str],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test V1 Engine logprobs & prompt logprobs

@@ -308,12 +307,8 @@ def test_get_logprobs_and_prompt_logprobs(
      temperature: "temperature" sampling parameter
      example_prompts: example prompt fixture
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
    do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
-        if do_apc and (
-            temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT
-        ):
+    if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
        # Skip some test-cases to save time.
        pytest.skip()
    test_prompts = example_prompts
@@ -361,14 +356,11 @@ def test_get_logprobs_and_prompt_logprobs(
        )


-def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
+def test_max_logprobs():
    """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
    Should also fail for `prompt_logprobs > max_logprobs`
    APC should not matter as this test checks basic request validation.
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
    runner = VllmRunner(
        "facebook/opt-125m",
        max_logprobs=1,
@@ -386,15 +378,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
        runner.generate(["Hello world"], sampling_params=bad_sampling_params)


-def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
+def test_none_logprobs(vllm_model, example_prompts):
    """Engine should return `logprobs` and `prompt_logprobs` as `None`

    Args:
      vllm_model: vLLM model fixture
      example_prompts: list of example prompts (test fixture)
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
    max_tokens = 5

    sampling_params_logprobs_none = SamplingParams(
@@ -416,15 +406,13 @@ def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa
        assert results_logprobs_none[i].prompt_logprobs is None


-def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
+def test_zero_logprobs(vllm_model, example_prompts):
    """Engine should return sampled token and prompt token logprobs

    Args:
      vllm_model: vLLM model fixture
      example_prompts: list of example prompts (test fixture)
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
    max_tokens = 5

    sampling_params_logprobs_zero = SamplingParams(
@@ -450,14 +438,12 @@ def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa
        assert len(prompt_token_ids) == len(prompt_logprobs)


-def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
+def test_all_logprobs(example_prompts):
    """Engine should return all vocabulary logprobs and prompt logprobs

    Args:
      example_prompts: list of example prompts (test fixture)
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
    runner = VllmRunner(
        "facebook/opt-125m",
        max_logprobs=-1,
@@ -488,16 +474,13 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):


 @pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
-def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch):
+def test_logprobs_mode(logprobs_mode: LogprobsMode):
    """Test with LLM engine with different logprobs_mode.
    For logprobs, we should have non-positive values.
    For logits, we should expect at least one positive values.
    """
    from vllm import LLM

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
    llm = LLM(
        "facebook/opt-125m",
        max_logprobs=5,

--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os

 import pytest

 from vllm import LLM, SamplingParams

-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
-
 MODEL = "meta-llama/Llama-3.2-1B"
 PROMPT = "Hello my name is Robert and I"

@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))


-def test_priority(llm):
-    """Check that we reject requests with priority."""
-
-    # Reject all allowed token ids
-    with pytest.raises(ValueError):
-        _ = llm.generate(PROMPT, priority=[1])
-
-
 def test_seed(llm):
    """Check that seed impacts randomness."""


--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -38,7 +38,6 @@ def test_eagle_max_len(
    monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
 ):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

        if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():

--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
 @pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
 def test_basic(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    model: str,
    max_tokens: int,
    tensor_parallel_size: int,
@@ -55,9 +54,6 @@ def test_basic(
    )
    example_prompts = [prompt]

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
    with vllm_runner(
        model,
        # Note: max_num_batched_tokens == 1024 is needed here to
@@ -82,7 +78,6 @@ def test_basic(
 @pytest.mark.parametrize("max_num_seqs", [16])
 def test_phi3(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    max_tokens: int,
    max_num_seqs: int,
 ) -> None:
@@ -99,9 +94,6 @@ def test_phi3(
    # test head dim = 96
    model = "microsoft/Phi-3-mini-128k-instruct"

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
    with vllm_runner(
        model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
    ) as vllm_model:
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
 )
 def test_gemma3_27b_with_text_input_and_tp(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    model = "google/gemma-3-27b-it"
    max_tokens = 16
@@ -140,9 +131,6 @@ def test_gemma3_27b_with_text_input_and_tp(
        " but in rising every time we fall.",
    ]

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
    with vllm_runner(
        model,
        max_num_batched_tokens=256,
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
 )
 def test_w8a8_quantization(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
    max_tokens = 5
@@ -176,9 +163,6 @@ def test_w8a8_quantization(
    )
    example_prompts = [prompt]

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
    with vllm_runner(
        model,
        max_num_batched_tokens=64,

--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
 @pytest.mark.parametrize("params", TEST_PARAMS)
 def test_perf(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    params: TestParams,
 ) -> None:
    tokenizer = get_tokenizer(
@@ -107,9 +106,6 @@ def test_perf(
        )
    )

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
    sampling_params = SamplingParams(
        max_tokens=params.decode_len, temperature=1.0, min_p=0.0
    )

--- a/tests/v1/tracing/test_tracing.py
+++ b/tests/v1/tracing/test_tracing.py
@@ -82,7 +82,7 @@ def test_traces(
 ):
    with monkeypatch.context() as m:
        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-        m.setenv("VLLM_USE_V1", "1")
+
        sampling_params = SamplingParams(
            temperature=0.01,
            top_p=0.1,

--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
        logger.info("Warming up model for the compilation...")
        # Only generate graph for the generic shape
        with _set_global_compilation_settings(self.vllm_config):
-            self._dummy_run(max(16, self.max_num_reqs))
+            self._dummy_run(
+                min(
+                    max(16, self.max_num_reqs),
+                    self.scheduler_config.max_num_batched_tokens,
+                )
+            )
+
        logger.info("Warming up done.")

    def _init_device_properties(self) -> None: