[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1e4ecca1 · Cyrus Leung · GitHub · c0a7b89d · 1e4ecca1 · 1e4ecca1
Unverified Commit 1e4ecca1 authored Oct 07, 2025 by Cyrus Leung Committed by GitHub Oct 07, 2025
11 changed files
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
 )
 def test_structured_output(
-    monkeypatch: pytest.MonkeyPatch,
    sample_json_schema: dict[str, Any],
    unsupported_json_schema: dict[str, Any],
    sample_sql_ebnf: str,
@@ -115,8 +114,6 @@ def test_structured_output(
    model_name: str,
    speculative_config: dict[str, Any],
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
    if current_platform.is_tpu() and speculative_config:
        pytest.skip("TPU does not support speculative decoding")
@@ -620,15 +617,12 @@ Make the response as short as possible.
    ],
 )
 def test_structured_output_with_reasoning_matrices(
-    monkeypatch: pytest.MonkeyPatch,
    backend: str,
    tokenizer_mode: TokenizerMode,
    reasoning_parser: str,
    model_name: str,
    speculative_config: dict[str, Any] | None,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
    if current_platform.is_tpu() and speculative_config:
        pytest.skip("TPU does not support speculative decoding")
@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
-    monkeypatch: pytest.MonkeyPatch,
    unsupported_json_schema: dict[str, Any],
    model_name: str,
    tokenizer_mode: str,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
    llm = LLM(
        model=model_name,
        max_model_len=1024,
@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
 @pytest.mark.skip_global_cleanup
-def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
+def test_guidance_no_additional_properties():
-    monkeypatch.setenv("VLLM_USE_V1", "1")
    llm = LLM(
        model="Qwen/Qwen2.5-1.5B-Instruct",
        max_model_len=1024,
@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
 @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
 def test_structured_output_batched_with_non_structured_outputs_requests(
-    monkeypatch: pytest.MonkeyPatch,
    sample_json_schema: dict[str, Any],
    backend: str,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
    # Don't use eager execution on TPUs because we want to test for no
    # recompilation at runtime
    enforce_eager = bool(not current_platform.is_tpu())

--- a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
@@ -53,7 +53,6 @@ cleanup() {
 launch_baseline() {
  BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  PJRT_DEVICE=TPU \
  VLLM_WORKER_MULTIPROC_METHOD=spawn \
  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
@@ -73,7 +72,6 @@ launch_pd() {
  UCX_TLS=tcp \
  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
  VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
  PJRT_DEVICE=TPU \
@@ -93,7 +91,6 @@ launch_pd() {
  UCX_TLS=tcp \
  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  PJRT_DEVICE=TPU \
  VLLM_WORKER_MULTIPROC_METHOD=spawn \
  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \

--- a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
@@ -55,7 +55,6 @@ launch_pd() {
  UCX_TLS=tcp \
  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
  VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
  PJRT_DEVICE=TPU \
@@ -75,7 +74,6 @@ launch_pd() {
  UCX_TLS=tcp \
  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  PJRT_DEVICE=TPU \
  VLLM_WORKER_MULTIPROC_METHOD=spawn \
  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \

--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 import pytest
 import ray
@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
 from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
 from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch):
-    """
-    The change relies on V1 APIs, so set VLLM_USE_V1=1.
-    """
-    monkeypatch.setenv("VLLM_USE_V1", "1")
 MODELS = [
    "distilbert/distilgpt2",
 ]
@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
    @ray.remote(num_gpus=1)
    class EngineTestActor:
        async def run(self):
-            # Set environment variable inside the Ray actor since environment
-            # variables from pytest fixtures don't propagate to Ray actors
-            os.environ["VLLM_USE_V1"] = "1"
            engine_args = AsyncEngineArgs(
                model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
            )

--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
    batch_logprobs_composition: BatchLogprobsComposition,
    temperature: float,
    example_prompts: list[str],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test V1 Engine logprobs & prompt logprobs
@@ -308,220 +307,204 @@ def test_get_logprobs_and_prompt_logprobs(
      temperature: "temperature" sampling parameter
      example_prompts: example prompt fixture
    """
-    with monkeypatch.context() as m:
+    do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
-        m.setenv("VLLM_USE_V1", "1")
+    if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
-        do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
+        # Skip some test-cases to save time.
-        if do_apc and (
+        pytest.skip()
-            temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT
+    test_prompts = example_prompts
-        ):
-            # Skip some test-cases to save time.
+    max_tokens = 5
-            pytest.skip()
+    hf_outputs = hf_model.generate_greedy(
-        test_prompts = example_prompts
+        test_prompts,
+        max_tokens=max_tokens,
-        max_tokens = 5
+    )
-        hf_outputs = hf_model.generate_greedy(
+    hf_logprobs = hf_model.generate_greedy_logprobs(
-            test_prompts,
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+    # Batch has mixed sample params
+    # (different logprobs/prompt logprobs combos)
+    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
+    # Ensure that each test prompt has a logprob config for testing
+    logprob_prompt_logprob_list = _repeat_logprob_config(
+        test_prompts, logprob_prompt_logprob_list
+    )
+    # Generate SamplingParams
+    vllm_sampling_params = [
+        SamplingParams(
            max_tokens=max_tokens,
+            logprobs=num_lp,
+            prompt_logprobs=num_plp,
+            temperature=temperature,
+            seed=1984,
        )
-        hf_logprobs = hf_model.generate_greedy_logprobs(
+        for num_lp, num_plp in logprob_prompt_logprob_list
-            test_prompts,
+    ]
+    for _ in range(2 if do_apc else 1):
+        _run_and_validate(
+            vllm_model=vllm_model,
+            test_prompts=test_prompts,
+            vllm_sampling_params=vllm_sampling_params,
+            hf_logprobs=hf_logprobs,
+            hf_outputs=hf_outputs,
+            logprob_prompt_logprob_list=logprob_prompt_logprob_list,
+            temperature=temperature,
            max_tokens=max_tokens,
+            do_apc=do_apc,
        )
-        # Batch has mixed sample params
-        # (different logprobs/prompt logprobs combos)
-        logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
-        # Ensure that each test prompt has a logprob config for testing
+def test_max_logprobs():
-        logprob_prompt_logprob_list = _repeat_logprob_config(
-            test_prompts, logprob_prompt_logprob_list
-        )
-        # Generate SamplingParams
-        vllm_sampling_params = [
-            SamplingParams(
-                max_tokens=max_tokens,
-                logprobs=num_lp,
-                prompt_logprobs=num_plp,
-                temperature=temperature,
-                seed=1984,
-            )
-            for num_lp, num_plp in logprob_prompt_logprob_list
-        ]
-        for _ in range(2 if do_apc else 1):
-            _run_and_validate(
-                vllm_model=vllm_model,
-                test_prompts=test_prompts,
-                vllm_sampling_params=vllm_sampling_params,
-                hf_logprobs=hf_logprobs,
-                hf_outputs=hf_outputs,
-                logprob_prompt_logprob_list=logprob_prompt_logprob_list,
-                temperature=temperature,
-                max_tokens=max_tokens,
-                do_apc=do_apc,
-            )
-def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
    """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
    Should also fail for `prompt_logprobs > max_logprobs`
    APC should not matter as this test checks basic request validation.
    """
-    with monkeypatch.context() as m:
+    runner = VllmRunner(
-        m.setenv("VLLM_USE_V1", "1")
+        "facebook/opt-125m",
+        max_logprobs=1,
-        runner = VllmRunner(
+        enable_prefix_caching=False,
-            "facebook/opt-125m",
+        # 2 other llms alive during whole session
-            max_logprobs=1,
+        gpu_memory_utilization=0.15,
-            enable_prefix_caching=False,
+        max_model_len=256,
-            # 2 other llms alive during whole session
+    )
-            gpu_memory_utilization=0.15,
+    vllm_sampling_params = SamplingParams(logprobs=1)
-            max_model_len=256,
+    # should pass
-        )
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
-        vllm_sampling_params = SamplingParams(logprobs=1)
-        # should pass
-        runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
-        bad_sampling_params = SamplingParams(logprobs=2)
+    bad_sampling_params = SamplingParams(logprobs=2)
-        with pytest.raises(ValueError):
+    with pytest.raises(ValueError):
-            runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
-def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
+def test_none_logprobs(vllm_model, example_prompts):
    """Engine should return `logprobs` and `prompt_logprobs` as `None`
    Args:
      vllm_model: vLLM model fixture
      example_prompts: list of example prompts (test fixture)
    """
-    with monkeypatch.context() as m:
+    max_tokens = 5
-        m.setenv("VLLM_USE_V1", "1")
-        max_tokens = 5
-        sampling_params_logprobs_none = SamplingParams(
+    sampling_params_logprobs_none = SamplingParams(
-            max_tokens=max_tokens,
+        max_tokens=max_tokens,
-            logprobs=None,
+        logprobs=None,
-            prompt_logprobs=None,
+        prompt_logprobs=None,
-            temperature=0.0,
+        temperature=0.0,
-        )
+    )
-        results_logprobs_none = vllm_model.llm.generate(
+    results_logprobs_none = vllm_model.llm.generate(
-            example_prompts,
+        example_prompts,
-            sampling_params=sampling_params_logprobs_none,
+        sampling_params=sampling_params_logprobs_none,
-        )
+    )
-        for i in range(len(results_logprobs_none)):
+    for i in range(len(results_logprobs_none)):
-            # Check sample logprobs are None
+        # Check sample logprobs are None
-            assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].logprobs is None
-            assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
-            # Check prompt logprobs are None
+        # Check prompt logprobs are None
-            assert results_logprobs_none[i].prompt_logprobs is None
+        assert results_logprobs_none[i].prompt_logprobs is None
-def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
+def test_zero_logprobs(vllm_model, example_prompts):
    """Engine should return sampled token and prompt token logprobs
    Args:
      vllm_model: vLLM model fixture
      example_prompts: list of example prompts (test fixture)
    """
-    with monkeypatch.context() as m:
+    max_tokens = 5
-        m.setenv("VLLM_USE_V1", "1")
-        max_tokens = 5
-        sampling_params_logprobs_zero = SamplingParams(
+    sampling_params_logprobs_zero = SamplingParams(
-            max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
+        max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
-        )
+    )
-        results_logprobs_zero = vllm_model.llm.generate(
+    results_logprobs_zero = vllm_model.llm.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_zero
+        example_prompts, sampling_params=sampling_params_logprobs_zero
-        )
+    )
-        for i in range(len(results_logprobs_zero)):
+    for i in range(len(results_logprobs_zero)):
-            # Check that there is one sample logprob dict for each
+        # Check that there is one sample logprob dict for each
-            # sample token
+        # sample token
-            logprobs = results_logprobs_zero[i].outputs[0].logprobs
+        logprobs = results_logprobs_zero[i].outputs[0].logprobs
-            prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
+        prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
-            sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
+        sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
-            prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
+        prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
-            assert logprobs is not None
+        assert logprobs is not None
-            assert len(sampled_token_ids) == len(logprobs)
+        assert len(sampled_token_ids) == len(logprobs)
-            assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
+        assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
-            # Check that there is one prompt logprob dict for each
+        # Check that there is one prompt logprob dict for each
-            # prompt token
+        # prompt token
-            assert prompt_logprobs is not None
+        assert prompt_logprobs is not None
-            assert len(prompt_token_ids) == len(prompt_logprobs)
+        assert len(prompt_token_ids) == len(prompt_logprobs)
-def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
+def test_all_logprobs(example_prompts):
    """Engine should return all vocabulary logprobs and prompt logprobs
    Args:
      example_prompts: list of example prompts (test fixture)
    """
-    with monkeypatch.context() as m:
+    runner = VllmRunner(
-        m.setenv("VLLM_USE_V1", "1")
+        "facebook/opt-125m",
-        runner = VllmRunner(
+        max_logprobs=-1,
-            "facebook/opt-125m",
+        enable_prefix_caching=False,
-            max_logprobs=-1,
+        # 2 other llms alive during whole session
-            enable_prefix_caching=False,
+        gpu_memory_utilization=0.15,
-            # 2 other llms alive during whole session
+        max_model_len=256,
-            gpu_memory_utilization=0.15,
+    )
-            max_model_len=256,
-        )
-        sampling_params_logprobs_all = SamplingParams(
+    sampling_params_logprobs_all = SamplingParams(
-            max_tokens=5, logprobs=-1, prompt_logprobs=-1
+        max_tokens=5, logprobs=-1, prompt_logprobs=-1
-        )
+    )
-        results_logprobs_all = runner.llm.generate(
+    results_logprobs_all = runner.llm.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_all
+        example_prompts, sampling_params=sampling_params_logprobs_all
-        )
+    )
-        vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
+    vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
-        for i in range(len(results_logprobs_all)):
+    for i in range(len(results_logprobs_all)):
-            logprobs = results_logprobs_all[i].outputs[0].logprobs
+        logprobs = results_logprobs_all[i].outputs[0].logprobs
-            prompt_logprobs = results_logprobs_all[i].prompt_logprobs
+        prompt_logprobs = results_logprobs_all[i].prompt_logprobs
-            assert logprobs is not None
+        assert logprobs is not None
-            for logprob in logprobs:
+        for logprob in logprobs:
-                assert len(logprob) == vocab_size
+            assert len(logprob) == vocab_size
-            assert prompt_logprobs is not None
+        assert prompt_logprobs is not None
-            assert prompt_logprobs[0] is None
+        assert prompt_logprobs[0] is None
-            for prompt_logprob in prompt_logprobs[1:]:
+        for prompt_logprob in prompt_logprobs[1:]:
-                assert len(prompt_logprob) == vocab_size
+            assert len(prompt_logprob) == vocab_size
 @pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
-def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch):
+def test_logprobs_mode(logprobs_mode: LogprobsMode):
    """Test with LLM engine with different logprobs_mode.
    For logprobs, we should have non-positive values.
    For logits, we should expect at least one positive values.
    """
    from vllm import LLM
-    with monkeypatch.context() as m:
+    llm = LLM(
-        m.setenv("VLLM_USE_V1", "1")
+        "facebook/opt-125m",
+        max_logprobs=5,
-        llm = LLM(
+        enable_prefix_caching=False,
-            "facebook/opt-125m",
+        # 2 other llms alive during whole session
-            max_logprobs=5,
+        gpu_memory_utilization=0.05,
-            enable_prefix_caching=False,
+        max_model_len=16,
-            # 2 other llms alive during whole session
+        logprobs_mode=logprobs_mode,
-            gpu_memory_utilization=0.05,
+    )
-            max_model_len=16,
+    vllm_sampling_params = SamplingParams(logprobs=1)
-            logprobs_mode=logprobs_mode,
+    results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
-        )
-        vllm_sampling_params = SamplingParams(logprobs=1)
+    total_token_with_logprobs = 0
-        results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
+    positive_values = 0
+    for output in results[0].outputs:
-        total_token_with_logprobs = 0
+        for logprobs in output.logprobs:
-        positive_values = 0
+            for token_id in logprobs:
-        for output in results[0].outputs:
+                logprob = logprobs[token_id]
-            for logprobs in output.logprobs:
+                if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
-                for token_id in logprobs:
+                    assert logprob.logprob <= 0
-                    logprob = logprobs[token_id]
+                if logprob.logprob > 0:
-                    if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
+                    positive_values = positive_values + 1
-                        assert logprob.logprob <= 0
+                total_token_with_logprobs = total_token_with_logprobs + 1
-                    if logprob.logprob > 0:
+    assert total_token_with_logprobs >= len(results[0].outputs)
-                        positive_values = positive_values + 1
+    if logprobs_mode in ("raw_logits", "processed_logits"):
-                    total_token_with_logprobs = total_token_with_logprobs + 1
+        assert positive_values > 0
-        assert total_token_with_logprobs >= len(results[0].outputs)
+    del llm
-        if logprobs_mode in ("raw_logits", "processed_logits"):
-            assert positive_values > 0
-        del llm
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 import pytest
 from vllm import LLM, SamplingParams
-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
 MODEL = "meta-llama/Llama-3.2-1B"
 PROMPT = "Hello my name is Robert and I"
@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
-def test_priority(llm):
-    """Check that we reject requests with priority."""
-    # Reject all allowed token ids
-    with pytest.raises(ValueError):
-        _ = llm.generate(PROMPT, priority=[1])
 def test_seed(llm):
    """Check that seed impacts randomness."""

--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -38,7 +38,6 @@ def test_eagle_max_len(
    monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
 ):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
        if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():

--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
 @pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
 def test_basic(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    model: str,
    max_tokens: int,
    tensor_parallel_size: int,
@@ -55,23 +54,20 @@ def test_basic(
    )
    example_prompts = [prompt]
-    with monkeypatch.context() as m:
+    with vllm_runner(
-        m.setenv("VLLM_USE_V1", "1")
+        model,
+        # Note: max_num_batched_tokens == 1024 is needed here to
+        # actually test chunked prompt
+        max_num_batched_tokens=1024,
+        max_model_len=8192,
+        gpu_memory_utilization=0.7,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    output = vllm_outputs[0][1]
-        with vllm_runner(
+    assert "1024" in output or "0, 1" in output
-            model,
-            # Note: max_num_batched_tokens == 1024 is needed here to
-            # actually test chunked prompt
-            max_num_batched_tokens=1024,
-            max_model_len=8192,
-            gpu_memory_utilization=0.7,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        output = vllm_outputs[0][1]
-        assert "1024" in output or "0, 1" in output
 @pytest.mark.skip(reason="Temporarily disabled due to timeout")
@@ -82,7 +78,6 @@ def test_basic(
 @pytest.mark.parametrize("max_num_seqs", [16])
 def test_phi3(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    max_tokens: int,
    max_num_seqs: int,
 ) -> None:
@@ -99,18 +94,15 @@ def test_phi3(
    # test head dim = 96
    model = "microsoft/Phi-3-mini-128k-instruct"
-    with monkeypatch.context() as m:
+    with vllm_runner(
-        m.setenv("VLLM_USE_V1", "1")
+        model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
+    ) as vllm_model:
-        with vllm_runner(
+        vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
-            model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
+    # vllm_outputs is a list of tuples whose first element is the token id
-        ) as vllm_model:
+    # and the second element is the output (including the prompt).
-            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+    for output, answer in zip(vllm_outputs, answers):
-        # vllm_outputs is a list of tuples whose first element is the token id
+        generated_text = output[1]
-        # and the second element is the output (including the prompt).
+        assert answer in generated_text
-        for output, answer in zip(vllm_outputs, answers):
-            generated_text = output[1]
-            assert answer in generated_text
 TP_SIZE_8 = 8
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
 )
 def test_gemma3_27b_with_text_input_and_tp(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    model = "google/gemma-3-27b-it"
    max_tokens = 16
@@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
        " but in rising every time we fall.",
    ]
-    with monkeypatch.context() as m:
+    with vllm_runner(
-        m.setenv("VLLM_USE_V1", "1")
+        model,
+        max_num_batched_tokens=256,
-        with vllm_runner(
+        max_num_seqs=max_num_seqs,
-            model,
+        tensor_parallel_size=tensor_parallel_size,
-            max_num_batched_tokens=256,
+    ) as vllm_model:
-            max_num_seqs=max_num_seqs,
+        vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
-            tensor_parallel_size=tensor_parallel_size,
+    # vllm_outputs is a list of tuples whose first element is the token id
-        ) as vllm_model:
+    # and the second element is the output (including the prompt).
-            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+    for output, answer in zip(vllm_outputs, answers):
-        # vllm_outputs is a list of tuples whose first element is the token id
+        generated_text = output[1]
-        # and the second element is the output (including the prompt).
+        assert answer in generated_text
-        for output, answer in zip(vllm_outputs, answers):
-            generated_text = output[1]
-            assert answer in generated_text
 @pytest.mark.skipif(
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
 )
 def test_w8a8_quantization(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
    max_tokens = 5
@@ -176,18 +163,15 @@ def test_w8a8_quantization(
    )
    example_prompts = [prompt]
-    with monkeypatch.context() as m:
+    with vllm_runner(
-        m.setenv("VLLM_USE_V1", "1")
+        model,
+        max_num_batched_tokens=64,
-        with vllm_runner(
+        max_model_len=4096,
-            model,
+        gpu_memory_utilization=0.7,
-            max_num_batched_tokens=64,
+        max_num_seqs=max_num_seqs,
-            max_model_len=4096,
+        tensor_parallel_size=tensor_parallel_size,
-            gpu_memory_utilization=0.7,
+    ) as vllm_model:
-            max_num_seqs=max_num_seqs,
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-            tensor_parallel_size=tensor_parallel_size,
+    output = vllm_outputs[0][1]
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    assert "1024" in output or "0, 1" in output
-        output = vllm_outputs[0][1]
-        assert "1024" in output or "0, 1" in output
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
 @pytest.mark.parametrize("params", TEST_PARAMS)
 def test_perf(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    params: TestParams,
 ) -> None:
    tokenizer = get_tokenizer(
@@ -107,48 +106,45 @@ def test_perf(
        )
    )
-    with monkeypatch.context() as m:
+    sampling_params = SamplingParams(
-        m.setenv("VLLM_USE_V1", "1")
+        max_tokens=params.decode_len, temperature=1.0, min_p=0.0
+    )
-        sampling_params = SamplingParams(
+    with vllm_runner(
-            max_tokens=params.decode_len, temperature=1.0, min_p=0.0
+        params.model,
+        max_num_batched_tokens=MAX_MODEL_LEN,
+        max_model_len=MAX_MODEL_LEN,
+        max_num_seqs=MAX_NUM_SEQS,
+        gpu_memory_utilization=GPU_UTIL,
+        enforce_eager=False,
+        tensor_parallel_size=1,
+    ) as vllm_model:
+        print("  -- Warmup / Compile")
+        for i in range(NUM_WARMUPS):
+            _ = vllm_model.generate(prompts, sampling_params)
+        print("  -- Benchmarking... ")
+        times = []
+        for i in range(NUM_RUNS):
+            start_time = time.time()
+            _ = vllm_model.generate(prompts, sampling_params)
+            times.append(time.time() - start_time)
+        avg_time = sum(times) / len(times)
+        print("  -- avg_time = {}".format(avg_time))
+        print(
+            "  -- expected_avg_time = {} with err_tol = {}".format(
+                params.expected_avg_time, params.err_tol
+            )
        )
+        diff = avg_time - params.expected_avg_time
-        with vllm_runner(
+        ok = diff < params.err_tol
-            params.model,
+        if diff < -params.err_tol:
-            max_num_batched_tokens=MAX_MODEL_LEN,
-            max_model_len=MAX_MODEL_LEN,
-            max_num_seqs=MAX_NUM_SEQS,
-            gpu_memory_utilization=GPU_UTIL,
-            enforce_eager=False,
-            tensor_parallel_size=1,
-        ) as vllm_model:
-            print("  -- Warmup / Compile")
-            for i in range(NUM_WARMUPS):
-                _ = vllm_model.generate(prompts, sampling_params)
-            print("  -- Benchmarking... ")
-            times = []
-            for i in range(NUM_RUNS):
-                start_time = time.time()
-                _ = vllm_model.generate(prompts, sampling_params)
-                times.append(time.time() - start_time)
-            avg_time = sum(times) / len(times)
-            print("  -- avg_time = {}".format(avg_time))
            print(
-                "  -- expected_avg_time = {} with err_tol = {}".format(
+                "  !! WARNING !! Performance has improved by {}, "
-                    params.expected_avg_time, params.err_tol
+                "it may be necessary to fine-tune the "
-                )
+                "expected_avg_time = {}".format(-diff, params.expected_avg_time)
            )
-            diff = avg_time - params.expected_avg_time
-            ok = diff < params.err_tol
+        assert ok, " !! ERROR !! Regression detected"
-            if diff < -params.err_tol:
-                print(
-                    "  !! WARNING !! Performance has improved by {}, "
-                    "it may be necessary to fine-tune the "
-                    "expected_avg_time = {}".format(-diff, params.expected_avg_time)
-                )
-            assert ok, " !! ERROR !! Regression detected"
--- a/tests/v1/tracing/test_tracing.py
+++ b/tests/v1/tracing/test_tracing.py
@@ -82,7 +82,7 @@ def test_traces(
 ):
    with monkeypatch.context() as m:
        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-        m.setenv("VLLM_USE_V1", "1")
        sampling_params = SamplingParams(
            temperature=0.01,
            top_p=0.1,

--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
        logger.info("Warming up model for the compilation...")
        # Only generate graph for the generic shape
        with _set_global_compilation_settings(self.vllm_config):
-            self._dummy_run(max(16, self.max_num_reqs))
+            self._dummy_run(
+                min(
+                    max(16, self.max_num_reqs),
+                    self.scheduler_config.max_num_batched_tokens,
+                )
+            )
        logger.info("Warming up done.")
    def _init_device_properties(self) -> None: