[Misc] Replace os environ to monkeypatch in test suite (#14516)

Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com> Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz>

[Misc] Replace os environ to monkeypatch in test suite (#14516)
Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com> Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
a73e183e · Sibi · GitHub · 1e799b7e · a73e183e · a73e183e
Unverified Commit a73e183e authored Mar 17, 2025 by Sibi Committed by GitHub Mar 16, 2025
20 changed files
--- a/tests/kv_transfer/module_test.py
+++ b/tests/kv_transfer/module_test.py
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -12,11 +12,10 @@ import pytest
 from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
 from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR

 from ...utils import check_logprobs_close

-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-

 @pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
@@ -55,45 +54,47 @@ def test_models(
    backend: str,
    tensor_parallel_size: int,
    disable_async_output_proc: bool,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Only checks log probs match to cover the discrepancy in
    numerical sensitive kernels.
    """
-    override_backend_env_variable(monkeypatch, backend)
-
-    MAX_MODEL_LEN = 1024
-    NUM_LOG_PROBS = 8
-
-    with vllm_runner(
-            base_model,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            kv_cache_dtype="auto",
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        baseline_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    with vllm_runner(
-            test_model,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        test_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    check_logprobs_close(
-        outputs_0_lst=baseline_outputs,
-        outputs_1_lst=test_outputs,
-        name_0="fp16_kv_cache",
-        name_1="fp8_kv_cache",
-    )
+    with monkeypatch.context() as m:
+        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
+
+        MAX_MODEL_LEN = 1024
+        NUM_LOG_PROBS = 8
+
+        with vllm_runner(
+                base_model,
+                max_model_len=MAX_MODEL_LEN,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                kv_cache_dtype="auto",
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+
+        with vllm_runner(
+                test_model,
+                max_model_len=MAX_MODEL_LEN,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                kv_cache_dtype=kv_cache_dtype,
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            test_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+
+        check_logprobs_close(
+            outputs_0_lst=baseline_outputs,
+            outputs_1_lst=test_outputs,
+            name_0="fp16_kv_cache",
+            name_1="fp8_kv_cache",
+        )


 @pytest.mark.cpu_model
@@ -119,38 +120,41 @@ def test_cpu_models(
    test_model: str,
    max_tokens: int,
    disable_async_output_proc: bool,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Only checks log probs match to cover the discrepancy in
    numerical sensitive kernels.
    """
-
-    MAX_MODEL_LEN = 1024
-    NUM_LOG_PROBS = 8
-
-    with vllm_runner(
-            base_model,
-            max_model_len=MAX_MODEL_LEN,
-            dtype="bfloat16",
-            kv_cache_dtype="auto",
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        baseline_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    with vllm_runner(
-            test_model,
-            max_model_len=MAX_MODEL_LEN,
-            dtype="bfloat16",
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        test_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    check_logprobs_close(
-        outputs_0_lst=baseline_outputs,
-        outputs_1_lst=test_outputs,
-        name_0="bf16_kv_cache",
-        name_1="fp8_kv_cache",
-    )
+    with monkeypatch.context() as m:
+        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+
+        MAX_MODEL_LEN = 1024
+        NUM_LOG_PROBS = 8
+
+        with vllm_runner(
+                base_model,
+                max_model_len=MAX_MODEL_LEN,
+                dtype="bfloat16",
+                kv_cache_dtype="auto",
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+
+        with vllm_runner(
+                test_model,
+                max_model_len=MAX_MODEL_LEN,
+                dtype="bfloat16",
+                kv_cache_dtype=kv_cache_dtype,
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            test_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+
+        check_logprobs_close(
+            outputs_0_lst=baseline_outputs,
+            outputs_1_lst=test_outputs,
+            name_0="bf16_kv_cache",
+            name_1="fp8_kv_cache",
+        )
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations

 import importlib.util
 import math
@@ -11,6 +12,7 @@ from scipy.spatial.distance import cosine

 import vllm
 import vllm.config
+from vllm.utils import STR_BACKEND_ENV_VAR

 from ....utils import RemoteOpenAIServer

@@ -29,36 +31,34 @@ def _arr(arr):
    return array("i", arr)


-def test_find_array(monkeypatch):
+def test_find_array(monkeypatch: pytest.MonkeyPatch):
    # GritLM embedding implementation is only supported by XFormers backend.
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")

-    from vllm.model_executor.models.gritlm import GritLMPooler
+        from vllm.model_executor.models.gritlm import GritLMPooler

-    # Create an LLM object to get the model config.
-    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
-    pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
+        # Create an LLM object to get the model config.
+        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+        pooler = GritLMPooler(model_config=llm.llm_engine.model_config)

-    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+        arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
-    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+        assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1

-    with pytest.raises(ValueError):
-        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+        with pytest.raises(ValueError):
+            pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)


 @pytest.fixture(scope="module")
 def server_embedding():
    # GritLM embedding implementation is only supported by XFormers backend.
-    with pytest.MonkeyPatch.context() as mp:
-        mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
-
-        args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
+    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server


 @pytest.fixture(scope="module")
@@ -69,9 +69,12 @@ def server_generate():


 @pytest_asyncio.fixture
-async def client_embedding(server_embedding: RemoteOpenAIServer):
-    async with server_embedding.get_async_client() as async_client:
-        yield async_client
+async def client_embedding(monkeypatch: pytest.MonkeyPatch,
+                           server_embedding: RemoteOpenAIServer):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+        async with server_embedding.get_async_client() as async_client:
+            yield async_client


 @pytest_asyncio.fixture
@@ -80,14 +83,20 @@ async def client_generate(server_generate: RemoteOpenAIServer):
        yield async_client


-def run_llm_encode(llm: vllm.LLM, queries: list[str],
-                   instruction: str) -> list[float]:
+def run_llm_encode(
+    llm: vllm.LLM,
+    queries: list[str],
+    instruction: str,
+) -> list[float]:
    outputs = llm.encode([instruction + q for q in queries], )
    return [output.outputs.embedding for output in outputs]


-async def run_client_embeddings(client: vllm.LLM, queries: list[str],
-                                instruction: str) -> list[float]:
+async def run_client_embeddings(
+    client: vllm.LLM,
+    queries: list[str],
+    instruction: str,
+) -> list[float]:
    outputs = await client.embeddings.create(
        model=MODEL_NAME,
        input=[instruction + q for q in queries],
@@ -106,7 +115,7 @@ def get_test_data():
    README.md in https://github.com/ContextualAI/gritlm
    """
    q_instruction = gritlm_instruction(
-        "Given a scientific paper title, retrieve the paper's abstract")
+        "Given a scientific paper title, retrieve the paper's abstract", )
    queries = [
        "Bitcoin: A Peer-to-Peer Electronic Cash System",
        "Generative Representational Instruction Tuning",
@@ -136,31 +145,32 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
    assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)


-def test_gritlm_offline_embedding(monkeypatch):
+def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
    # GritLM embedding implementation is only supported by XFormers backend.
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")

-    queries, q_instruction, documents, d_instruction = get_test_data()
+        queries, q_instruction, documents, d_instruction = get_test_data()

-    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)

-    d_rep = run_llm_encode(
-        llm,
-        documents,
-        d_instruction,
-    )
-    q_rep = run_llm_encode(
-        llm,
-        queries,
-        q_instruction,
-    )
+        d_rep = run_llm_encode(
+            llm,
+            documents,
+            d_instruction,
+        )
+        q_rep = run_llm_encode(
+            llm,
+            queries,
+            q_instruction,
+        )

-    validate_embed_output(q_rep, d_rep)
+        validate_embed_output(q_rep, d_rep)


 @pytest.mark.asyncio
 async def test_gritlm_api_server_embedding(
-        client_embedding: openai.AsyncOpenAI):
+    client_embedding: openai.AsyncOpenAI, ):
    queries, q_instruction, documents, d_instruction = get_test_data()

    d_rep = await run_client_embeddings(

--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
 # SPDX-License-Identifier: Apache-2.0

-import os
-
 import pytest

 from vllm import LLM, SamplingParams
@@ -11,76 +9,92 @@ from ..utils import fork_new_process_for_each_test


 @fork_new_process_for_each_test
-def test_plugin(dummy_opt_path, monkeypatch):
+def test_plugin(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_opt_path: str,
+):
    # V1 shuts down rather than raising an error here.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    os.environ["VLLM_PLUGINS"] = ""
-    with pytest.raises(Exception) as excinfo:
-        LLM(model=dummy_opt_path, load_format="dummy")
-    error_msg = "has no vLLM implementation and " \
-                "the Transformers implementation is not compatible with vLLM"
-    assert (error_msg in str(excinfo.value))
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        m.setenv("VLLM_PLUGINS", "")

+        with pytest.raises(Exception) as excinfo:
+            LLM(model=dummy_opt_path, load_format="dummy")
+        error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM"  # noqa: E501
+        assert (error_msg in str(excinfo.value))

-@fork_new_process_for_each_test
-def test_oot_registration_text_generation(dummy_opt_path):
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = ["Hello, my name is", "The text does not matter"]
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=dummy_opt_path, load_format="dummy")
-    first_token = llm.get_tokenizer().decode(0)
-    outputs = llm.generate(prompts, sampling_params)

-    for output in outputs:
-        generated_text = output.outputs[0].text
-        # make sure only the first token is generated
-        rest = generated_text.replace(first_token, "")
-        assert rest == ""
+@fork_new_process_for_each_test
+def test_oot_registration_text_generation(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_opt_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = ["Hello, my name is", "The text does not matter"]
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(model=dummy_opt_path, load_format="dummy")
+        first_token = llm.get_tokenizer().decode(0)
+        outputs = llm.generate(prompts, sampling_params)
+
+        for output in outputs:
+            generated_text = output.outputs[0].text
+            # make sure only the first token is generated
+            rest = generated_text.replace(first_token, "")
+            assert rest == ""


 @fork_new_process_for_each_test
-def test_oot_registration_embedding(dummy_gemma2_embedding_path):
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = ["Hello, my name is", "The text does not matter"]
-    llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
-    outputs = llm.embed(prompts)
+def test_oot_registration_embedding(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_gemma2_embedding_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = ["Hello, my name is", "The text does not matter"]
+        llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
+        outputs = llm.embed(prompts)

-    for output in outputs:
-        assert all(v == 0 for v in output.outputs.embedding)
+        for output in outputs:
+            assert all(v == 0 for v in output.outputs.embedding)


 image = ImageAsset("cherry_blossom").pil_image.convert("RGB")


 @fork_new_process_for_each_test
-def test_oot_registration_multimodal(dummy_llava_path, monkeypatch):
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = [{
-        "prompt": "What's in the image?<image>",
-        "multi_modal_data": {
-            "image": image
-        },
-    }, {
-        "prompt": "Describe the image<image>",
-        "multi_modal_data": {
-            "image": image
-        },
-    }]
-
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=dummy_llava_path,
-              load_format="dummy",
-              max_num_seqs=1,
-              trust_remote_code=True,
-              gpu_memory_utilization=0.98,
-              max_model_len=4096,
-              enforce_eager=True,
-              limit_mm_per_prompt={"image": 1})
-    first_token = llm.get_tokenizer().decode(0)
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        # make sure only the first token is generated
-        rest = generated_text.replace(first_token, "")
-        assert rest == ""
+def test_oot_registration_multimodal(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_llava_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = [{
+            "prompt": "What's in the image?<image>",
+            "multi_modal_data": {
+                "image": image
+            },
+        }, {
+            "prompt": "Describe the image<image>",
+            "multi_modal_data": {
+                "image": image
+            },
+        }]
+
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(model=dummy_llava_path,
+                  load_format="dummy",
+                  max_num_seqs=1,
+                  trust_remote_code=True,
+                  gpu_memory_utilization=0.98,
+                  max_model_len=4096,
+                  enforce_eager=True,
+                  limit_mm_per_prompt={"image": 1})
+        first_token = llm.get_tokenizer().decode(0)
+        outputs = llm.generate(prompts, sampling_params)
+
+        for output in outputs:
+            generated_text = output.outputs[0].text
+            # make sure only the first token is generated
+            rest = generated_text.replace(first_token, "")
+            assert rest == ""
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -235,25 +235,28 @@ async def test_bad_request(tmp_socket):


 @pytest.mark.asyncio
-async def test_mp_crash_detection(monkeypatch):
+async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:

-    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
-    parser = make_arg_parser(parser)
-    args = parser.parse_args([])
+        parser = FlexibleArgumentParser(
+            description="vLLM's remote OpenAI server.")
+        parser = make_arg_parser(parser)
+        args = parser.parse_args([])

-    # When LLMEngine is loaded, it will crash.
-    def mock_init():
-        raise ValueError
+        # When LLMEngine is loaded, it will crash.
+        def mock_init():
+            raise ValueError

-    monkeypatch.setattr(LLMEngine, "__init__", mock_init)
+        m.setattr(LLMEngine, "__init__", mock_init)

-    start = time.perf_counter()
-    async with build_async_engine_client(args):
-        pass
-    end = time.perf_counter()
+        start = time.perf_counter()
+        async with build_async_engine_client(args):
+            pass
+        end = time.perf_counter()

-    assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
-                              "if there is an error in the startup.")
+        assert end - start < 60, (
+            "Expected vLLM to gracefully shutdown in <60s "
+            "if there is an error in the startup.")


 @pytest.mark.asyncio

--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -5,7 +5,7 @@ from typing import Optional

 import pytest

-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR

 from ..models.utils import check_logprobs_close
 from ..utils import (completions_with_server_args, get_client_text_generations,
@@ -52,7 +52,7 @@ async def test_multi_step(
    num_logprobs: Optional[int],
    attention_backend: str,
    enable_chunked_prefill: bool,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
    client/server environment.
@@ -82,67 +82,70 @@ async def test_multi_step(
        pytest.skip("Multi-step with Chunked-Prefill only supports"
                    "PP=1 and FLASH_ATTN backend")

-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    prompts = example_prompts
-    if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
-
-    server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
-    ms_server_args = DEFAULT_SERVER_ARGS + \
-        ["--num-scheduler-steps", f"{num_scheduler_steps}"]
-
-    if not is_async:
-        ms_server_args += ["--disable-async-output-proc"]
-
-    if eager_mode:
-        ms_server_args.append("--enforce-eager")
-
-    if enable_chunked_prefill:
-        ms_server_args.append("--enable-chunked-prefill")
-
-    distributed_args = [
-        "--tensor-parallel-size",
-        str(tp_size),
-        "--pipeline-parallel-size",
-        str(pp_size),
-    ]
-
-    # Spin up client/server & issue completion API requests.
-    # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 5x to 1200 *just for this test* due to
-    # observed timeouts in GHA CI
-    ref_completions = await completions_with_server_args(
-        prompts,
-        model,
-        server_args + distributed_args,
-        num_logprobs,
-        max_wait_seconds=5 * 240)
-    test_completions = await completions_with_server_args(
-        prompts,
-        model,
-        ms_server_args + distributed_args,
-        num_logprobs,
-        max_wait_seconds=5 * 240)
-
-    # Assert multi-step scheduling produces identical tokens
-    # to single-step scheduling.
-    ref_generations = get_client_text_generations(ref_completions)
-    test_generations = get_client_text_generations(test_completions)
-    assert ref_generations == test_generations
-
-    # Assert multi-step scheduling produces nearly-identical logprobs
-    # to single-step scheduling.
-    ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
-    test_text_logprobs = get_client_text_logprob_generations(test_completions)
-    check_logprobs_close(
-        outputs_0_lst=ref_text_logprobs,
-        outputs_1_lst=test_text_logprobs,
-        name_0="hf",
-        name_1="vllm",
-    )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
+
+        server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
+        ms_server_args = DEFAULT_SERVER_ARGS + \
+            ["--num-scheduler-steps", f"{num_scheduler_steps}"]
+
+        if not is_async:
+            ms_server_args += ["--disable-async-output-proc"]
+
+        if eager_mode:
+            ms_server_args.append("--enforce-eager")
+
+        if enable_chunked_prefill:
+            ms_server_args.append("--enable-chunked-prefill")
+
+        distributed_args = [
+            "--tensor-parallel-size",
+            str(tp_size),
+            "--pipeline-parallel-size",
+            str(pp_size),
+        ]
+
+        # Spin up client/server & issue completion API requests.
+        # Default `max_wait_seconds` is 240 but was empirically
+        # was raised 5x to 1200 *just for this test* due to
+        # observed timeouts in GHA CI
+        ref_completions = await completions_with_server_args(
+            prompts,
+            model,
+            server_args + distributed_args,
+            num_logprobs,
+            max_wait_seconds=5 * 240)
+        test_completions = await completions_with_server_args(
+            prompts,
+            model,
+            ms_server_args + distributed_args,
+            num_logprobs,
+            max_wait_seconds=5 * 240)
+
+        # Assert multi-step scheduling produces identical tokens
+        # to single-step scheduling.
+        ref_generations = get_client_text_generations(ref_completions)
+        test_generations = get_client_text_generations(test_completions)
+        assert ref_generations == test_generations
+
+        # Assert multi-step scheduling produces nearly-identical logprobs
+        # to single-step scheduling.
+        ref_text_logprobs = get_client_text_logprob_generations(
+            ref_completions)
+        test_text_logprobs = get_client_text_logprob_generations(
+            test_completions)
+        check_logprobs_close(
+            outputs_0_lst=ref_text_logprobs,
+            outputs_1_lst=test_text_logprobs,
+            name_0="hf",
+            name_1="vllm",
+        )


 @pytest.mark.parametrize(("tp_size, pp_size"), [
@@ -152,7 +155,7 @@ async def test_multi_step(
 async def test_multi_step_pp_smoke(
    tp_size: int,
    pp_size: int,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Smoke test for the vLLM engine with multi-step scheduling in an
@@ -174,54 +177,55 @@ async def test_multi_step_pp_smoke(
    attention_backend = "FLASH_ATTN"
    max_num_seqs = 3

-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    # Prompt from the ShareGPT dataset
-    prompts = [
-        "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
-    ]
-    # Use varying max_tokens to introduce scheduling randomness.
-    max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
-    assert len(prompts) == len(max_tokens)
-
-    test_args = [
-        "--tensor-parallel-size",
-        str(tp_size), "--pipeline-parallel-size",
-        str(pp_size), "--max-num-seqs",
-        str(max_num_seqs)
-    ]
-
-    server_args = DEFAULT_SERVER_ARGS + test_args
-    ms_server_args = DEFAULT_SERVER_ARGS + \
-       ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
-       test_args
-
-    # Spin up client/server & issue completion API requests.
-    # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 3x to 720 *just for this test* due to
-    # observed timeouts in GHA CI
-    ref_completions = await completions_with_server_args(
-        prompts=prompts,
-        model_name=model,
-        server_cli_args=server_args,
-        num_logprobs=None,
-        max_wait_seconds=5 * 240,
-        max_tokens=max_tokens)
-
-    test_completions = await completions_with_server_args(
-        prompts=prompts,
-        model_name=model,
-        server_cli_args=ms_server_args,
-        num_logprobs=None,
-        max_wait_seconds=5 * 240,
-        max_tokens=max_tokens)
-
-    # Assert multi-step scheduling produces identical tokens
-    # to single-step scheduling.
-    ref_generations = get_client_text_generations(ref_completions)
-    test_generations = get_client_text_generations(test_completions)
-
-    assert ref_generations == test_generations
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        # Prompt from the ShareGPT dataset
+        prompts = [
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+        ]
+        # Use varying max_tokens to introduce scheduling randomness.
+        max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
+        assert len(prompts) == len(max_tokens)
+
+        test_args = [
+            "--tensor-parallel-size",
+            str(tp_size), "--pipeline-parallel-size",
+            str(pp_size), "--max-num-seqs",
+            str(max_num_seqs)
+        ]
+
+        server_args = DEFAULT_SERVER_ARGS + test_args
+        ms_server_args = DEFAULT_SERVER_ARGS + \
+          ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
+          test_args
+
+        # Spin up client/server & issue completion API requests.
+        # Default `max_wait_seconds` is 240 but was empirically
+        # was raised 3x to 720 *just for this test* due to
+        # observed timeouts in GHA CI
+        ref_completions = await completions_with_server_args(
+            prompts=prompts,
+            model_name=model,
+            server_cli_args=server_args,
+            num_logprobs=None,
+            max_wait_seconds=5 * 240,
+            max_tokens=max_tokens)
+
+        test_completions = await completions_with_server_args(
+            prompts=prompts,
+            model_name=model,
+            server_cli_args=ms_server_args,
+            num_logprobs=None,
+            max_wait_seconds=5 * 240,
+            max_tokens=max_tokens)
+
+        # Assert multi-step scheduling produces identical tokens
+        # to single-step scheduling.
+        ref_generations = get_client_text_generations(ref_completions)
+        test_generations = get_client_text_generations(test_completions)
+
+        assert ref_generations == test_generations
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -7,7 +7,7 @@ from typing import Optional

 import pytest

-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR

 from ..models.utils import check_logprobs_close, check_outputs_equal

@@ -42,7 +42,7 @@ def test_multi_step_llm(
    num_prompts: int,
    num_logprobs: Optional[int],
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test vLLM engine with multi-step scheduling via sync LLM Engine.

@@ -70,48 +70,49 @@ def test_multi_step_llm(
      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
                    completions endpoint; `None` -> 1 logprob returned.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    prompts = example_prompts
-    if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            enable_chunked_prefill=enable_chunked_prefill,
-            num_scheduler_steps=num_scheduler_steps,
-    ) as vllm_model:
-        vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
-                        if num_logprobs is None else
-                        vllm_model.generate_greedy_logprobs(
-                            prompts, max_tokens, num_logprobs))
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
-                      if num_logprobs is None else
-                      hf_model.generate_greedy_logprobs_limit(
-                          prompts, max_tokens, num_logprobs))
-
-    if num_logprobs is None:
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-    else:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                enable_chunked_prefill=enable_chunked_prefill,
+                num_scheduler_steps=num_scheduler_steps,
+        ) as vllm_model:
+            vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
+                            if num_logprobs is None else
+                            vllm_model.generate_greedy_logprobs(
+                                prompts, max_tokens, num_logprobs))
+
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
+                          if num_logprobs is None else
+                          hf_model.generate_greedy_logprobs_limit(
+                              prompts, max_tokens, num_logprobs))
+
+        if num_logprobs is None:
+            check_outputs_equal(
+                outputs_0_lst=hf_outputs,
+                outputs_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+            )
+        else:
+            check_logprobs_close(
+                outputs_0_lst=hf_outputs,
+                outputs_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+            )


 @pytest.mark.parametrize("model", MODELS)
@@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs(
    num_logprobs: Optional[int],
    num_prompt_logprobs: Optional[int],
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test prompt logprobs with multi-step scheduling via sync LLM Engine.

@@ -166,47 +167,48 @@ def test_multi_step_llm_w_prompt_logprobs(
                           note that this argument is not supported by the
                           OpenAI completions endpoint.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    prompts = example_prompts
-    if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            num_scheduler_steps=num_scheduler_steps,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            prompts,
-            max_tokens,
-            num_logprobs,
-            num_prompt_logprobs=num_prompt_logprobs)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-    ) as vllm_model:
-        single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
-            prompts,
-            max_tokens,
-            num_logprobs,
-            num_prompt_logprobs=num_prompt_logprobs)
-
-    check_logprobs_close(
-        outputs_0_lst=single_step_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                num_scheduler_steps=num_scheduler_steps,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs,
+                num_prompt_logprobs=num_prompt_logprobs)
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+        ) as vllm_model:
+            single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs,
+                num_prompt_logprobs=num_prompt_logprobs)
+
+        check_logprobs_close(
+            outputs_0_lst=single_step_vllm_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )


 @pytest.mark.parametrize("model", MODELS)
@@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
    num_prompts: int,
    num_logprobs: Optional[int],
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.

@@ -293,77 +295,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
    #
    # The Incorrect scheduling behavior - if it occurs - will cause an exception
    # in the model runner resulting from `do_sample=False`.
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    assert len(example_prompts) >= 2
-    challenge_prompts = copy.deepcopy(example_prompts)
-    challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
-                            'inference and serving engine for LLMs.\n'
-                            )  # 24 tok
-    challenge_prompts[1] = (
-        'Briefly describe the major milestones in the '
-        'development of artificial intelligence from 1950 to 2020.\n'
-    )  # 30 tok
-
-    # If necessary, adjust the length of `challenge_prompts` to match
-    # `num_prompts`
-    if len(challenge_prompts) < num_prompts:
-        challenge_prompts = (challenge_prompts *
-                             ((num_prompts // len(challenge_prompts)) + 1))
-    challenge_prompts = challenge_prompts[:num_prompts]
-    assert len(challenge_prompts) == num_prompts
-
-    # Single-step scheduler baseline
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            num_scheduler_steps=num_scheduler_steps,
-            max_model_len=48,
-            max_num_batched_tokens=48,
-            max_num_seqs=4,
-            block_size=16,
-    ) as vllm_model:
-        outputs_baseline = (vllm_model.generate_greedy(
-            challenge_prompts, max_tokens) if num_logprobs is None else
-                            vllm_model.generate_greedy_logprobs(
-                                challenge_prompts, max_tokens, num_logprobs))
-
-    # multi-step+"single-step chunked prefill"+APC
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            enable_chunked_prefill=True,
-            enable_prefix_caching=True,
-            num_scheduler_steps=num_scheduler_steps,
-            max_model_len=48,
-            max_num_batched_tokens=48,
-            max_num_seqs=4,
-            block_size=16,
-    ) as vllm_model:
-        outputs_w_features = (vllm_model.generate_greedy(
-            challenge_prompts, max_tokens) if num_logprobs is None else
-                              vllm_model.generate_greedy_logprobs(
-                                  challenge_prompts, max_tokens, num_logprobs))
-
-    if num_logprobs is None:
-        # No-logprobs test
-        check_outputs_equal(
-            outputs_0_lst=outputs_baseline,
-            outputs_1_lst=outputs_w_features,
-            name_0="multi-step",
-            name_1="multi-step+features",
-        )
-    else:
-        # Yes-logprobs test
-        check_logprobs_close(
-            outputs_0_lst=outputs_baseline,
-            outputs_1_lst=outputs_w_features,
-            name_0="multi-step",
-            name_1="multi-step+features",
-        )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        assert len(example_prompts) >= 2
+        challenge_prompts = copy.deepcopy(example_prompts)
+        challenge_prompts[0] = (
+            'vLLM is a high-throughput and memory-efficient '
+            'inference and serving engine for LLMs.\n')  # 24 tok
+        challenge_prompts[1] = (
+            'Briefly describe the major milestones in the '
+            'development of artificial intelligence from 1950 to 2020.\n'
+        )  # 30 tok
+
+        # If necessary, adjust the length of `challenge_prompts` to match
+        # `num_prompts`
+        if len(challenge_prompts) < num_prompts:
+            challenge_prompts = (challenge_prompts *
+                                 ((num_prompts // len(challenge_prompts)) + 1))
+        challenge_prompts = challenge_prompts[:num_prompts]
+        assert len(challenge_prompts) == num_prompts
+
+        # Single-step scheduler baseline
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                num_scheduler_steps=num_scheduler_steps,
+                max_model_len=48,
+                max_num_batched_tokens=48,
+                max_num_seqs=4,
+                block_size=16,
+        ) as vllm_model:
+            outputs_baseline = (
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
+                    challenge_prompts, max_tokens, num_logprobs))
+
+        # multi-step+"single-step chunked prefill"+APC
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                enable_chunked_prefill=True,
+                enable_prefix_caching=True,
+                num_scheduler_steps=num_scheduler_steps,
+                max_model_len=48,
+                max_num_batched_tokens=48,
+                max_num_seqs=4,
+                block_size=16,
+        ) as vllm_model:
+            outputs_w_features = (
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
+                    challenge_prompts, max_tokens, num_logprobs))
+
+        if num_logprobs is None:
+            # No-logprobs test
+            check_outputs_equal(
+                outputs_0_lst=outputs_baseline,
+                outputs_1_lst=outputs_w_features,
+                name_0="multi-step",
+                name_1="multi-step+features",
+            )
+        else:
+            # Yes-logprobs test
+            check_logprobs_close(
+                outputs_0_lst=outputs_baseline,
+                outputs_1_lst=outputs_w_features,
+                name_0="multi-step",
+                name_1="multi-step+features",
+            )
--- a/tests/neuron/1_core/test_block_table.py
+++ b/tests/neuron/1_core/test_block_table.py
 # SPDX-License-Identifier: Apache-2.0
-import os

 import neuronxcc.nki.language as nl
 import pytest
@@ -99,6 +98,7 @@ def ref_block_tables_transform(
 )
 @torch.inference_mode()
 def test_load_and_transform_block_tables(
+    monkeypatch: pytest.MonkeyPatch,
    num_tiles,
    num_blocks_per_tile,
    q_head_per_kv_head,
@@ -108,46 +108,46 @@ def test_load_and_transform_block_tables(

    device = xm.xla_device()

-    compiler_flags = [
+    compiler_flags_str = " ".join([
        "-O1",
        "--retry_failed_compilation",
-    ]
-    compiler_flags_str = " ".join(compiler_flags)
-    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
-
-    torch.manual_seed(10000)
-    torch.set_printoptions(sci_mode=False)
-
-    # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
-    B_P_SIZE = 128
-    if num_blocks_per_tile < B_P_SIZE:
-        assert B_P_SIZE % num_blocks_per_tile == 0
-        block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
-    else:
-        block_size_tiling_factor = 1
-    max_num_blocks = 100000
-    block_tables = torch.randint(
-        0,
-        max_num_blocks,
-        (num_tiles * num_blocks_per_tile, ),
-        dtype=torch.int32,
-    )
-    nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
-        block_tables.to(device=device),
-        num_tiles,
-        num_blocks_per_tile,
-        q_head_per_kv_head,
-        head_id,
-        block_size_tiling_factor,
-    ).cpu()
-    ref_out = ref_block_tables_transform(
-        block_tables,
-        num_tiles,
-        num_blocks_per_tile,
-        q_head_per_kv_head,
-        head_id,
-        block_size_tiling_factor,
-    )
-    assert (nki_out.shape == ref_out.shape
-            ), f"{nki_out.shape=} != {ref_out.shape=}"
-    assert torch.all(nki_out == ref_out)
+    ])
+    with monkeypatch.context() as m:
+        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
+
+        torch.manual_seed(10000)
+        torch.set_printoptions(sci_mode=False)
+
+        # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
+        B_P_SIZE = 128
+        if num_blocks_per_tile < B_P_SIZE:
+            assert B_P_SIZE % num_blocks_per_tile == 0
+            block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
+        else:
+            block_size_tiling_factor = 1
+        max_num_blocks = 100000
+        block_tables = torch.randint(
+            0,
+            max_num_blocks,
+            (num_tiles * num_blocks_per_tile, ),
+            dtype=torch.int32,
+        )
+        nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
+            block_tables.to(device=device),
+            num_tiles,
+            num_blocks_per_tile,
+            q_head_per_kv_head,
+            head_id,
+            block_size_tiling_factor,
+        ).cpu()
+        ref_out = ref_block_tables_transform(
+            block_tables,
+            num_tiles,
+            num_blocks_per_tile,
+            q_head_per_kv_head,
+            head_id,
+            block_size_tiling_factor,
+        )
+        assert (nki_out.shape == ref_out.shape
+                ), f"{nki_out.shape=} != {ref_out.shape=}"
+        assert torch.all(nki_out == ref_out)
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -320,6 +320,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
    ])
 @torch.inference_mode()
 def test_contexted_kv_attention(
+    monkeypatch: pytest.MonkeyPatch,
    prefill_batch_size: int,
    decode_batch_size: int,
    num_heads: int,
@@ -329,7 +330,6 @@ def test_contexted_kv_attention(
    large_tile_size,
    mixed_precision: bool,
 ) -> None:
-    import os

    import torch_xla.core.xla_model as xm

@@ -340,174 +340,178 @@ def test_contexted_kv_attention(

    device = xm.xla_device()

-    compiler_flags = [
+    compiler_flags_str = " ".join([
        "-O1",
        "--retry_failed_compilation",
-    ]
-    compiler_flags_str = " ".join(compiler_flags)
-    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
-
-    torch.manual_seed(0)
-    torch.set_printoptions(sci_mode=False)
-    torch.set_default_device("cpu")
-    dtype = torch.float32
-
-    min_ctx_len = 32
-    max_ctx_len = 1024
-    min_query_len = 16
-    max_query_len = 512
-    num_kv_heads = num_heads // num_queries_per_kv
-    (
-        query,
-        k_active,
-        v_active,
-        k_cache,
-        v_cache,
-        block_table,
-        key,
-        value,
-        query_lens,
-        seq_lens,
-    ) = sample_inputs(
-        prefill_batch_size=prefill_batch_size,
-        decode_batch_size=decode_batch_size,
-        min_query_len=min_query_len,
-        max_query_len=max_query_len,
-        min_ctx_len=min_ctx_len,
-        max_ctx_len=max_ctx_len,
-        block_size=block_size,
-        num_heads=num_heads,
-        num_kv_heads=num_kv_heads,
-        head_size=head_size,
-        dtype=dtype,
-    )
+    ])
+    with monkeypatch.context() as m:
+        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
+
+        torch.manual_seed(0)
+        torch.set_printoptions(sci_mode=False)
+        torch.set_default_device("cpu")
+        dtype = torch.float32
+
+        min_ctx_len = 32
+        max_ctx_len = 1024
+        min_query_len = 16
+        max_query_len = 512
+        num_kv_heads = num_heads // num_queries_per_kv
+        (
+            query,
+            k_active,
+            v_active,
+            k_cache,
+            v_cache,
+            block_table,
+            key,
+            value,
+            query_lens,
+            seq_lens,
+        ) = sample_inputs(
+            prefill_batch_size=prefill_batch_size,
+            decode_batch_size=decode_batch_size,
+            min_query_len=min_query_len,
+            max_query_len=max_query_len,
+            min_ctx_len=min_ctx_len,
+            max_ctx_len=max_ctx_len,
+            block_size=block_size,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_size=head_size,
+            dtype=dtype,
+        )

-    output_ref = ref_context_attention(
-        query,
-        key,
-        value,
-        query_lens,
-        seq_lens,
-        head_size,
-        num_queries_per_kv,
-        return_max_reduce=False,
-    )
+        output_ref = ref_context_attention(
+            query,
+            key,
+            value,
+            query_lens,
+            seq_lens,
+            head_size,
+            num_queries_per_kv,
+            return_max_reduce=False,
+        )

-    # build neuron program
-    B_P_SIZE = 128
-    assert (large_tile_size >= B_P_SIZE
-            ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
-
-    def ceil_div(a, b):
-        return (a + b - 1) // b
-
-    def pad_to_multiple(a, b):
-        return ceil_div(a, b) * b
-
-    def pad_to_next_power_of_2(a):
-        assert a > 0
-        return 2**int(a - 1).bit_length()
-
-    # calculate input shapes
-    max_num_queries = pad_to_next_power_of_2(sum(query_lens))
-    context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-    num_active_blocks = ceil_div(context_lens, block_size).sum().item()
-    num_active_blocks = pad_to_multiple(num_active_blocks,
-                                        large_tile_size // block_size)
-    context_kv_len = num_active_blocks * block_size
-    assert (context_kv_len %
-            large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
+        # build neuron program
+        B_P_SIZE = 128
+        assert (large_tile_size >= B_P_SIZE
+                ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"

-    # pad QKV tensors
-    pad_dims = (
-        0,
-        0,
-        0,
-        0,
-        0,
-        max_num_queries - query.shape[0],
-    )
-    query = F.pad(query, pad_dims, "constant", 0)
-    k = F.pad(k_active, pad_dims, "constant", 0)
-    v = F.pad(v_active, pad_dims, "constant", 0)
-
-    # permute QKV tensors
-    # query: (1, n_heads, d, seq_q)
-    # key:   (1, n_kv_heads, d, seq_k)
-    # value: (1, n_kv_heads, seq_v, d)
-    query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
-    k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
-    v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
-    k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
-    v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
-
-    # transform block table
-    active_block_table = get_active_block_tables(
-        block_table.cpu(),
-        torch.tensor(query_lens).cpu(),
-        torch.tensor(seq_lens).cpu(),
-        block_size,
-        num_active_blocks,
-    )
+        def ceil_div(a, b):
+            return (a + b - 1) // b

-    # Build attention masks
-    prior_mask, active_mask = (
-        BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-            query_lens, seq_lens, block_size=block_size))
-    prior_mask_padded = F.pad(
-        prior_mask,
-        (
+        def pad_to_multiple(a, b):
+            return ceil_div(a, b) * b
+
+        def pad_to_next_power_of_2(a):
+            assert a > 0
+            return 2**int(a - 1).bit_length()
+
+        # calculate input shapes
+        max_num_queries = pad_to_next_power_of_2(sum(query_lens))
+        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+        num_active_blocks = ceil_div(context_lens, block_size).sum().item()
+        num_active_blocks = pad_to_multiple(num_active_blocks,
+                                            large_tile_size // block_size)
+        context_kv_len = num_active_blocks * block_size
+        assert (
+            context_kv_len %
+            large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
+
+        # pad QKV tensors
+        pad_dims = (
            0,
-            context_kv_len - prior_mask.shape[1],
            0,
-            max_num_queries - prior_mask.shape[0],
-        ),
-        "constant",
-        0,
-    ).bool()
-    active_mask_padded = F.pad(
-        active_mask,
-        (
            0,
-            max_num_queries - active_mask.shape[1],
            0,
-            max_num_queries - active_mask.shape[0],
-        ),
-        "constant",
-        0,
-    ).bool()
-    attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1)
-
-    attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size)
-
-    input_args = (
-        query.to(device=device),
-        k.to(device=device),
-        v.to(device=device),
-        k_cache.to(device=device),
-        v_cache.to(device=device),
-        active_block_table.to(device=device),
-        attn_mask.to(device=device),
-    )
-    input_kwargs = dict(
-        n_kv_head=num_kv_heads,
-        head_size=head_size,
-        mixed_precision=mixed_precision,
-        LARGE_TILE_SZ=large_tile_size,
-    )
+            0,
+            max_num_queries - query.shape[0],
+        )
+        query = F.pad(query, pad_dims, "constant", 0)
+        k = F.pad(k_active, pad_dims, "constant", 0)
+        v = F.pad(v_active, pad_dims, "constant", 0)
+
+        # permute QKV tensors
+        # query: (1, n_heads, d, seq_q)
+        # key:   (1, n_kv_heads, d, seq_k)
+        # value: (1, n_kv_heads, seq_v, d)
+        query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+        k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+        v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
+        k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
+        v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
+
+        # transform block table
+        active_block_table = get_active_block_tables(
+            block_table.cpu(),
+            torch.tensor(query_lens).cpu(),
+            torch.tensor(seq_lens).cpu(),
+            block_size,
+            num_active_blocks,
+        )

-    output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+        # Build attention masks
+        prior_mask, active_mask = (
+            BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+                query_lens, seq_lens, block_size=block_size))
+        prior_mask_padded = F.pad(
+            prior_mask,
+            (
+                0,
+                context_kv_len - prior_mask.shape[1],
+                0,
+                max_num_queries - prior_mask.shape[0],
+            ),
+            "constant",
+            0,
+        ).bool()
+        active_mask_padded = F.pad(
+            active_mask,
+            (
+                0,
+                max_num_queries - active_mask.shape[1],
+                0,
+                max_num_queries - active_mask.shape[0],
+            ),
+            "constant",
+            0,
+        ).bool()
+        attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
+                                 dim=1)
+
+        attn_mask = reorder_context_mask(attn_mask, large_tile_size,
+                                         block_size)
+
+        input_args = (
+            query.to(device=device),
+            k.to(device=device),
+            v.to(device=device),
+            k_cache.to(device=device),
+            v_cache.to(device=device),
+            active_block_table.to(device=device),
+            attn_mask.to(device=device),
+        )
+        input_kwargs = dict(
+            n_kv_head=num_kv_heads,
+            head_size=head_size,
+            mixed_precision=mixed_precision,
+            LARGE_TILE_SZ=large_tile_size,
+        )

-    num_actual_tokens = sum(query_lens)
-    # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
-    output_nki = output_nki.cpu().permute(0, 2, 1, 3)
-    output_nki = output_nki[0, :num_actual_tokens, :, :]
-    output_ref_padded = F.pad(
-        output_ref,
-        (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
-        "constant",
-        0,
-    )
-    output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
+        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+
+        num_actual_tokens = sum(query_lens)
+        # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
+        output_nki = output_nki.cpu().permute(0, 2, 1, 3)
+        output_nki = output_nki[0, :num_actual_tokens, :, :]
+        output_ref_padded = F.pad(
+            output_ref,
+            (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
+            "constant",
+            0,
+        )
+        output_ref = output_ref_padded.transpose(
+            0, 1)[0, :num_actual_tokens, :, :]

-    torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
+        torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
 # SPDX-License-Identifier: Apache-2.0

+import pytest
 import torch

-from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import get_attn_backend
-from vllm.utils import STR_INVALID_VAL
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL


 def test_platform_plugins():
@@ -25,8 +25,9 @@ def test_platform_plugins():
        f" is loaded. The first import:\n{_init_trace}")


-def test_oot_attention_backend(monkeypatch):
+def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
    # ignore the backend env variable if it is set
-    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
-    backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
-    assert backend.get_name() == "Dummy_Backend"
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+        assert backend.get_name() == "Dummy_Backend"
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -22,43 +22,47 @@ class DummyV1Scheduler(V1Scheduler):
        raise Exception("Exception raised by DummyV1Scheduler")


-def test_scheduler_plugins_v0(monkeypatch):
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    with pytest.raises(Exception) as exception_info:
+def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with pytest.raises(Exception) as exception_info:

-        engine_args = EngineArgs(
-            model="facebook/opt-125m",
-            enforce_eager=True,  # reduce test time
-            scheduler_cls=DummyV0Scheduler,
-        )
+            engine_args = EngineArgs(
+                model="facebook/opt-125m",
+                enforce_eager=True,  # reduce test time
+                scheduler_cls=DummyV0Scheduler,
+            )

-        engine = LLMEngine.from_engine_args(engine_args=engine_args)
+            engine = LLMEngine.from_engine_args(engine_args=engine_args)

-        sampling_params = SamplingParams(max_tokens=1)
-        engine.add_request("0", "foo", sampling_params)
-        engine.step()
+            sampling_params = SamplingParams(max_tokens=1)
+            engine.add_request("0", "foo", sampling_params)
+            engine.step()

-    assert str(exception_info.value) == "Exception raised by DummyV0Scheduler"
+        assert str(
+            exception_info.value) == "Exception raised by DummyV0Scheduler"


-def test_scheduler_plugins_v1(monkeypatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    # Explicitly turn off engine multiprocessing so that the scheduler runs in
-    # this process
-    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        # Explicitly turn off engine multiprocessing so
+        # that the scheduler runs in this process
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

-    with pytest.raises(Exception) as exception_info:
+        with pytest.raises(Exception) as exception_info:

-        engine_args = EngineArgs(
-            model="facebook/opt-125m",
-            enforce_eager=True,  # reduce test time
-            scheduler_cls=DummyV1Scheduler,
-        )
+            engine_args = EngineArgs(
+                model="facebook/opt-125m",
+                enforce_eager=True,  # reduce test time
+                scheduler_cls=DummyV1Scheduler,
+            )

-        engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
+            engine = V1LLMEngine.from_engine_args(engine_args=engine_args)

-        sampling_params = SamplingParams(max_tokens=1)
-        engine.add_request("0", "foo", sampling_params)
-        engine.step()
+            sampling_params = SamplingParams(max_tokens=1)
+            engine.add_request("0", "foo", sampling_params)
+            engine.step()

-    assert str(exception_info.value) == "Exception raised by DummyV1Scheduler"
+        assert str(
+            exception_info.value) == "Exception raised by DummyV1Scheduler"
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -4,25 +4,29 @@
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """

+from __future__ import annotations
+
 import pytest

 from tests.conftest import VllmRunner
 from tests.core.utils import SchedulerProxy, create_dummy_prompt
-from tests.kernels.utils import override_backend_env_variable
 from vllm import SamplingParams, TokensPrompt
 from vllm.core.scheduler import Scheduler
 from vllm.engine.llm_engine import LLMEngine
 from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR

 from ..models.utils import check_outputs_equal


 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
    """
    This module relies on V0 internals, so set VLLM_USE_V1=0.
    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield


 MODELS = [
@@ -56,7 +60,7 @@ def test_mixed_requests(
    cached_position: int,
    enable_chunked_prefill: bool,
    block_size: int,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Test the case when some sequences have the prefix cache hit
@@ -67,72 +71,77 @@ def test_mixed_requests(
        pytest.skip("Flashinfer does not support ROCm/HIP.")
    if backend == "XFORMERS" and current_platform.is_rocm():
        pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    cached_prompt = example_prompts[cached_position]
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enable_prefix_caching=True,
-            enable_chunked_prefill=enable_chunked_prefill,
-            block_size=block_size,
-    ) as vllm_model:
-        # Run the first prompt so the cache is populated
-        vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
-
-        # Run all the promopts
-        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-        req_outputs = vllm_model.model.generate(example_prompts, greedy_params)
-
-        # Verify number of cached tokens
-        for i in range(len(req_outputs)):
-            if i == cached_position:
-                expected_num_cached_tokens = (
-                    len(req_outputs[i].prompt_token_ids) //
-                    block_size) * block_size
-            else:
-                expected_num_cached_tokens = 0
-            assert (
-                req_outputs[i].num_cached_tokens == expected_num_cached_tokens)
-
-        vllm_outputs = [(
-            output.prompt_token_ids + list(output.outputs[0].token_ids),
-            output.prompt + output.outputs[0].text,
-        ) for output in req_outputs]
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
+
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+        cached_prompt = example_prompts[cached_position]
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enable_prefix_caching=True,
+                enable_chunked_prefill=enable_chunked_prefill,
+                block_size=block_size,
+        ) as vllm_model:
+            # Run the first prompt so the cache is populated
+            vllm_outputs = vllm_model.generate_greedy([cached_prompt],
+                                                      max_tokens)
+
+            # Run all the promopts
+            greedy_params = SamplingParams(temperature=0.0,
+                                           max_tokens=max_tokens)
+            req_outputs = vllm_model.model.generate(example_prompts,
+                                                    greedy_params)
+
+            # Verify number of cached tokens
+            for i in range(len(req_outputs)):
+                if i == cached_position:
+                    expected_num_cached_tokens = (
+                        len(req_outputs[i].prompt_token_ids) //
+                        block_size) * block_size
+                else:
+                    expected_num_cached_tokens = 0
+                assert (req_outputs[i].num_cached_tokens ==
+                        expected_num_cached_tokens)
+
+            vllm_outputs = [(
+                output.prompt_token_ids + list(output.outputs[0].token_ids),
+                output.prompt + output.outputs[0].text,
+            ) for output in req_outputs]
+
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )


 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
 def test_unstable_prompt_sequence(
    vllm_runner,
    backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:

    if backend == "FLASHINFER" and current_platform.is_rocm():
        pytest.skip("Flashinfer does not support ROCm/HIP.")
    if backend == "XFORMERS" and current_platform.is_rocm():
        pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
-
-    with vllm_runner(
-            "Qwen/Qwen2.5-0.5B-Instruct",
-            enable_chunked_prefill=True,
-            enable_prefix_caching=True,
-            max_model_len=4096,
-    ) as vllm_model:
-        for prompt in UNSTABLE_PROMPT_SEQUENCE:
-            vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
-                                SamplingParams(max_tokens=1))
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
+
+        with vllm_runner(
+                "Qwen/Qwen2.5-0.5B-Instruct",
+                enable_chunked_prefill=True,
+                enable_prefix_caching=True,
+                max_model_len=4096,
+        ) as vllm_model:
+            for prompt in UNSTABLE_PROMPT_SEQUENCE:
+                vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
+                                    SamplingParams(max_tokens=1))


 @pytest.mark.parametrize("model", MODELS)

--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -56,12 +56,11 @@ def test_gc():
    assert allocated < 50 * 1024 * 1024


-def test_model_from_modelscope(monkeypatch):
+def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
-    MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"
-    monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True")
-    try:
-        llm = LLM(model=MODELSCOPE_MODEL_NAME)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_MODELSCOPE", "True")
+        llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")

        prompts = [
            "Hello, my name is",
@@ -73,10 +72,3 @@ def test_model_from_modelscope(monkeypatch):

        outputs = llm.generate(prompts, sampling_params)
        assert len(outputs) == 4
-    finally:
-        monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False)
-
-
-if __name__ == "__main__":
-    import pytest
-    pytest.main([__file__])
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
 # SPDX-License-Identifier: Apache-2.0
+# ruff: noqa

 import asyncio
-import os
 import socket
 from collections.abc import AsyncIterator
 from unittest.mock import patch
@@ -112,16 +112,16 @@ def test_deprecate_kwargs_additional_message():
        dummy(old_arg=1)


-def test_get_open_port():
-    os.environ["VLLM_PORT"] = "5678"
-    # make sure we can get multiple ports, even if the env var is set
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
-        s1.bind(("localhost", get_open_port()))
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
-            s2.bind(("localhost", get_open_port()))
-            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
-                s3.bind(("localhost", get_open_port()))
-    os.environ.pop("VLLM_PORT")
+def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PORT", "5678")
+        # make sure we can get multiple ports, even if the env var is set
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
+            s1.bind(("localhost", get_open_port()))
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
+                s2.bind(("localhost", get_open_port()))
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
+                    s3.bind(("localhost", get_open_port()))


 # Tests for FlexibleArgumentParser
@@ -366,31 +366,32 @@ def test_bind_kv_cache_non_attention():
    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]


-def test_bind_kv_cache_encoder_decoder(monkeypatch):
+def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
    # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")

-    from vllm.attention import Attention, AttentionType
+        from vllm.attention import Attention, AttentionType

-    # example from bart
-    ctx = {
-        'encoder.layers.0.self_attn.attn':
-            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
-        'decoder.layers.0.encoder_attn.attn':
-            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
-        'decoder.layers.0.self_attn.attn':
-            Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
-    }
+        # example from bart
+        ctx = {
+            'encoder.layers.0.self_attn.attn':
+                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
+            'decoder.layers.0.encoder_attn.attn':
+                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
+            'decoder.layers.0.self_attn.attn':
+                Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
+        }

-    kv_cache = [
-        torch.zeros((1, )),
-    ]
-    encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
+        kv_cache = [
+            torch.zeros((1, )),
+        ]
+        encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache

-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
-    assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
+        bind_kv_cache(ctx, [kv_cache])
+        assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
+        assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
+        assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]


 def test_bind_kv_cache_pp():

--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
 # SPDX-License-Identifier: Apache-2.0

-import os
+import pytest

 from vllm.config import CompilationLevel

@@ -9,16 +9,17 @@ from ..utils import compare_two_settings
 # --enforce-eager on TPU causes graph compilation
 # this times out default Health Check in the MQLLMEngine,
 # so we set the timeout here to 30s
-os.environ["VLLM_RPC_TIMEOUT"] = "30000"


-def test_custom_dispatcher():
-    compare_two_settings(
-        "google/gemma-2b",
-        arg1=[
-            "--enforce-eager",
-            f"-O{CompilationLevel.DYNAMO_ONCE}",
-        ],
-        arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
-        env1={},
-        env2={})
+def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_RPC_TIMEOUT", "30000")
+        compare_two_settings(
+            "google/gemma-2b",
+            arg1=[
+                "--enforce-eager",
+                f"-O{CompilationLevel.DYNAMO_ONCE}",
+            ],
+            arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
+            env1={},
+            env2={})
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
 # SPDX-License-Identifier: Apache-2.0
+# ruff: noqa
+# type: ignore
+from __future__ import annotations

-import os
 import threading
 from collections.abc import Iterable
 from concurrent import futures
-from typing import Callable, Literal
+from typing import Callable, Generator, Literal

 import grpc
 import pytest
@@ -21,12 +23,14 @@ from vllm.tracing import SpanAttributes


 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
    """
    Since this module is V0 only, set VLLM_USE_V1=0 for
    all tests in the module.
    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield


 FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
@@ -67,7 +71,7 @@ class FakeTraceService(TraceServiceServicer):


 @pytest.fixture
-def trace_service():
+def trace_service() -> Generator[FakeTraceService, None, None]:
    """Fixture to set up a fake gRPC trace service"""
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
    service = FakeTraceService()
@@ -80,136 +84,153 @@ def trace_service():
    server.stop(None)


-def test_traces(trace_service):
-    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
-
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=256)
-    model = "facebook/opt-125m"
-    llm = LLM(
-        model=model,
-        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-    )
-    prompts = ["This is a short prompt"]
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
-
-    timeout = 5
-    if not trace_service.evt.wait(timeout):
-        raise TimeoutError(
-            f"The fake trace service didn't receive a trace within "
-            f"the {timeout} seconds timeout")
-
-    request = trace_service.request
-    assert len(request.resource_spans) == 1, (
-        f"Expected 1 resource span, "
-        f"but got {len(request.resource_spans)}")
-    assert len(request.resource_spans[0].scope_spans) == 1, (
-        f"Expected 1 scope span, "
-        f"but got {len(request.resource_spans[0].scope_spans)}")
-    assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
-        f"Expected 1 span, "
-        f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
-
-    attributes = decode_attributes(
-        request.resource_spans[0].scope_spans[0].spans[0].attributes)
-    assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                          ) == sampling_params.temperature
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-        outputs[0].prompt_token_ids)
-    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-    assert attributes.get(
-        SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
-    metrics = outputs[0].metrics
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
-    ttft = metrics.first_token_time - metrics.arrival_time
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
-    e2e_time = metrics.finished_time - metrics.arrival_time
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
-    assert metrics.scheduler_time > 0
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
-                          ) == metrics.scheduler_time
-    # Model forward and model execute should be none, since detailed traces is
-    # not enabled.
-    assert metrics.model_forward_time is None
-    assert metrics.model_execute_time is None
-
-
-def test_traces_with_detailed_steps(trace_service):
-    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
-
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=256)
-    model = "facebook/opt-125m"
-    llm = LLM(
-        model=model,
-        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-        collect_detailed_traces="all",
-    )
-    prompts = ["This is a short prompt"]
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
-
-    timeout = 5
-    if not trace_service.evt.wait(timeout):
-        raise TimeoutError(
-            f"The fake trace service didn't receive a trace within "
-            f"the {timeout} seconds timeout")
-
-    request = trace_service.request
-    assert len(request.resource_spans) == 1, (
-        f"Expected 1 resource span, "
-        f"but got {len(request.resource_spans)}")
-    assert len(request.resource_spans[0].scope_spans) == 1, (
-        f"Expected 1 scope span, "
-        f"but got {len(request.resource_spans[0].scope_spans)}")
-    assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
-        f"Expected 1 span, "
-        f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
-
-    attributes = decode_attributes(
-        request.resource_spans[0].scope_spans[0].spans[0].attributes)
-    assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                          ) == sampling_params.temperature
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-        outputs[0].prompt_token_ids)
-    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-    assert attributes.get(
-        SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
-    metrics = outputs[0].metrics
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
-    ttft = metrics.first_token_time - metrics.arrival_time
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
-    e2e_time = metrics.finished_time - metrics.arrival_time
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
-    assert metrics.scheduler_time > 0
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
-                          ) == metrics.scheduler_time
-    assert metrics.model_forward_time > 0
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
-            metrics.model_forward_time / 1000)
-    assert metrics.model_execute_time > 0
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
-                          ) == metrics.model_execute_time
-    assert metrics.model_forward_time < 1000 * metrics.model_execute_time
+def test_traces(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
+
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+        )
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+        timeout = 5
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
+
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        metrics = outputs[0].metrics
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+                              ) == metrics.time_in_queue
+        ttft = metrics.first_token_time - metrics.arrival_time
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        e2e_time = metrics.finished_time - metrics.arrival_time
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+        assert metrics.scheduler_time > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                              ) == metrics.scheduler_time
+        # Model forward and model execute should be none, since detailed traces is
+        # not enabled.
+        assert metrics.model_forward_time is None
+        assert metrics.model_execute_time is None
+
+
+def test_traces_with_detailed_steps(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
+
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+            collect_detailed_traces="all",
+        )
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+        timeout = 5
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
+
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        metrics = outputs[0].metrics
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+                              ) == metrics.time_in_queue
+        ttft = metrics.first_token_time - metrics.arrival_time
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        e2e_time = metrics.finished_time - metrics.arrival_time
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+        assert metrics.scheduler_time > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                              ) == metrics.scheduler_time
+        assert metrics.model_forward_time > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
+        ) == pytest.approx(metrics.model_forward_time / 1000)
+        assert metrics.model_execute_time > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
+        ) == metrics.model_execute_time
+        assert metrics.model_forward_time < 1000 * metrics.model_execute_time
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -566,6 +566,7 @@ def init_test_distributed_environment(


 def multi_process_parallel(
+    monkeypatch: pytest.MonkeyPatch,
    tp_size: int,
    pp_size: int,
    test_target: Any,
@@ -582,7 +583,13 @@ def multi_process_parallel(
    refs = []
    for rank in range(tp_size * pp_size):
        refs.append(
-            test_target.remote(tp_size, pp_size, rank, distributed_init_port))
+            test_target.remote(
+                monkeypatch,
+                tp_size,
+                pp_size,
+                rank,
+                distributed_init_port,
+            ), )
    ray.get(refs)

    ray.shutdown()
@@ -700,7 +707,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
    """
    Get a pytest mark, which skips the test if the GPU doesn't meet
    a minimum memory requirement in GB.
-    
+
    This can be leveraged via `@large_gpu_test` to skip tests in environments
    without enough resources, or called when filtering tests to run directly.
    """

--- a/tests/v1/e2e/test_ngram_spec_decode.py
+++ b/tests/v1/e2e/test_ngram_spec_decode.py
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
 import random
+from typing import Any

 import pytest

@@ -50,8 +53,12 @@ def model_name():
    return "meta-llama/Meta-Llama-3-8B-Instruct"


-def test_ngram_correctness(monkeypatch, test_prompts, sampling_config,
-                           model_name):
+def test_ngram_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+):
    '''
    Compare the outputs of a original LLM and a speculative LLM
    should be the same when using ngram speculative decoding.

--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -80,9 +80,11 @@ async def generate(engine: AsyncLLM,
                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
-async def test_load(monkeypatch, output_kind: RequestOutputKind,
-                    engine_args_and_prompt: tuple[AsyncEngineArgs,
-                                                  PromptType]):
+async def test_load(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+    engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType],
+):
    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
    # so that in the future when we switch, we don't have to change all the
    # tests.
@@ -126,7 +128,8 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind,
                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
-async def test_abort(monkeypatch, output_kind: RequestOutputKind,
+async def test_abort(monkeypatch: pytest.MonkeyPatch,
+                     output_kind: RequestOutputKind,
                     engine_args_and_prompt: tuple[AsyncEngineArgs,
                                                   PromptType]):


--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -45,7 +45,7 @@ def make_request() -> EngineCoreRequest:


 @fork_new_process_for_each_test
-def test_engine_core(monkeypatch):
+def test_engine_core(monkeypatch: pytest.MonkeyPatch):

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
@@ -159,10 +159,10 @@ def test_engine_core(monkeypatch):


 @fork_new_process_for_each_test
-def test_engine_core_advanced_sampling(monkeypatch):
+def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
    """
-    A basic end-to-end test to verify that the engine functions correctly 
-    when additional sampling parameters, such as top_p, min_tokens, and 
+    A basic end-to-end test to verify that the engine functions correctly
+    when additional sampling parameters, such as top_p, min_tokens, and
    presence_penalty, are set.
    """
    with monkeypatch.context() as m:
@@ -209,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch):


 @fork_new_process_for_each_test
-def test_engine_core_concurrent_batches(monkeypatch):
+def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
    """
    Test that the engine can handle multiple concurrent batches.
    """