[Misc] Replace os environ to monkeypatch in test suite (#14516)

Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com> Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz>

[Misc] Replace os environ to monkeypatch in test suite (#14516)
Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com> Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
a73e183e · Sibi · GitHub · 1e799b7e · a73e183e · a73e183e
Unverified Commit a73e183e authored Mar 17, 2025 by Sibi Committed by GitHub Mar 16, 2025
20 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -522,7 +522,7 @@ steps:
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py

 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -47,6 +47,7 @@ def test_vllm_gc_ed():
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False])
 def test_models(
+    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    model: str,
    backend: str,
@@ -63,31 +64,33 @@ def test_models(
        pytest.skip(
            f"{backend} does not support gemma2 with full context length.")

-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", backend)

-    # 5042 tokens for gemma2
-    # gemma2 has alternating sliding window size of 4096
-    # we need a prompt with more than 4096 tokens to test the sliding window
-    prompt = "The following numbers of the sequence " + ", ".join(
-        str(i) for i in range(1024)) + " are:"
-    example_prompts = [prompt]
+        # 5042 tokens for gemma2
+        # gemma2 has alternating sliding window size of 4096
+        # we need a prompt with more than 4096 tokens to test the sliding window
+        prompt = "The following numbers of the sequence " + ", ".join(
+            str(i) for i in range(1024)) + " are:"
+        example_prompts = [prompt]

-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    with VllmRunner(model,
-                    max_model_len=8192,
-                    dtype=dtype,
-                    enforce_eager=enforce_eager,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        with VllmRunner(model,
+                        max_model_len=8192,
+                        dtype=dtype,
+                        enforce_eager=enforce_eager,
+                        gpu_memory_utilization=0.7) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)

-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )


 @multi_gpu_test(num_gpus=2)
@@ -104,6 +107,7 @@ def test_models(
        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
    ])
 def test_models_distributed(
+    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    vllm_runner,
    example_prompts,
@@ -116,34 +120,41 @@ def test_models_distributed(
    if test_suite != TARGET_TEST_SUITE:
        pytest.skip(f"Skip test for {test_suite}")

-    if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-        # test Ray Compiled Graph
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-
-    if attention_backend:
-        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
-
-    dtype = "half"
-    max_tokens = 5
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=2,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+    with monkeypatch.context() as monkeypatch_context:
+        if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+            # test Ray Compiled Graph
+            monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+            monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
+
+        if attention_backend:
+            monkeypatch_context.setenv(
+                "VLLM_ATTENTION_BACKEND",
+                attention_backend,
+            )
+
+        dtype = "half"
+        max_tokens = 5
+
+        # NOTE: take care of the order. run vLLM first, and then run HF.
+        # vLLM needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with fork method
+        # (the default method).
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=2,
+                distributed_executor_backend=distributed_executor_backend,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
+
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -7,16 +7,22 @@ prefill requests are chunked.

 Run `pytest tests/models/test_chunked_prefill.py`.
 """
-import os
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING

 import pytest

-from tests.kernels.utils import override_backend_env_variable
 from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR

 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test

+if TYPE_CHECKING:
+    from .conftest import HfRunner, VllmRunner
+
 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-3.2-1B-Instruct",
@@ -24,12 +30,14 @@ MODELS = [


 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
    """
    Since this module is V0 only, set VLLM_USE_V1=0 for
    all tests in the file.
    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield


 @pytest.mark.parametrize("model", MODELS)
@@ -42,8 +50,8 @@ def use_v0_only(monkeypatch):
 @pytest.mark.parametrize("tensor_parallel_size", [1])
 @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    dtype: str,
@@ -52,37 +60,39 @@ def test_models(
    enforce_eager: bool,
    tensor_parallel_size: int,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Checks exact match decode between huggingface model and vllm runner with
    chunked prefill.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)

-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
+        max_num_seqs = chunked_prefill_token_size
+        max_num_batched_tokens = chunked_prefill_token_size

-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                max_num_batched_tokens=max_num_batched_tokens,
+                enable_chunked_prefill=True,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                max_num_seqs=max_num_seqs,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)

-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )


 @multi_gpu_test(num_gpus=2)
@@ -90,57 +100,61 @@ def test_models(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models_distributed(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    distributed_executor_backend: str,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    if (model == "meta-llama/Llama-3.2-1B-Instruct"
-            and distributed_executor_backend == "ray"):
-        # test Ray Compiled Graph
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-
-    dtype = "half"
-    max_tokens = 5
-    chunked_prefill_token_size = 16
-
-    # Add a chunked prefill config.
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    assert chunked_prefill_token_size != -1
-    enable_chunked_prefill = True
-    max_num_batched_tokens = chunked_prefill_token_size
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+        if (model == "meta-llama/Llama-3.2-1B-Instruct"
+                and distributed_executor_backend == "ray"):
+            # test Ray Compiled Graph
+            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
+
+        dtype = "half"
+        max_tokens = 5
+        chunked_prefill_token_size = 16
+
+        # Add a chunked prefill config.
+        max_num_seqs = min(chunked_prefill_token_size, 256)
+        assert chunked_prefill_token_size != -1
+        enable_chunked_prefill = True
+        max_num_batched_tokens = chunked_prefill_token_size
+
+        # NOTE: take care of the order. run vLLM first, and then run HF.
+        # vLLM needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with
+        # fork method (the default method).

-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            max_num_seqs=max_num_seqs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=2,
+                max_num_seqs=max_num_seqs,
+                enable_chunked_prefill=enable_chunked_prefill,
+                max_num_batched_tokens=max_num_batched_tokens,
+                distributed_executor_backend=distributed_executor_backend,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(
+                example_prompts,
+                max_tokens,
+            )

-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )


 @pytest.mark.parametrize(
@@ -158,7 +172,7 @@ def test_models_distributed(
 # the async postprocessor
 @pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models_with_fp8_kv_cache(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    example_prompts,
    kv_cache_dtype: str,
    model: str,
@@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache(
 @pytest.mark.parametrize("tensor_parallel_size", [1])
 @pytest.mark.parametrize("dtype", ["half"])
 def test_with_prefix_caching(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
@@ -254,8 +268,10 @@ def test_with_prefix_caching(
        ) as vllm_model:
            outputs[enable] = []
            for prompt in full_prompts:
-                outputs[enable] += vllm_model.generate_greedy([prompt],
-                                                              max_tokens)
+                outputs[enable] += vllm_model.generate_greedy(
+                    [prompt],
+                    max_tokens,
+                )

    check_outputs_equal(
        outputs_0_lst=outputs[False],
@@ -274,8 +290,8 @@ def test_with_prefix_caching(
 @pytest.mark.cpu_model
 @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_models_cpu(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    dtype: str,
@@ -283,7 +299,7 @@ def test_models_cpu(
    chunked_prefill_token_size: int,
    enforce_eager: bool,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    test_models(
        hf_runner,
@@ -307,7 +323,7 @@ def test_models_cpu(
 @pytest.mark.cpu_model
 @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_with_prefix_caching_cpu(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,

--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -123,40 +123,38 @@ def test_cumem_with_cudagraph():
        # sleep mode with pytorch checkpoint
        ("facebook/opt-125m", False),
    ])
-def test_end_to_end(model: str, use_v1: bool):
-    import os
-    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
-    free, total = torch.cuda.mem_get_info()
-    used_bytes_baseline = total - free  # in case other process is running
-    llm = LLM(model, enable_sleep_mode=True)
-    prompt = "How are you?"
-    sampling_params = SamplingParams(temperature=0, max_tokens=10)
-    output = llm.generate(prompt, sampling_params)
-
-    # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
-    # which is difficult to measure in the test. therefore, we only
-    # test sleep level 1 here.
-    llm.sleep(level=1)
-
-    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
-    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
-    # now the memory usage is mostly cudagraph memory pool,
-    # and it should be less than the model weights (1B model, 2GiB weights)
-
-    # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
-    # is captured but cannot be releasesd from PyTorch due to a known bug,
-    # therefore high memory usage after `llm.sleep` is called is expected.
-    # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
-    # in V1.
-    if use_v1:
-        assert used_bytes < 7 * GiB_bytes
-    else:
-        assert used_bytes < 2 * GiB_bytes
-
-    llm.wake_up()
-    output2 = llm.generate(prompt, sampling_params)
-
-    # cmp output
-    assert output[0].outputs[0].text == output2[0].outputs[0].text
-
-    del os.environ["VLLM_USE_V1"]
+def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        free, total = torch.cuda.mem_get_info()
+        used_bytes_baseline = total - free  # in case other process is running
+        llm = LLM(model, enable_sleep_mode=True)
+        prompt = "How are you?"
+        sampling_params = SamplingParams(temperature=0, max_tokens=10)
+        output = llm.generate(prompt, sampling_params)
+
+        # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+        # which is difficult to measure in the test. therefore, we only
+        # test sleep level 1 here.
+        llm.sleep(level=1)
+
+        free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+        # now the memory usage is mostly cudagraph memory pool,
+        # and it should be less than the model weights (1B model, 2GiB weights)
+
+        # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+        # is captured but cannot be releasesd from PyTorch due to a known bug,
+        # therefore high memory usage after `llm.sleep` is called is expected.
+        # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+        # in V1.
+        if use_v1:
+            assert used_bytes < 7 * GiB_bytes
+        else:
+            assert used_bytes < 2 * GiB_bytes
+
+        llm.wake_up()
+        output2 = llm.generate(prompt, sampling_params)
+
+        # cmp output
+        assert output[0].outputs[0].text == output2[0].outputs[0].text
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations

 import dataclasses
-from typing import Optional

 import pytest

@@ -22,75 +22,76 @@ class TestSetting:
    fullgraph: bool


-# representative settings for testing
-test_settings = [
-    # basic llama model
-    TestSetting(
-        model="meta-llama/Llama-3.2-1B-Instruct",
-        model_args=[],
-        pp_size=2,
-        tp_size=2,
-        attn_backend="FLASHINFER",
-        method="generate",
-        fullgraph=True,
-    ),
-    # llama model with quantization
-    TestSetting(
-        model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-        model_args=["--quantization", "gptq"],
-        pp_size=1,
-        tp_size=1,
-        attn_backend="FLASH_ATTN",
-        method="generate",
-        fullgraph=True,
-    ),
-    # MoE model
-    TestSetting(
-        model="ibm/PowerMoE-3b",
-        model_args=[],
-        pp_size=1,
-        tp_size=2,
-        attn_backend="FLASH_ATTN",
-        method="generate",
-        fullgraph=True,
-    ),
-    # embedding model
-    TestSetting(
-        model="BAAI/bge-multilingual-gemma2",
-        model_args=["--task", "embed"],
-        pp_size=1,
-        tp_size=1,
-        attn_backend="FLASH_ATTN",
-        method="encode",
-        fullgraph=True,
-    ),
-    # encoder-based embedding model (BERT)
-    TestSetting(
-        model="BAAI/bge-base-en-v1.5",
-        model_args=["--task", "embed"],
-        pp_size=1,
-        tp_size=1,
-        attn_backend="XFORMERS",
-        method="encode",
-        fullgraph=True,
-    ),
-    # vision language model
-    TestSetting(
-        model="microsoft/Phi-3.5-vision-instruct",
-        model_args=["--trust-remote-code", "--max-model-len", "2048"],
-        pp_size=2,
-        tp_size=1,
-        attn_backend="FLASH_ATTN",
-        method="generate_with_image",
-        fullgraph=False,
-    ),
-]
-
-
 # we cannot afford testing the full Catesian product
 # of all models and all levels
-@pytest.mark.parametrize("test_setting", test_settings)
-def test_compile_correctness(test_setting: TestSetting):
+@pytest.mark.parametrize(
+    "test_setting",
+    [
+        # basic llama model
+        TestSetting(
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            model_args=[],
+            pp_size=2,
+            tp_size=2,
+            attn_backend="FLASHINFER",
+            method="generate",
+            fullgraph=True,
+        ),
+        # llama model with quantization
+        TestSetting(
+            model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            model_args=["--quantization", "gptq"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+            fullgraph=True,
+        ),
+        # MoE model
+        TestSetting(
+            model="ibm/PowerMoE-3b",
+            model_args=[],
+            pp_size=1,
+            tp_size=2,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+            fullgraph=True,
+        ),
+        # embedding model
+        TestSetting(
+            model="BAAI/bge-multilingual-gemma2",
+            model_args=["--task", "embed"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="encode",
+            fullgraph=True,
+        ),
+        # encoder-based embedding model (BERT)
+        TestSetting(
+            model="BAAI/bge-base-en-v1.5",
+            model_args=["--task", "embed"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="XFORMERS",
+            method="encode",
+            fullgraph=True,
+        ),
+        # vision language model
+        TestSetting(
+            model="microsoft/Phi-3.5-vision-instruct",
+            model_args=["--trust-remote-code", "--max-model-len", "2048"],
+            pp_size=2,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="generate_with_image",
+            fullgraph=False,
+        ),
+    ])
+def test_compile_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_setting: TestSetting,
+):
    # this test is run under multiple suits, with different GPUs.
    # make sure we only run the test with correct CUDA devices.
    # don't use "<", as it will duplicate the tests.
@@ -103,41 +104,45 @@ def test_compile_correctness(test_setting: TestSetting):
    fullgraph = test_setting.fullgraph
    if cuda_device_count_stateless() != pp_size * tp_size:
        pytest.skip("Not correct CUDA devices for the test.")
-    import os
-    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
-    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
-                ["-tp", str(tp_size)]

-    all_args: list[list[str]] = []
-    all_envs: list[Optional[dict[str, str]]] = []
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+        final_args = [
+            "--enforce-eager", *model_args, "-pp",
+            str(pp_size), "-tp",
+            str(tp_size)
+        ]
+
+        all_args: list[list[str]] = []
+        all_envs: list[dict[str, str] | None] = []

-    for level in [
-            CompilationLevel.NO_COMPILATION,
-            CompilationLevel.PIECEWISE,
-    ]:
-        all_args.append(final_args + [f"-O{level}"])
-        all_envs.append({})
+        for level in [
+                CompilationLevel.NO_COMPILATION,
+                CompilationLevel.PIECEWISE,
+        ]:
+            all_args.append(final_args + [f"-O{level}"])
+            all_envs.append({})

-    # inductor will change the output, so we only compare if the output
-    # is close, not exactly the same.
-    compare_all_settings(
-        model,
-        all_args,
-        all_envs,
-        method=method if method != "generate" else "generate_close")
-    all_envs.clear()
-    all_args.clear()
+        # inductor will change the output, so we only compare if the output
+        # is close, not exactly the same.
+        compare_all_settings(
+            model,
+            all_args,
+            all_envs,
+            method=method if method != "generate" else "generate_close")
+        all_envs.clear()
+        all_args.clear()

-    for level in [
-            CompilationLevel.NO_COMPILATION,
-            CompilationLevel.DYNAMO_AS_IS,
-            CompilationLevel.DYNAMO_ONCE,
-    ]:
-        all_args.append(final_args + [f"-O{level}"])
-        all_envs.append({})
-        if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
-            # "DYNAMO_ONCE" will always use fullgraph
-            all_envs[-1][
-                "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
+        for level in [
+                CompilationLevel.NO_COMPILATION,
+                CompilationLevel.DYNAMO_AS_IS,
+                CompilationLevel.DYNAMO_ONCE,
+        ]:
+            all_args.append(final_args + [f"-O{level}"])
+            all_envs.append({})
+            if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
+                # "DYNAMO_ONCE" will always use fullgraph
+                all_envs[-1][
+                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore

-    compare_all_settings(model, all_args * 3, all_envs, method=method)
+        compare_all_settings(model, all_args * 3, all_envs, method=method)
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
 # SPDX-License-Identifier: Apache-2.0

+from __future__ import annotations
+
+from typing import Any
+
 import pytest
+import torch

+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
 from vllm.config import CompilationLevel
+from vllm.platforms import current_platform

 from ..utils import fork_new_process_for_each_test
-from .utils import TEST_MODELS, check_full_graph_support


-@pytest.mark.parametrize("model_info", TEST_MODELS)
+@pytest.fixture(params=None, name="model_info")
+def models_list_fixture(request):
+    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
+        ("facebook/opt-125m", {}),
+        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
+            "dtype": torch.float16,
+            "quantization": "compressed-tensors"
+        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
+            "dtype": torch.float16,
+            "quantization": "compressed-tensors"
+        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
+            "quantization": "compressed-tensors"
+        }),
+        ("meta-llama/Llama-3.2-1B-Instruct", {}),
+    ]
+
+    if is_quant_method_supported("aqlm"):
+        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+            "quantization": "aqlm"
+        }))
+
+    # TODO: figure out why this fails.
+    if False and is_quant_method_supported("gguf"):  # noqa: SIM223
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+            "quantization": "gguf"
+        }))
+
+    if is_quant_method_supported("gptq"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+            "quantization": "gptq"
+        }))
+
+    if is_quant_method_supported("gptq_marlin"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+            "quantization": "gptq_marlin"
+        }))
+
+    if is_quant_method_supported("gptq_marlin_24"):
+        TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+            "quantization": "gptq_marlin_24"
+        }))
+
+    if is_quant_method_supported("marlin"):
+        TEST_MODELS.append(
+            ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+                "quantization": "marlin"
+            }))
+
+    if not current_platform.is_rocm() and is_quant_method_supported("awq"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+            "quantization": "AWQ"
+        }))
+
+    return TEST_MODELS
+
+
 @pytest.mark.parametrize(
    "optimization_level",
-    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
+)
+@pytest.mark.parametrize("model_info", "", indirect=True)
 @fork_new_process_for_each_test
-def test_full_graph(model_info, optimization_level):
-    model = model_info[0]
-    model_kwargs = model_info[1]
-    check_full_graph_support(model,
-                             model_kwargs,
-                             optimization_level,
-                             tp_size=1)
+def test_full_graph(
+    monkeypatch: pytest.MonkeyPatch,
+    model_info: tuple[str, dict[str, Any]],
+    optimization_level: int,
+):
+    model, model_kwargs = model_info
+
+    with monkeypatch.context() as m:
+        # make sure these models can be captured in full graph mode
+        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
+        print(f"MODEL={model}")
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(
+            model=model,
+            enforce_eager=True,
+            tensor_parallel_size=1,
+            disable_custom_all_reduce=True,
+            compilation_config=optimization_level,
+            **model_kwargs,
+        )
+        outputs = llm.generate(prompts, sampling_params)
+
+        # Print the outputs.
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-import torch
-
-from tests.quantization.utils import is_quant_method_supported
-from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-
-TEST_MODELS = [
-    ("facebook/opt-125m", {}),
-    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
-        "dtype": torch.float16,
-        "quantization": "compressed-tensors"
-    }),
-    ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
-        "dtype": torch.float16,
-        "quantization": "compressed-tensors"
-    }),
-    ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
-        "quantization": "compressed-tensors"
-    }),
-    ("meta-llama/Llama-3.2-1B-Instruct", {}),
-]
-
-if is_quant_method_supported("aqlm"):
-    TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
-        "quantization": "aqlm"
-    }))
-
-# TODO: figure out why this fails.
-if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
-        "quantization": "gguf"
-    }))
-
-if is_quant_method_supported("gptq"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
-        "quantization": "gptq"
-    }))
-
-if is_quant_method_supported("gptq_marlin"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
-        "quantization": "gptq_marlin"
-    }))
-
-if is_quant_method_supported("gptq_marlin_24"):
-    TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
-        "quantization": "gptq_marlin_24"
-    }))
-
-if is_quant_method_supported("marlin"):
-    TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
-        "quantization": "marlin"
-    }))
-
-if not current_platform.is_rocm() and is_quant_method_supported("awq"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
-        "quantization": "AWQ"
-    }))
-
-
-def check_full_graph_support(model,
-                             model_kwargs,
-                             optimization_level,
-                             tp_size=1):
-    # make sure these models can be captured in full graph mode
-    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
-
-    print(f"MODEL={model}")
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=model,
-              enforce_eager=True,
-              tensor_parallel_size=tp_size,
-              disable_custom_all_reduce=True,
-              compilation_config=optimization_level,
-              **model_kwargs)
-
-    outputs = llm.generate(prompts, sampling_params)
-
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1110,4 +1110,4 @@ def pytest_collection_modifyitems(config, items):
    skip_optional = pytest.mark.skip(reason="need --optional option to run")
    for item in items:
        if "optional" in item.keywords:
-            item.add_marker(skip_optional)
+            item.add_marker(skip_optional)
\ No newline at end of file
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -3,7 +3,10 @@

 Run `pytest tests/distributed/test_comm_ops.py`.
 """
-import os
+
+from __future__ import annotations
+
+from typing import Any, Callable

 import pytest
 import ray
@@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel


 @ray.remote(num_gpus=1, max_calls=1)
-def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
-                           distributed_init_port: str):
+def all_reduce_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,


 @ray.remote(num_gpus=1, max_calls=1)
-def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
-                           distributed_init_port: str):
+def all_gather_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,


 @ray.remote(num_gpus=1, max_calls=1)
-def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
-                                      distributed_init_port: str):
+def broadcast_tensor_dict_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,


 @ray.remote(num_gpus=1, max_calls=1)
-def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
-                                      distributed_init_port: str):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+def send_recv_tensor_dict_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,


 @ray.remote(num_gpus=1, max_calls=1)
-def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
-                          distributed_init_port: str):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+def send_recv_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
    all_reduce_test_worker, all_gather_test_worker,
    broadcast_tensor_dict_test_worker
 ])
-def test_multi_process_tensor_parallel(tp_size, test_target):
-    multi_process_parallel(tp_size, 1, test_target)
+def test_multi_process_tensor_parallel(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    test_target: Callable[..., Any],
+):
+    multi_process_parallel(monkeypatch, tp_size, 1, test_target)


 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
 @pytest.mark.parametrize("pp_size", [2])
 @pytest.mark.parametrize(
    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
-def test_multi_process_pipeline_parallel(pp_size, test_target):
-    multi_process_parallel(1, pp_size, test_target)
+def test_multi_process_pipeline_parallel(
+    monkeypatch: pytest.MonkeyPatch,
+    pp_size: int,
+    test_target: Callable[..., Any],
+):
+    multi_process_parallel(monkeypatch, 1, pp_size, test_target)


 @pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
    broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel_pipeline_parallel(
-        tp_size, pp_size, test_target):
-    multi_process_parallel(tp_size, pp_size, test_target)
+    tp_size: int,
+    pp_size: int,
+    test_target: Callable[..., Any],
+    monkeypatch: pytest.MonkeyPatch,
+):
+    multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
 # SPDX-License-Identifier: Apache-2.0

-import os
 import random

 import pytest
@@ -23,95 +22,115 @@ for i, v in enumerate(test_sizes):


 @ray.remote(num_gpus=1, max_calls=1)
-def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
-    ensure_model_parallel_initialized(tp_size, pp_size)
-    group = get_tensor_model_parallel_group().device_group
-
-    # A small all_reduce for warmup.
-    # this is needed because device communicators might be created lazily
-    # (e.g. NCCL). This will ensure that the communicator is initialized
-    # before any communication happens, so that this group can be used for
-    # graph capture immediately.
-    data = torch.zeros(1)
-    data = data.to(device=device)
-    torch.distributed.all_reduce(data, group=group)
-    torch.cuda.synchronize()
-    del data
-
-    # we use the first group to communicate once
-    # and the second group to communicate twice
-    # and so on
-    # this is used to demonstrate that each group can
-    # communicate independently
-    num_communication = rank // tp_size + 1
-
-    for sz in test_sizes:
-        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-            with graph_capture(device=device) as graph_capture_context:
-                # use integers so result matches NCCL exactly
-                inp1 = torch.randint(1,
-                                     16, (sz, ),
-                                     dtype=dtype,
-                                     device=torch.cuda.current_device())
-                inp2 = torch.randint(1,
-                                     16, (sz, ),
-                                     dtype=dtype,
-                                     device=torch.cuda.current_device())
-                torch.cuda.synchronize()
-                graph = torch.cuda.CUDAGraph()
-                with torch.cuda.graph(graph,
-                                      stream=graph_capture_context.stream):
-                    for i in range(num_communication):
-                        out1 = tensor_model_parallel_all_reduce(inp1)
-                        # the input buffer is immediately modified to test
-                        # synchronization
-                        dist.all_reduce(inp1, group=group)
-                        out2 = tensor_model_parallel_all_reduce(inp2)
-                        dist.all_reduce(inp2, group=group)
-            graph.replay()
-            torch.testing.assert_close(out1, inp1)
-            torch.testing.assert_close(out2, inp2)
+def graph_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
+        ensure_model_parallel_initialized(tp_size, pp_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        # A small all_reduce for warmup.
+        # this is needed because device communicators might be created lazily
+        # (e.g. NCCL). This will ensure that the communicator is initialized
+        # before any communication happens, so that this group can be used for
+        # graph capture immediately.
+        data = torch.zeros(1)
+        data = data.to(device=device)
+        torch.distributed.all_reduce(data, group=group)
+        torch.cuda.synchronize()
+        del data
+
+        # we use the first group to communicate once
+        # and the second group to communicate twice
+        # and so on
+        # this is used to demonstrate that each group can
+        # communicate independently
+        num_communication = rank // tp_size + 1
+
+        for sz in test_sizes:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                with graph_capture(device=device) as graph_capture_context:
+                    # use integers so result matches NCCL exactly
+                    inp1 = torch.randint(1,
+                                         16, (sz, ),
+                                         dtype=dtype,
+                                         device=torch.cuda.current_device())
+                    inp2 = torch.randint(1,
+                                         16, (sz, ),
+                                         dtype=dtype,
+                                         device=torch.cuda.current_device())
+                    torch.cuda.synchronize()
+                    graph = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(graph,
+                                          stream=graph_capture_context.stream):
+                        for i in range(num_communication):
+                            out1 = tensor_model_parallel_all_reduce(inp1)
+                            # the input buffer is immediately modified to test
+                            # synchronization
+                            dist.all_reduce(inp1, group=group)
+                            out2 = tensor_model_parallel_all_reduce(inp2)
+                            dist.all_reduce(inp2, group=group)
+                graph.replay()
+                torch.testing.assert_close(out1, inp1)
+                torch.testing.assert_close(out2, inp2)


 @ray.remote(num_gpus=1, max_calls=1)
-def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
-
-    # we use the first group to communicate once
-    # and the second group to communicate twice
-    # and so on
-    # this is used to demonstrate that each group can
-    # communicate independently
-    num_communication = rank // tp_size + 1
-    sz = 1024
-    fa = get_tp_group().ca_comm
-    inp = torch.ones(sz, dtype=torch.float32, device=device)
-    out = inp
-    for _ in range(num_communication):
-        out = fa.all_reduce(out, registered=False)
-    torch.testing.assert_close(out, inp * (tp_size**num_communication))
-
-    inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
-    out = inp
-    for _ in range(num_communication):
-        out = fa.all_reduce(out, registered=False)
-    torch.testing.assert_close(out, inp * (tp_size**num_communication))
+def eager_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
+
+        # we use the first group to communicate once
+        # and the second group to communicate twice
+        # and so on
+        # this is used to demonstrate that each group can
+        # communicate independently
+        num_communication = rank // tp_size + 1
+        sz = 1024
+        fa = get_tp_group().ca_comm
+        inp = torch.ones(sz, dtype=torch.float32, device=device)
+        out = inp
+        for _ in range(num_communication):
+            out = fa.all_reduce(out, registered=False)
+        torch.testing.assert_close(out, inp * (tp_size**num_communication))
+
+        inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+        out = inp
+        for _ in range(num_communication):
+            out = fa.all_reduce(out, registered=False)
+        torch.testing.assert_close(out, inp * (tp_size**num_communication))


 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
 @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
-def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
+def test_custom_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pipeline_parallel_size,
+    test_target,
+):
    world_size = tp_size * pipeline_parallel_size
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")
-    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
+                           test_target)
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -7,33 +7,35 @@ import pytest
 from vllm.distributed.utils import get_pp_indices


-def test_custom_layer_partition():
-
-    def _verify(partition_str, num_layers, pp_size, goldens):
-        bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
-        os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
-        for pp_rank, golden in enumerate(goldens):
-            assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
-        if bak is not None:
-            os.environ["VLLM_PP_LAYER_PARTITION"] = bak
-
-    # Even partition
-    _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    # Balanced partition
-    _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
-    # Put reminder somewhere
-    _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
-    # Invalid partition strings
-    with pytest.raises(ValueError):
-        _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    with pytest.raises(ValueError):
-        _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    # Wrong number of partitions
-    with pytest.raises(ValueError):
-        _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    # Wrong number of layers
-    with pytest.raises(ValueError):
-        _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m:
+
+        def _verify(partition_str, num_layers, pp_size, goldens):
+            bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
+            m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
+            for pp_rank, golden in enumerate(goldens):
+                assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
+            if bak is not None:
+                m.setenv("VLLM_PP_LAYER_PARTITION", bak)
+
+        # Even partition
+        _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Balanced partition
+        _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
+        # Put reminder somewhere
+        _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
+        # Invalid partition strings
+        with pytest.raises(ValueError):
+            _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        with pytest.raises(ValueError):
+            _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Wrong number of partitions
+        with pytest.raises(ValueError):
+            _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Wrong number of layers
+        with pytest.raises(ValueError):
+            _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])


 @pytest.mark.parametrize(
@@ -55,6 +57,10 @@ def test_custom_layer_partition():
        (5, 3, 1, (2, 4)),
        (5, 3, 2, (4, 5)),
    ])
-def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int,
-                               pp_rank: int, indices: tuple[int, int]):
+def test_uneven_auto_partition(
+    num_hidden_layers: int,
+    pp_size: int,
+    pp_rank: int,
+    indices: tuple[int, int],
+):
    assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations

-import os
+from typing import TYPE_CHECKING

 import pytest

 from ..utils import compare_two_settings, fork_new_process_for_each_test

+if TYPE_CHECKING:
+    from typing_extensions import LiteralString
+

 @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
    (2, "JackFram/llama-160m"),
@@ -15,18 +19,24 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
    "FLASHINFER",
 ])
 @fork_new_process_for_each_test
-def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
-    cudagraph_args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "float16",
-        "--pipeline-parallel-size",
-        str(PP_SIZE),
-        "--distributed-executor-backend",
-        "mp",
-    ]
-    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
-
-    eager_args = cudagraph_args + ["--enforce-eager"]
-
-    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
+def test_pp_cudagraph(
+    monkeypatch: pytest.MonkeyPatch,
+    PP_SIZE: int,
+    MODEL_NAME: str,
+    ATTN_BACKEND: LiteralString,
+):
+    with monkeypatch.context() as m:
+        cudagraph_args = [
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "float16",
+            "--pipeline-parallel-size",
+            str(PP_SIZE),
+            "--distributed-executor-backend",
+            "mp",
+        ]
+        m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
+
+        eager_args = cudagraph_args + ["--enforce-eager"]
+
+        compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -49,7 +49,7 @@ TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
 @pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""

    with monkeypatch.context() as m:
@@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
        run_test(more_args)


-def test_lm_eval_accuracy_v0_engine(monkeypatch):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V0 Engine."""

    with monkeypatch.context() as m:

--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -53,32 +53,37 @@ def cache_models():

 @pytest.mark.skip_global_cleanup
 @pytest.mark.usefixtures("cache_models")
-def test_offline_mode(monkeypatch):
+def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
    # Set HF to offline mode and ensure we can still construct an LLM
-    try:
-        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
-        monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
+    with monkeypatch.context() as m:
+        try:
+            m.setenv("HF_HUB_OFFLINE", "1")
+            m.setenv("VLLM_NO_USAGE_STATS", "1")

-        def disable_connect(*args, **kwargs):
-            raise RuntimeError("No http calls allowed")
+            def disable_connect(*args, **kwargs):
+                raise RuntimeError("No http calls allowed")

-        monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
-                            disable_connect)
-        monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
-                            disable_connect)
+            m.setattr(
+                urllib3.connection.HTTPConnection,
+                "connect",
+                disable_connect,
+            )
+            m.setattr(
+                urllib3.connection.HTTPSConnection,
+                "connect",
+                disable_connect,
+            )

-        # Need to re-import huggingface_hub and friends to setup offline mode
-        _re_import_modules()
-        # Cached model files should be used in offline mode
-        for model_config in MODEL_CONFIGS:
-            LLM(**model_config)
-    finally:
-        # Reset the environment after the test
-        # NB: Assuming tests are run in online mode
-        monkeypatch.delenv("HF_HUB_OFFLINE")
-        monkeypatch.delenv("VLLM_NO_USAGE_STATS")
-        _re_import_modules()
-        pass
+            # Need to re-import huggingface_hub
+            # and friends to setup offline mode
+            _re_import_modules()
+            # Cached model files should be used in offline mode
+            for model_config in MODEL_CONFIGS:
+                LLM(**model_config)
+        finally:
+            # Reset the environment after the test
+            # NB: Assuming tests are run in online mode
+            _re_import_modules()


 def _re_import_modules():

--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -70,7 +70,7 @@ def run_test(more_args):
 @pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""

    with monkeypatch.context() as m:
@@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):


 @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
+                                    more_args):
    """Run with the V0 Engine."""

    with monkeypatch.context() as m:

--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -5,13 +5,12 @@ from unittest.mock import Mock, patch
 import pytest
 import torch

-from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.openvino import OpenVinoPlatform
 from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL


 @pytest.fixture(autouse=True)
@@ -25,87 +24,111 @@ def clear_cache():
    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
 @pytest.mark.parametrize("use_v1", [True, False])
 @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
-def test_env(name: str, use_v1: bool, device: str, monkeypatch):
+def test_env(
+    name: str,
+    use_v1: bool,
+    device: str,
+    monkeypatch: pytest.MonkeyPatch,
+):
    """Test that the attention selector can be set via environment variable.
    Note that we do not test FlashAttn because it is the default backend.
    """

-    monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-    override_backend_env_variable(monkeypatch, name)
-
-    if device == "cpu":
-        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
-                                       False)
-        assert backend.get_name() == "TORCH_SDPA"
-    elif device == "hip":
-        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
-                                       False)
-        EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
-        assert backend.get_name() == EXPECTED
-    elif device == "openvino":
-        with patch("vllm.attention.selector.current_platform",
-                   OpenVinoPlatform()), patch.dict('sys.modules',
-                                                   {'openvino': Mock()}):
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
-                                       False)
-        assert backend.get_name() == "OPENVINO"
-    else:
-        if name in ["XFORMERS", "FLASHINFER"]:
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv(STR_BACKEND_ENV_VAR, name)
+
+        if device == "cpu":
+            with patch("vllm.attention.selector.current_platform",
+                       CpuPlatform()):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           16, False)
+            assert backend.get_name() == "TORCH_SDPA"
+        elif device == "hip":
            with patch("vllm.attention.selector.current_platform",
-                       CudaPlatform()):
+                       RocmPlatform()):
                backend = get_attn_backend(16, torch.float16, torch.float16,
                                           16, False)
-            EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+            EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
            assert backend.get_name() == EXPECTED
+        elif device == "openvino":
+            with patch("vllm.attention.selector.current_platform",
+                       OpenVinoPlatform()), patch.dict('sys.modules',
+                                                       {'openvino': Mock()}):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           16, False)
+            assert backend.get_name() == "OPENVINO"
+        else:
+            if name in ["XFORMERS", "FLASHINFER"]:
+                with patch("vllm.attention.selector.current_platform",
+                           CudaPlatform()):
+                    backend = get_attn_backend(16, torch.float16,
+                                               torch.float16, 16, False)
+                EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+                assert backend.get_name() == EXPECTED


-def test_flash_attn(monkeypatch):
+def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    """Test FlashAttn validation."""
    # TODO: When testing for v1, pipe in `use_v1` as an argument to
    # get_attn_backend

-    override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)

-    # Unsupported CUDA arch
-    with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
+        # Unsupported CUDA arch
+        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
+                            (7, 5))
        backend = get_attn_backend(16, torch.float16, None, 16, False)
        assert backend.get_name() != STR_FLASH_ATTN_VAL

-    # Unsupported data type
-    backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Reset the monkeypatch for subsequent tests
+        monkeypatch.undo()

-    # Unsupported kv cache data type
-    backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Unsupported data type
+        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL

-    # Unsupported block size
-    backend = get_attn_backend(16, torch.float16, None, 8, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Unsupported kv cache data type
+        backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL

-    # flash-attn is not installed
-    with patch.dict('sys.modules', {'vllm_flash_attn': None}):
+        # Unsupported block size
+        backend = get_attn_backend(16, torch.float16, None, 8, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # flash-attn is not installed
+        import sys
+        original_module = sys.modules.get('vllm_flash_attn')
+        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
        backend = get_attn_backend(16, torch.float16, None, 16, False)
        assert backend.get_name() != STR_FLASH_ATTN_VAL

-    # Unsupported head size
-    backend = get_attn_backend(17, torch.float16, None, 16, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Restore the original module if it existed
+        if original_module is not None:
+            monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
+                                original_module)
+        else:
+            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)

-    # Attention-free models should bypass env and use PlaceholderAttention
-    backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Unsupported head size
+        backend = get_attn_backend(17, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Attention-free models should bypass env and use PlaceholderAttention
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL


 @pytest.mark.parametrize("use_v1", [True, False])
-def test_invalid_env(use_v1: bool, monkeypatch):
-    """Ignore the invalid env variable if it is set."""
-    monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
+def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m, patch(
+            "vllm.attention.selector.current_platform", CudaPlatform()):
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)

-    with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+        # Test with head size 32
        backend = get_attn_backend(32, torch.float16, None, 16, False)
        EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
        assert backend.get_name() == EXPECTED

--- a/tests/kernels/test_awq.py
+++ b/tests/kernels/test_awq.py
 # SPDX-License-Identifier: Apache-2.0

-import os
-
 import pytest
 import torch

@@ -11,36 +9,38 @@ from vllm import _custom_ops as ops  # noqa: F401

 @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
                    reason="AWQ is not supported on this GPU type.")
-def test_awq_dequantize_opcheck():
-    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
-    qweight = torch.randint(-2000000000,
-                            2000000000, (8192, 256),
-                            device='cuda',
-                            dtype=torch.int32)
-    scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
-    zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
-    split_k_iters = 0
-    thx = 0
-    thy = 0
-    opcheck(torch.ops._C.awq_dequantize,
-            (qweight, scales, zeros, split_k_iters, thx, thy))
+def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_TRITON_AWQ", "0")
+        qweight = torch.randint(-2000000000,
+                                2000000000, (8192, 256),
+                                device='cuda',
+                                dtype=torch.int32)
+        scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
+        zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
+        split_k_iters = 0
+        thx = 0
+        thy = 0
+        opcheck(torch.ops._C.awq_dequantize,
+                (qweight, scales, zeros, split_k_iters, thx, thy))


 @pytest.mark.skip(reason="Not working; needs investigation.")
 @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
                    reason="AWQ is not supported on this GPU type.")
-def test_awq_gemm_opcheck():
-    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
-    input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
-    qweight = torch.randint(-2000000000,
-                            2000000000, (8192, 256),
-                            device='cuda',
-                            dtype=torch.int32)
-    scales = torch.randint(-2000000000,
-                           2000000000, (64, 256),
-                           device='cuda',
-                           dtype=torch.int32)
-    qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
-    split_k_iters = 8
-    opcheck(torch.ops._C.awq_gemm,
-            (input, qweight, qzeros, scales, split_k_iters))
+def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_TRITON_AWQ", "0")
+        input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
+        qweight = torch.randint(-2000000000,
+                                2000000000, (8192, 256),
+                                device='cuda',
+                                dtype=torch.int32)
+        scales = torch.randint(-2000000000,
+                               2000000000, (64, 256),
+                               device='cuda',
+                               dtype=torch.int32)
+        qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
+        split_k_iters = 8
+        opcheck(torch.ops._C.awq_gemm,
+                (input, qweight, qzeros, scales, split_k_iters))
--- a/tests/kernels/test_rocm_attention_selector.py
+++ b/tests/kernels/test_rocm_attention_selector.py
 # SPDX-License-Identifier: Apache-2.0

-from unittest.mock import patch
-
 import pytest
 import torch

-from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.rocm import RocmPlatform
+from vllm.utils import STR_BACKEND_ENV_VAR


 @pytest.fixture(autouse=True)
@@ -17,15 +15,19 @@ def clear_cache():
    _cached_get_attn_backend.cache_clear()


-def test_selector(monkeypatch):
-    """Test that the attention selector for ROCm.
-    """
-    override_backend_env_variable(monkeypatch, "ROCM_FLASH")
+def test_selector(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")

-    with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+        # Set the current platform to ROCm using monkeypatch
+        monkeypatch.setattr("vllm.attention.selector.current_platform",
+                            RocmPlatform())
+
+        # Test standard ROCm attention
        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
        assert (backend.get_name() == "ROCM_FLASH"
                or backend.get_name() == "ROCM_ATTN_VLLM_V1")
+
        # mla test for deepseek related
        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
                                   False, True)

--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -36,12 +36,12 @@ ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = (

 class QKVInputs(NamedTuple):
    '''
-    Data structure for representing unpacked attention inputs, 
+    Data structure for representing unpacked attention inputs,
    query/key/values and their sequence lengths.

    Attributes:

-        * {query,key,value}: unpacked (batch_size x padded_seq_len x 
+        * {query,key,value}: unpacked (batch_size x padded_seq_len x
                             num_heads x head_size) attention inputs
        * q_seq_lens: query sequence lengths list
        * kv_seq_lens: shared key/value sequence lengths list
@@ -56,14 +56,14 @@ class QKVInputs(NamedTuple):

 class QKVO(NamedTuple):
    '''
-    Data structure for representing unpacked attention inputs, 
+    Data structure for representing unpacked attention inputs,
    alongside unpacked known-correct attention output

    Attributes:

-        * qkv: unpacked (batch_size x padded_seq_len x 
+        * qkv: unpacked (batch_size x padded_seq_len x
                             num_heads x head_size) attention inputs
-        * ideal_output: unpacked (batch_size x padded_seq_len x 
+        * ideal_output: unpacked (batch_size x padded_seq_len x
                        num_heads x head_size) known-correct attention output
    '''

@@ -77,7 +77,7 @@ class PackedQKVInputs(NamedTuple):

    Attributes:

-        * {query,key,value}: packed (number_of_tokens x num_heads 
+        * {query,key,value}: packed (number_of_tokens x num_heads
                             x head_size) attention inputs
        * q_start_loc_list: list of query start locations within packed tensor
        * kv_start_loc_list: shared list of key/value start locations within
@@ -97,14 +97,14 @@ class PackedQKVInputs(NamedTuple):

 class PackedQKVO(NamedTuple):
    '''
-    Data structure for representing packed attention inputs, 
+    Data structure for representing packed attention inputs,
    alongside packed known-correct attention output

    Attributes:

-        * packed_qkv: packed (number_of_tokens x num_heads 
+        * packed_qkv: packed (number_of_tokens x num_heads
                      x head_size) attention inputs
-        * ideal_output: packed (number_of_tokens x num_heads 
+        * ideal_output: packed (number_of_tokens x num_heads
                        x head_size) known-correct attention output
    '''

@@ -134,7 +134,7 @@ class PhaseTestParameters(NamedTuple):

    Attributes:

-        * packed_qkvo: packed (number_of_tokens x num_heads 
+        * packed_qkvo: packed (number_of_tokens x num_heads
                       x head_size) attention inputs & known-correct
                       output
        * kv_mmap: KV cache memory mapping, specific to this test phase &
@@ -195,7 +195,7 @@ def make_causal_mask(
    Create a q_max_seq_len x kv_max_seq_len causal mask

    Arguments:
-    
+
    * q_max_seq_len: query max seq len
    * kv_max_seq_len: key/value max seq len

@@ -320,9 +320,9 @@ def make_qkv(
    * max_kv_seq_len: max key/value seq len
    * num_heads
    * head_size
-    * is_encoder_decoder_attn: if True, query seqlen may differ from 
-      key/value seqlen (as is often the case for cross-attention); 
-      o/w, query/key/value seqlens match at each batch index 
+    * is_encoder_decoder_attn: if True, query seqlen may differ from
+      key/value seqlen (as is often the case for cross-attention);
+      o/w, query/key/value seqlens match at each batch index
      (max_kv_seq_len is unused)
    * force_kv_seq_lens: if not None, overrides kv sequence lengths
    * attn_type: encoder, decoder self, or enc/dec cross attention
@@ -469,7 +469,7 @@ def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
    Individually pack each of Q, K and V, each with dimensions batch_size x
    padded_seq_len x num_heads x head_size, into respective number_of_tokens x
    num_heads x head_size tensors.
-    
+
    For Q, number_of_tokens = sum(q_seq_lens).

    For K and V, number_of_tokens = sum(kv_seq_lens)
@@ -619,9 +619,9 @@ def make_kv_cache(num_blocks: int,
    Returns:

    * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
-    *     for backend 'XFORMERS' 
+    *     for backend 'XFORMERS'
    * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
-    *     for backend 'FLASH_ATTN'  
+    *     for backend 'FLASH_ATTN'
    '''
    if backend == 'XFORMERS':
        kv_cache = torch.rand(
@@ -662,20 +662,20 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
    Context:
    * Your goal is to test (1) prefill of N prompts, with prompt-lengths
      {K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token
-      for all N prompts (N tokens total); the resultant sequence lengths 
+      for all N prompts (N tokens total); the resultant sequence lengths
      after decode would be {K_i + 1 for i \\in [0,N)}
-    * The test you want to do requires (1) having the prefill slot mapping 
-      for all tokens present during prefill, the number of which is 
-      M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N 
+    * The test you want to do requires (1) having the prefill slot mapping
+      for all tokens present during prefill, the number of which is
+      M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N
      decoded tokens
-    
-    This function consumes a single 1D slot mapping, which is the 
+
+    This function consumes a single 1D slot mapping, which is the
    concatenation of N slot mappings each of length K_i + 1 (corresponding
    to the  sequence lengths after decode), with a total length of
    P = \\sum_i{K_i + 1} = M + N

    The prefill-phase slot mapping results from excising the (K_i + 1)-th entry
-    from each of the N subsequences in the slot mapping (i.e. omitting the 
+    from each of the N subsequences in the slot mapping (i.e. omitting the
    decoded token's mapping.)

    The N excised entries are appended to obtain the decode-phase slot mapping
@@ -684,15 +684,15 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],

    * slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N
      post-decode sequences
-    * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the 
+    * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the
      description above)
    * device: cuda, cpu, etc.

    Returns:

-    * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) 
+    * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor)
      reflecting all N prefill prompts
-    * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting 
+    * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting
      all N decoded tokens
    '''

@@ -725,7 +725,7 @@ def make_block_tables_slot_mapping(

    Then the minimum KV cache size in blocks is

-    total_cache_blocks = sum(num_blocks for all seqs) 
+    total_cache_blocks = sum(num_blocks for all seqs)

    Then, the blocktable mapping counts downward from

@@ -734,7 +734,7 @@ def make_block_tables_slot_mapping(
    to

    block_base_addr
-    
+

    The constructed block-tables and slot-mapping are sized to the
    lengths of the sequences in their entirety (as reflected by seq_lens),
@@ -749,7 +749,7 @@ def make_block_tables_slot_mapping(

    Return:

-    * block_tables_tensor: block table for sequence   
+    * block_tables_tensor: block table for sequence
    * slot_mapping_list: slot mapping for sequence
    * max_block_idx: the highest block address within this block table
    '''
@@ -807,7 +807,7 @@ def make_test_metadata(
    encoder_test_params and cross_test_params arguments allow encoder
    attention and enc/dec cross-attention (respectively) to use distinct
    metadata values from decoder self-attention (decoder_test_params.)
-    
+
    if encoder_test_params and cross_test_params are None, the attention
    metadata will support decoder-only scenario.

@@ -820,7 +820,7 @@ def make_test_metadata(
    * attn_backend_name: Backend for sourcing attention kernels
    * is_prompt: prefill if True, o/w decode
    * seq_lens: list of token counts for each sequence
-    * decoder_test_params: decoder self-attention test params; 
+    * decoder_test_params: decoder self-attention test params;
                           this function requires
                           kv_mmap (memory mapping) field
    * device: CPU or CUDA device

--- a/tests/kv_transfer/disagg_test.py
+++ b/tests/kv_transfer/disagg_test.py