[Bugfix] Remove VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE #2969 (#25090)

Signed-off-by: Lucas Kabela <lucaskabela@meta.com>

[Bugfix] Remove VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE #2969 (#25090)
Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
3da17c2c · Lucas Kabela · GitHub · 14c14327 · 3da17c2c · 3da17c2c
Unverified Commit 3da17c2c authored Sep 19, 2025 by Lucas Kabela Committed by GitHub Sep 19, 2025
6 changed files
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -20,7 +20,6 @@ class TestSetting:
    tp_size: int
    attn_backend: str
    method: str
-    fullgraph: bool
 # we cannot afford testing the full Cartesian product
@@ -36,7 +35,6 @@ class TestSetting:
            tp_size=2,
            attn_backend="FLASH_ATTN",
            method="generate",
-            fullgraph=True,
        ),
        # llama model with quantization
        TestSetting(
@@ -46,7 +44,6 @@ class TestSetting:
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="generate",
-            fullgraph=True,
        ),
        # MoE model
        TestSetting(
@@ -56,7 +53,6 @@ class TestSetting:
            tp_size=2,
            attn_backend="FLASH_ATTN",
            method="generate",
-            fullgraph=True,
        ),
        # embedding model
        TestSetting(
@@ -73,7 +69,6 @@ class TestSetting:
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="encode",
-            fullgraph=True,
        ),
        TestSetting(
            model="BAAI/bge-base-en-v1.5",
@@ -82,7 +77,6 @@ class TestSetting:
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="encode",
-            fullgraph=True,
        ),
        # vision language model
        TestSetting(
@@ -92,7 +86,6 @@ class TestSetting:
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="generate_with_image",
-            fullgraph=False,
        ),
    ],
 )
@@ -109,9 +102,8 @@ def test_compile_correctness(
    tp_size = test_setting.tp_size
    attn_backend = test_setting.attn_backend
    method = test_setting.method
-    fullgraph = test_setting.fullgraph
+    if cuda_device_count_stateless() < pp_size * tp_size:
-    if cuda_device_count_stateless() != pp_size * tp_size:
+        pytest.skip(f"Need at least {pp_size}*{tp_size} CUDA gpus but got "
-        pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got "
                    f"{cuda_device_count_stateless()}")
    with monkeypatch.context() as m:
@@ -149,9 +141,5 @@ def test_compile_correctness(
        ]:
            all_args.append(final_args + [f"-O{level}"])
            all_envs.append({})
-            if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
-                # "DYNAMO_ONCE" will always use fullgraph
-                all_envs[-1][
-                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
        compare_all_settings(model, all_args * 3, all_envs, method=method)
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -79,9 +79,7 @@ def test_full_graph(
 ):
    model, model_kwargs = model_info
-    with monkeypatch.context() as m:
+    with monkeypatch.context():
-        # make sure these models can be captured in full graph mode
-        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
        print(f"MODEL={model}")
        run_model(optimization_level, model, model_kwargs)

--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -10,7 +10,6 @@ from typing import Callable, Optional
 import torch
-import vllm.envs as envs
 from vllm.config import (CompilationLevel, CUDAGraphMode,
                         get_current_vllm_config)
 from vllm.logger import init_logger
@@ -47,9 +46,8 @@ class TorchCompileWrapperWithCustomDispatcher:
                options = get_current_vllm_config(
                ).compilation_config.inductor_compile_config
-            compiled_callable = torch.compile(
+            compiled_callable = torch.compile(self.forward,
-                self.forward,
+                                              fullgraph=True,
-                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
                                              backend=backend,
                                              options=options)

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -434,11 +434,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_FLASH_ATTN_VERSION":
    lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)),
-    # Internal flag to enable Dynamo fullgraph capture
-    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
-    lambda: bool(
-        os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
    # Feature flag to enable/disable Inductor standalone compile.
    # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
    # enabled by default.

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2602,9 +2602,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            backend = self.vllm_config.compilation_config.init_backend(
                self.vllm_config)
            compilation_counter.dynamo_as_is_count += 1
-            self.model.compile(
+            self.model.compile(fullgraph=True, backend=backend)
-                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                backend=backend)
            return
        # for other compilation levels, cudagraph behavior is controlled by
        # CudagraphWraper and CudagraphDispatcher of vllm.

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -18,7 +18,6 @@ import torch.distributed
 import torch.nn as nn
 from tqdm.auto import tqdm
-import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
@@ -1099,9 +1098,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
            backend = self.vllm_config.compilation_config.init_backend(
                self.vllm_config)
            compilation_counter.dynamo_as_is_count += 1
-            self.model = torch.compile(
+            self.model = torch.compile(self.model,
-                self.model,
+                                       fullgraph=True,
-                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
                                       backend=backend)
    def get_model(self) -> nn.Module: