[compile] Initialize passes at VllmBackend init (#35216)

Signed-off-by: angelayi <yiangela7@gmail.com>

[compile] Initialize passes at VllmBackend init (#35216)
Signed-off-by: angelayi <yiangela7@gmail.com>
12fd17eb · Angela Yi · GitHub · 37aadf62 · 12fd17eb · 12fd17eb
Unverified Commit 12fd17eb authored Mar 20, 2026 by Angela Yi Committed by GitHub Mar 20, 2026
Showing with 19 additions and 5 deletions

tests/test_config.py tests/test_config.py +2 -2

vllm/compilation/backends.py vllm/compilation/backends.py +12 -3

vllm/compilation/decorators.py vllm/compilation/decorators.py +5 -0

No files found.
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -32,9 +32,9 @@ from vllm.platforms import current_platform

 def test_compile_config_repr_succeeds():
    # setup: VllmBackend mutates the config object
+    # Note: VllmBackend.__init__ already calls configure_post_pass()
    config = VllmConfig()
-    backend = VllmBackend(config)
-    backend.configure_post_pass()
+    _ = VllmBackend(config)

    # test that repr(config) succeeds
    val = repr(config)

--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -836,8 +836,18 @@ class VllmBackend:
        # in future we need PostGradPassManager.uuid() to be executed
        # only at compile time.
        self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config)
-        # `torch.compile` is JIT compiled, so we don't need to
-        # do anything here
+
+        # Configure post-grad passes (including AllReduceFusionPass) during
+        # backend init rather than at torch.compile time, so that expensive
+        # one-time setup (e.g. FlashInfer workspace allocation) is not
+        # attributed to compilation latency.
+        start = time.time()
+        self.configure_post_pass()
+        logger.info_once(
+            "Post-grad pass configuration time: %.2f s",
+            time.time() - start,
+            scope="local",
+        )

    def collect_standalone_compile_artifacts(
        self,
@@ -1118,7 +1128,6 @@ class VllmBackend:
        assert not self._called, "VllmBackend can only be called once"

        self.graph = graph
-        self.configure_post_pass()

        if self.compilation_config.use_inductor_graph_partition:
            # Let Inductor decide partitioning; avoid FX-level pre-splitting.

--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -380,6 +380,11 @@ def _support_torch_compile(
        compilation_counter.num_models_seen += 1
        self.compiled = False

+        # Skip if a parent class's @support_torch_compile already
+        # initialized the compile wrapper
+        if hasattr(self, "_compiled_callable"):
+            return
+
        # Handled by monkeypatching `TorchCompileWithNoGuardsWrapper` into base class
        TorchCompileWithNoGuardsWrapper.__init__(
            self,