Revert "[compile] Initialize passes at VllmBackend init" (#37733)

1fa1e53a · Simon Mo · GitHub · 3ffa5200 · 1fa1e53a · 1fa1e53a
Unverified Commit 1fa1e53a authored Mar 20, 2026 by Simon Mo Committed by GitHub Mar 20, 2026
Showing with 5 additions and 19 deletions

tests/test_config.py tests/test_config.py +2 -2

vllm/compilation/backends.py vllm/compilation/backends.py +3 -12

vllm/compilation/decorators.py vllm/compilation/decorators.py +0 -5

No files found.
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -32,9 +32,9 @@ from vllm.platforms import current_platform
 def test_compile_config_repr_succeeds():
    # setup: VllmBackend mutates the config object
-    # Note: VllmBackend.__init__ already calls configure_post_pass()
    config = VllmConfig()
-    _ = VllmBackend(config)
+    backend = VllmBackend(config)
+    backend.configure_post_pass()
    # test that repr(config) succeeds
    val = repr(config)

--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -836,18 +836,8 @@ class VllmBackend:
        # in future we need PostGradPassManager.uuid() to be executed
        # only at compile time.
        self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config)
+        # `torch.compile` is JIT compiled, so we don't need to
-        # Configure post-grad passes (including AllReduceFusionPass) during
+        # do anything here
-        # backend init rather than at torch.compile time, so that expensive
-        # one-time setup (e.g. FlashInfer workspace allocation) is not
-        # attributed to compilation latency.
-        start = time.time()
-        self.configure_post_pass()
-        logger.info_once(
-            "Post-grad pass configuration time: %.2f s",
-            time.time() - start,
-            scope="local",
-        )
    def collect_standalone_compile_artifacts(
        self,
@@ -1128,6 +1118,7 @@ class VllmBackend:
        assert not self._called, "VllmBackend can only be called once"
        self.graph = graph
+        self.configure_post_pass()
        if self.compilation_config.use_inductor_graph_partition:
            # Let Inductor decide partitioning; avoid FX-level pre-splitting.

--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -380,11 +380,6 @@ def _support_torch_compile(
        compilation_counter.num_models_seen += 1
        self.compiled = False
-        # Skip if a parent class's @support_torch_compile already
-        # initialized the compile wrapper
-        if hasattr(self, "_compiled_callable"):
-            return
        # Handled by monkeypatching `TorchCompileWithNoGuardsWrapper` into base class
        TorchCompileWithNoGuardsWrapper.__init__(
            self,