[compile] Apply stored functorch config while finalizing loaded artifacts. (#36582)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>

[compile] Apply stored functorch config while finalizing loaded artifacts. (#36582)
Signed-off-by: zhxchen17 <zhxchen17@fb.com>
bdd8981d · Zhengxu Chen · GitHub · f088a831 · bdd8981d · bdd8981d
Unverified Commit bdd8981d authored Mar 10, 2026 by Zhengxu Chen Committed by GitHub Mar 10, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 25 deletions

vllm/compilation/caching.py vllm/compilation/caching.py +7 -1

vllm/compilation/piecewise_backend.py vllm/compilation/piecewise_backend.py +8 -24

No files found.
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -369,8 +369,14 @@ class VllmSerializableFunction(SerializableCallable):  # type: ignore[misc]

        from vllm.compilation.backends import VllmBackend

+        saved_aot_autograd_config = self.aot_autograd_config
+        if saved_aot_autograd_config is not None:
+            functorch_ctx = torch._functorch.config.patch(saved_aot_autograd_config)
+        else:
+            functorch_ctx = contextlib.nullcontext()
+
        vllm_backend = VllmBackend(vllm_config, self.prefix, self.is_encoder)
-        with tracing(TracingContext(self._fake_mode)):
+        with tracing(TracingContext(self._fake_mode)), functorch_ctx:
            result = vllm_backend(self.graph_module, list(self.example_inputs))
            self.optimized_call = result.optimized_call
            self.vllm_backend = vllm_backend

--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -258,31 +258,15 @@ class PiecewiseBackend:
            else:
                args_list = get_fake_args_from_graph(self.graph)

-            # TODO(https://github.com/vllm-project/vllm/issues/35766)
-            # Can we remove strict_autograd_cache and
-            # force_non_lazy_backward_lowering overrides?
-            # I added them explicitly because this is what they are
-            # set to before the refactor
-            # (https://github.com/vllm-project/vllm/pull/35472).
-            # They affect the aotautograd cache key computation
-            # but they shouldn't have any effect on the actual
-            # compilation.
-            config_patches = dict(
-                bundled_autograd_cache=True,
-                strict_autograd_cache=False,
+            range_entry.runnable = self.vllm_backend.compiler_manager.compile(
+                self.graph,
+                args_list,
+                self.vllm_backend.inductor_config,
+                self.compilation_config,
+                compile_range=range_entry.compile_range,
+                graph_index=self.piecewise_compile_index,
+                num_graphs=self.total_piecewise_compiles,
            )
-            if hasattr(torch._functorch.config, "force_non_lazy_backward_lowering"):
-                config_patches["force_non_lazy_backward_lowering"] = False
-            with torch._functorch.config.patch(**config_patches):
-                range_entry.runnable = self.vllm_backend.compiler_manager.compile(
-                    self.graph,
-                    args_list,
-                    self.vllm_backend.inductor_config,
-                    self.compilation_config,
-                    compile_range=range_entry.compile_range,
-                    graph_index=self.piecewise_compile_index,
-                    num_graphs=self.total_piecewise_compiles,
-                )

            range_entry.compiled = True