Disable remote caching when calling compile_fx (#16611)

Signed-off-by: rzou <zou3519@gmail.com>

Disable remote caching when calling compile_fx (#16611)
Signed-off-by: rzou <zou3519@gmail.com>
966c742e · Richard Zou · GitHub · 0d7d05f4 · 966c742e
Unverified Commit 966c742e authored Apr 16, 2025 by Richard Zou Committed by GitHub Apr 15, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 0 deletions

vllm/compilation/compiler_interface.py vllm/compilation/compiler_interface.py +13 -0

No files found.
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -290,6 +290,19 @@ class InductorAdaptor(CompilerInterface):
            # Dynamo metrics context, see method for more details.
            stack.enter_context(self.metrics_context())
+            # Disable remote caching. When these are on, on remote cache-hit,
+            # the monkey-patched functions never actually get called.
+            # vLLM today assumes and requires the monkey-patched functions to
+            # get hit.
+            # TODO(zou3519): we're going to replace this all with
+            # standalone_compile sometime.
+            if is_torch_equal_or_newer("2.6"):
+                stack.enter_context(
+                    torch._inductor.config.patch(fx_graph_remote_cache=False))
+                stack.enter_context(
+                    torch._functorch.config.patch(
+                        enable_remote_autograd_cache=False))
            compiled_graph = compile_fx(
                graph,
                example_inputs,