[torch.compile] Use custom ops when use_inductor=False (#19618)

aafbbd98 · Woosuk Kwon · GitHub · 0f087451 · aafbbd98
Unverified Commit aafbbd98 authored Jun 13, 2025 by Woosuk Kwon Committed by GitHub Jun 13, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 3 deletions

vllm/config.py vllm/config.py +15 -3

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4450,15 +4450,27 @@ class VllmConfig:
            self.compilation_config.custom_ops.append("+rms_norm")
        if envs.VLLM_USE_V1 and self.model_config is not None and \
            not self.model_config.enforce_eager:
-            # FIXME(rob): Add function to set all of these.
+            # By default, V1 uses piecewise CUDA graphs. If full_cuda_graph
-            if not self.compilation_config.custom_ops:
+            # is set to True, full CUDA graphs will be used.
-                self.compilation_config.custom_ops = ["none"]
            self.compilation_config.cudagraph_num_of_warmups = 1
            self.compilation_config.pass_config.enable_fusion = False
            self.compilation_config.pass_config.enable_noop = False
            self.compilation_config.level = CompilationLevel.PIECEWISE
            self.compilation_config.set_splitting_ops_for_v1()
+            # The behavior of custom ops with inductor depends on the config:
+            # - If use_inductor=True and custom_ops is empty:
+            #   Inductor generates Triton kernels for all registered custom ops
+            #   (default behavior)
+            # - If use_inductor=True and custom_ops is non-empty:
+            #   Custom CUDA kernels are used for specified ops while inductor
+            #   generates Triton kernels for remaining ops, including misc torch
+            #   ops in the model.
+            if (not self.compilation_config.custom_ops
+                    and self.compilation_config.use_inductor):
+                # Let inductor generate Triton kernels for the custom ops.
+                self.compilation_config.custom_ops = ["none"]
        self._set_cudagraph_sizes()
        if self.cache_config.cpu_offload_gb > 0 and \