[BE] Fix compile time message to be consistent (use monitoring) (#40641)

Signed-off-by: Lucas Kabela <lucaskabela@meta.com>

[BE] Fix compile time message to be consistent (use monitoring) (#40641)
Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
0283f303 · Lucas Kabela · GitHub · ac58e2a1 · 0283f303 · 0283f303
Unverified Commit 0283f303 authored Apr 22, 2026 by Lucas Kabela Committed by GitHub Apr 23, 2026
Showing with 14 additions and 15 deletions

vllm/compilation/backends.py vllm/compilation/backends.py +4 -13

vllm/compilation/decorators.py vllm/compilation/decorators.py +5 -2

vllm/compilation/monitor.py vllm/compilation/monitor.py +5 -0

No files found.
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -283,10 +283,6 @@ class CompilerManager:
                # after loading the last graph for this shape, record the time.
                # there can be multiple graphs due to piecewise compilation.
                elapsed = time.perf_counter() - compilation_start_time
-                if is_encoder:
-                    compilation_config.encoder_compilation_time += elapsed
-                else:
-                    compilation_config.compilation_time += elapsed
                logger.info_once(
                    "Directly load the compiled graph(s) for compile range %s "
                    "from the cache, took %.3f s",
@@ -388,10 +384,6 @@ class CompilerManager:
        # after compiling the last graph, record the end time
        if graph_index == num_graphs - 1:
            elapsed = time.perf_counter() - compilation_start_time
-            if is_encoder:
-                compilation_config.encoder_compilation_time += elapsed
-            else:
-                compilation_config.compilation_time += elapsed
            logger.info_once(
                "Compiling a graph for compile range %s takes %.2f s",
                str(compile_range),
@@ -1129,11 +1121,10 @@ class VllmBackend:
        from .monitor import torch_compile_start_time

        dynamo_time = time.perf_counter() - torch_compile_start_time
-        logger.info_once("Dynamo bytecode transform time: %.2f s", dynamo_time)
-        if self.is_encoder:
-            self.compilation_config.encoder_compilation_time += dynamo_time
-        else:
-            self.compilation_config.compilation_time += dynamo_time
+        logger.info_once(
+            "Dynamo bytecode transform time: %.2f s",
+            dynamo_time,
+        )

        # Record Dynamo time in tracing if available
        start_time = int(torch_compile_start_time * 1e9)

--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -285,7 +285,7 @@ def _try_load_aot_compiled_fn(
    Re-raises on failure when ``VLLM_FORCE_AOT_LOAD`` is set.
    """
    try:
-        with monitor_torch_compile(model.vllm_config):
+        with monitor_torch_compile(model.vllm_config, is_encoder=model._is_encoder):
            with (
                set_current_vllm_config(model.vllm_config),
                open(aot_compilation_path, "rb") as f,
@@ -617,7 +617,9 @@ def _support_torch_compile(
                # store the path for saving after warmup
                self._aot_compilation_path = aot_compilation_path
                self._aot_cache_dir = cache_dir
-                with monitor_torch_compile(self.vllm_config):
+                with monitor_torch_compile(
+                    self.vllm_config, is_encoder=self._is_encoder
+                ):
                    self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
                    compilation_counter.num_aot_compiles += 1
                    # All compilation is done at this point, save the
@@ -631,6 +633,7 @@ def _support_torch_compile(
                    self.vllm_config,
                    "torch.compile and initial profiling/warmup "
                    "run together took %.2f s in total",
+                    is_encoder=self._is_encoder,
                ):
                    output = TorchCompileWithNoGuardsWrapper.__call__(
                        self,  # type: ignore[arg-type]

--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -18,6 +18,7 @@ torch_compile_start_time: float = 0.0
 def monitor_torch_compile(
    vllm_config: VllmConfig,
    message: str = "torch.compile took %.2f s in total",
+    is_encoder: bool = False,
 ) -> Generator[None, None, None]:
    """Context manager that times torch.compile and manages depyf debugging.

@@ -45,6 +46,10 @@ def monitor_torch_compile(
    else:
        total_compile_time = time.perf_counter() - torch_compile_start_time
        if compilation_config.mode == CompilationMode.VLLM_COMPILE:
+            if is_encoder:
+                compilation_config.encoder_compilation_time += total_compile_time
+            else:
+                compilation_config.compilation_time += total_compile_time
            logger.info_once(message, total_compile_time)
    finally:
        if depyf_cm is not None: