[Debugging] Add annotation for easier trace analysis (#22496)

17676585 · Dayeol Lee · GitHub · efe73e9b · 17676585
Unverified Commit 17676585 authored Nov 05, 2025 by Dayeol Lee Committed by GitHub Nov 05, 2025
Show whitespace changes
Inline Side-by-side

Showing with 19 additions and 3 deletions

vllm/v1/worker/gpu_worker.py vllm/v1/worker/gpu_worker.py +19 -3

No files found.
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -509,6 +509,19 @@ class Worker(WorkerBase):
    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
        return self.model_runner.get_supported_tasks()
+    def annotate_profile(self, scheduler_output):
+        # add trace annotation so that we can easily distinguish
+        # new/cached request numbers in each iteration
+        if not self.profiler:
+            return nullcontext()
+        num_new = len(scheduler_output.scheduled_new_reqs)
+        num_cached = len(scheduler_output.scheduled_cached_reqs.req_ids)
+        return torch.profiler.record_function(
+            f"execute_new_{num_new}_cached_{num_cached}"
+        )
    @torch.inference_mode()
    def sample_tokens(
        self, grammar_output: "GrammarOutput | None"
@@ -536,7 +549,10 @@ class Worker(WorkerBase):
                )
            )
-        output = self.model_runner.execute_model(scheduler_output, intermediate_tensors)
+        with self.annotate_profile(scheduler_output):
+            output = self.model_runner.execute_model(
+                scheduler_output, intermediate_tensors
+            )
            if isinstance(output, (ModelRunnerOutput, NoneType)):
                return output