[Cleanup] Refactor profiling env vars into a CLI config (#29912)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

[Cleanup] Refactor profiling env vars into a CLI config (#29912)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
e858bfe0 · Benjamin Chislett · GitHub · d471b2af · e858bfe0 · e858bfe0
Unverified Commit e858bfe0 authored Dec 09, 2025 by Benjamin Chislett Committed by GitHub Dec 09, 2025
Show whitespace changes
Inline Side-by-side

Showing with 11 additions and 35 deletions

vllm/v1/worker/tpu_worker.py vllm/v1/worker/tpu_worker.py +2 -2

vllm/v1/worker/xpu_worker.py vllm/v1/worker/xpu_worker.py +9 -33

No files found.
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -98,10 +98,10 @@ class TPUWorker:
        # MP runtime is initialized.
        self.profiler = None
        self.profile_dir = None
-        if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
+        if vllm_config.profiler_config.profiler == "torch" and self.rank < 1:
            # For TPU, we can only have 1 active profiler session for 1 profiler
            # server. So we only profile on rank0.
-            self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+            self.profile_dir = vllm_config.profiler_config.torch_profiler_dir
            logger.info(
                "Profiling enabled. Traces will be saved to: %s", self.profile_dir
            )

--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -6,12 +6,12 @@ from typing import Any
 import torch
 import torch.distributed

-import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed import get_world_group
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
+from vllm.profiler.wrapper import TorchProfilerWrapper
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 from vllm.v1.worker.xpu_model_runner import XPUModelRunner

@@ -36,41 +36,17 @@ class XPUWorker(Worker):
        assert device_config.device_type == "xpu"
        assert current_platform.is_xpu()

-        # Torch profiler. Enabled and configured through env vars:
-        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        # Torch profiler. Enabled and configured through profiler_config.
        self.profiler: Any | None = None
-        if envs.VLLM_TORCH_PROFILER_DIR:
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+        profiler_config = vllm_config.profiler_config
+        if profiler_config.profiler == "torch":
            worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
-            logger.info(
-                "Profiling enabled. Traces will be saved to: %s",
-                torch_profiler_trace_dir,
-            )
-            logger.debug(
-                "Profiler config: record_shapes=%s,"
-                "profile_memory=%s,with_stack=%s,with_flops=%s",
-                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-            )
-            self.profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.XPU,
-                ],
-                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir,
+            self.profiler = TorchProfilerWrapper(
+                profiler_config,
                worker_name=worker_name,
-                    use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
-                ),
+                local_rank=self.local_rank,
+                activities=["CPU", "XPU"],
            )
-        else:
-            self.profiler = None

    # we provide this function due to `torch.xpu.mem_get_info()` doesn't
    # return correct free_gpu_memory on intel client GPU. We need to