Unverified Commit 76852017 authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[MISC] Rename the torch profiler filename as instance_id+rank_id for merging...


[MISC] Rename the torch profiler filename as instance_id+rank_id for merging the Profiler results of each Rank (#25867)
Signed-off-by: default avatarwang.yuqi <noooop@126.com>
parent 82e64c7a
...@@ -5,6 +5,7 @@ import copy ...@@ -5,6 +5,7 @@ import copy
import hashlib import hashlib
import json import json
import os import os
import time
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import field, replace from dataclasses import field, replace
from functools import lru_cache from functools import lru_cache
...@@ -270,6 +271,9 @@ class VllmConfig: ...@@ -270,6 +271,9 @@ class VllmConfig:
def __post_init__(self): def __post_init__(self):
"""Verify configs are valid & consistent with each other.""" """Verify configs are valid & consistent with each other."""
# To give each torch profile run a unique instance name.
self.instance_id = f"{time.time_ns()}"
self.try_verify_and_update_config() self.try_verify_and_update_config()
if self.model_config is not None: if self.model_config is not None:
......
...@@ -79,6 +79,7 @@ class Worker(WorkerBase): ...@@ -79,6 +79,7 @@ class Worker(WorkerBase):
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
if envs.VLLM_TORCH_PROFILER_DIR: if envs.VLLM_TORCH_PROFILER_DIR:
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
logger.info( logger.info(
"Profiling enabled. Traces will be saved to: %s", "Profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir, torch_profiler_trace_dir,
...@@ -101,7 +102,7 @@ class Worker(WorkerBase): ...@@ -101,7 +102,7 @@ class Worker(WorkerBase):
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
on_trace_ready=torch.profiler.tensorboard_trace_handler( on_trace_ready=torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir, use_gzip=True torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
), ),
) )
else: else:
......
...@@ -39,6 +39,7 @@ class XPUWorker(Worker): ...@@ -39,6 +39,7 @@ class XPUWorker(Worker):
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
if envs.VLLM_TORCH_PROFILER_DIR: if envs.VLLM_TORCH_PROFILER_DIR:
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
logger.info( logger.info(
"Profiling enabled. Traces will be saved to: %s", "Profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir, torch_profiler_trace_dir,
...@@ -61,7 +62,7 @@ class XPUWorker(Worker): ...@@ -61,7 +62,7 @@ class XPUWorker(Worker):
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
on_trace_ready=torch.profiler.tensorboard_trace_handler( on_trace_ready=torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir, use_gzip=True torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
), ),
) )
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment