"git@developer.sourcefind.cn:liming6/dcu-process-montor.git" did not exist on "c06b9d4e283ddafa4790d253a0f9b5812ebf86cb"
Unverified Commit c142eced authored by Zaida Zhou's avatar Zaida Zhou Committed by GitHub
Browse files

[Feature] Add PyTorch Profiler to analyze training process (#937)

* [Feature] Add PyTorch Profiler to analyze training process

* [Feature] Add PyTorch Profiler to analyze training process

* pring log and fix docstring

* add by_epoch

* modify docstring according to comment
parent 5d75c7a8
...@@ -448,6 +448,16 @@ class BaseRunner(metaclass=ABCMeta): ...@@ -448,6 +448,16 @@ class BaseRunner(metaclass=ABCMeta):
hook = timer_config hook = timer_config
self.register_hook(hook) self.register_hook(hook)
def register_profiler_hook(self, profiler_config):
if profiler_config is None:
return
if isinstance(profiler_config, dict):
profiler_config.setdefault('type', 'ProfilerHook')
hook = mmcv.build_from_cfg(profiler_config, HOOKS)
else:
hook = profiler_config
self.register_hook(hook)
def register_training_hooks(self, def register_training_hooks(self,
lr_config, lr_config,
optimizer_config=None, optimizer_config=None,
......
...@@ -11,6 +11,7 @@ from .lr_updater import LrUpdaterHook ...@@ -11,6 +11,7 @@ from .lr_updater import LrUpdaterHook
from .memory import EmptyCacheHook from .memory import EmptyCacheHook
from .momentum_updater import MomentumUpdaterHook from .momentum_updater import MomentumUpdaterHook
from .optimizer import Fp16OptimizerHook, OptimizerHook from .optimizer import Fp16OptimizerHook, OptimizerHook
from .profiler import ProfilerHook
from .sampler_seed import DistSamplerSeedHook from .sampler_seed import DistSamplerSeedHook
from .sync_buffer import SyncBuffersHook from .sync_buffer import SyncBuffersHook
...@@ -20,5 +21,5 @@ __all__ = [ ...@@ -20,5 +21,5 @@ __all__ = [
'DistSamplerSeedHook', 'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook', 'DistSamplerSeedHook', 'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook',
'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook', 'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook',
'WandbLoggerHook', 'MomentumUpdaterHook', 'SyncBuffersHook', 'EMAHook', 'WandbLoggerHook', 'MomentumUpdaterHook', 'SyncBuffersHook', 'EMAHook',
'EvalHook', 'DistEvalHook' 'EvalHook', 'DistEvalHook', 'ProfilerHook'
] ]
# Copyright (c) Open-MMLab. All rights reserved.
import warnings
from typing import Callable, List, Optional, Union
import torch
from ..dist_utils import master_only
from .hook import HOOKS, Hook
@HOOKS.register_module()
class ProfilerHook(Hook):
"""Profiler to analyze perfromance during training.
PyTorch Profiler is a tool that allows the collection of the performance
metrics during the training. More details on Profiler can be found at
https://pytorch.org/docs/1.8.1/profiler.html#torch.profiler.profile
Args:
by_epoch (bool): Profile performance by epoch or by iteration.
Default: True.
profile_iters (int): Number of iterations for profiling.
If ``by_epoch=True``, profile_iters indicates that they are the
first profile_iters epochs at the beginning of the
training, otherwise it indicates the first profile_iters
iterations. Default: 1.
activities (list[str]): List of activity groups (CPU, CUDA) to use in
profiling. Default: ['cpu', 'cuda'].
schedule (dict, optional): Config of generating the callable schedule.
if schedule is None, profiler will not add step markers into the
trace and table view. Default: None.
on_trace_ready (callable, dict): Either a handler or a dict of generate
handler. Default: None.
record_shapes (bool): Save information about operator's input shapes.
Default: False.
profile_memory (bool): Track tensor memory allocation/deallocation.
Default: False.
with_stack (bool): Record source information (file and line number)
for the ops. Default: False.
with_flops (bool): Use formula to estimate the FLOPS of specific
operators (matrix multiplication and 2D convolution).
Default: False.
json_trace_path (str, optional): Exports the collected trace in Chrome
JSON format. Default: None.
Example:
>>> runner = ... # instantiate a Runner
>>> # tensorboard trace
>>> trace_config = dict(type='tb_trace', dir_name='work_dir')
>>> profiler_config = dict(on_trace_ready=trace_config)
>>> runner.register_profiler_hook(profiler_config)
>>> runner.run(data_loaders=[trainloader], workflow=[('train', 1)])
"""
def __init__(self,
by_epoch: bool = True,
profile_iters: int = 1,
activities: List[str] = ['cpu', 'cuda'],
schedule: Optional[dict] = None,
on_trace_ready: Optional[Union[Callable, dict]] = None,
record_shapes: bool = False,
profile_memory: bool = False,
with_stack: bool = False,
with_flops: bool = False,
json_trace_path: Optional[str] = None) -> None:
try:
from torch import profiler # torch version >= 1.8.1
except ImportError:
raise ImportError('profiler is the new feature of torch1.8.1, '
f'but your verison is {torch.__version__}')
assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.'
self.by_epoch = by_epoch
if profile_iters < 1:
raise ValueError('profile_iters should be greater than 0, but got '
f'{profile_iters}')
self.profile_iters = profile_iters
if not isinstance(activities, list):
raise ValueError(
f'activities should be list, but got {type(activities)}')
self.activities = []
for activity in activities:
activity = activity.lower()
if activity == 'cpu':
self.activities.append(profiler.ProfilerActivity.CPU)
elif activity == 'cuda':
self.activities.append(profiler.ProfilerActivity.CUDA)
else:
raise ValueError(
f'activity should be "cpu" or "cuda", but got {activity}')
if schedule is not None:
self.schedule = profiler.schedule(**schedule)
else:
self.schedule = None
self.on_trace_ready = on_trace_ready
self.record_shapes = record_shapes
self.profile_memory = profile_memory
self.with_stack = with_stack
self.with_flops = with_flops
self.json_trace_path = json_trace_path
@master_only
def before_run(self, runner):
if self.by_epoch and runner.max_epochs < self.profile_iters:
raise ValueError('self.profile_iters should not be greater than '
f'{runner.max_epochs}')
if not self.by_epoch and runner.max_iters < self.profile_iters:
raise ValueError('self.profile_iters should not be greater than '
f'{runner.max_iters}')
if callable(self.on_trace_ready): # handler
_on_trace_ready = self.on_trace_ready
elif isinstance(self.on_trace_ready, dict): # config of handler
trace_cfg = self.on_trace_ready.copy()
trace_type = trace_cfg.pop('type') # log_trace handler
if trace_type == 'log_trace':
def _log_hanlder(prof):
print(prof.key_averages().table(**trace_cfg))
_on_trace_ready = _log_hanlder
elif trace_type == 'tb_trace': # tensorboard_trace handler
try:
import torch_tb_profiler # noqa: F401
except ImportError:
raise ImportError('please run "pip install '
'torch-tb-profiler" to install '
'torch_tb_profiler')
_on_trace_ready = torch.profiler.tensorboard_trace_handler(
**trace_cfg)
else:
raise ValueError('trace_type should be "log_trace" or '
f'"tb_trace", but got {trace_type}')
elif self.on_trace_ready is None:
_on_trace_ready = None # type: ignore
else:
raise ValueError('on_trace_ready should be handler, dict or None, '
f'but got {type(self.on_trace_ready)}')
if runner.max_epochs > 1:
warnings.warn(f'profiler will profile {runner.max_epochs} epochs '
'instead of 1 epoch. Since profiler will slow down '
'the training, it is recommended to train 1 epoch '
'with ProfilerHook and adjust your setting according'
' to the profiler summary. During normal training '
'(epoch > 1), you may disable the ProfilerHook.')
self.profiler = torch.profiler.profile(
activities=self.activities,
schedule=self.schedule,
on_trace_ready=_on_trace_ready,
record_shapes=self.record_shapes,
profile_memory=self.profile_memory,
with_stack=self.with_stack,
with_flops=self.with_flops)
self.profiler.__enter__()
runner.logger.info('profiler is profiling...')
@master_only
def after_train_epoch(self, runner):
if self.by_epoch and runner.epoch == self.profile_iters - 1:
runner.logger.info('profiler may take a few minutes...')
self.profiler.__exit__(None, None, None)
if self.json_trace_path is not None:
self.profiler.export_chrome_trace(self.json_trace_path)
@master_only
def after_train_iter(self, runner):
self.profiler.step()
if not self.by_epoch and runner.iter == self.profile_iters - 1:
runner.logger.info('profiler may take a few minutes...')
self.profiler.__exit__(None, None, None)
if self.json_trace_path is not None:
self.profiler.export_chrome_trace(self.json_trace_path)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment