Unverified Commit 9185eee8 authored by Zaida Zhou's avatar Zaida Zhou Committed by GitHub
Browse files

Remove runner, parallel, engine and device (#2216)

* Remove runner, parallel, engine and device

* fix format

* remove outdated docs
parent 19a02415
# Copyright (c) OpenMMLab. All rights reserved.
from mmcv.utils import Registry, is_method_overridden
HOOKS = Registry('hook')
class Hook:
stages = ('before_run', 'before_train_epoch', 'before_train_iter',
'after_train_iter', 'after_train_epoch', 'before_val_epoch',
'before_val_iter', 'after_val_iter', 'after_val_epoch',
'after_run')
def before_run(self, runner):
pass
def after_run(self, runner):
pass
def before_epoch(self, runner):
pass
def after_epoch(self, runner):
pass
def before_iter(self, runner):
pass
def after_iter(self, runner):
pass
def before_train_epoch(self, runner):
self.before_epoch(runner)
def before_val_epoch(self, runner):
self.before_epoch(runner)
def after_train_epoch(self, runner):
self.after_epoch(runner)
def after_val_epoch(self, runner):
self.after_epoch(runner)
def before_train_iter(self, runner):
self.before_iter(runner)
def before_val_iter(self, runner):
self.before_iter(runner)
def after_train_iter(self, runner):
self.after_iter(runner)
def after_val_iter(self, runner):
self.after_iter(runner)
def every_n_epochs(self, runner, n):
return (runner.epoch + 1) % n == 0 if n > 0 else False
def every_n_inner_iters(self, runner, n):
return (runner.inner_iter + 1) % n == 0 if n > 0 else False
def every_n_iters(self, runner, n):
return (runner.iter + 1) % n == 0 if n > 0 else False
def end_of_epoch(self, runner):
return runner.inner_iter + 1 == len(runner.data_loader)
def is_last_epoch(self, runner):
return runner.epoch + 1 == runner._max_epochs
def is_last_iter(self, runner):
return runner.iter + 1 == runner._max_iters
def get_triggered_stages(self):
trigger_stages = set()
for stage in Hook.stages:
if is_method_overridden(stage, Hook, self):
trigger_stages.add(stage)
# some methods will be triggered in multi stages
# use this dict to map method to stages.
method_stages_map = {
'before_epoch': ['before_train_epoch', 'before_val_epoch'],
'after_epoch': ['after_train_epoch', 'after_val_epoch'],
'before_iter': ['before_train_iter', 'before_val_iter'],
'after_iter': ['after_train_iter', 'after_val_iter'],
}
for method, map_stages in method_stages_map.items():
if is_method_overridden(method, Hook, self):
trigger_stages.update(map_stages)
return [stage for stage in Hook.stages if stage in trigger_stages]
# Copyright (c) OpenMMLab. All rights reserved.
import time
from .hook import HOOKS, Hook
@HOOKS.register_module()
class IterTimerHook(Hook):
def before_epoch(self, runner):
self.t = time.time()
def before_iter(self, runner):
runner.log_buffer.update({'data_time': time.time() - self.t})
def after_iter(self, runner):
runner.log_buffer.update({'time': time.time() - self.t})
self.t = time.time()
# Copyright (c) OpenMMLab. All rights reserved.
from .base import LoggerHook
from .clearml import ClearMLLoggerHook
from .dvclive import DvcliveLoggerHook
from .mlflow import MlflowLoggerHook
from .neptune import NeptuneLoggerHook
from .pavi import PaviLoggerHook
from .segmind import SegmindLoggerHook
from .tensorboard import TensorboardLoggerHook
from .text import TextLoggerHook
from .wandb import WandbLoggerHook
__all__ = [
'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook',
'NeptuneLoggerHook', 'DvcliveLoggerHook', 'SegmindLoggerHook',
'ClearMLLoggerHook'
]
# Copyright (c) OpenMMLab. All rights reserved.
import numbers
from abc import ABCMeta, abstractmethod
from typing import Dict
import numpy as np
import torch
from ..hook import Hook
class LoggerHook(Hook):
"""Base class for logger hooks.
Args:
interval (int): Logging interval (every k iterations). Default 10.
ignore_last (bool): Ignore the log of last iterations in each epoch
if less than `interval`. Default True.
reset_flag (bool): Whether to clear the output buffer after logging.
Default False.
by_epoch (bool): Whether EpochBasedRunner is used. Default True.
"""
__metaclass__ = ABCMeta
def __init__(self,
interval: int = 10,
ignore_last: bool = True,
reset_flag: bool = False,
by_epoch: bool = True):
self.interval = interval
self.ignore_last = ignore_last
self.reset_flag = reset_flag
self.by_epoch = by_epoch
@abstractmethod
def log(self, runner):
pass
@staticmethod
def is_scalar(val,
include_np: bool = True,
include_torch: bool = True) -> bool:
"""Tell the input variable is a scalar or not.
Args:
val: Input variable.
include_np (bool): Whether include 0-d np.ndarray as a scalar.
include_torch (bool): Whether include 0-d torch.Tensor as a scalar.
Returns:
bool: True or False.
"""
if isinstance(val, numbers.Number):
return True
elif include_np and isinstance(val, np.ndarray) and val.ndim == 0:
return True
elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1:
return True
else:
return False
def get_mode(self, runner) -> str:
if runner.mode == 'train':
if 'time' in runner.log_buffer.output:
mode = 'train'
else:
mode = 'val'
elif runner.mode == 'val':
mode = 'val'
else:
raise ValueError(f"runner mode should be 'train' or 'val', "
f'but got {runner.mode}')
return mode
def get_epoch(self, runner) -> int:
if runner.mode == 'train':
epoch = runner.epoch + 1
elif runner.mode == 'val':
# normal val mode
# runner.epoch += 1 has been done before val workflow
epoch = runner.epoch
else:
raise ValueError(f"runner mode should be 'train' or 'val', "
f'but got {runner.mode}')
return epoch
def get_iter(self, runner, inner_iter: bool = False) -> int:
"""Get the current training iteration step."""
if self.by_epoch and inner_iter:
current_iter = runner.inner_iter + 1
else:
current_iter = runner.iter + 1
return current_iter
def get_lr_tags(self, runner) -> Dict[str, float]:
tags = {}
lrs = runner.current_lr()
if isinstance(lrs, dict):
for name, value in lrs.items():
tags[f'learning_rate/{name}'] = value[0]
else:
tags['learning_rate'] = lrs[0]
return tags
def get_momentum_tags(self, runner) -> Dict[str, float]:
tags = {}
momentums = runner.current_momentum()
if isinstance(momentums, dict):
for name, value in momentums.items():
tags[f'momentum/{name}'] = value[0]
else:
tags['momentum'] = momentums[0]
return tags
def get_loggable_tags(
self,
runner,
allow_scalar: bool = True,
allow_text: bool = False,
add_mode: bool = True,
tags_to_skip: tuple = ('time', 'data_time')
) -> Dict:
tags = {}
for var, val in runner.log_buffer.output.items():
if var in tags_to_skip:
continue
if self.is_scalar(val) and not allow_scalar:
continue
if isinstance(val, str) and not allow_text:
continue
if add_mode:
var = f'{self.get_mode(runner)}/{var}'
tags[var] = val
tags.update(self.get_lr_tags(runner))
tags.update(self.get_momentum_tags(runner))
return tags
def before_run(self, runner) -> None:
for hook in runner.hooks[::-1]:
if isinstance(hook, LoggerHook):
hook.reset_flag = True
break
def before_epoch(self, runner) -> None:
runner.log_buffer.clear() # clear logs of last epoch
def after_train_iter(self, runner) -> None:
if self.by_epoch and self.every_n_inner_iters(runner, self.interval):
runner.log_buffer.average(self.interval)
elif not self.by_epoch and self.every_n_iters(runner, self.interval):
runner.log_buffer.average(self.interval)
elif self.end_of_epoch(runner) and not self.ignore_last:
# not precise but more stable
runner.log_buffer.average(self.interval)
if runner.log_buffer.ready:
self.log(runner)
if self.reset_flag:
runner.log_buffer.clear_output()
def after_train_epoch(self, runner) -> None:
if runner.log_buffer.ready:
self.log(runner)
if self.reset_flag:
runner.log_buffer.clear_output()
def after_val_epoch(self, runner) -> None:
runner.log_buffer.average()
self.log(runner)
if self.reset_flag:
runner.log_buffer.clear_output()
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict, Optional
from ...dist_utils import master_only
from ..hook import HOOKS
from .base import LoggerHook
@HOOKS.register_module()
class ClearMLLoggerHook(LoggerHook):
"""Class to log metrics with clearml.
It requires `clearml`_ to be installed.
Args:
init_kwargs (dict): A dict contains the `clearml.Task.init`
initialization keys. See `taskinit`_ for more details.
interval (int): Logging interval (every k iterations). Default 10.
ignore_last (bool): Ignore the log of last iterations in each epoch
if less than `interval`. Default: True.
reset_flag (bool): Whether to clear the output buffer after logging.
Default: False.
by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
.. _clearml:
https://clear.ml/docs/latest/docs/
.. _taskinit:
https://clear.ml/docs/latest/docs/references/sdk/task/#taskinit
"""
def __init__(self,
init_kwargs: Optional[Dict] = None,
interval: int = 10,
ignore_last: bool = True,
reset_flag: bool = False,
by_epoch: bool = True):
super().__init__(interval, ignore_last, reset_flag, by_epoch)
self.import_clearml()
self.init_kwargs = init_kwargs
def import_clearml(self):
try:
import clearml
except ImportError:
raise ImportError(
'Please run "pip install clearml" to install clearml')
self.clearml = clearml
@master_only
def before_run(self, runner) -> None:
super().before_run(runner)
task_kwargs = self.init_kwargs if self.init_kwargs else {}
self.task = self.clearml.Task.init(**task_kwargs)
self.task_logger = self.task.get_logger()
@master_only
def log(self, runner) -> None:
tags = self.get_loggable_tags(runner)
for tag, val in tags.items():
self.task_logger.report_scalar(tag, tag, val,
self.get_iter(runner))
# Copyright (c) OpenMMLab. All rights reserved.
from pathlib import Path
from typing import Optional
from ...dist_utils import master_only
from ..hook import HOOKS
from .base import LoggerHook
@HOOKS.register_module()
class DvcliveLoggerHook(LoggerHook):
"""Class to log metrics with dvclive.
It requires `dvclive`_ to be installed.
Args:
model_file (str): Default None. If not None, after each epoch the
model will be saved to {model_file}.
interval (int): Logging interval (every k iterations). Default 10.
ignore_last (bool): Ignore the log of last iterations in each epoch
if less than `interval`. Default: True.
reset_flag (bool): Whether to clear the output buffer after logging.
Default: False.
by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
kwargs: Arguments for instantiating `Live`_.
.. _dvclive:
https://dvc.org/doc/dvclive
.. _Live:
https://dvc.org/doc/dvclive/api-reference/live#parameters
"""
def __init__(self,
model_file: Optional[str] = None,
interval: int = 10,
ignore_last: bool = True,
reset_flag: bool = False,
by_epoch: bool = True,
**kwargs):
super().__init__(interval, ignore_last, reset_flag, by_epoch)
self.model_file = model_file
self.import_dvclive(**kwargs)
def import_dvclive(self, **kwargs) -> None:
try:
from dvclive import Live
except ImportError:
raise ImportError(
'Please run "pip install dvclive" to install dvclive')
self.dvclive = Live(**kwargs)
@master_only
def log(self, runner) -> None:
tags = self.get_loggable_tags(runner)
if tags:
self.dvclive.set_step(self.get_iter(runner))
for k, v in tags.items():
self.dvclive.log(k, v)
@master_only
def after_train_epoch(self, runner) -> None:
super().after_train_epoch(runner)
if self.model_file is not None:
runner.save_checkpoint(
Path(self.model_file).parent,
filename_tmpl=Path(self.model_file).name,
create_symlink=False,
)
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict, Optional
from mmcv.utils import TORCH_VERSION
from ...dist_utils import master_only
from ..hook import HOOKS
from .base import LoggerHook
@HOOKS.register_module()
class MlflowLoggerHook(LoggerHook):
"""Class to log metrics and (optionally) a trained model to MLflow.
It requires `MLflow`_ to be installed.
Args:
exp_name (str, optional): Name of the experiment to be used.
Default None. If not None, set the active experiment.
If experiment does not exist, an experiment with provided name
will be created.
tags (Dict[str], optional): Tags for the current run.
Default None. If not None, set tags for the current run.
log_model (bool, optional): Whether to log an MLflow artifact.
Default True. If True, log runner.model as an MLflow artifact
for the current run.
interval (int): Logging interval (every k iterations). Default: 10.
ignore_last (bool): Ignore the log of last iterations in each epoch
if less than `interval`. Default: True.
reset_flag (bool): Whether to clear the output buffer after logging.
Default: False.
by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
.. _MLflow:
https://www.mlflow.org/docs/latest/index.html
"""
def __init__(self,
exp_name: Optional[str] = None,
tags: Optional[Dict] = None,
log_model: bool = True,
interval: int = 10,
ignore_last: bool = True,
reset_flag: bool = False,
by_epoch: bool = True):
super().__init__(interval, ignore_last, reset_flag, by_epoch)
self.import_mlflow()
self.exp_name = exp_name
self.tags = tags
self.log_model = log_model
def import_mlflow(self) -> None:
try:
import mlflow
import mlflow.pytorch as mlflow_pytorch
except ImportError:
raise ImportError(
'Please run "pip install mlflow" to install mlflow')
self.mlflow = mlflow
self.mlflow_pytorch = mlflow_pytorch
@master_only
def before_run(self, runner) -> None:
super().before_run(runner)
if self.exp_name is not None:
self.mlflow.set_experiment(self.exp_name)
if self.tags is not None:
self.mlflow.set_tags(self.tags)
@master_only
def log(self, runner) -> None:
tags = self.get_loggable_tags(runner)
if tags:
self.mlflow.log_metrics(tags, step=self.get_iter(runner))
@master_only
def after_run(self, runner) -> None:
if self.log_model:
self.mlflow_pytorch.log_model(
runner.model,
'models',
pip_requirements=[f'torch=={TORCH_VERSION}'])
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict, Optional
from ...dist_utils import master_only
from ..hook import HOOKS
from .base import LoggerHook
@HOOKS.register_module()
class NeptuneLoggerHook(LoggerHook):
"""Class to log metrics to NeptuneAI.
It requires `Neptune`_ to be installed.
Args:
init_kwargs (dict): a dict contains the initialization keys as below:
- project (str): Name of a project in a form of
namespace/project_name. If None, the value of NEPTUNE_PROJECT
environment variable will be taken.
- api_token (str): User’s API token. If None, the value of
NEPTUNE_API_TOKEN environment variable will be taken. Note: It is
strongly recommended to use NEPTUNE_API_TOKEN environment
variable rather than placing your API token in plain text in your
source code.
- name (str, optional, default is 'Untitled'): Editable name of the
run. Name is displayed in the run's Details and in Runs table as
a column.
Check https://docs.neptune.ai/api-reference/neptune#init for more
init arguments.
interval (int): Logging interval (every k iterations). Default: 10.
ignore_last (bool): Ignore the log of last iterations in each epoch
if less than ``interval``. Default: True.
reset_flag (bool): Whether to clear the output buffer after logging.
Default: True.
with_step (bool): If True, the step will be logged from
``self.get_iters``. Otherwise, step will not be logged.
Default: True.
by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
.. _Neptune:
https://docs.neptune.ai
"""
def __init__(self,
init_kwargs: Optional[Dict] = None,
interval: int = 10,
ignore_last: bool = True,
reset_flag: bool = True,
with_step: bool = True,
by_epoch: bool = True):
super().__init__(interval, ignore_last, reset_flag, by_epoch)
self.import_neptune()
self.init_kwargs = init_kwargs
self.with_step = with_step
def import_neptune(self) -> None:
try:
import neptune.new as neptune
except ImportError:
raise ImportError(
'Please run "pip install neptune-client" to install neptune')
self.neptune = neptune
self.run = None
@master_only
def before_run(self, runner) -> None:
if self.init_kwargs:
self.run = self.neptune.init(**self.init_kwargs)
else:
self.run = self.neptune.init()
@master_only
def log(self, runner) -> None:
tags = self.get_loggable_tags(runner)
if tags:
for tag_name, tag_value in tags.items():
if self.with_step:
self.run[tag_name].log( # type: ignore
tag_value, step=self.get_iter(runner))
else:
tags['global_step'] = self.get_iter(runner)
self.run[tag_name].log(tags) # type: ignore
@master_only
def after_run(self, runner) -> None:
self.run.stop() # type: ignore
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os
import os.path as osp
from typing import Dict, Optional
import mmengine
import torch
import yaml
import mmcv
from ....parallel.utils import is_module_wrapper
from ...dist_utils import master_only
from ..hook import HOOKS
from .base import LoggerHook
@HOOKS.register_module()
class PaviLoggerHook(LoggerHook):
"""Class to visual model, log metrics (for internal use).
Args:
init_kwargs (dict): A dict contains the initialization keys as below:
- name (str, optional): Custom training name. Defaults to None,
which means current work_dir.
- project (str, optional): Project name. Defaults to "default".
- model (str, optional): Training model name. Defaults to current
model.
- session_text (str, optional): Session string in YAML format.
Defaults to current config.
- training_id (int, optional): Training ID in PAVI, if you want to
use an existing training. Defaults to None.
- compare_id (int, optional): Compare ID in PAVI, if you want to
add the task to an existing compare. Defaults to None.
- overwrite_last_training (bool, optional): Whether to upload data
to the training with the same name in the same project, rather
than creating a new one. Defaults to False.
add_graph (bool): Whether to visual model. Default: False.
add_last_ckpt (bool): Whether to save checkpoint after run.
Default: False.
interval (int): Logging interval (every k iterations). Default: True.
ignore_last (bool): Ignore the log of last iterations in each epoch
if less than `interval`. Default: True.
reset_flag (bool): Whether to clear the output buffer after logging.
Default: False.
by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
img_key (string): Get image data from Dataset. Default: 'img_info'.
"""
def __init__(self,
init_kwargs: Optional[Dict] = None,
add_graph: bool = False,
add_last_ckpt: bool = False,
interval: int = 10,
ignore_last: bool = True,
reset_flag: bool = False,
by_epoch: bool = True,
img_key: str = 'img_info'):
super().__init__(interval, ignore_last, reset_flag, by_epoch)
self.init_kwargs = init_kwargs
self.add_graph = add_graph
self.add_last_ckpt = add_last_ckpt
self.img_key = img_key
@master_only
def before_run(self, runner) -> None:
super().before_run(runner)
try:
from pavi import SummaryWriter
except ImportError:
raise ImportError(
'No module named pavi, please contact pavi team or visit'
'document for pavi installation instructions.')
self.run_name = runner.work_dir.split('/')[-1]
if not self.init_kwargs:
self.init_kwargs = dict()
self.init_kwargs.setdefault('name', self.run_name)
self.init_kwargs.setdefault('model', runner._model_name)
if runner.meta is not None:
if 'config_dict' in runner.meta:
config_dict = runner.meta['config_dict']
assert isinstance(
config_dict,
dict), ('meta["config_dict"] has to be of a dict, '
f'but got {type(config_dict)}')
elif 'config_file' in runner.meta:
config_file = runner.meta['config_file']
config_dict = dict(mmcv.Config.fromfile(config_file))
else:
config_dict = None
if config_dict is not None:
# 'max_.*iter' is parsed in pavi sdk as the maximum iterations
# to properly set up the progress bar.
config_dict = config_dict.copy()
config_dict.setdefault('max_iter', runner.max_iters)
# non-serializable values are first converted in
# mmengine.dump to json
config_dict = json.loads(
mmengine.dump(config_dict, file_format='json'))
session_text = yaml.dump(config_dict)
self.init_kwargs.setdefault('session_text', session_text)
self.writer = SummaryWriter(**self.init_kwargs)
def get_step(self, runner) -> int:
"""Get the total training step/epoch."""
if self.get_mode(runner) == 'val' and self.by_epoch:
return self.get_epoch(runner)
else:
return self.get_iter(runner)
@master_only
def log(self, runner) -> None:
tags = self.get_loggable_tags(runner, add_mode=False)
if tags:
self.writer.add_scalars(
self.get_mode(runner), tags, self.get_step(runner))
@master_only
def after_run(self, runner) -> None:
if self.add_last_ckpt:
ckpt_path = osp.join(runner.work_dir, 'latest.pth')
if osp.islink(ckpt_path):
ckpt_path = osp.join(runner.work_dir, os.readlink(ckpt_path))
if osp.isfile(ckpt_path):
# runner.epoch += 1 has been done before `after_run`.
iteration = runner.epoch if self.by_epoch else runner.iter
return self.writer.add_snapshot_file(
tag=self.run_name,
snapshot_file_path=ckpt_path,
iteration=iteration)
# flush the buffer and send a task ending signal to Pavi
self.writer.close()
@master_only
def before_epoch(self, runner) -> None:
if runner.epoch == 0 and self.add_graph:
if is_module_wrapper(runner.model):
_model = runner.model.module
else:
_model = runner.model
device = next(_model.parameters()).device
data = next(iter(runner.data_loader))
image = data[self.img_key][0:1].to(device)
with torch.no_grad():
self.writer.add_graph(_model, image)
# Copyright (c) OpenMMLab. All rights reserved.
from ...dist_utils import master_only
from ..hook import HOOKS
from .base import LoggerHook
@HOOKS.register_module()
class SegmindLoggerHook(LoggerHook):
"""Class to log metrics to Segmind.
It requires `Segmind`_ to be installed.
Args:
interval (int): Logging interval (every k iterations). Default: 10.
ignore_last (bool): Ignore the log of last iterations in each epoch
if less than `interval`. Default True.
reset_flag (bool): Whether to clear the output buffer after logging.
Default False.
by_epoch (bool): Whether EpochBasedRunner is used. Default True.
.. _Segmind:
https://docs.segmind.com/python-library
"""
def __init__(self,
interval: int = 10,
ignore_last: bool = True,
reset_flag: bool = False,
by_epoch=True):
super().__init__(interval, ignore_last, reset_flag, by_epoch)
self.import_segmind()
def import_segmind(self) -> None:
try:
import segmind
except ImportError:
raise ImportError(
"Please run 'pip install segmind' to install segmind")
self.log_metrics = segmind.tracking.fluent.log_metrics
self.mlflow_log = segmind.utils.logging_utils.try_mlflow_log
@master_only
def log(self, runner) -> None:
tags = self.get_loggable_tags(runner)
if tags:
# logging metrics to segmind
self.mlflow_log(
self.log_metrics, tags, step=runner.epoch, epoch=runner.epoch)
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
from typing import Optional
from mmcv.utils import TORCH_VERSION, digit_version
from ...dist_utils import master_only
from ..hook import HOOKS
from .base import LoggerHook
@HOOKS.register_module()
class TensorboardLoggerHook(LoggerHook):
"""Class to log metrics to Tensorboard.
Args:
log_dir (string): Save directory location. Default: None. If default
values are used, directory location is ``runner.work_dir``/tf_logs.
interval (int): Logging interval (every k iterations). Default: True.
ignore_last (bool): Ignore the log of last iterations in each epoch
if less than `interval`. Default: True.
reset_flag (bool): Whether to clear the output buffer after logging.
Default: False.
by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
"""
def __init__(self,
log_dir: Optional[str] = None,
interval: int = 10,
ignore_last: bool = True,
reset_flag: bool = False,
by_epoch: bool = True):
super().__init__(interval, ignore_last, reset_flag, by_epoch)
self.log_dir = log_dir
@master_only
def before_run(self, runner) -> None:
super().before_run(runner)
if (TORCH_VERSION == 'parrots'
or digit_version(TORCH_VERSION) < digit_version('1.1')):
try:
from tensorboardX import SummaryWriter
except ImportError:
raise ImportError('Please install tensorboardX to use '
'TensorboardLoggerHook.')
else:
try:
from torch.utils.tensorboard import SummaryWriter
except ImportError:
raise ImportError(
'Please run "pip install future tensorboard" to install '
'the dependencies to use torch.utils.tensorboard '
'(applicable to PyTorch 1.1 or higher)')
if self.log_dir is None:
self.log_dir = osp.join(runner.work_dir, 'tf_logs')
self.writer = SummaryWriter(self.log_dir)
@master_only
def log(self, runner) -> None:
tags = self.get_loggable_tags(runner, allow_text=True)
for tag, val in tags.items():
if isinstance(val, str):
self.writer.add_text(tag, val, self.get_iter(runner))
else:
self.writer.add_scalar(tag, val, self.get_iter(runner))
@master_only
def after_run(self, runner) -> None:
self.writer.close()
# Copyright (c) OpenMMLab. All rights reserved.
import datetime
import os
import os.path as osp
from collections import OrderedDict
from typing import Dict, Optional, Union
import mmengine
import torch
import torch.distributed as dist
from mmengine.fileio.file_client import FileClient
from mmcv.utils import is_tuple_of, scandir
from ..hook import HOOKS
from .base import LoggerHook
@HOOKS.register_module()
class TextLoggerHook(LoggerHook):
"""Logger hook in text.
In this logger hook, the information will be printed on terminal and
saved in json file.
Args:
by_epoch (bool, optional): Whether EpochBasedRunner is used.
Default: True.
interval (int, optional): Logging interval (every k iterations).
Default: 10.
ignore_last (bool, optional): Ignore the log of last iterations in each
epoch if less than :attr:`interval`. Default: True.
reset_flag (bool, optional): Whether to clear the output buffer after
logging. Default: False.
interval_exp_name (int, optional): Logging interval for experiment
name. This feature is to help users conveniently get the experiment
information from screen or log file. Default: 1000.
out_dir (str, optional): Logs are saved in ``runner.work_dir`` default.
If ``out_dir`` is specified, logs will be copied to a new directory
which is the concatenation of ``out_dir`` and the last level
directory of ``runner.work_dir``. Default: None.
`New in version 1.3.16.`
out_suffix (str or tuple[str], optional): Those filenames ending with
``out_suffix`` will be copied to ``out_dir``.
Default: ('.log.json', '.log', '.py').
`New in version 1.3.16.`
keep_local (bool, optional): Whether to keep local log when
:attr:`out_dir` is specified. If False, the local log will be
removed. Default: True.
`New in version 1.3.16.`
file_client_args (dict, optional): Arguments to instantiate a
FileClient. See :class:`mmengine.fileio.FileClient` for details.
Default: None.
`New in version 1.3.16.`
"""
def __init__(self,
by_epoch: bool = True,
interval: int = 10,
ignore_last: bool = True,
reset_flag: bool = False,
interval_exp_name: int = 1000,
out_dir: Optional[str] = None,
out_suffix: Union[str, tuple] = ('.log.json', '.log', '.py'),
keep_local: bool = True,
file_client_args: Optional[Dict] = None):
super().__init__(interval, ignore_last, reset_flag, by_epoch)
self.by_epoch = by_epoch
self.time_sec_tot = 0
self.interval_exp_name = interval_exp_name
if out_dir is None and file_client_args is not None:
raise ValueError(
'file_client_args should be "None" when `out_dir` is not'
'specified.')
self.out_dir = out_dir
if not (out_dir is None or isinstance(out_dir, str)
or is_tuple_of(out_dir, str)):
raise TypeError('out_dir should be "None" or string or tuple of '
'string, but got {out_dir}')
self.out_suffix = out_suffix
self.keep_local = keep_local
self.file_client_args = file_client_args
if self.out_dir is not None:
self.file_client = FileClient.infer_client(file_client_args,
self.out_dir)
def before_run(self, runner) -> None:
super().before_run(runner)
if self.out_dir is not None:
self.file_client = FileClient.infer_client(self.file_client_args,
self.out_dir)
# The final `self.out_dir` is the concatenation of `self.out_dir`
# and the last level directory of `runner.work_dir`
basename = osp.basename(runner.work_dir.rstrip(osp.sep))
self.out_dir = self.file_client.join_path(self.out_dir, basename)
runner.logger.info(
f'Text logs will be saved to {self.out_dir} by '
f'{self.file_client.name} after the training process.')
self.start_iter = runner.iter
self.json_log_path = osp.join(runner.work_dir,
f'{runner.timestamp}.log.json')
if runner.meta is not None:
self._dump_log(runner.meta, runner)
def _get_max_memory(self, runner) -> int:
device = getattr(runner.model, 'output_device', None)
mem = torch.cuda.max_memory_allocated(device=device)
mem_mb = torch.tensor([int(mem) // (1024 * 1024)],
dtype=torch.int,
device=device)
if runner.world_size > 1:
dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
return mem_mb.item()
def _log_info(self, log_dict: Dict, runner) -> None:
# print exp name for users to distinguish experiments
# at every ``interval_exp_name`` iterations and the end of each epoch
if runner.meta is not None and 'exp_name' in runner.meta:
if (self.every_n_iters(runner, self.interval_exp_name)) or (
self.by_epoch and self.end_of_epoch(runner)):
exp_info = f'Exp name: {runner.meta["exp_name"]}'
runner.logger.info(exp_info)
if log_dict['mode'] == 'train':
if isinstance(log_dict['lr'], dict):
lr_str = []
for k, val in log_dict['lr'].items():
lr_str.append(f'lr_{k}: {val:.3e}')
lr_str = ' '.join(lr_str) # type: ignore
else:
lr_str = f'lr: {log_dict["lr"]:.3e}' # type: ignore
# by epoch: Epoch [4][100/1000]
# by iter: Iter [100/100000]
if self.by_epoch:
log_str = f'Epoch [{log_dict["epoch"]}]' \
f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t'
else:
log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t'
log_str += f'{lr_str}, '
if 'time' in log_dict.keys():
self.time_sec_tot += (log_dict['time'] * self.interval)
time_sec_avg = self.time_sec_tot / (
runner.iter - self.start_iter + 1)
eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
log_str += f'eta: {eta_str}, '
log_str += f'time: {log_dict["time"]:.3f}, ' \
f'data_time: {log_dict["data_time"]:.3f}, '
# statistic memory
if torch.cuda.is_available():
log_str += f'memory: {log_dict["memory"]}, '
else:
# val/test time
# here 1000 is the length of the val dataloader
# by epoch: Epoch[val] [4][1000]
# by iter: Iter[val] [1000]
if self.by_epoch:
log_str = f'Epoch({log_dict["mode"]}) ' \
f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t'
else:
log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t'
log_items = []
for name, val in log_dict.items():
# TODO: resolve this hack
# these items have been in log_str
if name in [
'mode', 'Epoch', 'iter', 'lr', 'time', 'data_time',
'memory', 'epoch'
]:
continue
if isinstance(val, float):
val = f'{val:.4f}'
log_items.append(f'{name}: {val}')
log_str += ', '.join(log_items)
runner.logger.info(log_str)
def _dump_log(self, log_dict: Dict, runner) -> None:
# dump log in json format
json_log = OrderedDict()
for k, v in log_dict.items():
json_log[k] = self._round_float(v)
# only append log at last line
if runner.rank == 0:
with open(self.json_log_path, 'a+') as f:
mmengine.dump(json_log, f, file_format='json')
f.write('\n')
def _round_float(self, items):
if isinstance(items, list):
return [self._round_float(item) for item in items]
elif isinstance(items, float):
return round(items, 5)
else:
return items
def log(self, runner) -> OrderedDict:
if 'eval_iter_num' in runner.log_buffer.output:
# this doesn't modify runner.iter and is regardless of by_epoch
cur_iter = runner.log_buffer.output.pop('eval_iter_num')
else:
cur_iter = self.get_iter(runner, inner_iter=True)
log_dict = OrderedDict(
mode=self.get_mode(runner),
epoch=self.get_epoch(runner),
iter=cur_iter)
# only record lr of the first param group
cur_lr = runner.current_lr()
if isinstance(cur_lr, list):
log_dict['lr'] = cur_lr[0]
else:
assert isinstance(cur_lr, dict)
log_dict['lr'] = {}
for k, lr_ in cur_lr.items():
assert isinstance(lr_, list)
log_dict['lr'].update({k: lr_[0]})
if 'time' in runner.log_buffer.output:
# statistic memory
if torch.cuda.is_available():
log_dict['memory'] = self._get_max_memory(runner)
log_dict = dict(log_dict, **runner.log_buffer.output) # type: ignore
self._log_info(log_dict, runner)
self._dump_log(log_dict, runner)
return log_dict
def after_run(self, runner) -> None:
# copy or upload logs to self.out_dir
if self.out_dir is not None:
for filename in scandir(runner.work_dir, self.out_suffix, True):
local_filepath = osp.join(runner.work_dir, filename)
out_filepath = self.file_client.join_path(
self.out_dir, filename)
with open(local_filepath) as f:
self.file_client.put_text(f.read(), out_filepath)
runner.logger.info(
f'The file {local_filepath} has been uploaded to '
f'{out_filepath}.')
if not self.keep_local:
os.remove(local_filepath)
runner.logger.info(
f'{local_filepath} was removed due to the '
'`self.keep_local=False`')
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
from typing import Dict, Optional, Union
from mmcv.utils import scandir
from ...dist_utils import master_only
from ..hook import HOOKS
from .base import LoggerHook
@HOOKS.register_module()
class WandbLoggerHook(LoggerHook):
"""Class to log metrics with wandb.
It requires `wandb`_ to be installed.
Args:
init_kwargs (dict): A dict contains the initialization keys. Check
https://docs.wandb.ai/ref/python/init for more init arguments.
interval (int): Logging interval (every k iterations).
Default 10.
ignore_last (bool): Ignore the log of last iterations in each epoch
if less than `interval`.
Default: True.
reset_flag (bool): Whether to clear the output buffer after logging.
Default: False.
commit (bool): Save the metrics dict to the wandb server and increment
the step. If false ``wandb.log`` just updates the current metrics
dict with the row argument and metrics won't be saved until
``wandb.log`` is called with ``commit=True``.
Default: True.
by_epoch (bool): Whether EpochBasedRunner is used.
Default: True.
with_step (bool): If True, the step will be logged from
``self.get_iters``. Otherwise, step will not be logged.
Default: True.
log_artifact (bool): If True, artifacts in {work_dir} will be uploaded
to wandb after training ends.
Default: True
`New in version 1.4.3.`
out_suffix (str or tuple[str], optional): Those filenames ending with
``out_suffix`` will be uploaded to wandb.
Default: ('.log.json', '.log', '.py').
`New in version 1.4.3.`
.. _wandb:
https://docs.wandb.ai
"""
def __init__(self,
init_kwargs: Optional[Dict] = None,
interval: int = 10,
ignore_last: bool = True,
reset_flag: bool = False,
commit: bool = True,
by_epoch: bool = True,
with_step: bool = True,
log_artifact: bool = True,
out_suffix: Union[str, tuple] = ('.log.json', '.log', '.py')):
super().__init__(interval, ignore_last, reset_flag, by_epoch)
self.import_wandb()
self.init_kwargs = init_kwargs
self.commit = commit
self.with_step = with_step
self.log_artifact = log_artifact
self.out_suffix = out_suffix
def import_wandb(self) -> None:
try:
import wandb
except ImportError:
raise ImportError(
'Please run "pip install wandb" to install wandb')
self.wandb = wandb
@master_only
def before_run(self, runner) -> None:
super().before_run(runner)
if self.wandb is None:
self.import_wandb()
if self.init_kwargs:
self.wandb.init(**self.init_kwargs) # type: ignore
else:
self.wandb.init() # type: ignore
@master_only
def log(self, runner) -> None:
tags = self.get_loggable_tags(runner)
if tags:
if self.with_step:
self.wandb.log(
tags, step=self.get_iter(runner), commit=self.commit)
else:
tags['global_step'] = self.get_iter(runner)
self.wandb.log(tags, commit=self.commit)
@master_only
def after_run(self, runner) -> None:
if self.log_artifact:
wandb_artifact = self.wandb.Artifact(
name='artifacts', type='model')
for filename in scandir(runner.work_dir, self.out_suffix, True):
local_filepath = osp.join(runner.work_dir, filename)
wandb_artifact.add_file(local_filepath)
self.wandb.log_artifact(wandb_artifact)
self.wandb.join()
# Copyright (c) OpenMMLab. All rights reserved.
import numbers
from math import cos, pi
from typing import Callable, List, Optional, Union
import mmcv
from mmcv import runner
from .hook import HOOKS, Hook
class LrUpdaterHook(Hook):
"""LR Scheduler in MMCV.
Args:
by_epoch (bool): LR changes epoch by epoch
warmup (string): Type of warmup used. It can be None(use no warmup),
'constant', 'linear' or 'exp'
warmup_iters (int): The number of iterations or epochs that warmup
lasts
warmup_ratio (float): LR used at the beginning of warmup equals to
warmup_ratio * initial_lr
warmup_by_epoch (bool): When warmup_by_epoch == True, warmup_iters
means the number of epochs that warmup lasts, otherwise means the
number of iteration that warmup lasts
"""
def __init__(self,
by_epoch: bool = True,
warmup: Optional[str] = None,
warmup_iters: int = 0,
warmup_ratio: float = 0.1,
warmup_by_epoch: bool = False) -> None:
# validate the "warmup" argument
if warmup is not None:
if warmup not in ['constant', 'linear', 'exp']:
raise ValueError(
f'"{warmup}" is not a supported type for warming up, valid'
' types are "constant", "linear" and "exp"')
if warmup is not None:
assert warmup_iters > 0, \
'"warmup_iters" must be a positive integer'
assert 0 < warmup_ratio <= 1.0, \
'"warmup_ratio" must be in range (0,1]'
self.by_epoch = by_epoch
self.warmup = warmup
self.warmup_iters: Optional[int] = warmup_iters
self.warmup_ratio = warmup_ratio
self.warmup_by_epoch = warmup_by_epoch
if self.warmup_by_epoch:
self.warmup_epochs: Optional[int] = self.warmup_iters
self.warmup_iters = None
else:
self.warmup_epochs = None
self.base_lr: Union[list, dict] = [] # initial lr for all param groups
self.regular_lr: list = [] # expected lr if no warming up is performed
def _set_lr(self, runner, lr_groups):
if isinstance(runner.optimizer, dict):
for k, optim in runner.optimizer.items():
for param_group, lr in zip(optim.param_groups, lr_groups[k]):
param_group['lr'] = lr
else:
for param_group, lr in zip(runner.optimizer.param_groups,
lr_groups):
param_group['lr'] = lr
def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
raise NotImplementedError
def get_regular_lr(self, runner: 'runner.BaseRunner'):
if isinstance(runner.optimizer, dict):
lr_groups = {}
for k in runner.optimizer.keys():
_lr_group = [
self.get_lr(runner, _base_lr)
for _base_lr in self.base_lr[k]
]
lr_groups.update({k: _lr_group})
return lr_groups
else:
return [self.get_lr(runner, _base_lr) for _base_lr in self.base_lr]
def get_warmup_lr(self, cur_iters: int):
def _get_warmup_lr(cur_iters, regular_lr):
if self.warmup == 'constant':
warmup_lr = [_lr * self.warmup_ratio for _lr in regular_lr]
elif self.warmup == 'linear':
k = (1 - cur_iters / self.warmup_iters) * (1 -
self.warmup_ratio)
warmup_lr = [_lr * (1 - k) for _lr in regular_lr]
elif self.warmup == 'exp':
k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
warmup_lr = [_lr * k for _lr in regular_lr]
return warmup_lr
if isinstance(self.regular_lr, dict):
lr_groups = {}
for key, regular_lr in self.regular_lr.items():
lr_groups[key] = _get_warmup_lr(cur_iters, regular_lr)
return lr_groups
else:
return _get_warmup_lr(cur_iters, self.regular_lr)
def before_run(self, runner: 'runner.BaseRunner'):
# NOTE: when resuming from a checkpoint, if 'initial_lr' is not saved,
# it will be set according to the optimizer params
if isinstance(runner.optimizer, dict):
self.base_lr = {}
for k, optim in runner.optimizer.items():
for group in optim.param_groups:
group.setdefault('initial_lr', group['lr'])
_base_lr = [
group['initial_lr'] for group in optim.param_groups
]
self.base_lr.update({k: _base_lr})
else:
for group in runner.optimizer.param_groups: # type: ignore
group.setdefault('initial_lr', group['lr'])
self.base_lr = [
group['initial_lr']
for group in runner.optimizer.param_groups # type: ignore
]
def before_train_epoch(self, runner: 'runner.BaseRunner'):
if self.warmup_iters is None:
epoch_len = len(runner.data_loader) # type: ignore
self.warmup_iters = self.warmup_epochs * epoch_len # type: ignore
if not self.by_epoch:
return
self.regular_lr = self.get_regular_lr(runner)
self._set_lr(runner, self.regular_lr)
def before_train_iter(self, runner: 'runner.BaseRunner'):
cur_iter = runner.iter
assert isinstance(self.warmup_iters, int)
if not self.by_epoch:
self.regular_lr = self.get_regular_lr(runner)
if self.warmup is None or cur_iter >= self.warmup_iters:
self._set_lr(runner, self.regular_lr)
else:
warmup_lr = self.get_warmup_lr(cur_iter)
self._set_lr(runner, warmup_lr)
elif self.by_epoch:
if self.warmup is None or cur_iter > self.warmup_iters:
return
elif cur_iter == self.warmup_iters:
self._set_lr(runner, self.regular_lr)
else:
warmup_lr = self.get_warmup_lr(cur_iter)
self._set_lr(runner, warmup_lr)
@HOOKS.register_module()
class FixedLrUpdaterHook(LrUpdaterHook):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def get_lr(self, runner, base_lr):
return base_lr
@HOOKS.register_module()
class StepLrUpdaterHook(LrUpdaterHook):
"""Step LR scheduler with min_lr clipping.
Args:
step (int | list[int]): Step to decay the LR. If an int value is given,
regard it as the decay interval. If a list is given, decay LR at
these steps.
gamma (float): Decay LR ratio. Defaults to 0.1.
min_lr (float, optional): Minimum LR value to keep. If LR after decay
is lower than `min_lr`, it will be clipped to this value. If None
is given, we don't perform lr clipping. Default: None.
"""
def __init__(self,
step: Union[int, List[int]],
gamma: float = 0.1,
min_lr: Optional[float] = None,
**kwargs) -> None:
if isinstance(step, list):
assert mmcv.is_list_of(step, int)
assert all([s > 0 for s in step])
elif isinstance(step, int):
assert step > 0
else:
raise TypeError('"step" must be a list or integer')
self.step = step
self.gamma = gamma
self.min_lr = min_lr
super().__init__(**kwargs)
def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
progress = runner.epoch if self.by_epoch else runner.iter
# calculate exponential term
if isinstance(self.step, int):
exp = progress // self.step
else:
exp = len(self.step)
for i, s in enumerate(self.step):
if progress < s:
exp = i
break
lr = base_lr * (self.gamma**exp)
if self.min_lr is not None:
# clip to a minimum value
lr = max(lr, self.min_lr)
return lr
@HOOKS.register_module()
class ExpLrUpdaterHook(LrUpdaterHook):
def __init__(self, gamma: float, **kwargs) -> None:
self.gamma = gamma
super().__init__(**kwargs)
def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
progress = runner.epoch if self.by_epoch else runner.iter
return base_lr * self.gamma**progress
@HOOKS.register_module()
class PolyLrUpdaterHook(LrUpdaterHook):
def __init__(self,
power: float = 1.,
min_lr: float = 0.,
**kwargs) -> None:
self.power = power
self.min_lr = min_lr
super().__init__(**kwargs)
def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
if self.by_epoch:
progress = runner.epoch
max_progress = runner.max_epochs
else:
progress = runner.iter
max_progress = runner.max_iters
coeff = (1 - progress / max_progress)**self.power
return (base_lr - self.min_lr) * coeff + self.min_lr
@HOOKS.register_module()
class InvLrUpdaterHook(LrUpdaterHook):
def __init__(self, gamma: float, power: float = 1., **kwargs) -> None:
self.gamma = gamma
self.power = power
super().__init__(**kwargs)
def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
progress = runner.epoch if self.by_epoch else runner.iter
return base_lr * (1 + self.gamma * progress)**(-self.power)
@HOOKS.register_module()
class CosineAnnealingLrUpdaterHook(LrUpdaterHook):
"""CosineAnnealing LR scheduler.
Args:
min_lr (float, optional): The minimum lr. Default: None.
min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
Either `min_lr` or `min_lr_ratio` should be specified.
Default: None.
"""
def __init__(self,
min_lr: Optional[float] = None,
min_lr_ratio: Optional[float] = None,
**kwargs) -> None:
assert (min_lr is None) ^ (min_lr_ratio is None)
self.min_lr = min_lr
self.min_lr_ratio = min_lr_ratio
super().__init__(**kwargs)
def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
if self.by_epoch:
progress = runner.epoch
max_progress = runner.max_epochs
else:
progress = runner.iter
max_progress = runner.max_iters
if self.min_lr_ratio is not None:
target_lr = base_lr * self.min_lr_ratio
else:
target_lr = self.min_lr # type:ignore
return annealing_cos(base_lr, target_lr, progress / max_progress)
@HOOKS.register_module()
class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook):
"""Flat + Cosine lr schedule.
Modified from https://github.com/fastai/fastai/blob/master/fastai/callback/schedule.py#L128 # noqa: E501
Args:
start_percent (float): When to start annealing the learning rate
after the percentage of the total training steps.
The value should be in range [0, 1).
Default: 0.75
min_lr (float, optional): The minimum lr. Default: None.
min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
Either `min_lr` or `min_lr_ratio` should be specified.
Default: None.
"""
def __init__(self,
start_percent: float = 0.75,
min_lr: Optional[float] = None,
min_lr_ratio: Optional[float] = None,
**kwargs) -> None:
assert (min_lr is None) ^ (min_lr_ratio is None)
if start_percent < 0 or start_percent > 1 or not isinstance(
start_percent, float):
raise ValueError(
'expected float between 0 and 1 start_percent, but '
f'got {start_percent}')
self.start_percent = start_percent
self.min_lr = min_lr
self.min_lr_ratio = min_lr_ratio
super().__init__(**kwargs)
def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
if self.by_epoch:
start = round(runner.max_epochs * self.start_percent)
progress = runner.epoch - start
max_progress = runner.max_epochs - start
else:
start = round(runner.max_iters * self.start_percent)
progress = runner.iter - start
max_progress = runner.max_iters - start
if self.min_lr_ratio is not None:
target_lr = base_lr * self.min_lr_ratio
else:
target_lr = self.min_lr # type:ignore
if progress < 0:
return base_lr
else:
return annealing_cos(base_lr, target_lr, progress / max_progress)
@HOOKS.register_module()
class CosineRestartLrUpdaterHook(LrUpdaterHook):
"""Cosine annealing with restarts learning rate scheme.
Args:
periods (list[int]): Periods for each cosine anneling cycle.
restart_weights (list[float]): Restart weights at each
restart iteration. Defaults to [1].
min_lr (float, optional): The minimum lr. Default: None.
min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
Either `min_lr` or `min_lr_ratio` should be specified.
Default: None.
"""
def __init__(self,
periods: List[int],
restart_weights: List[float] = [1],
min_lr: Optional[float] = None,
min_lr_ratio: Optional[float] = None,
**kwargs) -> None:
assert (min_lr is None) ^ (min_lr_ratio is None)
self.periods = periods
self.min_lr = min_lr
self.min_lr_ratio = min_lr_ratio
self.restart_weights = restart_weights
assert (len(self.periods) == len(self.restart_weights)
), 'periods and restart_weights should have the same length.'
super().__init__(**kwargs)
self.cumulative_periods = [
sum(self.periods[0:i + 1]) for i in range(0, len(self.periods))
]
def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
if self.by_epoch:
progress = runner.epoch
else:
progress = runner.iter
if self.min_lr_ratio is not None:
target_lr = base_lr * self.min_lr_ratio
else:
target_lr = self.min_lr # type:ignore
idx = get_position_from_periods(progress, self.cumulative_periods)
current_weight = self.restart_weights[idx]
nearest_restart = 0 if idx == 0 else self.cumulative_periods[idx - 1]
current_periods = self.periods[idx]
alpha = min((progress - nearest_restart) / current_periods, 1)
return annealing_cos(base_lr, target_lr, alpha, current_weight)
def get_position_from_periods(iteration: int, cumulative_periods: List[int]):
"""Get the position from a period list.
It will return the index of the right-closest number in the period list.
For example, the cumulative_periods = [100, 200, 300, 400],
if iteration == 50, return 0;
if iteration == 210, return 2;
if iteration == 300, return 3.
Args:
iteration (int): Current iteration.
cumulative_periods (list[int]): Cumulative period list.
Returns:
int: The position of the right-closest number in the period list.
"""
for i, period in enumerate(cumulative_periods):
if iteration < period:
return i
raise ValueError(f'Current iteration {iteration} exceeds '
f'cumulative_periods {cumulative_periods}')
@HOOKS.register_module()
class CyclicLrUpdaterHook(LrUpdaterHook):
"""Cyclic LR Scheduler.
Implement the cyclical learning rate policy (CLR) described in
https://arxiv.org/pdf/1506.01186.pdf
Different from the original paper, we use cosine annealing rather than
triangular policy inside a cycle. This improves the performance in the
3D detection area.
Args:
by_epoch (bool, optional): Whether to update LR by epoch.
target_ratio (tuple[float], optional): Relative ratio of the highest LR
and the lowest LR to the initial LR.
cyclic_times (int, optional): Number of cycles during training
step_ratio_up (float, optional): The ratio of the increasing process of
LR in the total cycle.
anneal_strategy (str, optional): {'cos', 'linear'}
Specifies the annealing strategy: 'cos' for cosine annealing,
'linear' for linear annealing. Default: 'cos'.
gamma (float, optional): Cycle decay ratio. Default: 1.
It takes values in the range (0, 1]. The difference between the
maximum learning rate and the minimum learning rate decreases
periodically when it is less than 1. `New in version 1.4.4.`
"""
def __init__(self,
by_epoch: bool = False,
target_ratio: Union[float, tuple] = (10, 1e-4),
cyclic_times: int = 1,
step_ratio_up: float = 0.4,
anneal_strategy: str = 'cos',
gamma: float = 1,
**kwargs) -> None:
if isinstance(target_ratio, float):
target_ratio = (target_ratio, target_ratio / 1e5)
elif isinstance(target_ratio, tuple):
target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
if len(target_ratio) == 1 else target_ratio
else:
raise ValueError('target_ratio should be either float '
f'or tuple, got {type(target_ratio)}')
assert len(target_ratio) == 2, \
'"target_ratio" must be list or tuple of two floats'
assert 0 <= step_ratio_up < 1.0, \
'"step_ratio_up" must be in range [0,1)'
assert 0 < gamma <= 1, \
'"gamma" must be in range (0, 1]'
self.target_ratio = target_ratio
self.cyclic_times = cyclic_times
self.step_ratio_up = step_ratio_up
self.gamma = gamma
self.max_iter_per_phase = None
self.lr_phases: list = [] # init lr_phases
# validate anneal_strategy
if anneal_strategy not in ['cos', 'linear']:
raise ValueError('anneal_strategy must be one of "cos" or '
f'"linear", instead got {anneal_strategy}')
elif anneal_strategy == 'cos':
self.anneal_func: Callable[[float, float, float],
float] = annealing_cos
elif anneal_strategy == 'linear':
self.anneal_func = annealing_linear
assert not by_epoch, \
'currently only support "by_epoch" = False'
super().__init__(by_epoch, **kwargs)
def before_run(self, runner: 'runner.BaseRunner'):
super().before_run(runner)
# initiate lr_phases
# total lr_phases are separated as up and down
self.max_iter_per_phase = runner.max_iters // self.cyclic_times
iter_up_phase = int(self.step_ratio_up *
self.max_iter_per_phase) # type: ignore
self.lr_phases.append([0, iter_up_phase, 1, self.target_ratio[0]])
self.lr_phases.append([
iter_up_phase, self.max_iter_per_phase, self.target_ratio[0],
self.target_ratio[1]
])
def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
curr_iter = runner.iter % self.max_iter_per_phase # type: ignore
curr_cycle = runner.iter // self.max_iter_per_phase # type: ignore
# Update weight decay
scale = self.gamma**curr_cycle
for (start_iter, end_iter, start_ratio, end_ratio) in self.lr_phases:
if start_iter <= curr_iter < end_iter:
# Apply cycle scaling to gradually reduce the difference
# between max_lr and base lr. The target end_ratio can be
# expressed as:
# end_ratio = (base_lr + scale * (max_lr - base_lr)) / base_lr
# iteration: 0-iter_up_phase:
if start_iter == 0:
end_ratio = 1 - scale + end_ratio * scale
# iteration: iter_up_phase-self.max_iter_per_phase
else:
start_ratio = 1 - scale + start_ratio * scale
progress = curr_iter - start_iter
return self.anneal_func(base_lr * start_ratio,
base_lr * end_ratio,
progress / (end_iter - start_iter))
@HOOKS.register_module()
class OneCycleLrUpdaterHook(LrUpdaterHook):
"""One Cycle LR Scheduler.
The 1cycle learning rate policy changes the learning rate after every
batch. The one cycle learning rate policy is described in
https://arxiv.org/pdf/1708.07120.pdf
Args:
max_lr (float or list): Upper learning rate boundaries in the cycle
for each parameter group.
total_steps (int, optional): The total number of steps in the cycle.
Note that if a value is not provided here, it will be the max_iter
of runner. Default: None.
pct_start (float): The percentage of the cycle (in number of steps)
spent increasing the learning rate.
Default: 0.3
anneal_strategy (str): {'cos', 'linear'}
Specifies the annealing strategy: 'cos' for cosine annealing,
'linear' for linear annealing.
Default: 'cos'
div_factor (float): Determines the initial learning rate via
initial_lr = max_lr/div_factor
Default: 25
final_div_factor (float): Determines the minimum learning rate via
min_lr = initial_lr/final_div_factor
Default: 1e4
three_phase (bool): If three_phase is True, use a third phase of the
schedule to annihilate the learning rate according to
final_div_factor instead of modifying the second phase (the first
two phases will be symmetrical about the step indicated by
pct_start).
Default: False
"""
def __init__(self,
max_lr: Union[float, List],
total_steps: Optional[int] = None,
pct_start: float = 0.3,
anneal_strategy: str = 'cos',
div_factor: float = 25,
final_div_factor: float = 1e4,
three_phase: bool = False,
**kwargs) -> None:
# validate by_epoch, currently only support by_epoch = False
if 'by_epoch' not in kwargs:
kwargs['by_epoch'] = False
else:
assert not kwargs['by_epoch'], \
'currently only support "by_epoch" = False'
if not isinstance(max_lr, (numbers.Number, list, dict)):
raise ValueError('the type of max_lr must be the one of list or '
f'dict, but got {type(max_lr)}')
self._max_lr = max_lr
if total_steps is not None:
if not isinstance(total_steps, int):
raise ValueError('the type of total_steps must be int, but'
f'got {type(total_steps)}')
self.total_steps = total_steps
# validate pct_start
if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
raise ValueError('expected float between 0 and 1 pct_start, but '
f'got {pct_start}')
self.pct_start = pct_start
# validate anneal_strategy
if anneal_strategy not in ['cos', 'linear']:
raise ValueError('anneal_strategy must be one of "cos" or '
f'"linear", instead got {anneal_strategy}')
elif anneal_strategy == 'cos':
self.anneal_func: Callable[[float, float, float],
float] = annealing_cos
elif anneal_strategy == 'linear':
self.anneal_func = annealing_linear
self.div_factor = div_factor
self.final_div_factor = final_div_factor
self.three_phase = three_phase
self.lr_phases: list = [] # init lr_phases
super().__init__(**kwargs)
def before_run(self, runner: 'runner.BaseRunner'):
if hasattr(self, 'total_steps'):
total_steps = self.total_steps
else:
total_steps = runner.max_iters
if total_steps < runner.max_iters:
raise ValueError(
'The total steps must be greater than or equal to max '
f'iterations {runner.max_iters} of runner, but total steps '
f'is {total_steps}.')
if isinstance(runner.optimizer, dict):
self.base_lr = {}
for k, optim in runner.optimizer.items():
_max_lr = format_param(k, optim, self._max_lr)
self.base_lr[k] = [lr / self.div_factor for lr in _max_lr]
for group, lr in zip(optim.param_groups, self.base_lr[k]):
group.setdefault('initial_lr', lr)
else:
k = type(runner.optimizer).__name__
_max_lr = format_param(k, runner.optimizer, self._max_lr)
self.base_lr = [lr / self.div_factor for lr in _max_lr]
optim_param_groups = runner.optimizer.param_groups # type: ignore
for group, lr in zip(optim_param_groups, self.base_lr):
group.setdefault('initial_lr', lr)
if self.three_phase:
self.lr_phases.append(
[float(self.pct_start * total_steps) - 1, 1, self.div_factor])
self.lr_phases.append([
float(2 * self.pct_start * total_steps) - 2, self.div_factor, 1
])
self.lr_phases.append(
[total_steps - 1, 1, 1 / self.final_div_factor])
else:
self.lr_phases.append(
[float(self.pct_start * total_steps) - 1, 1, self.div_factor])
self.lr_phases.append(
[total_steps - 1, self.div_factor, 1 / self.final_div_factor])
def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
curr_iter = runner.iter
start_iter = 0
for i, (end_iter, start_lr, end_lr) in enumerate(self.lr_phases):
if curr_iter <= end_iter:
pct = (curr_iter - start_iter) / (end_iter - start_iter)
lr = self.anneal_func(base_lr * start_lr, base_lr * end_lr,
pct)
break
start_iter = end_iter
return lr
@HOOKS.register_module()
class LinearAnnealingLrUpdaterHook(LrUpdaterHook):
"""Linear annealing LR Scheduler decays the learning rate of each parameter
group linearly.
Args:
min_lr (float, optional): The minimum lr. Default: None.
min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
Either `min_lr` or `min_lr_ratio` should be specified.
Default: None.
"""
def __init__(self,
min_lr: Optional[float] = None,
min_lr_ratio: Optional[float] = None,
**kwargs):
assert (min_lr is None) ^ (min_lr_ratio is None)
self.min_lr = min_lr
self.min_lr_ratio = min_lr_ratio
super().__init__(**kwargs)
def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
if self.by_epoch:
progress = runner.epoch
max_progress = runner.max_epochs
else:
progress = runner.iter
max_progress = runner.max_iters
if self.min_lr_ratio is not None:
target_lr = base_lr * self.min_lr_ratio
else:
target_lr = self.min_lr # type:ignore
return annealing_linear(base_lr, target_lr, progress / max_progress)
def annealing_cos(start: float,
end: float,
factor: float,
weight: float = 1.) -> float:
"""Calculate annealing cos learning rate.
Cosine anneal from `weight * start + (1 - weight) * end` to `end` as
percentage goes from 0.0 to 1.0.
Args:
start (float): The starting learning rate of the cosine annealing.
end (float): The ending learing rate of the cosine annealing.
factor (float): The coefficient of `pi` when calculating the current
percentage. Range from 0.0 to 1.0.
weight (float, optional): The combination factor of `start` and `end`
when calculating the actual starting learning rate. Default to 1.
"""
cos_out = cos(pi * factor) + 1
return end + 0.5 * weight * (start - end) * cos_out
def annealing_linear(start: float, end: float, factor: float) -> float:
"""Calculate annealing linear learning rate.
Linear anneal from `start` to `end` as percentage goes from 0.0 to 1.0.
Args:
start (float): The starting learning rate of the linear annealing.
end (float): The ending learing rate of the linear annealing.
factor (float): The coefficient of `pi` when calculating the current
percentage. Range from 0.0 to 1.0.
"""
return start + (end - start) * factor
def format_param(name, optim, param):
if isinstance(param, numbers.Number):
return [param] * len(optim.param_groups)
elif isinstance(param, (list, tuple)): # multi param groups
if len(param) != len(optim.param_groups):
raise ValueError(f'expected {len(optim.param_groups)} '
f'values for {name}, got {len(param)}')
return param
else: # multi optimizers
if name not in param:
raise KeyError(f'{name} is not found in {param.keys()}')
return param[name]
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from .hook import HOOKS, Hook
@HOOKS.register_module()
class EmptyCacheHook(Hook):
def __init__(self,
before_epoch: bool = False,
after_epoch: bool = True,
after_iter: bool = False):
self._before_epoch = before_epoch
self._after_epoch = after_epoch
self._after_iter = after_iter
def after_iter(self, runner):
if self._after_iter:
torch.cuda.empty_cache()
def before_epoch(self, runner):
if self._before_epoch:
torch.cuda.empty_cache()
def after_epoch(self, runner):
if self._after_epoch:
torch.cuda.empty_cache()
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Callable, Dict, List, Optional, Tuple, Union
import mmcv
from .hook import HOOKS, Hook
from .lr_updater import annealing_cos, annealing_linear, format_param
class MomentumUpdaterHook(Hook):
def __init__(self,
by_epoch: bool = True,
warmup: Optional[str] = None,
warmup_iters: int = 0,
warmup_ratio: float = 0.9):
# validate the "warmup" argument
if warmup is not None:
if warmup not in ['constant', 'linear', 'exp']:
raise ValueError(
f'"{warmup}" is not a supported type for warming up, valid'
' types are "constant" and "linear"')
if warmup is not None:
assert warmup_iters > 0, \
'"warmup_iters" must be a positive integer'
assert 0 < warmup_ratio <= 1.0, \
'"warmup_momentum" must be in range (0,1]'
self.by_epoch = by_epoch
self.warmup = warmup
self.warmup_iters = warmup_iters
self.warmup_ratio = warmup_ratio
# initial momentum for all param groups
self.base_momentum: Union[list, dict] = []
# expected momentum if no warming up is performed
self.regular_momentum: Union[list, dict] = []
def _set_momentum(self, runner, momentum_groups):
if isinstance(runner.optimizer, dict):
for k, optim in runner.optimizer.items():
for param_group, mom in zip(optim.param_groups,
momentum_groups[k]):
if 'momentum' in param_group.keys():
param_group['momentum'] = mom
elif 'betas' in param_group.keys():
param_group['betas'] = (mom, param_group['betas'][1])
else:
for param_group, mom in zip(runner.optimizer.param_groups,
momentum_groups):
if 'momentum' in param_group.keys():
param_group['momentum'] = mom
elif 'betas' in param_group.keys():
param_group['betas'] = (mom, param_group['betas'][1])
def get_momentum(self, runner, base_momentum) -> float:
raise NotImplementedError
def get_regular_momentum(self, runner) -> Union[list, Dict[str, list]]:
if isinstance(runner.optimizer, dict):
assert isinstance(self.base_momentum, dict)
momentum_groups: Dict[str, List[float]] = {}
for k in runner.optimizer.keys():
_momentum_group: List[float] = [
self.get_momentum(runner, _base_momentum)
for _base_momentum in self.base_momentum[k]
]
momentum_groups.update({k: _momentum_group})
return momentum_groups
else:
assert isinstance(self.base_momentum, list)
return [
self.get_momentum(runner, _base_momentum)
for _base_momentum in self.base_momentum
]
def get_warmup_momentum(
self,
cur_iters: int) -> Union[List[float], Dict[str, List[float]]]:
def _get_warmup_momentum(cur_iters, regular_momentum):
if self.warmup == 'constant':
warmup_momentum = [
_momentum / self.warmup_ratio
for _momentum in regular_momentum
]
elif self.warmup == 'linear':
k = (1 - cur_iters / self.warmup_iters) * (1 -
self.warmup_ratio)
warmup_momentum = [
_momentum / (1 - k) for _momentum in regular_momentum
]
elif self.warmup == 'exp':
k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
warmup_momentum = [
_momentum / k for _momentum in regular_momentum
]
else:
raise ValueError(
'Expected values of `self.warmup` to be "constant", '
f'"linear", or "exp", got {self.warmup}')
return warmup_momentum
if isinstance(self.regular_momentum, dict):
momentum_groups = {}
for key, regular_momentum in self.regular_momentum.items():
momentum_groups[key] = _get_warmup_momentum(
cur_iters, regular_momentum)
return momentum_groups
else:
return _get_warmup_momentum(cur_iters, self.regular_momentum)
def before_run(self, runner):
# NOTE: when resuming from a checkpoint,
# if 'initial_momentum' is not saved,
# it will be set according to the optimizer params
if isinstance(runner.optimizer, dict):
self.base_momentum = {}
for k, optim in runner.optimizer.items():
for group in optim.param_groups:
if 'momentum' in group.keys():
group.setdefault('initial_momentum', group['momentum'])
else:
group.setdefault('initial_momentum', group['betas'][0])
_base_momentum = [
group['initial_momentum'] for group in optim.param_groups
]
self.base_momentum.update({k: _base_momentum})
else:
for group in runner.optimizer.param_groups:
if 'momentum' in group.keys():
group.setdefault('initial_momentum', group['momentum'])
else:
group.setdefault('initial_momentum', group['betas'][0])
self.base_momentum = [
group['initial_momentum']
for group in runner.optimizer.param_groups
]
def before_train_epoch(self, runner):
if not self.by_epoch:
return
self.regular_momentum = self.get_regular_momentum(runner)
self._set_momentum(runner, self.regular_momentum)
def before_train_iter(self, runner):
cur_iter = runner.iter
if not self.by_epoch:
self.regular_momentum = self.get_regular_momentum(runner)
if self.warmup is None or cur_iter >= self.warmup_iters:
self._set_momentum(runner, self.regular_momentum)
else:
warmup_momentum = self.get_warmup_momentum(cur_iter)
self._set_momentum(runner, warmup_momentum)
elif self.by_epoch:
if self.warmup is None or cur_iter > self.warmup_iters:
return
elif cur_iter == self.warmup_iters:
self._set_momentum(runner, self.regular_momentum)
else:
warmup_momentum = self.get_warmup_momentum(cur_iter)
self._set_momentum(runner, warmup_momentum)
@HOOKS.register_module()
class StepMomentumUpdaterHook(MomentumUpdaterHook):
"""Step momentum scheduler with min value clipping.
Args:
step (int | list[int]): Step to decay the momentum. If an int value is
given, regard it as the decay interval. If a list is given, decay
momentum at these steps.
gamma (float, optional): Decay momentum ratio. Default: 0.5.
min_momentum (float, optional): Minimum momentum value to keep. If
momentum after decay is lower than this value, it will be clipped
accordingly. If None is given, we don't perform lr clipping.
Default: None.
"""
def __init__(self,
step: Union[int, List[int]],
gamma: float = 0.5,
min_momentum: Optional[float] = None,
**kwargs):
if isinstance(step, list):
assert mmcv.is_list_of(step, int)
assert all([s > 0 for s in step])
elif isinstance(step, int):
assert step > 0
else:
raise TypeError('"step" must be a list or integer')
self.step = step
self.gamma = gamma
self.min_momentum = min_momentum
super().__init__(**kwargs)
def get_momentum(self, runner, base_momentum: float) -> float:
progress = runner.epoch if self.by_epoch else runner.iter
# calculate exponential term
if isinstance(self.step, int):
exp = progress // self.step
else:
exp = len(self.step)
for i, s in enumerate(self.step):
if progress < s:
exp = i
break
momentum = base_momentum * (self.gamma**exp)
if self.min_momentum is not None:
# clip to a minimum value
momentum = max(momentum, self.min_momentum)
return momentum
@HOOKS.register_module()
class CosineAnnealingMomentumUpdaterHook(MomentumUpdaterHook):
"""Cosine annealing LR Momentum decays the Momentum of each parameter group
linearly.
Args:
min_momentum (float, optional): The minimum momentum. Default: None.
min_momentum_ratio (float, optional): The ratio of minimum momentum to
the base momentum. Either `min_momentum` or `min_momentum_ratio`
should be specified. Default: None.
"""
def __init__(self,
min_momentum: Optional[float] = None,
min_momentum_ratio: Optional[float] = None,
**kwargs):
assert (min_momentum is None) ^ (min_momentum_ratio is None)
self.min_momentum = min_momentum
self.min_momentum_ratio = min_momentum_ratio
super().__init__(**kwargs)
def get_momentum(self, runner, base_momentum: float) -> float:
if self.by_epoch:
progress = runner.epoch
max_progress = runner.max_epochs
else:
progress = runner.iter
max_progress = runner.max_iters
if self.min_momentum_ratio is not None:
target_momentum = base_momentum * self.min_momentum_ratio
else:
assert self.min_momentum is not None
target_momentum = self.min_momentum
return annealing_cos(base_momentum, target_momentum,
progress / max_progress)
@HOOKS.register_module()
class LinearAnnealingMomentumUpdaterHook(MomentumUpdaterHook):
"""Linear annealing LR Momentum decays the Momentum of each parameter group
linearly.
Args:
min_momentum (float, optional): The minimum momentum. Default: None.
min_momentum_ratio (float, optional): The ratio of minimum momentum to
the base momentum. Either `min_momentum` or `min_momentum_ratio`
should be specified. Default: None.
"""
def __init__(self,
min_momentum: Optional[float] = None,
min_momentum_ratio: Optional[float] = None,
**kwargs):
assert (min_momentum is None) ^ (min_momentum_ratio is None)
self.min_momentum = min_momentum
self.min_momentum_ratio = min_momentum_ratio
super().__init__(**kwargs)
def get_momentum(self, runner, base_momentum: float) -> float:
if self.by_epoch:
progress = runner.epoch
max_progress = runner.max_epochs
else:
progress = runner.iter
max_progress = runner.max_iters
if self.min_momentum_ratio is not None:
target_momentum = base_momentum * self.min_momentum_ratio
else:
assert self.min_momentum is not None
target_momentum = self.min_momentum
return annealing_linear(base_momentum, target_momentum,
progress / max_progress)
@HOOKS.register_module()
class CyclicMomentumUpdaterHook(MomentumUpdaterHook):
"""Cyclic momentum Scheduler.
Implement the cyclical momentum scheduler policy described in
https://arxiv.org/pdf/1708.07120.pdf
This momentum scheduler usually used together with the CyclicLRUpdater
to improve the performance in the 3D detection area.
Args:
target_ratio (tuple[float]): Relative ratio of the lowest momentum and
the highest momentum to the initial momentum.
cyclic_times (int): Number of cycles during training
step_ratio_up (float): The ratio of the increasing process of momentum
in the total cycle.
by_epoch (bool): Whether to update momentum by epoch.
anneal_strategy (str, optional): {'cos', 'linear'}
Specifies the annealing strategy: 'cos' for cosine annealing,
'linear' for linear annealing. Default: 'cos'.
gamma (float, optional): Cycle decay ratio. Default: 1.
It takes values in the range (0, 1]. The difference between the
maximum learning rate and the minimum learning rate decreases
periodically when it is less than 1. `New in version 1.4.4.`
"""
def __init__(self,
by_epoch: bool = False,
target_ratio: Tuple[float, float] = (0.85 / 0.95, 1.),
cyclic_times: int = 1,
step_ratio_up: float = 0.4,
anneal_strategy: str = 'cos',
gamma: float = 1.,
**kwargs):
if isinstance(target_ratio, float):
target_ratio = (target_ratio, target_ratio / 1e5)
elif isinstance(target_ratio, tuple):
target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
if len(target_ratio) == 1 else target_ratio
else:
raise ValueError('target_ratio should be either float '
f'or tuple, got {type(target_ratio)}')
assert len(target_ratio) == 2, \
'"target_ratio" must be list or tuple of two floats'
assert 0 <= step_ratio_up < 1.0, \
'"step_ratio_up" must be in range [0,1)'
self.target_ratio = target_ratio
self.cyclic_times = cyclic_times
self.step_ratio_up = step_ratio_up
self.gamma = gamma
self.momentum_phases: List[list] = [] # init momentum_phases
self.anneal_func: Callable[[float, float, float], float]
if anneal_strategy not in ['cos', 'linear']:
raise ValueError('anneal_strategy must be one of "cos" or '
f'"linear", instead got {anneal_strategy}')
elif anneal_strategy == 'cos':
self.anneal_func = annealing_cos
elif anneal_strategy == 'linear':
self.anneal_func = annealing_linear
# currently only support by_epoch=False
assert not by_epoch, \
'currently only support "by_epoch" = False'
super().__init__(by_epoch, **kwargs)
def before_run(self, runner):
super().before_run(runner)
# initiate momentum_phases
# total momentum_phases are separated as up and down
max_iter_per_phase = runner.max_iters // self.cyclic_times
iter_up_phase = int(self.step_ratio_up * max_iter_per_phase)
self.max_iter_per_phase = max_iter_per_phase
self.momentum_phases.append(
[0, iter_up_phase, 1, self.target_ratio[0]])
self.momentum_phases.append([
iter_up_phase, max_iter_per_phase, self.target_ratio[0],
self.target_ratio[1]
])
def get_momentum(self, runner, base_momentum: float) -> float:
curr_iter = runner.iter % self.max_iter_per_phase
curr_cycle = runner.iter // self.max_iter_per_phase
scale = self.gamma**curr_cycle
for (start_iter, end_iter, start_ratio, end_ratio) \
in self.momentum_phases:
if start_iter <= curr_iter < end_iter:
# Apply cycle scaling to gradually reduce the difference
# between max_momentum and base momentum. The target end_ratio
# can be expressed as:
# end_ratio = (base_momentum + scale * \
# (max_momentum - base_momentum)) / base_momentum
# iteration: 0-iter_up_phase:
if start_iter == 0:
end_ratio = 1 - scale + end_ratio * scale
# iteration: iter_up_phase-self.max_iter_per_phase
else:
start_ratio = 1 - scale + start_ratio * scale
progress = curr_iter - start_iter
return self.anneal_func(base_momentum * start_ratio,
base_momentum * end_ratio,
progress / (end_iter - start_iter))
raise RuntimeError('The method should return in the for-loop and '
'should not be executed until this')
@HOOKS.register_module()
class OneCycleMomentumUpdaterHook(MomentumUpdaterHook):
"""OneCycle momentum Scheduler.
This momentum scheduler usually used together with the OneCycleLrUpdater
to improve the performance.
Args:
base_momentum (float or list): Lower momentum boundaries in the cycle
for each parameter group. Note that momentum is cycled inversely
to learning rate; at the peak of a cycle, momentum is
'base_momentum' and learning rate is 'max_lr'.
Default: 0.85
max_momentum (float or list): Upper momentum boundaries in the cycle
for each parameter group. Functionally,
it defines the cycle amplitude (max_momentum - base_momentum).
Note that momentum is cycled inversely
to learning rate; at the start of a cycle, momentum is
'max_momentum' and learning rate is 'base_lr'
Default: 0.95
pct_start (float): The percentage of the cycle (in number of steps)
spent increasing the learning rate.
Default: 0.3
anneal_strategy (str): {'cos', 'linear'}
Specifies the annealing strategy: 'cos' for cosine annealing,
'linear' for linear annealing.
Default: 'cos'
three_phase (bool): If three_phase is True, use a third phase of the
schedule to annihilate the learning rate according to
final_div_factor instead of modifying the second phase (the first
two phases will be symmetrical about the step indicated by
pct_start).
Default: False
"""
def __init__(self,
base_momentum: Union[float, list, dict] = 0.85,
max_momentum: Union[float, list, dict] = 0.95,
pct_start: float = 0.3,
anneal_strategy: str = 'cos',
three_phase: bool = False,
**kwargs):
# validate by_epoch, currently only support by_epoch=False
if 'by_epoch' not in kwargs:
kwargs['by_epoch'] = False
else:
assert not kwargs['by_epoch'], \
'currently only support "by_epoch" = False'
if not isinstance(base_momentum, (float, list, dict)):
raise ValueError('base_momentum must be the type among of float,'
'list or dict.')
self._base_momentum = base_momentum
if not isinstance(max_momentum, (float, list, dict)):
raise ValueError('max_momentum must be the type among of float,'
'list or dict.')
self._max_momentum = max_momentum
# validate pct_start
if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
raise ValueError('Expected float between 0 and 1 pct_start, but '
f'got {pct_start}')
self.pct_start = pct_start
# validate anneal_strategy
self.anneal_func: Callable[[float, float, float], float]
if anneal_strategy not in ['cos', 'linear']:
raise ValueError('anneal_strategy must by one of "cos" or '
f'"linear", instead got {anneal_strategy}')
elif anneal_strategy == 'cos':
self.anneal_func = annealing_cos
elif anneal_strategy == 'linear':
self.anneal_func = annealing_linear
self.three_phase = three_phase
self.momentum_phases: List[dict] = [] # init momentum_phases
super().__init__(**kwargs)
def before_run(self, runner):
if isinstance(runner.optimizer, dict):
for k, optim in runner.optimizer.items():
if ('momentum' not in optim.defaults
and 'betas' not in optim.defaults):
raise ValueError('optimizer must support momentum with'
'option enabled')
self.use_beta1 = 'betas' in optim.defaults
_base_momentum = format_param(k, optim, self._base_momentum)
_max_momentum = format_param(k, optim, self._max_momentum)
for group, b_momentum, m_momentum in zip(
optim.param_groups, _base_momentum, _max_momentum):
if self.use_beta1:
_, beta2 = group['betas']
group['betas'] = (m_momentum, beta2)
else:
group['momentum'] = m_momentum
group['base_momentum'] = b_momentum
group['max_momentum'] = m_momentum
else:
optim = runner.optimizer
if ('momentum' not in optim.defaults
and 'betas' not in optim.defaults):
raise ValueError('optimizer must support momentum with'
'option enabled')
self.use_beta1 = 'betas' in optim.defaults
k = type(optim).__name__
_base_momentum = format_param(k, optim, self._base_momentum)
_max_momentum = format_param(k, optim, self._max_momentum)
for group, b_momentum, m_momentum in zip(optim.param_groups,
_base_momentum,
_max_momentum):
if self.use_beta1:
_, beta2 = group['betas']
group['betas'] = (m_momentum, beta2)
else:
group['momentum'] = m_momentum
group['base_momentum'] = b_momentum
group['max_momentum'] = m_momentum
if self.three_phase:
self.momentum_phases.append({
'end_iter':
float(self.pct_start * runner.max_iters) - 1,
'start_momentum':
'max_momentum',
'end_momentum':
'base_momentum'
})
self.momentum_phases.append({
'end_iter':
float(2 * self.pct_start * runner.max_iters) - 2,
'start_momentum':
'base_momentum',
'end_momentum':
'max_momentum'
})
self.momentum_phases.append({
'end_iter': runner.max_iters - 1,
'start_momentum': 'max_momentum',
'end_momentum': 'max_momentum'
})
else:
self.momentum_phases.append({
'end_iter':
float(self.pct_start * runner.max_iters) - 1,
'start_momentum':
'max_momentum',
'end_momentum':
'base_momentum'
})
self.momentum_phases.append({
'end_iter': runner.max_iters - 1,
'start_momentum': 'base_momentum',
'end_momentum': 'max_momentum'
})
def _set_momentum(self, runner, momentum_groups):
if isinstance(runner.optimizer, dict):
for k, optim in runner.optimizer.items():
for param_group, mom in zip(optim.param_groups,
momentum_groups[k]):
if 'momentum' in param_group.keys():
param_group['momentum'] = mom
elif 'betas' in param_group.keys():
param_group['betas'] = (mom, param_group['betas'][1])
else:
for param_group, mom in zip(runner.optimizer.param_groups,
momentum_groups):
if 'momentum' in param_group.keys():
param_group['momentum'] = mom
elif 'betas' in param_group.keys():
param_group['betas'] = (mom, param_group['betas'][1])
def get_momentum(self, runner, param_group: Dict[str, float]) -> float:
curr_iter = runner.iter
start_iter = 0
momentum = 0.
for i, phase in enumerate(self.momentum_phases):
end_iter = phase['end_iter']
if curr_iter <= end_iter or i == len(self.momentum_phases) - 1:
pct = (curr_iter - start_iter) / (end_iter - start_iter)
momentum = self.anneal_func(
param_group[phase['start_momentum']],
param_group[phase['end_momentum']], pct)
break
start_iter = end_iter
return momentum
def get_regular_momentum(self, runner):
if isinstance(runner.optimizer, dict):
momentum_groups = {}
for k, optim in runner.optimizer.items():
_momentum_group = [
self.get_momentum(runner, param_group)
for param_group in optim.param_groups
]
momentum_groups.update({k: _momentum_group})
return momentum_groups
else:
momentum_groups = []
for param_group in runner.optimizer.param_groups:
momentum_groups.append(self.get_momentum(runner, param_group))
return momentum_groups
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import logging
from collections import defaultdict
from itertools import chain
from typing import Optional, Union
import torch.nn as nn
from torch import Tensor
from torch.nn.utils import clip_grad
from mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version
from ..dist_utils import allreduce_grads
from ..fp16_utils import LossScaler, wrap_fp16_model
from .hook import HOOKS, Hook
try:
# If PyTorch version >= 1.6.0, torch.cuda.amp.GradScaler would be imported
# and used; otherwise, auto fp16 will adopt mmcv's implementation.
from torch.cuda.amp import GradScaler
except ImportError:
pass
@HOOKS.register_module()
class OptimizerHook(Hook):
"""A hook contains custom operations for the optimizer.
Args:
grad_clip (dict, optional): A config dict to control the clip_grad.
Default: None.
detect_anomalous_params (bool): This option is only used for
debugging which will slow down the training speed.
Detect anomalous parameters that are not included in
the computational graph with `loss` as the root.
There are two cases
- Parameters were not used during
forward pass.
- Parameters were not used to produce
loss.
Default: False.
"""
def __init__(self,
grad_clip: Optional[dict] = None,
detect_anomalous_params: bool = False):
self.grad_clip = grad_clip
self.detect_anomalous_params = detect_anomalous_params
def clip_grads(self, params):
params = list(
filter(lambda p: p.requires_grad and p.grad is not None, params))
if len(params) > 0:
return clip_grad.clip_grad_norm_(params, **self.grad_clip)
def after_train_iter(self, runner):
runner.optimizer.zero_grad()
if self.detect_anomalous_params:
self.detect_anomalous_parameters(runner.outputs['loss'], runner)
runner.outputs['loss'].backward()
if self.grad_clip is not None:
grad_norm = self.clip_grads(runner.model.parameters())
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update({'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
runner.optimizer.step()
def detect_anomalous_parameters(self, loss: Tensor, runner) -> None:
logger = runner.logger
parameters_in_graph = set()
visited = set()
def traverse(grad_fn):
if grad_fn is None:
return
if grad_fn not in visited:
visited.add(grad_fn)
if hasattr(grad_fn, 'variable'):
parameters_in_graph.add(grad_fn.variable)
parents = grad_fn.next_functions
if parents is not None:
for parent in parents:
grad_fn = parent[0]
traverse(grad_fn)
traverse(loss.grad_fn)
for n, p in runner.model.named_parameters():
if p not in parameters_in_graph and p.requires_grad:
logger.log(
level=logging.ERROR,
msg=f'{n} with shape {p.size()} is not '
f'in the computational graph \n')
@HOOKS.register_module()
class GradientCumulativeOptimizerHook(OptimizerHook):
"""Optimizer Hook implements multi-iters gradient cumulating.
Args:
cumulative_iters (int, optional): Num of gradient cumulative iters.
The optimizer will step every `cumulative_iters` iters.
Defaults to 1.
Examples:
>>> # Use cumulative_iters to simulate a large batch size
>>> # It is helpful when the hardware cannot handle a large batch size.
>>> loader = DataLoader(data, batch_size=64)
>>> optim_hook = GradientCumulativeOptimizerHook(cumulative_iters=4)
>>> # almost equals to
>>> loader = DataLoader(data, batch_size=256)
>>> optim_hook = OptimizerHook()
"""
def __init__(self, cumulative_iters: int = 1, **kwargs):
super().__init__(**kwargs)
assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \
f'cumulative_iters only accepts positive int, but got ' \
f'{type(cumulative_iters)} instead.'
self.cumulative_iters = cumulative_iters
self.divisible_iters = 0
self.remainder_iters = 0
self.initialized = False
def has_batch_norm(self, module: nn.Module) -> bool:
if isinstance(module, _BatchNorm):
return True
for m in module.children():
if self.has_batch_norm(m):
return True
return False
def _init(self, runner):
if runner.iter % self.cumulative_iters != 0:
runner.logger.warning(
'Resume iter number is not divisible by cumulative_iters in '
'GradientCumulativeOptimizerHook, which means the gradient of '
'some iters is lost and the result may be influenced slightly.'
)
if self.has_batch_norm(runner.model) and self.cumulative_iters > 1:
runner.logger.warning(
'GradientCumulativeOptimizerHook may slightly decrease '
'performance if the model has BatchNorm layers.')
residual_iters = runner.max_iters - runner.iter
self.divisible_iters = (
residual_iters // self.cumulative_iters * self.cumulative_iters)
self.remainder_iters = residual_iters - self.divisible_iters
self.initialized = True
def after_train_iter(self, runner):
if not self.initialized:
self._init(runner)
if runner.iter < self.divisible_iters:
loss_factor = self.cumulative_iters
else:
loss_factor = self.remainder_iters
loss = runner.outputs['loss']
loss = loss / loss_factor
loss.backward()
if (self.every_n_iters(runner, self.cumulative_iters)
or self.is_last_iter(runner)):
if self.grad_clip is not None:
grad_norm = self.clip_grads(runner.model.parameters())
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update({'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
runner.optimizer.step()
runner.optimizer.zero_grad()
if (TORCH_VERSION != 'parrots'
and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
@HOOKS.register_module()
class Fp16OptimizerHook(OptimizerHook):
"""FP16 optimizer hook (using PyTorch's implementation).
If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
to take care of the optimization procedure.
Args:
loss_scale (float | str | dict): Scale factor configuration.
If loss_scale is a float, static loss scaling will be used with
the specified scale. If loss_scale is a string, it must be
'dynamic', then dynamic loss scaling will be used.
It can also be a dict containing arguments of GradScalar.
Defaults to 512. For Pytorch >= 1.6, mmcv uses official
implementation of GradScaler. If you use a dict version of
loss_scale to create GradScaler, please refer to:
https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
for the parameters.
Examples:
>>> loss_scale = dict(
... init_scale=65536.0,
... growth_factor=2.0,
... backoff_factor=0.5,
... growth_interval=2000
... )
>>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale)
"""
def __init__(self,
grad_clip: Optional[dict] = None,
coalesce: bool = True,
bucket_size_mb: int = -1,
loss_scale: Union[float, str, dict] = 512.,
distributed: bool = True):
self.grad_clip = grad_clip
self.coalesce = coalesce
self.bucket_size_mb = bucket_size_mb
self.distributed = distributed
self._scale_update_param = None
if loss_scale == 'dynamic':
self.loss_scaler = GradScaler()
elif isinstance(loss_scale, float):
self._scale_update_param = loss_scale
self.loss_scaler = GradScaler(init_scale=loss_scale)
elif isinstance(loss_scale, dict):
self.loss_scaler = GradScaler(**loss_scale)
else:
raise ValueError('loss_scale must be of type float, dict, or '
f'"dynamic", got {loss_scale}')
def before_run(self, runner) -> None:
"""Preparing steps before Mixed Precision Training."""
# wrap model mode to fp16
wrap_fp16_model(runner.model)
# resume from state dict
if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
scaler_state_dict = runner.meta['fp16']['loss_scaler']
self.loss_scaler.load_state_dict(scaler_state_dict)
def copy_grads_to_fp32(self, fp16_net: nn.Module,
fp32_weights: Tensor) -> None:
"""Copy gradients from fp16 model to fp32 weight copy."""
for fp32_param, fp16_param in zip(fp32_weights,
fp16_net.parameters()):
if fp16_param.grad is not None:
if fp32_param.grad is None:
fp32_param.grad = fp32_param.data.new(
fp32_param.size())
fp32_param.grad.copy_(fp16_param.grad)
def copy_params_to_fp16(self, fp16_net: nn.Module,
fp32_weights: Tensor) -> None:
"""Copy updated params from fp32 weight copy to fp16 model."""
for fp16_param, fp32_param in zip(fp16_net.parameters(),
fp32_weights):
fp16_param.data.copy_(fp32_param.data)
def after_train_iter(self, runner) -> None:
"""Backward optimization steps for Mixed Precision Training. For
dynamic loss scaling, please refer to
https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.
1. Scale the loss by a scale factor.
2. Backward the loss to obtain the gradients.
3. Unscale the optimizer’s gradient tensors.
4. Call optimizer.step() and update scale factor.
5. Save loss_scaler state_dict for resume purpose.
"""
# clear grads of last iteration
runner.model.zero_grad()
runner.optimizer.zero_grad()
self.loss_scaler.scale(runner.outputs['loss']).backward()
self.loss_scaler.unscale_(runner.optimizer)
# grad clip
if self.grad_clip is not None:
grad_norm = self.clip_grads(runner.model.parameters())
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update({'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
# backward and update scaler
self.loss_scaler.step(runner.optimizer)
self.loss_scaler.update(self._scale_update_param)
# save state_dict of loss_scaler
runner.meta.setdefault(
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
@HOOKS.register_module()
class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
Fp16OptimizerHook):
"""Fp16 optimizer Hook (using PyTorch's implementation) implements
multi-iters gradient cumulating.
If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
to take care of the optimization procedure.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def after_train_iter(self, runner) -> None:
if not self.initialized:
self._init(runner)
if runner.iter < self.divisible_iters:
loss_factor = self.cumulative_iters
else:
loss_factor = self.remainder_iters
loss = runner.outputs['loss']
loss = loss / loss_factor
self.loss_scaler.scale(loss).backward()
if (self.every_n_iters(runner, self.cumulative_iters)
or self.is_last_iter(runner)):
# copy fp16 grads in the model to fp32 params in the optimizer
self.loss_scaler.unscale_(runner.optimizer)
if self.grad_clip is not None:
grad_norm = self.clip_grads(runner.model.parameters())
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update(
{'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
# backward and update scaler
self.loss_scaler.step(runner.optimizer)
self.loss_scaler.update(self._scale_update_param)
# save state_dict of loss_scaler
runner.meta.setdefault(
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
# clear grads
runner.model.zero_grad()
runner.optimizer.zero_grad()
else:
@HOOKS.register_module()
class Fp16OptimizerHook(OptimizerHook): # type: ignore
"""FP16 optimizer hook (mmcv's implementation).
The steps of fp16 optimizer is as follows.
1. Scale the loss value.
2. BP in the fp16 model.
2. Copy gradients from fp16 model to fp32 weights.
3. Update fp32 weights.
4. Copy updated parameters from fp32 weights to fp16 model.
Refer to https://arxiv.org/abs/1710.03740 for more details.
Args:
loss_scale (float | str | dict): Scale factor configuration.
If loss_scale is a float, static loss scaling will be used with
the specified scale. If loss_scale is a string, it must be
'dynamic', then dynamic loss scaling will be used.
It can also be a dict containing arguments of LossScaler.
Defaults to 512.
"""
def __init__(self,
grad_clip: Optional[dict] = None,
coalesce: bool = True,
bucket_size_mb: int = -1,
loss_scale: Union[float, str, dict] = 512.,
distributed: bool = True):
self.grad_clip = grad_clip
self.coalesce = coalesce
self.bucket_size_mb = bucket_size_mb
self.distributed = distributed
if loss_scale == 'dynamic':
self.loss_scaler = LossScaler(mode='dynamic')
elif isinstance(loss_scale, float):
self.loss_scaler = LossScaler(
init_scale=loss_scale, mode='static')
elif isinstance(loss_scale, dict):
self.loss_scaler = LossScaler(**loss_scale)
else:
raise ValueError('loss_scale must be of type float, dict, or '
f'"dynamic", got {loss_scale}')
def before_run(self, runner) -> None:
"""Preparing steps before Mixed Precision Training.
1. Make a master copy of fp32 weights for optimization.
2. Convert the main model from fp32 to fp16.
"""
# keep a copy of fp32 weights
old_groups = runner.optimizer.param_groups
runner.optimizer.param_groups = copy.deepcopy(
runner.optimizer.param_groups)
state: defaultdict = defaultdict(dict)
p_map = {
old_p: p
for old_p, p in zip(
chain(*(g['params'] for g in old_groups)),
chain(*(g['params']
for g in runner.optimizer.param_groups)))
}
for k, v in runner.optimizer.state.items():
state[p_map[k]] = v
runner.optimizer.state = state
# convert model to fp16
wrap_fp16_model(runner.model)
# resume from state dict
if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
scaler_state_dict = runner.meta['fp16']['loss_scaler']
self.loss_scaler.load_state_dict(scaler_state_dict)
def copy_grads_to_fp32(self, fp16_net: nn.Module,
fp32_weights: Tensor) -> None:
"""Copy gradients from fp16 model to fp32 weight copy."""
for fp32_param, fp16_param in zip(fp32_weights,
fp16_net.parameters()):
if fp16_param.grad is not None:
if fp32_param.grad is None:
fp32_param.grad = fp32_param.data.new(
fp32_param.size())
fp32_param.grad.copy_(fp16_param.grad)
def copy_params_to_fp16(self, fp16_net: nn.Module,
fp32_weights: Tensor) -> None:
"""Copy updated params from fp32 weight copy to fp16 model."""
for fp16_param, fp32_param in zip(fp16_net.parameters(),
fp32_weights):
fp16_param.data.copy_(fp32_param.data)
def after_train_iter(self, runner) -> None:
"""Backward optimization steps for Mixed Precision Training. For
dynamic loss scaling, please refer `loss_scalar.py`
1. Scale the loss by a scale factor.
2. Backward the loss to obtain the gradients (fp16).
3. Copy gradients from the model to the fp32 weight copy.
4. Scale the gradients back and update the fp32 weight copy.
5. Copy back the params from fp32 weight copy to the fp16 model.
6. Save loss_scaler state_dict for resume purpose.
"""
# clear grads of last iteration
runner.model.zero_grad()
runner.optimizer.zero_grad()
# scale the loss value
scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale
scaled_loss.backward()
# copy fp16 grads in the model to fp32 params in the optimizer
fp32_weights = []
for param_group in runner.optimizer.param_groups:
fp32_weights += param_group['params']
self.copy_grads_to_fp32(runner.model, fp32_weights)
# allreduce grads
if self.distributed:
allreduce_grads(fp32_weights, self.coalesce,
self.bucket_size_mb)
has_overflow = self.loss_scaler.has_overflow(fp32_weights)
# if has overflow, skip this iteration
if not has_overflow:
# scale the gradients back
for param in fp32_weights:
if param.grad is not None:
param.grad.div_(self.loss_scaler.loss_scale)
if self.grad_clip is not None:
grad_norm = self.clip_grads(fp32_weights)
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update(
{'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
# update fp32 params
runner.optimizer.step()
# copy fp32 params to the fp16 model
self.copy_params_to_fp16(runner.model, fp32_weights)
self.loss_scaler.update_scale(has_overflow)
if has_overflow:
runner.logger.warning('Check overflow, downscale loss scale '
f'to {self.loss_scaler.cur_scale}')
# save state_dict of loss_scaler
runner.meta.setdefault(
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
@HOOKS.register_module()
class GradientCumulativeFp16OptimizerHook( # type: ignore
GradientCumulativeOptimizerHook, Fp16OptimizerHook):
"""Fp16 optimizer Hook (using mmcv implementation) implements multi-
iters gradient cumulating."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def after_train_iter(self, runner) -> None:
if not self.initialized:
self._init(runner)
if runner.iter < self.divisible_iters:
loss_factor = self.cumulative_iters
else:
loss_factor = self.remainder_iters
loss = runner.outputs['loss']
loss = loss / loss_factor
# scale the loss value
scaled_loss = loss * self.loss_scaler.loss_scale
scaled_loss.backward()
if (self.every_n_iters(runner, self.cumulative_iters)
or self.is_last_iter(runner)):
# copy fp16 grads in the model to fp32 params in the optimizer
fp32_weights = []
for param_group in runner.optimizer.param_groups:
fp32_weights += param_group['params']
self.copy_grads_to_fp32(runner.model, fp32_weights)
# allreduce grads
if self.distributed:
allreduce_grads(fp32_weights, self.coalesce,
self.bucket_size_mb)
has_overflow = self.loss_scaler.has_overflow(fp32_weights)
# if has overflow, skip this iteration
if not has_overflow:
# scale the gradients back
for param in fp32_weights:
if param.grad is not None:
param.grad.div_(self.loss_scaler.loss_scale)
if self.grad_clip is not None:
grad_norm = self.clip_grads(fp32_weights)
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update(
{'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
# update fp32 params
runner.optimizer.step()
# copy fp32 params to the fp16 model
self.copy_params_to_fp16(runner.model, fp32_weights)
else:
runner.logger.warning(
'Check overflow, downscale loss scale '
f'to {self.loss_scaler.cur_scale}')
self.loss_scaler.update_scale(has_overflow)
# save state_dict of loss_scaler
runner.meta.setdefault(
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
# clear grads
runner.model.zero_grad()
runner.optimizer.zero_grad()
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
import warnings
from typing import Callable, List, Optional, Union
import torch
from ..dist_utils import master_only
from .hook import HOOKS, Hook
@HOOKS.register_module()
class ProfilerHook(Hook):
"""Profiler to analyze performance during training.
PyTorch Profiler is a tool that allows the collection of the performance
metrics during the training. More details on Profiler can be found at
https://pytorch.org/docs/1.8.1/profiler.html#torch.profiler.profile
Args:
by_epoch (bool): Profile performance by epoch or by iteration.
Default: True.
profile_iters (int): Number of iterations for profiling.
If ``by_epoch=True``, profile_iters indicates that they are the
first profile_iters epochs at the beginning of the
training, otherwise it indicates the first profile_iters
iterations. Default: 1.
activities (list[str]): List of activity groups (CPU, CUDA) to use in
profiling. Default: ['cpu', 'cuda'].
schedule (dict, optional): Config of generating the callable schedule.
if schedule is None, profiler will not add step markers into the
trace and table view. Default: None.
on_trace_ready (callable, dict): Either a handler or a dict of generate
handler. Default: None.
record_shapes (bool): Save information about operator's input shapes.
Default: False.
profile_memory (bool): Track tensor memory allocation/deallocation.
Default: False.
with_stack (bool): Record source information (file and line number)
for the ops. Default: False.
with_flops (bool): Use formula to estimate the FLOPS of specific
operators (matrix multiplication and 2D convolution).
Default: False.
json_trace_path (str, optional): Exports the collected trace in Chrome
JSON format. Default: None.
Example:
>>> runner = ... # instantiate a Runner
>>> # tensorboard trace
>>> trace_config = dict(type='tb_trace', dir_name='work_dir')
>>> profiler_config = dict(on_trace_ready=trace_config)
>>> runner.register_profiler_hook(profiler_config)
>>> runner.run(data_loaders=[trainloader], workflow=[('train', 1)])
"""
def __init__(self,
by_epoch: bool = True,
profile_iters: int = 1,
activities: List[str] = ['cpu', 'cuda'],
schedule: Optional[dict] = None,
on_trace_ready: Optional[Union[Callable, dict]] = None,
record_shapes: bool = False,
profile_memory: bool = False,
with_stack: bool = False,
with_flops: bool = False,
json_trace_path: Optional[str] = None) -> None:
try:
from torch import profiler # torch version >= 1.8.1
except ImportError:
raise ImportError('profiler is the new feature of torch1.8.1, '
f'but your version is {torch.__version__}')
assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.'
self.by_epoch = by_epoch
if profile_iters < 1:
raise ValueError('profile_iters should be greater than 0, but got '
f'{profile_iters}')
self.profile_iters = profile_iters
if not isinstance(activities, list):
raise ValueError(
f'activities should be list, but got {type(activities)}')
self.activities = []
for activity in activities:
activity = activity.lower()
if activity == 'cpu':
self.activities.append(profiler.ProfilerActivity.CPU)
elif activity == 'cuda':
self.activities.append(profiler.ProfilerActivity.CUDA)
else:
raise ValueError(
f'activity should be "cpu" or "cuda", but got {activity}')
if schedule is not None:
self.schedule = profiler.schedule(**schedule)
else:
self.schedule = None
self.on_trace_ready = on_trace_ready
self.record_shapes = record_shapes
self.profile_memory = profile_memory
self.with_stack = with_stack
self.with_flops = with_flops
self.json_trace_path = json_trace_path
@master_only
def before_run(self, runner):
if self.by_epoch and runner.max_epochs < self.profile_iters:
raise ValueError('self.profile_iters should not be greater than '
f'{runner.max_epochs}')
if not self.by_epoch and runner.max_iters < self.profile_iters:
raise ValueError('self.profile_iters should not be greater than '
f'{runner.max_iters}')
if callable(self.on_trace_ready): # handler
_on_trace_ready = self.on_trace_ready
elif isinstance(self.on_trace_ready, dict): # config of handler
trace_cfg = self.on_trace_ready.copy()
trace_type = trace_cfg.pop('type') # log_trace handler
if trace_type == 'log_trace':
def _log_handler(prof):
print(prof.key_averages().table(**trace_cfg))
_on_trace_ready = _log_handler
elif trace_type == 'tb_trace': # tensorboard_trace handler
try:
import torch_tb_profiler # noqa: F401
except ImportError:
raise ImportError('please run "pip install '
'torch-tb-profiler" to install '
'torch_tb_profiler')
if 'dir_name' not in trace_cfg:
trace_cfg['dir_name'] = osp.join(runner.work_dir,
'tf_tracing_logs')
elif not osp.isabs(trace_cfg['dir_name']):
trace_cfg['dir_name'] = osp.join(runner.work_dir,
trace_cfg['dir_name'])
runner.logger.info(
'tracing files of ProfilerHook will be saved to '
f"{trace_cfg['dir_name']}.")
_on_trace_ready = torch.profiler.tensorboard_trace_handler(
**trace_cfg)
else:
raise ValueError('trace_type should be "log_trace" or '
f'"tb_trace", but got {trace_type}')
elif self.on_trace_ready is None:
_on_trace_ready = None # type: ignore
else:
raise ValueError('on_trace_ready should be handler, dict or None, '
f'but got {type(self.on_trace_ready)}')
if self.by_epoch and runner.max_epochs > 1:
warnings.warn(f'profiler will profile {runner.max_epochs} epochs '
'instead of 1 epoch. Since profiler will slow down '
'the training, it is recommended to train 1 epoch '
'with ProfilerHook and adjust your setting according'
' to the profiler summary. During normal training '
'(epoch > 1), you may disable the ProfilerHook.')
self.profiler = torch.profiler.profile(
activities=self.activities,
schedule=self.schedule,
on_trace_ready=_on_trace_ready,
record_shapes=self.record_shapes,
profile_memory=self.profile_memory,
with_stack=self.with_stack,
with_flops=self.with_flops)
self.profiler.__enter__()
runner.logger.info('profiler is profiling...')
@master_only
def after_train_epoch(self, runner):
if self.by_epoch and runner.epoch == self.profile_iters - 1:
runner.logger.info('profiler may take a few minutes...')
self.profiler.__exit__(None, None, None)
if self.json_trace_path is not None:
self.profiler.export_chrome_trace(self.json_trace_path)
@master_only
def after_train_iter(self, runner):
self.profiler.step()
if not self.by_epoch and runner.iter == self.profile_iters - 1:
runner.logger.info('profiler may take a few minutes...')
self.profiler.__exit__(None, None, None)
if self.json_trace_path is not None:
self.profiler.export_chrome_trace(self.json_trace_path)
# Copyright (c) OpenMMLab. All rights reserved.
from .hook import HOOKS, Hook
@HOOKS.register_module()
class DistSamplerSeedHook(Hook):
"""Data-loading sampler for distributed training.
When distributed training, it is only useful in conjunction with
:obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
purpose with :obj:`IterLoader`.
"""
def before_epoch(self, runner):
if hasattr(runner.data_loader.sampler, 'set_epoch'):
# in case the data loader uses `SequentialSampler` in Pytorch
runner.data_loader.sampler.set_epoch(runner.epoch)
elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'):
# batch sampler in pytorch warps the sampler as its attributes.
runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch)
# Copyright (c) OpenMMLab. All rights reserved.
from ..dist_utils import allreduce_params
from .hook import HOOKS, Hook
@HOOKS.register_module()
class SyncBuffersHook(Hook):
"""Synchronize model buffers such as running_mean and running_var in BN at
the end of each epoch.
Args:
distributed (bool): Whether distributed training is used. It is
effective only for distributed training. Defaults to True.
"""
def __init__(self, distributed: bool = True):
self.distributed = distributed
def after_epoch(self, runner):
"""All-reduce model buffers at the end of each epoch."""
if self.distributed:
allreduce_params(runner.model.buffers())
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment