Commit 918abe42 authored by Valentin Andrei's avatar Valentin Andrei Committed by Facebook GitHub Bot
Browse files

Add multi-tensor optimizer version for SGD

Summary:
Added multi-tensor optimizer implementation for SGD, from `torch.optim._multi_tensor`. It can potentially provide ~5% QPS improvement by using `foreach` API to speed up the optimizer step.

Using it is optional, from the configuration file, by specifying `SGD_MT` in the `SOLVER.OPTIMIZER` setting.

Reviewed By: zhanghang1989

Differential Revision: D30377761

fbshipit-source-id: 06107f1b91e9807c1db5d1b0ca6be09fcbb13e67
parent 9a9d53fb
......@@ -210,6 +210,33 @@ def adamw(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
)
@D2GO_OPTIM_MAPPER_REGISTRY.register()
def sgd_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
"""
Build a multi_tensor SGD optimizer that works significantly faster.
This version is expected to be the default implementation for SGD
optimizer by end of H1'21. To benefit from the speedup, the number
of parameter groups needs to be reduced using `reduce_param_groups`.
"""
params = get_default_optimizer_params(
model,
base_lr=cfg.SOLVER.BASE_LR,
weight_decay=cfg.SOLVER.WEIGHT_DECAY,
weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM,
weight_decay_embed=cfg.SOLVER.WEIGHT_DECAY_EMBED,
bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR,
use_param_group_reduction=True,
weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS,
lr_multipliers_overwrite=_merge_dict(cfg.SOLVER.LR_MULTIPLIER_OVERWRITE),
)
return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.SGD)(
params,
cfg.SOLVER.BASE_LR,
momentum=cfg.SOLVER.MOMENTUM,
nesterov=cfg.SOLVER.NESTEROV,
)
@D2GO_OPTIM_MAPPER_REGISTRY.register()
def adamw_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
"""
......
......@@ -35,12 +35,12 @@ def _test_each_optimizer(cfg):
class TestOptimizer(unittest.TestCase):
def test_all_optimiers(self):
def test_all_optimizers(self):
runner = default_runner.Detectron2GoRunner()
cfg = runner.get_default_cfg()
multipliers = [None, [{'conv': 0.1}]]
for optimizer_name in ["SGD", "AdamW"]:
for optimizer_name in ["SGD", "AdamW", "SGD_MT", "AdamW_MT"]:
for mult in multipliers:
cfg.SOLVER.OPTIMIZER = optimizer_name
cfg.SOLVER.MULTIPLIERS = mult
......@@ -50,10 +50,9 @@ class TestOptimizer(unittest.TestCase):
runner = default_runner.Detectron2GoRunner()
cfg = runner.get_default_cfg()
for optimizer_name in ["SGD", "AdamW"]:
for optimizer_name in ["SGD", "AdamW", "SGD_MT", "AdamW_MT"]:
cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 0.2
cfg.SOLVER.CLIP_GRADIENTS.ENABLED = True
cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "full_model"
cfg.SOLVER.OPTIMIZER = optimizer_name
_test_each_optimizer(cfg)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment