Unverified Commit 43f27332 authored by Calogero Zarbo's avatar Calogero Zarbo Committed by GitHub
Browse files

Add "zero_allow_untested_optimizer" option in conf file (#173)

* added zero_allow_untested_optimizer flag helpers

* add zero_allow_untested_optimizer config constants

* zero_allow_untested_optimizer logic with assertion

* Added unit test and CustomOptimizer helper class
parent 20557f70
...@@ -165,6 +165,12 @@ def get_optimizer_legacy_fusion(param_dict): ...@@ -165,6 +165,12 @@ def get_optimizer_legacy_fusion(param_dict):
return LEGACY_FUSION_DEFAULT return LEGACY_FUSION_DEFAULT
def get_zero_allow_untested_optimizer(param_dict):
return get_scalar_param(param_dict,
ZERO_ALLOW_UNTESTED_OPTIMIZER,
ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT)
def get_scheduler_name(param_dict): def get_scheduler_name(param_dict):
if SCHEDULER in param_dict.keys() and \ if SCHEDULER in param_dict.keys() and \
TYPE in param_dict[SCHEDULER].keys(): TYPE in param_dict[SCHEDULER].keys():
...@@ -271,6 +277,9 @@ class DeepSpeedConfig(object): ...@@ -271,6 +277,9 @@ class DeepSpeedConfig(object):
self.optimizer_params = get_optimizer_params(param_dict) self.optimizer_params = get_optimizer_params(param_dict)
self.optimizer_legacy_fusion = get_optimizer_legacy_fusion(param_dict) self.optimizer_legacy_fusion = get_optimizer_legacy_fusion(param_dict)
self.zero_allow_untested_optimizer = get_zero_allow_untested_optimizer(
param_dict)
self.scheduler_name = get_scheduler_name(param_dict) self.scheduler_name = get_scheduler_name(param_dict)
self.scheduler_params = get_scheduler_params(param_dict) self.scheduler_params = get_scheduler_params(param_dict)
......
...@@ -31,6 +31,12 @@ SCHEDULER_TYPE_DEFAULT = None ...@@ -31,6 +31,12 @@ SCHEDULER_TYPE_DEFAULT = None
SCHEDULER_PARAMS = "params" SCHEDULER_PARAMS = "params"
MAX_GRAD_NORM = 'max_grad_norm' MAX_GRAD_NORM = 'max_grad_norm'
#############################################
# Optimizer and lr scheduler
#############################################
ZERO_ALLOW_UNTESTED_OPTIMIZER = "zero_allow_untested_optimizer"
ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT = False
############################################# #############################################
# Torch distributed constants # Torch distributed constants
############################################# #############################################
......
...@@ -271,6 +271,9 @@ class DeepSpeedLight(Module): ...@@ -271,6 +271,9 @@ class DeepSpeedLight(Module):
def zero_optimization(self): def zero_optimization(self):
return self._config.zero_enabled return self._config.zero_enabled
def zero_allow_untested_optimizer(self):
return self._config.zero_allow_untested_optimizer
def allgather_size(self): def allgather_size(self):
return self._config.allgather_size return self._config.allgather_size
...@@ -444,7 +447,10 @@ class DeepSpeedLight(Module): ...@@ -444,7 +447,10 @@ class DeepSpeedLight(Module):
logging.info('DeepSpeed Basic Optimizer = {}'.format(basic_optimizer)) logging.info('DeepSpeed Basic Optimizer = {}'.format(basic_optimizer))
if self.zero_optimization(): if self.zero_optimization():
if self.optimizer_name != ADAM_OPTIMIZER: if self.optimizer_name() != ADAM_OPTIMIZER:
assert self.zero_allow_untested_optimizer(), \
'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.'
logging.warning( logging.warning(
"**** You are using ZeRO with an untested optimizer, proceed with caution *****" "**** You are using ZeRO with an untested optimizer, proceed with caution *****"
) )
......
...@@ -18,6 +18,29 @@ class SimpleModel(torch.nn.Module): ...@@ -18,6 +18,29 @@ class SimpleModel(torch.nn.Module):
return self.cross_entropy_loss(hidden_dim, y) return self.cross_entropy_loss(hidden_dim, y)
class SimpleOptimizer(torch.optim.Optimizer):
def __init__(self, params, lr=0.11072018):
defaults = dict(lr=lr)
super(SimpleOptimizer, self).__init__(params, defaults)
def __setstate__(self, state):
super(SimpleOptimizer, self).__setstate__(state)
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
p.data.add_(-group['lr'], d_p)
return loss
def random_dataloader(model, total_samples, hidden_dim, device): def random_dataloader(model, total_samples, hidden_dim, device):
batch_size = model.train_micro_batch_size_per_gpu() batch_size = model.train_micro_batch_size_per_gpu()
train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half) train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
......
...@@ -5,7 +5,7 @@ import pytest ...@@ -5,7 +5,7 @@ import pytest
import json import json
import os import os
from common import distributed_test from common import distributed_test
from simple_model import SimpleModel, random_dataloader, args_from_dict from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
def test_lamb_fp16_basic(tmpdir): def test_lamb_fp16_basic(tmpdir):
...@@ -289,3 +289,29 @@ def test_zero_static_scale(tmpdir): ...@@ -289,3 +289,29 @@ def test_zero_static_scale(tmpdir):
model.step() model.step()
_test_zero_static_scale(args) _test_zero_static_scale(args)
def test_zero_allow_untested_optimizer(tmpdir):
config_dict = {
"train_batch_size": 4,
"steps_per_print": 1,
"fp16": {
"enabled": True,
},
"zero_optimization": True,
"zero_allow_untested_optimizer": False
}
args = args_from_dict(tmpdir, config_dict)
@distributed_test(world_size=[1])
def _test_zero_allow_untested_optimizer(args):
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=True)
optimizer = SimpleOptimizer(model.parameters())
with pytest.raises(AssertionError):
model, optim, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
model_parameters=model.parameters())
_test_zero_allow_untested_optimizer(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment