Unverified Commit 41db1c2f authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files
parent 79093d74
import argparse
import torch
import time
import numpy as np
import pytest
import copy
from deepspeed.ops.adam import DeepSpeedCPUAdam
def check_equal(first, second, atol=1e-2, verbose=False):
x = first.detach().numpy()
y = second.detach().numpy()
if verbose:
print("x = {}".format(x.flatten()))
print("y = {}".format(y.flatten()))
print('-' * 80)
np.testing.assert_allclose(x, y, err_msg="param-update dismatch!", atol=atol)
@pytest.mark.parametrize('model_size',
[
(64),
(22),
(55),
(127),
(1024),
(1048576),
]) # yapf: disable
def test_adam_opt(model_size):
device = 'cpu'
rng_state = torch.get_rng_state()
param = torch.nn.Parameter(torch.randn(model_size, device=device))
torch.set_rng_state(rng_state)
param1 = torch.nn.Parameter(torch.randn(model_size, device=device))
optimizer1 = torch.optim.Adam([param1])
optimizer = DeepSpeedCPUAdam([param])
for i in range(10):
rng_state = torch.get_rng_state()
param.grad = torch.randn(model_size, device=device)
torch.set_rng_state(rng_state)
param1.grad = torch.randn(model_size, device=device)
optimizer.step()
optimizer1.step()
check_equal(param, param1, atol=1e-2, verbose=True)
...@@ -28,25 +28,25 @@ def compare_model_states(saved_model, loaded_model): ...@@ -28,25 +28,25 @@ def compare_model_states(saved_model, loaded_model):
compare_deepspeed_states(saved_model, loaded_model) compare_deepspeed_states(saved_model, loaded_model)
for p0, p1 in zip(saved_model.module.parameters(), loaded_model.module.parameters()): for p0, p1 in zip(saved_model.module.parameters(), loaded_model.module.parameters()):
assert torch.allclose(p0,p1,atol=1e-07), f"FP16 model state {p0} is not equal to {p1}" assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}"
if isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer): if isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer):
for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups): for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups):
assert torch.allclose(p0,p1,atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}" assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
elif isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer_Stage1): elif isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer_Stage1):
for partition0, partition1 in zip(saved_model.optimizer.local_sub_partitions_of_fp32_groups, loaded_model.optimizer.local_sub_partitions_of_fp32_groups): for partition0, partition1 in zip(saved_model.optimizer.local_sub_partitions_of_fp32_groups, loaded_model.optimizer.local_sub_partitions_of_fp32_groups):
for p0, p1 in zip(partition0, partition1): for p0, p1 in zip(partition0, partition1):
assert torch.allclose(p0,p1,atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}" assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
elif isinstance(saved_model.optimizer, FP16_Optimizer): elif isinstance(saved_model.optimizer, FP16_Optimizer):
for p0, p1 in zip(saved_model.optimizer.fp32_groups_flat, loaded_model.optimizer.fp32_groups_flat): for p0, p1 in zip(saved_model.optimizer.fp32_groups_flat, loaded_model.optimizer.fp32_groups_flat):
assert torch.allclose(p0,p1,atol=1e-07), f"FP32 model states {p0} is not equal to {p1}" assert torch.allclose(p0, p1, atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
elif isinstance(saved_model.optimizer, FP16_UnfusedOptimizer): elif isinstance(saved_model.optimizer, FP16_UnfusedOptimizer):
for params0, params1 in zip(saved_model.optimizer.fp32_groups, loaded_model.optimizer.fp32_groups): for params0, params1 in zip(saved_model.optimizer.fp32_groups, loaded_model.optimizer.fp32_groups):
for p0, p1 in zip(params0, params1): for p0, p1 in zip(params0, params1):
assert torch.allclose(p0,p1,atol=1e-07), f"FP32 model states {p0} is not equal to {p1}" assert torch.allclose(p0, p1, atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
elif isinstance(saved_model.optimizer, torch.optim.Optimizer): elif isinstance(saved_model.optimizer, torch.optim.Optimizer):
pass pass
else: else:
...@@ -97,7 +97,7 @@ def checkpoint_correctness_verification(args, ...@@ -97,7 +97,7 @@ def checkpoint_correctness_verification(args,
load_lr_scheduler_states=False, load_lr_scheduler_states=False,
fp16=True): fp16=True):
dtype = torch.half if fp16 else torch.float32 dtype = torch.half if fp16 else torch.float32
ds_model, _, _,_ = deepspeed.initialize(args=args, ds_model, _, _, _ = deepspeed.initialize(args=args,
model=model, model=model,
model_parameters=model.parameters()) model_parameters=model.parameters())
data_loader = random_dataloader(model=ds_model, data_loader = random_dataloader(model=ds_model,
...@@ -117,7 +117,7 @@ def checkpoint_correctness_verification(args, ...@@ -117,7 +117,7 @@ def checkpoint_correctness_verification(args,
trained_model.save_checkpoint(save_folder, save_tag) trained_model.save_checkpoint(save_folder, save_tag)
loaded_model, _, _,_ = deepspeed.initialize(args=args, loaded_model, _, _, _ = deepspeed.initialize(args=args,
model=model, model=model,
model_parameters=model.parameters()) model_parameters=model.parameters())
...@@ -235,13 +235,24 @@ def test_checkpoint_fused_optimizer(tmpdir): ...@@ -235,13 +235,24 @@ def test_checkpoint_fused_optimizer(tmpdir):
load_optimizer_states=False) load_optimizer_states=False)
@pytest.mark.parametrize("zero_stage", [1, 2]) @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
def test_checkpoint_zero_optimizer(tmpdir, zero_stage): [
(1,
False,
'Adam'),
(2,
False,
'Adam'),
(2,
True,
'deepspeed_adam'),
])
def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
"steps_per_print": 1, "steps_per_print": 1,
"optimizer": { "optimizer": {
"type": "Adam", "type": adam_optimizer,
"params": { "params": {
"lr": 0.00015, "lr": 0.00015,
"betas": [0.8, "betas": [0.8,
...@@ -254,8 +265,9 @@ def test_checkpoint_zero_optimizer(tmpdir, zero_stage): ...@@ -254,8 +265,9 @@ def test_checkpoint_zero_optimizer(tmpdir, zero_stage):
"enabled": True "enabled": True
}, },
"zero_optimization": { "zero_optimization": {
"stage": zero_stage "stage": zero_stage,
}, "cpu_offload": use_cpu_offload
}
} }
args = args_from_dict(tmpdir, config_dict) args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10 hidden_dim = 10
...@@ -276,13 +288,27 @@ def test_checkpoint_zero_optimizer(tmpdir, zero_stage): ...@@ -276,13 +288,27 @@ def test_checkpoint_zero_optimizer(tmpdir, zero_stage):
load_optimizer_states=True) load_optimizer_states=True)
@pytest.mark.parametrize("zero_stage", [1, 2]) @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
def test_checkpoint_zero_no_optimizer(tmpdir, zero_stage): [
(1,
False,
"Adam"),
(2,
False,
"Adam"),
(2,
True,
'deepspeed_adam'),
])
def test_checkpoint_zero_no_optimizer(tmpdir,
zero_stage,
use_cpu_offload,
adam_optimizer):
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
"steps_per_print": 1, "steps_per_print": 1,
"optimizer": { "optimizer": {
"type": "Adam", "type": adam_optimizer,
"params": { "params": {
"lr": 0.00015, "lr": 0.00015,
"betas": [0.8, "betas": [0.8,
...@@ -295,8 +321,9 @@ def test_checkpoint_zero_no_optimizer(tmpdir, zero_stage): ...@@ -295,8 +321,9 @@ def test_checkpoint_zero_no_optimizer(tmpdir, zero_stage):
"enabled": True "enabled": True
}, },
"zero_optimization": { "zero_optimization": {
"stage": zero_stage "stage": zero_stage,
}, "cpu_offload": use_cpu_offload
}
} }
args = args_from_dict(tmpdir, config_dict) args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10 hidden_dim = 10
...@@ -320,13 +347,27 @@ def test_checkpoint_zero_no_optimizer(tmpdir, zero_stage): ...@@ -320,13 +347,27 @@ def test_checkpoint_zero_no_optimizer(tmpdir, zero_stage):
load_optimizer_states=False) load_optimizer_states=False)
@pytest.mark.parametrize("zero_stage", [0, 1, 2]) @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
def test_checkpoint_lr_scheduler(tmpdir, zero_stage): [
(0,
False,
'Adam'),
(1,
False,
'Adam'),
(2,
False,
'Adam'),
(2,
True,
'deepspeed_adam'),
])
def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
"steps_per_print": 1, "steps_per_print": 1,
"optimizer": { "optimizer": {
"type": "Adam", "type": adam_optimizer,
"params": { "params": {
"lr": 0.00015, "lr": 0.00015,
"betas": [0.8, "betas": [0.8,
...@@ -339,7 +380,8 @@ def test_checkpoint_lr_scheduler(tmpdir, zero_stage): ...@@ -339,7 +380,8 @@ def test_checkpoint_lr_scheduler(tmpdir, zero_stage):
"enabled": True "enabled": True
}, },
"zero_optimization": { "zero_optimization": {
"stage": zero_stage "stage": zero_stage,
"cpu_offload": use_cpu_offload
}, },
"scheduler": { "scheduler": {
"type": "WarmupLR", "type": "WarmupLR",
...@@ -376,13 +418,27 @@ def test_checkpoint_lr_scheduler(tmpdir, zero_stage): ...@@ -376,13 +418,27 @@ def test_checkpoint_lr_scheduler(tmpdir, zero_stage):
load_lr_scheduler_states=True) load_lr_scheduler_states=True)
@pytest.mark.parametrize("zero_stage", [0, 1, 2]) @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage): [
(0,
False,
'Adam'),
(1,
False,
'Adam'),
(2,
False,
'Adam'),
(2,
True,
'deepspeed_adam'),
])
def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
"steps_per_print": 1, "steps_per_print": 1,
"optimizer": { "optimizer": {
"type": "Adam", "type": adam_optimizer,
"params": { "params": {
"lr": 1e-5 "lr": 1e-5
} }
...@@ -391,7 +447,8 @@ def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage): ...@@ -391,7 +447,8 @@ def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage):
"enabled": True "enabled": True
}, },
"zero_optimization": { "zero_optimization": {
"stage": zero_stage "stage": zero_stage,
"cpu_offload": use_cpu_offload
}, },
"scheduler": { "scheduler": {
"type": "WarmupLR", "type": "WarmupLR",
...@@ -400,7 +457,7 @@ def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage): ...@@ -400,7 +457,7 @@ def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage):
"warmup_max_lr": 0.001, "warmup_max_lr": 0.001,
"warmup_num_steps": 1000 "warmup_num_steps": 1000
} }
} },
} }
args = args_from_dict(tmpdir, config_dict) args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10 hidden_dim = 10
......
...@@ -191,7 +191,6 @@ def test_unfused_no_overflow(tmpdir): ...@@ -191,7 +191,6 @@ def test_unfused_no_overflow(tmpdir):
model, optim, _, _ = deepspeed.initialize(args=args, model, optim, _, _ = deepspeed.initialize(args=args,
model=model, model=model,
model_parameters=model.parameters()) model_parameters=model.parameters())
expected_loss_scale = 2**8 expected_loss_scale = 2**8
expected_scale_window = 2 expected_scale_window = 2
# Ensure the dynamic loss scaler is correctly configured. # Ensure the dynamic loss scaler is correctly configured.
......
import torch import torch
import apex
import deepspeed import deepspeed
import argparse import argparse
import pytest import pytest
...@@ -217,8 +218,16 @@ def test_adamw_fp16_empty_grad(tmpdir): ...@@ -217,8 +218,16 @@ def test_adamw_fp16_empty_grad(tmpdir):
_test_adamw_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim) _test_adamw_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
@pytest.mark.parametrize("zero_stage", [0, 1, 2]) @pytest.mark.parametrize('zero_stage, use_cpu_offload',
def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage): [
(1,
False),
(2,
False),
(2,
True),
])
def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
config_dict = { config_dict = {
"train_batch_size": 1, "train_batch_size": 1,
"steps_per_print": 1, "steps_per_print": 1,
...@@ -246,7 +255,8 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage): ...@@ -246,7 +255,8 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage):
"enabled": True "enabled": True
}, },
"zero_optimization": { "zero_optimization": {
"stage": zero_stage "stage": zero_stage,
"cpu_offload": use_cpu_offload
} }
} }
...@@ -274,8 +284,16 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage): ...@@ -274,8 +284,16 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage):
hidden_dim=hidden_dim) hidden_dim=hidden_dim)
@pytest.mark.parametrize("zero_stage", [1, 2]) @pytest.mark.parametrize('zero_stage, use_cpu_offload',
def test_zero_static_scale(tmpdir, zero_stage): [
(1,
False),
(2,
False),
(2,
True),
])
def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
config_dict = { config_dict = {
"train_batch_size": 4, "train_batch_size": 4,
"steps_per_print": 1, "steps_per_print": 1,
...@@ -290,7 +308,8 @@ def test_zero_static_scale(tmpdir, zero_stage): ...@@ -290,7 +308,8 @@ def test_zero_static_scale(tmpdir, zero_stage):
"loss_scale": 138. "loss_scale": 138.
}, },
"zero_optimization": { "zero_optimization": {
"stage": zero_stage "stage": zero_stage,
"cpu_offload": use_cpu_offload
} }
} }
args = args_from_dict(tmpdir, config_dict) args = args_from_dict(tmpdir, config_dict)
...@@ -363,8 +382,16 @@ def test_zero_static_scale_deprecated_format(tmpdir): ...@@ -363,8 +382,16 @@ def test_zero_static_scale_deprecated_format(tmpdir):
_test_zero_static_scale(args) _test_zero_static_scale(args)
@pytest.mark.parametrize("zero_stage", [1, 2]) @pytest.mark.parametrize('zero_stage, use_cpu_offload',
def test_zero_allow_untested_optimizer(tmpdir, zero_stage): [
(1,
False),
(2,
False),
(2,
True),
])
def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
config_dict = { config_dict = {
"train_batch_size": 4, "train_batch_size": 4,
"steps_per_print": 1, "steps_per_print": 1,
...@@ -372,7 +399,8 @@ def test_zero_allow_untested_optimizer(tmpdir, zero_stage): ...@@ -372,7 +399,8 @@ def test_zero_allow_untested_optimizer(tmpdir, zero_stage):
"enabled": True, "enabled": True,
}, },
"zero_optimization": { "zero_optimization": {
"stage": zero_stage "stage": zero_stage,
"cpu_offload": use_cpu_offload
}, },
"zero_allow_untested_optimizer": False "zero_allow_untested_optimizer": False
} }
...@@ -392,8 +420,16 @@ def test_zero_allow_untested_optimizer(tmpdir, zero_stage): ...@@ -392,8 +420,16 @@ def test_zero_allow_untested_optimizer(tmpdir, zero_stage):
_test_zero_allow_untested_optimizer(args) _test_zero_allow_untested_optimizer(args)
@pytest.mark.parametrize("zero_stage", [1, 2]) @pytest.mark.parametrize('zero_stage, use_cpu_offload',
def test_zero_empty_partition(tmpdir, zero_stage): [
(1,
False),
(2,
False),
(2,
True),
])
def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
config_dict = { config_dict = {
"train_micro_batch_size_per_gpu": 1, "train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
...@@ -408,7 +444,8 @@ def test_zero_empty_partition(tmpdir, zero_stage): ...@@ -408,7 +444,8 @@ def test_zero_empty_partition(tmpdir, zero_stage):
} }
}, },
"zero_optimization": { "zero_optimization": {
"stage": zero_stage "stage": zero_stage,
"cpu_offload": use_cpu_offload
} }
} }
args = args_from_dict(tmpdir, config_dict) args = args_from_dict(tmpdir, config_dict)
...@@ -572,3 +609,83 @@ def test_adam_amp_o2_empty_grad(tmpdir): ...@@ -572,3 +609,83 @@ def test_adam_amp_o2_empty_grad(tmpdir):
model.step() model.step()
_test_adam_amp_o2_empty_grad(args=args, model=model, hidden_dim=hidden_dim) _test_adam_amp_o2_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
@pytest.mark.parametrize('zero_stage, optimizer_constructor',
[(1,
apex.optimizers.FusedAdam),
(2,
torch.optim.Adam),
(2,
apex.optimizers.FusedAdam)])
def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
config_dict = {
"train_batch_size": 2,
"steps_per_print": 1,
"fp16": {
"enabled": True
},
"zero_optimization": {
"stage": zero_stage
}
}
args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=False)
@distributed_test(world_size=[1])
def _test_zero_supported_client_optimizer(args, model, optimizer_constructor):
client_optimizer = optimizer_constructor(params=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=client_optimizer)
_test_zero_supported_client_optimizer(args=args,
model=model,
optimizer_constructor=optimizer_constructor)
def test_zero2_reduce_scatter_off(tmpdir):
config_dict = {
"train_batch_size": 2,
"steps_per_print": 1,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 2,
"contiguous_gradients": True,
"allgather_bucket_size": 2000000000,
"reduce_bucket_size": 200000000,
"overlap_comm": False,
"reduce_scatter": False
},
"fp16": {
"enabled": True
}
}
args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10
model = SimpleModel(hidden_dim, rank=args.local_rank)
@distributed_test(world_size=[2])
def _helper(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
device=model.device)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
model.step()
_helper(args=args, model=model, hidden_dim=hidden_dim)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment