Commit c25a91b6 authored by aiss's avatar aiss
Browse files

Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm

See merge request dcutoolkit/deeplearing/deepspeed!2
parents d1596c94 af82b300
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
......@@ -28,8 +31,7 @@ def _main():
param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
torch_time = _test_perf(param, torch.optim.Adagrad)
ds_time = _test_perf(param, DeepSpeedCPUAdagrad)
#print(f"Step time: {torch_time=} {ds_time=}")
print("Step time: {torch_time=%s} {ds_time=%s}" %(torch_time, ds_time))
print(f"Step time: {torch_time=} {ds_time=}")
_main()
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
from deepspeed.ops.adam import DeepSpeedCPUAdam
......@@ -28,8 +31,7 @@ def _main():
param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
torch_time = _test_perf(param, torch.optim.Adam)
ds_time = _test_perf(param, DeepSpeedCPUAdam)
#print(f"Step time: {torch_time=} {ds_time=}")
print("Step time: {torch_time=%s} {ds_time=%s}" %(torch_time, ds_time))
print(f"Step time: {torch_time=} {ds_time=}")
_main()
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
from deepspeed.ops.adam import DeepSpeedCPUAdam
......@@ -8,10 +11,7 @@ from deepspeed.accelerator import get_accelerator
device = 'cpu'
model_size = 1 * 1024**3
param = torch.nn.Parameter(torch.ones(model_size, device=device))
param_fp16 = torch.nn.Parameter(
torch.ones(model_size,
dtype=torch.half,
device=get_accelerator().device_name(0)))
param_fp16 = torch.nn.Parameter(torch.ones(model_size, dtype=torch.half, device=get_accelerator().device_name(0)))
optimizer = DeepSpeedCPUAdam([param])
#torch.set_num_threads(128)
......
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
......@@ -10,6 +13,7 @@ import deepspeed
class VerboseLinear(torch.nn.Linear):
def __init__(self, **kwargs):
print(f'Begin VerboseLinear.__init__')
super().__init__(**kwargs)
......@@ -17,21 +21,19 @@ class VerboseLinear(torch.nn.Linear):
class LinearStack(torch.nn.Module):
def __init__(self, input_dim=2, hidden_dim=4, output_dim=4, num_layers=2):
super().__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.hidden_dim = hidden_dim
self.input_layer = VerboseLinear(in_features=self.input_dim,
out_features=self.hidden_dim)
self.input_layer = VerboseLinear(in_features=self.input_dim, out_features=self.hidden_dim)
self.layers = torch.nn.ModuleList([
torch.nn.Linear(in_features=self.hidden_dim,
out_features=self.hidden_dim,
bias=False) for x in range(num_layers)
torch.nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim, bias=False)
for x in range(num_layers)
])
self.output_layer = torch.nn.Linear(in_features=self.hidden_dim,
out_features=self.output_dim)
self.output_layer = torch.nn.Linear(in_features=self.hidden_dim, out_features=self.output_dim)
self.identity = torch.nn.Identity()
def forward(self, x):
......
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
from deepspeed.pt.deepspeed_linear import LinearModuleForZeroStage3
......@@ -28,10 +31,7 @@ def see_memory_usage(message):
)
tens = torch.rand(1024,
16384,
dtype=torch.half,
device=torch.device(get_accelerator().device_name()))
tens = torch.rand(1024, 16384, dtype=torch.half, device=torch.device(get_accelerator().device_name()))
tens_back = tens.detach().clone()
#linear_bk = torch.nn.functional.linear
......@@ -45,9 +45,7 @@ y = model(tens)
see_memory_usage("After forward")
model.weight.data = torch.zeros(1,
dtype=torch.half,
device=torch.device(get_accelerator().device_name()))
model.weight.data = torch.zeros(1, dtype=torch.half, device=torch.device(get_accelerator().device_name()))
see_memory_usage("After weight zero")
......
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
import os
import json
import argparse
import torch
import deepspeed
from torch.utils.data.distributed import DistributedSampler
import deepspeed.comm as dist
class SimpleModel(torch.nn.Module):
def __init__(self, hidden_dim, empty_grad=False):
super(SimpleModel, self).__init__()
self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
if empty_grad:
self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim)])
self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
def forward(self, x, y):
hidden = x
hidden = self.linear(hidden)
return self.cross_entropy_loss(hidden, y)
def create_config_from_dict(tmpdir, config_dict):
config_path = os.path.join(tmpdir, 'temp_config.json')
with open(config_path, 'w') as fd:
json.dump(config_dict, fd)
return config_path
def get_data_loader(model, total_samples, hidden_dim, device):
batch_size = model.train_micro_batch_size_per_gpu()
train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
sampler = DistributedSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
return train_loader
def get_args(tmpdir, config_dict):
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int, default=0)
parser.add_argument('--zero', type=int, default=3)
args = parser.parse_args() #args=''
config_dict["zero_optimization"]["stage"] = args.zero
# print('config_dict["zero_optimization"]', config_dict["zero_optimization"])
config_path = create_config_from_dict(tmpdir, config_dict)
args.deepspeed_config = config_path
return args
def print0(msg):
if dist.get_rank() == 0:
print(msg, flush=True)
rank = int(os.environ['RANK'])
print('seed:', 2222 + rank)
torch.random.manual_seed(2222 + rank)
config_dict = {
"train_batch_size": 8,
"steps_per_print": 1,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
}
},
"fp16": {
"enabled": True,
"initial_scale_power": 15
},
"zero_optimization": {
"stage": 3,
"reduce_bucket_size": 20,
"mics_shard_size": 4,
"mics_hierarchical_params_gather": True,
"stage3_model_persistence_threshold": 10
}
}
# "initial_scale_power": 15
args = get_args('/tmp/', config_dict)
hidden_dim = 32
# with deepspeed.zero.Init():
model = SimpleModel(hidden_dim, empty_grad=False)
# print('------> init model with deepspeed.zero.Init()')
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters(),
dist_init_required=True)
def print_params(tag, model):
if dist.get_rank() == 0:
for n, p in model.named_parameters():
print0("{} {}:{}".format(tag, n, p))
data_loader = get_data_loader(model=model, total_samples=1000, hidden_dim=hidden_dim, device=model.device)
#print_params('pre-train', model)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
if dist.get_rank() == 0:
print("LOSS:", loss.item())
model.backward(loss)
model.step()
#print_params('step={}'.format(n), model)
if n == 5: break
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import os
import json
......@@ -10,6 +13,7 @@ import deepspeed.comm as dist
class SimpleModel(torch.nn.Module):
def __init__(self, hidden_dim, empty_grad=False):
super(SimpleModel, self).__init__()
self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
......@@ -33,14 +37,10 @@ def create_config_from_dict(tmpdir, config_dict):
def get_data_loader(model, total_samples, hidden_dim, device):
batch_size = model.train_micro_batch_size_per_gpu()
train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
train_label = torch.empty(total_samples,
dtype=torch.long,
device=device).random_(hidden_dim)
train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
sampler = DistributedSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=batch_size,
sampler=sampler)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
return train_loader
......@@ -82,16 +82,17 @@ config_dict = {
},
"zero_optimization": {
"stage": 0,
"reduce_bucket_size": 20
"reduce_bucket_size": 20,
"stage3_model_persistence_threshold": 10
}
}
# "initial_scale_power": 15
args = get_args('/tmp/', config_dict)
hidden_dim = 4
hidden_dim = 32
model = SimpleModel(hidden_dim, empty_grad=False)
model, _, _,_ = deepspeed.initialize(args=args,
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters(),
dist_init_required=True)
......@@ -103,10 +104,7 @@ def print_params(tag, model):
print0("{} {}:{}".format(tag, n, p))
data_loader = get_data_loader(model=model,
total_samples=1000,
hidden_dim=hidden_dim,
device=model.device)
data_loader = get_data_loader(model=model, total_samples=1000, hidden_dim=hidden_dim, device=model.device)
#print_params('pre-train', model)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
......
#!/bin/bash
###该脚本基于test目录,即:pwd=path/to/ds/test
#accelerator
cd accelerator/
pytest ./
cd ..
#unit模块
pytest ./unit/profiling/flops_profiler/test_flops_profiler.py
pytest unit/test_ds_config.py
pytest unit/test_cpu_adam.py
pytest unit/pipe/test_pipe_module.py
pytest unit/autotuning/test_autotuning.py
export ROCBLAS_COMPUTETYPE_FP16R=0
pytest unit/ops/accelerators/test_accelerator_forward.py
pytest unit/ops/accelerators/test_accelerator_backward.py
pytest unit/test_cuda_forward.py
pytest unit/test_cuda_backward.py
pytest unit/test_get_optim_files.py
pytest unit/test_autotuning.py
pytest unit/test_csr.py
pytest unit/test_run.py
pytest unit/comm/test_dist.py
pytest unit/runtime/half_precision/test_fp16.py
pytest unit/runtime/half_precision/test_bf16.py
pytest unit/runtime/half_precision/onebit/test_onebit.py
pytest unit/runtime/half_precision/test_dynamic_loss_scale.py
pytest unit/runtime/test_ds_config_dict.py
pytest unit/runtime/test_ds_config_model.py
pytest unit/runtime/pipe/test_pipe.py
pytest unit/runtime/pipe/test_topology.py
pytest unit/runtime/pipe/test_pipe_schedule.py
pytest unit/runtime/test_lr_schedulers.py
pytest unit/runtime/activation_checkpointing/test_activation_checkpointing.py
pytest unit/runtime/test_ds_initialize.py
pytest unit/runtime/test_pld.py
pytest unit/runtime/test_runtime_utils.py
pytest unit/runtime/zero/test_ignore_unused_parameters.py
pytest unit/runtime/zero/test_zero_context_ancestry.py
pytest unit/runtime/zero/test_zero_tensor_fragment.py
pytest unit/runtime/zero/test_zero_context_return.py
pytest unit/runtime/zero/test_zero.py
pytest unit/runtime/zero/test_zero_context.py
pytest unit/runtime/zero/test_zero_config.py
pytest unit/runtime/zero/test_zero_tiled.py
pytest unit/runtime/sparse_tensor/test_sparse_grads.py
pytest unit/runtime/sparse_tensor/test_csr.py
pytest unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
pytest unit/runtime/test_data_efficiency.py
pytest unit/runtime/test_multi_output_model.py
pytest unit/runtime/comm/test_coalesced_collectives.py
pytest unit/runtime/test_autocast.py
pytest unit/runtime/utils/test_partition.py
pytest unit/runtime/test_data.py
pytest unit/test_ds_arguments.py
pytest unit/test_pipe_schedule.py
pytest unit/moe/test_moe.py
pytest unit/moe/test_moe_tp.py
pytest unit/checkpoint/test_sparse.py
pytest unit/checkpoint/test_zero_optimizer.py
pytest unit/checkpoint/test_other_optimizer.py
pytest unit/checkpoint/test_pipeline.py
pytest unit/checkpoint/test_latest_checkpoint.py
pytest unit/checkpoint/test_tag_validation.py
pytest unit/checkpoint/test_moe_checkpoint.py
pytest unit/checkpoint/test_reshape_checkpoint.py
pytest unit/checkpoint/test_lr_scheduler.py
pytest unit/test_autocast.py
pytest unit/monitor/test_monitor.py
pytest unit/utils/test_get_optim_files.py
pytest unit/utils/test_init_on_device.py
pytest unit/utils/test_groups.py
pytest unit/test_cpu_adagrad.py
pytest unit/ops/adagrad/test_cpu_adagrad.py
pytest unit/ops/spatial/test_nhwc_bias_add.py
pytest unit/ops/quantizer/test_quantize.py
pytest unit/ops/quantizer/test_dequantize.py
pytest unit/ops/quantizer/test_fake_quantization.py
pytest unit/ops/aio/test_aio.py
pytest unit/ops/adam/test_cpu_adam.py
pytest unit/ops/adam/test_adamw.py
pytest unit/ops/transformer/inference/test_bias_relu.py
pytest unit/ops/transformer/inference/test_bias_add.py
pytest unit/ops/transformer/inference/test_residual_add.py
pytest unit/ops/transformer/inference/test_moe_res_matmult.py
pytest unit/ops/transformer/inference/test_bias_gelu.py
pytest unit/ops/transformer/inference/test_layer_norm.py
pytest unit/ops/transformer/inference/test_bias_geglu.py
pytest unit/launcher/test_multinode_runner.py
pytest unit/launcher/test_run.py
pytest unit/launcher/test_ds_arguments.py
pytest unit/elasticity/test_elastic.py
pytest unit/test_zero_tiled.py
pytest unit/test_groups.py
#!/bin/bash
pytest ./lightning/test_simple.py
pytest ./model/BingBertSquad/test_e2e_squad.py
pytest ./model/Megatron_GPT2/test_common.py
pytest ./onebit/test_nccl_backend.py
pytest ./onebit/test_mpi_backend.py
pytest ./onebit/test_mpi_perf.py
pytest ./onebit/test_nccl_perf.py
pytest ./small_model_debugging/test_mics_config.py
pytest ./small_model_debugging/test_model.py
pytest ./unit/autotuning/test_autotuning.py
pytest ./unit/comm/test_dist.py
pytest ./unit/compression/test_compression.py
pytest ./unit/moe/test_moe_tp.py
pytest ./unit/moe/test_moe.py
pytest ./unit/monitor/test_monitor.py
pytest ./unit/pipe/test_pipe_module.py
pytest ./unit/profiling/flops_profiler/test_flops_profiler.py
pytest ./unit/checkpoint/test_latest_checkpoint.py
pytest ./unit/checkpoint/test_lr_scheduler.py
pytest ./unit/checkpoint/test_moe_checkpoint.py
pytest ./unit/checkpoint/test_other_optimizer.py
pytest ./unit/checkpoint/test_reshape_checkpoint.py
pytest ./unit/checkpoint/test_tag_validation.py
pytest ./unit/checkpoint/test_pipeline.py
pytest ./unit/checkpoint/test_sparse.py
pytest ./unit/checkpoint/test_zero_optimizer.py
pytest ./unit/elasticity/test_elastic.py
pytest ./unit/inference/test_inference_config.py
pytest ./unit/inference/test_checkpoint_sharding.py
pytest ./unit/inference/test_inference.py
pytest ./unit/inference/test_model_profiling.py
pytest ./unit/launcher/test_ds_arguments.py
pytest ./unit/launcher/test_multinode_runner.py
pytest ./unit/launcher/test_run.py
pytest ./unit/model_parallelism/test_configurable_parallel_mp.py
pytest ./unit/model_parallelism/test_configurable_parallel_pp.py
pytest ./unit/ops/adam/test_adamw.py
pytest ./unit/ops/adam/test_cpu_adam.py
pytest ./unit/ops/aio/test_aio.py
pytest ./unit/ops/quantizer/test_fake_quantization.py
pytest ./unit/ops/quantizer/test_quantize.py
pytest ./unit/ops/spatial/test_nhwc_bias_add.py
pytest ./unit/ops/transformer/inference/test_bias_add.py
pytest ./unit/ops/transformer/inference/test_bias_geglu.py
pytest ./unit/ops/transformer/inference/test_residual_add.py
pytest ./unit/ops/transformer/inference/test_bias_gelu.py
pytest ./unit/ops/transformer/inference/test_bias_relu.py
pytest ./unit/ops/transformer/inference/test_layer_norm.py
pytest ./unit/ops/transformer/inference/test_moe_res_matmult.py
pytest ./unit/ops/accelerators/test_accelerator_backward.py
pytest ./unit/ops/accelerators/test_accelerator_forward.py
pytest ./unit/ops/adagrad/test_cpu_adagrad.py
pytest ./unit/ops/sparse_attention/test_sparse_attention.py
pytest ./unit/runtime/half_precision/onebit/test_onebit.py
pytest ./unit/runtime/half_precision/test_dynamic_loss_scale.py
pytest ./unit/runtime/half_precision/test_bf16.py
pytest ./unit/runtime/half_precision/test_fp16.py
pytest ./unit/runtime/pipe/test_pipe.py
pytest ./unit/runtime/pipe/test_topology.py
pytest ./unit/runtime/pipe/test_pipe_schedule.py
pytest ./unit/runtime/test_data_efficiency.py
pytest ./unit/runtime/test_ds_config_dict.py
pytest ./unit/runtime/test_ds_initialize.py
pytest ./unit/runtime/test_multi_output_model.py
pytest ./unit/runtime/test_pld.py
pytest ./unit/runtime/utils/test_partition.py
pytest ./unit/runtime/zero/test_ignore_unused_parameters.py
pytest ./unit/runtime/zero/test_zero.py
pytest ./unit/runtime/zero/test_zero_config.py
pytest ./unit/runtime/zero/test_zero_context_ancestry.py
pytest ./unit/runtime/zero/test_zero_tensor_fragment.py
pytest ./unit/runtime/zero/test_zero_context.py
pytest ./unit/runtime/zero/test_zero_context_return.py
pytest ./unit/runtime/zero/test_zero_dynamic_class.py
pytest ./unit/runtime/zero/test_zero_nesting_init.py
pytest ./unit/runtime/zero/test_zero_tiled.py
pytest ./unit/runtime/activation_checkpointing/test_activation_checkpointing.py
pytest ./unit/runtime/comm/test_coalesced_collectives.py
pytest ./unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
pytest ./unit/runtime/sparse_tensor/test_csr.py
pytest ./unit/runtime/sparse_tensor/test_sparse_grads.py
pytest ./unit/runtime/test_autocast.py
pytest ./unit/runtime/test_data.py
pytest ./unit/runtime/test_ds_config_model.py
pytest ./unit/runtime/test_lr_schedulers.py
pytest ./unit/runtime/test_runtime_utils.py
pytest ./unit/utils/test_init_on_device.py
pytest ./unit/utils/test_get_optim_files.py
pytest ./unit/utils/test_groups.py
pytest ./accelerator/test_ds_init.py
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
'''Copyright The Microsoft DeepSpeed Team'''
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import pytest
import torch
......@@ -12,41 +15,23 @@ from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
class AlexNet(nn.Module):
def __init__(self, num_classes=10):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3,
64,
kernel_size=11,
stride=4,
padding=5),
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=5),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2,
stride=2),
nn.Conv2d(64,
192,
kernel_size=5,
padding=2),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(64, 192, kernel_size=5, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2,
stride=2),
nn.Conv2d(192,
384,
kernel_size=3,
padding=1),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(192, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384,
256,
kernel_size=3,
padding=1),
nn.Conv2d(384, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256,
256,
kernel_size=3,
padding=1),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2,
stride=2),
nn.MaxPool2d(kernel_size=2, stride=2),
)
self.classifier = nn.Linear(256, num_classes)
self.loss_fn = nn.CrossEntropyLoss()
......@@ -59,12 +44,14 @@ class AlexNet(nn.Module):
class AlexNetPipe(AlexNet):
def to_layers(self):
layers = [*self.features, lambda x: x.view(x.size(0), -1), self.classifier]
return layers
class AlexNetPipeSpec(PipelineModule):
def __init__(self, num_classes=10, **kwargs):
self.num_classes = num_classes
specs = [
......@@ -81,7 +68,6 @@ class AlexNetPipeSpec(PipelineModule):
LayerSpec(nn.Conv2d, 256, 256, kernel_size=3, padding=1),
F.relu,
LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
lambda x: x.view(x.size(0), -1),
LayerSpec(nn.Linear, 256, self.num_classes), # classifier
]
......@@ -99,12 +85,7 @@ def cifar_trainset(fp16=False):
transform_list = [
transforms.ToTensor(),
transforms.Normalize((0.5,
0.5,
0.5),
(0.5,
0.5,
0.5)),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]
if fp16:
transform_list.append(torchvision.transforms.Lambda(cast_to_half))
......@@ -117,23 +98,14 @@ def cifar_trainset(fp16=False):
dist.barrier()
if local_rank != 0:
dist.barrier()
trainset = torchvision.datasets.CIFAR10(root='/blob/cifar10-data',
train=True,
download=True,
transform=transform)
trainset = torchvision.datasets.CIFAR10(root='/blob/cifar10-data', train=True, download=True, transform=transform)
if local_rank == 0:
dist.barrier()
return trainset
def train_cifar(model,
config,
num_steps=400,
average_dp_losses=True,
fp16=True,
seed=123):
with get_accelerator().random().fork_rng(
devices=[get_accelerator().current_device_name()]):
def train_cifar(model, config, num_steps=400, average_dp_losses=True, fp16=True, seed=123):
with get_accelerator().random().fork_rng(devices=[get_accelerator().current_device_name()]):
ds_utils.set_random_seed(seed)
# disable dropout
......@@ -142,8 +114,7 @@ def train_cifar(model,
trainset = cifar_trainset(fp16=fp16)
config['local_rank'] = dist.get_rank()
engine, _, _, _ = deepspeed.initialize(
config=config,
engine, _, _, _ = deepspeed.initialize(config=config,
model=model,
model_parameters=[p for p in model.parameters()],
training_data=trainset)
......
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import os
import pytest
......@@ -14,13 +17,11 @@ TUNE_OPTION = 'tune'
def test_command_line():
'''Validate handling of command line arguments'''
for opt in [RUN_OPTION, TUNE_OPTION]:
dsrun.parse_args(
args=f"--num_nodes 1 --num_gpus 1 --autotuning {opt} foo.py".split())
dsrun.parse_args(args=f"--num_nodes 1 --num_gpus 1 --autotuning {opt} foo.py".split())
for error_opts in [
"--autotuning --num_nodes 1 --num_gpus 1 foo.py".split(),
"--autotuning test --num_nodes 1 -- num_gpus 1 foo.py".split(),
"--autotuning".split()
"--autotuning test --num_nodes 1 -- num_gpus 1 foo.py".split(), "--autotuning".split()
]:
with pytest.raises(SystemExit):
dsrun.parse_args(args=error_opts)
......@@ -65,18 +66,9 @@ def test_resource_manager_arg_mappings(arg_mappings):
]
) # yapf: disable
def test_autotuner_resources(tmpdir, active_resources):
config_dict = {
"autotuning": {
"enabled": True,
"exps_dir": os.path.join(tmpdir,
'exps_dir'),
"arg_mappings": {}
}
}
config_dict = {"autotuning": {"enabled": True, "exps_dir": os.path.join(tmpdir, 'exps_dir'), "arg_mappings": {}}}
config_path = create_config_from_dict(tmpdir, config_dict)
args = dsrun.parse_args(
args=f'--autotuning {TUNE_OPTION} foo.py --deepspeed_config {config_path}'.split(
))
args = dsrun.parse_args(args=f'--autotuning {TUNE_OPTION} foo.py --deepspeed_config {config_path}'.split())
tuner = Autotuner(args=args, active_resources=active_resources)
expected_num_nodes = len(list(active_resources.keys()))
......
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import os
import torch
......@@ -9,6 +12,7 @@ from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
from unit.simple_model import *
......@@ -22,13 +26,18 @@ def compare_deepspeed_states(saved_model, loaded_model):
assert saved_model.global_steps == loaded_model.global_steps
def compare_model_states(saved_model,
loaded_model,
compare_optimizer=True,
load_module_only=False):
def zero3_params_to_fetch(param_list):
return [p for p in param_list if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE]
def compare_model_states(saved_model, loaded_model, compare_optimizer=True, load_module_only=False):
if not load_module_only:
compare_deepspeed_states(saved_model, loaded_model)
params_to_fetch = zero3_params_to_fetch(
list(saved_model.module.named_parameters()) + list(loaded_model.module.named_parameters()))
enable_gather = len(params_to_fetch) > 0
with deepspeed.zero.GatheredParameters(params_to_fetch, enabled=enable_gather):
for p0, p1 in zip(saved_model.module.named_parameters(), loaded_model.module.named_parameters()):
np0, p0 = p0
np1, p1 = p1
......@@ -38,7 +47,8 @@ def compare_model_states(saved_model,
p0 = p0.half()
assert id(p0) != id(p1), f'Comparing fp16 model state tensor against itself : {id(p0)} <====> {id(p1)}'
try:
assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}"
assert torch.allclose(p0, p1,
atol=1e-07), f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}"
except RuntimeError as err:
print(f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}")
raise err
......@@ -46,14 +56,14 @@ def compare_model_states(saved_model,
if not compare_optimizer:
return
if DeepSpeedZeroOptimizer_Stage3 is not None and isinstance(
saved_model.optimizer,
DeepSpeedZeroOptimizer_Stage3):
for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat, loaded_model.optimizer.fp32_partitioned_groups_flat):
if DeepSpeedZeroOptimizer_Stage3 is not None and isinstance(saved_model.optimizer, DeepSpeedZeroOptimizer_Stage3):
for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat,
loaded_model.optimizer.fp32_partitioned_groups_flat):
assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
elif isinstance(saved_model.optimizer, DeepSpeedZeroOptimizer):
for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups):
for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups,
loaded_model.optimizer.single_partition_of_fp32_groups):
assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
......@@ -89,8 +99,7 @@ def compare_optimizer_states(saved_model, loaded_model, hidden_dim, fp16=True):
saved_optimizer = saved_model.optimizer.optimizer if fp16 else saved_model.optimizer
loaded_optimizer = loaded_model.optimizer.optimizer if fp16 else loaded_model.optimizer
for state0, state1 in zip(saved_optimizer.state.values(),
loaded_optimizer.state.values()):
for state0, state1 in zip(saved_optimizer.state.values(), loaded_optimizer.state.values()):
compare_state_dicts(state0, state1)
......@@ -130,6 +139,7 @@ def create_deepspeed_model(config_dict, model, base_optimizer):
model=model,
model_parameters=create_moe_param_groups(model),
optimizer=base_optimizer)
ds_model.empty_partition_cache()
return ds_model
......@@ -141,15 +151,12 @@ def checkpoint_correctness_verification(config_dict,
load_lr_scheduler_states=False,
fp16=True,
train_batch=False,
base_optimizers=[None,
None],
base_optimizers=[None, None],
empty_tag=False,
seq_dataloader=False,
load_module_only=False):
dtype = torch.half if fp16 else torch.float32
ds_model = create_deepspeed_model(config_dict=config_dict,
model=models[0],
base_optimizer=base_optimizers[0])
ds_model = create_deepspeed_model(config_dict=config_dict, model=models[0], base_optimizer=base_optimizers[0])
if seq_dataloader:
data_loader = sequence_dataloader(model=ds_model,
......@@ -174,6 +181,9 @@ def checkpoint_correctness_verification(config_dict,
ds_model.backward(loss)
ds_model.step()
# Flush zero stage 3 cache
ds_model.empty_partition_cache()
trained_model = ds_model
save_folder = os.path.join(tmpdir, 'saved_checkpoint')
......@@ -196,11 +206,8 @@ def checkpoint_correctness_verification(config_dict,
stored = sum(v for _, v in storages.items())
assert needed == stored, f"MoE expert checkpoint uses more storage than required: {f}"
loaded_model = create_deepspeed_model(config_dict=config_dict,
model=models[1],
base_optimizer=base_optimizers[1])
assert list(trained_model.parameters())[0].dtype == list(
loaded_model.parameters())[0].dtype
loaded_model = create_deepspeed_model(config_dict=config_dict, model=models[1], base_optimizer=base_optimizers[1])
assert list(trained_model.parameters())[0].dtype == list(loaded_model.parameters())[0].dtype
loaded_model.load_checkpoint(save_folder,
tag=save_tag,
......
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import deepspeed
......@@ -46,8 +49,6 @@ class TestLatestCheckpoint(DistributedTest):
}
hidden_dim = 10
model = SimpleModel(hidden_dim)
model, _, _,_ = deepspeed.initialize(config=config_dict,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
# should be no-op, since latest doesn't exist
model.load_checkpoint(tmpdir)
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import deepspeed
from deepspeed.ops.op_builder import CPUAdamBuilder
......@@ -11,19 +14,8 @@ from unit.checkpoint.common import checkpoint_correctness_verification
import pytest
@pytest.mark.parametrize('zero_stage, use_cpu_offload',
[(0,
False),
(1,
False),
(2,
False),
(2,
True),
(3,
False),
(3,
True)])
@pytest.mark.parametrize('zero_stage, use_cpu_offload', [(0, False), (1, False), (2, False), (2, True), (3, False),
(3, True)])
class TestLRSchedulerCheckpoint(DistributedTest):
world_size = 2
......@@ -38,8 +30,7 @@ class TestLRSchedulerCheckpoint(DistributedTest):
"type": 'Adam',
"params": {
"lr": 0.00015,
"betas": [0.8,
0.999],
"betas": [0.8, 0.999],
"eps": 1e-8,
"weight_decay": 3e-7
}
......
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
......@@ -19,20 +22,10 @@ class TestMoECheckpoint(DistributedTest):
if not required_torch_version():
pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
config_dict = {
"train_batch_size": 8,
"steps_per_print": 1,
"fp16": {
"enabled": True
}
}
config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}}
hidden_dim = 16
models = [
SimpleMoEModel(hidden_dim=hidden_dim,
num_experts=ep_size,
ep_size=ep_size) for _ in range(2)
]
models = [SimpleMoEModel(hidden_dim=hidden_dim, num_experts=ep_size, ep_size=ep_size) for _ in range(2)]
optimizers = [torch.optim.AdamW(params=model.parameters()) for model in models]
checkpoint_correctness_verification(config_dict,
models=models,
......@@ -45,15 +38,7 @@ class TestMoECheckpoint(DistributedTest):
base_optimizers=optimizers,
seq_dataloader=True)
@pytest.mark.parametrize("ep_size, load_optim_states",
[(4,
True),
(4,
False),
(2,
True),
(2,
False)])
@pytest.mark.parametrize("ep_size, load_optim_states", [(4, True), (4, False), (2, True), (2, False)])
def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states):
if not required_torch_version():
pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
......@@ -65,8 +50,7 @@ class TestMoECheckpoint(DistributedTest):
"type": 'Adam',
"params": {
"lr": 0.00015,
"betas": [0.8,
0.999],
"betas": [0.8, 0.999],
"eps": 1e-8,
"weight_decay": 3e-7
}
......@@ -81,21 +65,11 @@ class TestMoECheckpoint(DistributedTest):
}
hidden_dim = 16
models = [
SimpleMoEModel(hidden_dim=hidden_dim,
num_experts=ep_size,
ep_size=ep_size) for _ in range(2)
]
models = [SimpleMoEModel(hidden_dim=hidden_dim, num_experts=ep_size, ep_size=ep_size) for _ in range(2)]
# param group must have a random unique name (for now)
# TODO: clean-up this requirement, the unique name should not be required here
param_groups = [{
'params': [p for p in model.parameters()],
'name': 'random-unique-name'
} for model in models]
params = [
split_params_into_different_moe_groups_for_optimizer(group)
for group in param_groups
]
param_groups = [{'params': [p for p in model.parameters()], 'name': 'random-unique-name'} for model in models]
params = [split_params_into_different_moe_groups_for_optimizer(group) for group in param_groups]
optimizers = [torch.optim.AdamW(params=param) for param in params]
checkpoint_correctness_verification(config_dict,
models=models,
......
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import deepspeed
from deepspeed.ops.op_builder import FusedLambBuilder
......@@ -14,8 +17,7 @@ import pytest
class TestOtherOptimizerCheckpoint(DistributedTest):
world_size = 2
@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
reason="lamb is not compatible")
@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
def test_checkpoint_unfused_optimizer(self, tmpdir):
config_dict = {
"train_batch_size": 2,
......@@ -74,8 +76,7 @@ class TestOtherOptimizerCheckpoint(DistributedTest):
"type": "Adam",
"params": {
"lr": 0.00015,
"betas": [0.8,
0.999],
"betas": [0.8, 0.999],
"eps": 1e-8,
"weight_decay": 3e-7
}
......@@ -111,8 +112,7 @@ class TestOtherOptimizerCheckpoint(DistributedTest):
"type": "Adam",
"params": {
"lr": 0.00015,
"betas": [0.8,
0.999],
"betas": [0.8, 0.999],
"eps": 1e-8,
"weight_decay": 3e-7
}
......
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
from unit.common import DistributedTest
from unit.simple_model import *
from unit.checkpoint.common import checkpoint_correctness_verification
from unit.util import skip_on_arch
import pytest
......@@ -14,6 +17,8 @@ class TestPipelineCheckpoint(DistributedTest):
@pytest.mark.parametrize("zero_stage", [0, 1])
def test_checkpoint_pipe_engine(self, zero_stage, tmpdir):
skip_on_arch(min_arch=7)
config_dict = {
"train_batch_size": 2,
"train_micro_batch_size_per_gpu": 1,
......
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from deepspeed.checkpoint import model_3d_desc
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment