Unverified Commit 65c2f974 authored by Shaden Smith's avatar Shaden Smith Committed by GitHub
Browse files

Pipeline parallel training engine. (#392)


Co-authored-by: default avatarJeff Rasley <jerasley@microsoft.com>
parent 41db1c2f
...@@ -24,6 +24,8 @@ release = '0.1.0' ...@@ -24,6 +24,8 @@ release = '0.1.0'
master_doc = 'index' master_doc = 'index'
autodoc_member_order = 'bysource'
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be # Add any Sphinx extension module names here, as strings. They can be
......
...@@ -34,6 +34,14 @@ Transformer Kernel API ...@@ -34,6 +34,14 @@ Transformer Kernel API
kernel kernel
Pipeline Parallelism
--------------------
.. toctree::
:maxdepth: 2
pipeline
pipeline-extending
Indices and tables Indices and tables
------------------ ------------------
......
Extending Pipeline Parallelism
==============================
.. automodule:: deepspeed.runtime.pipe.schedule
:members:
Pipeline Parallelism
====================
.. automodule:: deepspeed.runtime.pipe.engine
:members:
.. automodule:: deepspeed.runtime.pipe.topology
:members:
...@@ -156,7 +156,7 @@ fi ...@@ -156,7 +156,7 @@ fi
if [ "$pip_mirror" != "" ]; then if [ "$pip_mirror" != "" ]; then
PIP_INSTALL="pip install --use-feature=2020-resolver -v -i $pip_mirror" PIP_INSTALL="pip install --use-feature=2020-resolver -v -i $pip_mirror"
else else
PIP_INSTALL="pip install -v" PIP_INSTALL="pip install --use-feature=2020-resolver -v"
fi fi
if [ ! -f $hostfile ]; then if [ ! -f $hostfile ]; then
......
...@@ -32,7 +32,7 @@ def distributed_test(world_size=2, backend='nccl'): ...@@ -32,7 +32,7 @@ def distributed_test(world_size=2, backend='nccl'):
def dist_init(local_rank, num_procs, *func_args, **func_kwargs): def dist_init(local_rank, num_procs, *func_args, **func_kwargs):
"""Initialize torch.distributed and execute the user function. """ """Initialize torch.distributed and execute the user function. """
os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29500' os.environ['MASTER_PORT'] = '29503'
dist.init_process_group(backend=backend, dist.init_process_group(backend=backend,
init_method='env://', init_method='env://',
rank=local_rank, rank=local_rank,
......
...@@ -3,6 +3,8 @@ import json ...@@ -3,6 +3,8 @@ import json
import argparse import argparse
import torch import torch
from deepspeed.pipe import PipelineModule, LayerSpec
class SimpleModel(torch.nn.Module): class SimpleModel(torch.nn.Module):
def __init__(self, hidden_dim, empty_grad=False, rank=0): def __init__(self, hidden_dim, empty_grad=False, rank=0):
...@@ -23,6 +25,59 @@ class SimpleModel(torch.nn.Module): ...@@ -23,6 +25,59 @@ class SimpleModel(torch.nn.Module):
return self.cross_entropy_loss(hidden_dim, y) return self.cross_entropy_loss(hidden_dim, y)
class LinearStack(torch.nn.Module):
def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4):
super().__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.hidden_dim = hidden_dim
self.input_layer = VerboseLinear(in_features=self.input_dim,
out_features=self.hidden_dim)
self.layers = torch.nn.ModuleList([
torch.nn.Linear(in_features=self.hidden_dim,
out_features=self.hidden_dim,
bias=False) for x in range(num_layers)
])
self.output_layer = torch.nn.Linear(in_features=self.hidden_dim,
out_features=self.output_dim)
self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
def forward(self, x, y):
x = self.input_layer(x)
for layer in self.layers:
x = layer(x)
x = self.output_layer(x)
return x
class LinearStackPipe(PipelineModule):
def __init__(self,
input_dim=128,
hidden_dim=128,
output_dim=128,
num_layers=4,
**kwargs):
self.input_dim = input_dim
self.output_dim = output_dim
self.hidden_dim = hidden_dim
self.num_layers = num_layers
layers = []
layers.append(LayerSpec(torch.nn.Linear, self.input_dim, self.hidden_dim))
for x in range(self.num_layers):
layers.append(
LayerSpec(torch.nn.Linear,
self.hidden_dim,
self.hidden_dim,
bias=False))
layers.append(lambda x: x)
layers.append(LayerSpec(torch.nn.Linear, self.hidden_dim, self.output_dim))
super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), **kwargs)
class SimpleOptimizer(torch.optim.Optimizer): class SimpleOptimizer(torch.optim.Optimizer):
def __init__(self, params, lr=0.11072018): def __init__(self, params, lr=0.11072018):
defaults = dict(lr=lr) defaults = dict(lr=lr)
...@@ -70,5 +125,10 @@ def args_from_dict(tmpdir, config_dict): ...@@ -70,5 +125,10 @@ def args_from_dict(tmpdir, config_dict):
args = parser.parse_args(args='') args = parser.parse_args(args='')
args.deepspeed = True args.deepspeed = True
args.deepspeed_config = config_path args.deepspeed_config = config_path
if torch.distributed.is_initialized():
# We assume up to one full node executing unit tests
assert torch.distributed.get_world_size() <= torch.cuda.device_count()
args.local_rank = torch.distributed.get_rank()
else:
args.local_rank = 0 args.local_rank = 0
return args return args
import torch import torch
import torch.distributed as dist
import deepspeed import deepspeed
from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1 from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
...@@ -6,13 +8,16 @@ from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1 ...@@ -6,13 +8,16 @@ from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
from deepspeed.runtime.pipe.topology import *
PipeTopo = PipeDataParallelTopology
import argparse import argparse
import pytest import pytest
import json import json
import os import os
import numbers import numbers
from common import distributed_test from common import distributed_test
from simple_model import SimpleModel, random_dataloader, args_from_dict from simple_model import *
def compare_deepspeed_states(saved_model, loaded_model): def compare_deepspeed_states(saved_model, loaded_model):
...@@ -24,12 +29,15 @@ def compare_deepspeed_states(saved_model, loaded_model): ...@@ -24,12 +29,15 @@ def compare_deepspeed_states(saved_model, loaded_model):
assert saved_model.global_steps == loaded_model.global_steps assert saved_model.global_steps == loaded_model.global_steps
def compare_model_states(saved_model, loaded_model): def compare_model_states(saved_model, loaded_model, compare_optimizer=True):
compare_deepspeed_states(saved_model, loaded_model) compare_deepspeed_states(saved_model, loaded_model)
for p0, p1 in zip(saved_model.module.parameters(), loaded_model.module.parameters()): for p0, p1 in zip(saved_model.module.parameters(), loaded_model.module.parameters()):
assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}" assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}"
if not compare_optimizer:
return
if isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer): if isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer):
for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups): for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups):
assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}" assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
...@@ -95,7 +103,8 @@ def checkpoint_correctness_verification(args, ...@@ -95,7 +103,8 @@ def checkpoint_correctness_verification(args,
tmpdir, tmpdir,
load_optimizer_states=False, load_optimizer_states=False,
load_lr_scheduler_states=False, load_lr_scheduler_states=False,
fp16=True): fp16=True,
train_batch=False):
dtype = torch.half if fp16 else torch.float32 dtype = torch.half if fp16 else torch.float32
ds_model, _, _, _ = deepspeed.initialize(args=args, ds_model, _, _, _ = deepspeed.initialize(args=args,
model=model, model=model,
...@@ -105,8 +114,15 @@ def checkpoint_correctness_verification(args, ...@@ -105,8 +114,15 @@ def checkpoint_correctness_verification(args,
hidden_dim=hidden_dim, hidden_dim=hidden_dim,
device=ds_model.device, device=ds_model.device,
dtype=dtype) dtype=dtype)
if train_batch:
ds_model.set_dataloader(data_loader)
for n, batch in enumerate(data_loader):
loss = ds_model.train_batch()
else:
for n, batch in enumerate(data_loader): for n, batch in enumerate(data_loader):
loss = ds_model(batch[0], batch[1]) loss = ds_model(batch[0], batch[1])
print(loss)
ds_model.backward(loss) ds_model.backward(loss)
ds_model.step() ds_model.step()
...@@ -514,3 +530,108 @@ def test_checkpoint_fp32_optimizer(tmpdir): ...@@ -514,3 +530,108 @@ def test_checkpoint_fp32_optimizer(tmpdir):
checkpoint_correctness_verification(args, model, hidden_dim, tmpdir, fp16=False) checkpoint_correctness_verification(args, model, hidden_dim, tmpdir, fp16=False)
_test_checkpoint_fp32_optimizer(args=args, model=model, hidden_dim=hidden_dim) _test_checkpoint_fp32_optimizer(args=args, model=model, hidden_dim=hidden_dim)
@pytest.mark.parametrize("zero_stage", [0, 1])
def test_checkpoint_pipe_engine(zero_stage, tmpdir, stages=2):
config_dict = {
"train_batch_size": 2,
"train_micro_batch_size_per_gpu": 1,
"steps_per_print": 1,
"optimizer": {
"type": "Adam",
"params": {
"lr": 1e-5
}
},
"zero_optimization": {
"stage": zero_stage
},
"fp16": {
"enabled": zero_stage > 0
},
"scheduler": {
"type": "OneCycle",
"params": {
"cycle_first_step_size": 1000,
"cycle_first_stair_count": 500,
"cycle_second_step_size": 1000,
"cycle_second_stair_count": 500,
"decay_step_size": 1000,
"cycle_min_lr": 0.0001,
"cycle_max_lr": 0.0010,
"decay_lr_rate": 0.001,
"cycle_min_mom": 0.85,
"cycle_max_mom": 0.99,
"decay_mom_rate": 0.0
}
}
}
@distributed_test(world_size=4)
def _test(save_folder, num_stages):
args = args_from_dict(tmpdir, config_dict)
model = LinearStackPipe(num_stages=num_stages)
checkpoint_correctness_verification(args=args,
model=model,
hidden_dim=model.hidden_dim,
tmpdir=save_folder,
fp16=config_dict['fp16']['enabled'],
load_optimizer_states=True,
load_lr_scheduler_states=True,
train_batch=True)
_test(tmpdir, num_stages=stages)
@pytest.mark.parametrize("base_topo,test_topo",
[
(PipeTopo(num_pp=1,
num_dp=4),
PipeTopo(num_pp=4,
num_dp=1)),
(PipeTopo(num_pp=2,
num_dp=2),
PipeTopo(num_pp=2,
num_dp=2)),
(PipeTopo(num_pp=4,
num_dp=1),
PipeTopo(num_pp=2,
num_dp=2)),
])
def test_checkpoint_pipe_module(base_topo, test_topo, tmpdir):
@distributed_test(world_size=4)
def _test(base_topo, test_topo, save_folder):
base_model = LinearStackPipe(topology=base_topo)
base_model.save_state_dict(save_folder)
dist.barrier()
test_model = LinearStackPipe(topology=test_topo)
test_model.load_state_dir(save_folder)
# Base and test can have different lengths, so make sure we map from the
# smaller to larger model
if len(base_model.forward_funcs) < len(test_model.forward_funcs):
A = base_model
B = test_model
else:
A = test_model
B = base_model
# Compare layers individually since partitions are different
for idx, A_layer in enumerate(A.forward_funcs):
if not hasattr(A_layer, 'parameters'):
# Skip functionals, etc.
continue
# Find the corresponding layer in B
global_idx = idx + A._local_start
B_local_idx = global_idx - B._local_start
B_layer = B.forward_funcs[B_local_idx]
# Compare layer parameters
for p0, p1 in zip(A_layer.parameters(), B_layer.parameters()):
assert torch.allclose(p0, p1, atol=1e-07), f"Model state {p0} is not equal to {p1}"
_test(base_topo, test_topo, save_folder=tmpdir)
from deepspeed.utils import RepeatingLoader
def test_repeating_loader():
loader = [1, 2, 3]
loader = RepeatingLoader(loader)
for idx in range(50):
assert next(loader) == 1
assert next(loader) == 2
assert next(loader) == 3
import pytest
import torch
import torch.distributed as dist
from deepspeed.runtime.utils import partition_uniform
from deepspeed.runtime.utils import partition_balanced
from deepspeed.runtime.utils import prefix_sum_inc
from deepspeed.runtime.utils import PartitionedTensor
from common import distributed_test
@distributed_test(world_size=4)
def test_partitioned_tensor():
world = dist.get_world_size()
rank = dist.get_rank()
group = dist.new_group(ranks=list(range(world)))
rows = world * 4
cols = 3
full = torch.rand(rows, cols).cuda()
dist.broadcast(full, src=0, group=group)
part = PartitionedTensor(full, group=group)
assert len(part.local_size()) == 1
assert part.local_size()[0] * world == full.numel()
reconstructed = part.full()
assert torch.equal(full, reconstructed)
@distributed_test(world_size=4)
def test_partitioned_tensor_meta():
world = dist.get_world_size()
rank = dist.get_rank()
group = dist.new_group(ranks=list(range(world)))
rows = world * 7
cols = 3
full = torch.rand(rows, cols).cuda()
dist.broadcast(full, src=0, group=group)
part = PartitionedTensor(full, group=group)
my_meta = PartitionedTensor.from_meta(part.to_meta(), part.local_data, group)
assert torch.equal(full, my_meta.full())
def assert_valid_partition(weights, parts, P):
N = len(weights)
assert len(parts) == P + 1
assert parts[0] == 0
assert parts[P] == N
for idx in range(P):
assert parts[idx] <= parts[idx + 1]
def get_partition_weights(weights, parts):
""" Return the amount of weight in each partition. """
costs = [0] * (len(parts) - 1)
P = len(parts) - 1
for p in range(P):
start = parts[p]
stop = parts[p + 1]
costs[p] = sum(weights[start:stop])
return costs
def test_prefix_sum():
x = [3, 4, 5]
psum = prefix_sum_inc(x)
assert psum == [3, 7, 12]
def test_valid_partition():
N = 10
P = 1
weights = [1] * N
parts = partition_balanced(weights, P)
assert_valid_partition(weights, parts, P)
def test_short_partition_uniform():
N = 2
P = 4
weights = [1] * N
parts = partition_uniform(len(weights), P)
assert_valid_partition(weights, parts, P)
def test_short_partition():
N = 2
P = 4
weights = [1] * N
parts = partition_balanced(weights, P)
assert_valid_partition(weights, parts, P)
def test_easy_balance_uniform():
weights = [1] * 8
P = 4
parts = partition_uniform(len(weights), P)
assert_valid_partition(weights, parts, P)
costs = get_partition_weights(weights, parts)
assert all(c == 2 for c in costs)
def test_easy_balance_balanced():
weights = [1] * 8
P = 4
parts = partition_balanced(weights, P)
assert_valid_partition(weights, parts, P)
costs = get_partition_weights(weights, parts)
assert all(c == 2 for c in costs), costs
def test_int_balanced():
weights = [0, 1, 2, 3, 3, 3]
P = 4
parts = partition_balanced(weights, P)
assert parts == [0, 3, 4, 5, 6]
assert_valid_partition(weights, parts, P)
costs = get_partition_weights(weights, parts)
assert all(c == 3 for c in costs)
def test_float_balanced():
weights = [0., 1.1, 1.9, 3., 3., 3.]
P = 4
parts = partition_balanced(weights, P)
assert_valid_partition(weights, parts, P)
assert parts == [0, 3, 4, 5, 6]
@pytest.mark.skip(reason="Variance-minimizing partitioning returns different result.")
def test_float_lastheavy():
weights = [0., 1.1, 1.9, 3., 30.]
P = 2
parts = partition_balanced(weights, P)
assert_valid_partition(weights, parts, P)
assert parts == [0, 4, 5]
def test_float_midheavy():
weights = [0., 1.1, 30, 3.]
P = 3
parts = partition_balanced(weights, P)
assert_valid_partition(weights, parts, P)
assert parts == [0, 2, 3, 4]
def test_balance_bert():
# Parameters per layer for a transformer model with 24 transformers and hidden dim 1024
weights = [
52559872,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
12596224,
0,
52559872
]
P = 8
parts = partition_balanced(weights, P)
assert_valid_partition(weights, parts, P)
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributed as dist
import pytest
import deepspeed
import deepspeed.runtime.utils as ds_utils
from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
PipeTopo = PipeDataParallelTopology
import deepspeed.runtime.pipe.module as PipelineModule
from deepspeed.runtime.pipe.module import LayerSpec
from common import distributed_test
def rel_diff(A, B):
return abs(A - B) / abs(A)
# All models
from simple_model import args_from_dict
class AlexNet(nn.Module):
def __init__(self, num_classes=10):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3,
64,
kernel_size=11,
stride=4,
padding=5),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2,
stride=2),
nn.Conv2d(64,
192,
kernel_size=5,
padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2,
stride=2),
nn.Conv2d(192,
384,
kernel_size=3,
padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384,
256,
kernel_size=3,
padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256,
256,
kernel_size=3,
padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2,
stride=2),
)
self.classifier = nn.Linear(256, num_classes)
self.loss_fn = nn.CrossEntropyLoss()
def forward(self, x, y):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return self.loss_fn(x, y)
class AlexNetPipe(PipelineModule.PipelineModule):
def __init__(self, num_classes=10, **kwargs):
self.num_classes = num_classes
specs = [
LayerSpec(nn.Conv2d, 3, 64, kernel_size=11, stride=4, padding=5),
LayerSpec(nn.ReLU, inplace=True),
LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
LayerSpec(nn.Conv2d, 64, 192, kernel_size=5, padding=2),
F.relu,
LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
LayerSpec(nn.Conv2d, 192, 384, kernel_size=3, padding=1),
F.relu,
LayerSpec(nn.Conv2d, 384, 256, kernel_size=3, padding=1),
F.relu,
LayerSpec(nn.Conv2d, 256, 256, kernel_size=3, padding=1),
F.relu,
LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
lambda x: x.view(x.size(0), -1),
LayerSpec(nn.Linear, 256, self.num_classes), # classifier
]
super().__init__(layers=specs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
def cifar_trainset(fp16=False):
import torchvision
import torchvision.transforms as transforms
transform_list = [
transforms.ToTensor(),
transforms.Normalize((0.5,
0.5,
0.5),
(0.5,
0.5,
0.5)),
]
if fp16:
transform_list.append(torchvision.transforms.Lambda(lambda x: x.half()))
transform = transforms.Compose(transform_list)
local_rank = torch.cuda.current_device()
# Only one rank per machine downloads.
dist.barrier()
if local_rank != 0:
dist.barrier()
trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10-data',
train=True,
download=True,
transform=transform)
if local_rank == 0:
dist.barrier()
return trainset
def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, seed=123):
with torch.random.fork_rng(devices=[torch.cuda.current_device()]):
ds_utils.set_random_seed(seed)
trainset = cifar_trainset(fp16=fp16)
args.local_rank = dist.get_rank()
engine, _, _, _ = deepspeed.initialize(
args=args,
model=model,
model_parameters=[p for p in model.parameters()],
training_data=trainset)
losses = []
for step in range(num_steps):
loss = engine.train_batch()
losses.append(loss.item())
if step % 50 == 0:
print(f'STEP={step} LOSS={loss.item()}')
if average_dp_losses:
loss_tensor = torch.tensor(losses).cuda()
dist.all_reduce(loss_tensor)
loss_tensor /= dist.get_world_size()
losses = loss_tensor.tolist()
return losses
@pytest.mark.parametrize('base_topo,test_topo',
[
(PipeTopo(num_pp=1,
num_dp=4),
PipeTopo(num_pp=2,
num_dp=2)),
(PipeTopo(num_pp=1,
num_dp=4),
PipeTopo(num_pp=4,
num_dp=1)),
])
def test_pipe_cifar10_seedlayers(base_topo, test_topo, tmpdir):
config_dict = {
"train_batch_size": 16,
"train_micro_batch_size_per_gpu": 4,
"steps_per_print": 20,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.001,
"betas": [0.9,
0.999],
"eps": 1e-8,
"weight_decay": 3e-7
}
},
"zero_optimization": {
"stage": 0
},
"fp16": {
"enabled": False
},
"pipeline": {
"seed_layers": True,
"activation_checkpoint_interval": 1
}
}
args = args_from_dict(tmpdir, config_dict)
@distributed_test(world_size=4)
def _helper(base_topo, test_topo, tmpdir, steps=500):
assert steps >= 100
base_model = AlexNetPipe(num_classes=10,
topology=base_topo,
seed_layers=config_dict['pipeline']['seed_layers'])
base_losses = train_cifar(base_model,
args,
num_steps=steps,
fp16=config_dict['fp16']['enabled'])
test_model = AlexNetPipe(num_classes=10,
topology=test_topo,
seed_layers=config_dict['pipeline']['seed_layers'])
test_losses = train_cifar(test_model,
args,
num_steps=steps,
fp16=config_dict['fp16']['enabled'])
abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)]
rel_diffs = [rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses)]
if dist.get_rank() == 0:
print(
f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}'
)
print(
f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}'
)
print(
f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}'
)
for lastX in [1, 10, 100]:
base_avg = sum(base_losses[-lastX:]) / lastX
test_avg = sum(test_losses[-lastX:]) / lastX
print(
f'last-{lastX}: base={base_avg} test={test_avg} abs={base_avg - test_avg} rel={rel_diff(base_avg, test_avg)}'
)
lastX = 100
base = base_losses[-lastX:]
base_avg = sum(base) / len(base)
test = test_losses[-lastX:]
test_avg = sum(test) / len(test)
assert rel_diff(base_avg, test_avg) < 0.03
_helper(base_topo, test_topo, tmpdir)
import copy
import torch
import torch.nn as nn
import torch.distributed as dist
import pytest
import deepspeed
from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
PipeTopo = PipeDataParallelTopology
from deepspeed.pipe import PipelineModule, LayerSpec
from deepspeed.utils import RepeatingLoader
from common import distributed_test
from simple_model import args_from_dict
HIDDEN_DIM = 32
LAYERS = 8
@pytest.fixture
def sequential_model():
model = torch.nn.Sequential(
*[nn.Linear(HIDDEN_DIM,
HIDDEN_DIM) for _ in range(LAYERS)],
nn.Linear(HIDDEN_DIM,
1),
)
return model
@pytest.fixture
def simple_args(tmpdir):
config_dict = {
"train_batch_size": 1,
"train_micro_batch_size_per_gpu": 1,
"steps_per_print": 1,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.001,
"betas": [0.9,
0.999],
"eps": 1e-8,
"weight_decay": 3e-7
}
},
"pipeline": {
"activation_checkpoint_interval": 1
}
}
args = args_from_dict(tmpdir, config_dict)
return args
def test_pipe_module_sequential(sequential_model, simple_args):
batch_input = torch.randn(1, HIDDEN_DIM)
@distributed_test(world_size=4)
def _helper():
base_model = copy.deepcopy(sequential_model)
base_input = batch_input.clone().detach()
base_output = base_model(base_input)
base_output = base_output
base_params = sum(p.numel() for p in base_model.parameters())
pipe_model = copy.deepcopy(sequential_model)
pipe_model = PipelineModule(layers=pipe_model, num_stages=4)
# Ensure all parameters are accounted for.
my_params = sum(p.numel() for p in pipe_model.parameters())
total_pipe_params = torch.LongTensor([my_params]).to('cuda')
dist.all_reduce(total_pipe_params)
total_pipe_params = total_pipe_params.item()
assert total_pipe_params == base_params
pipe_model, _, _, _ = deepspeed.initialize(
args=simple_args,
model=pipe_model,
model_parameters=[p for p in pipe_model.parameters()])
if pipe_model.is_first_stage or pipe_model.is_last_stage:
pipe_input = base_input.clone().detach().to('cuda')
# label 0 is meaningless
dataset = [(pipe_input, 0)]
loader = RepeatingLoader(dataset)
data_iter = iter(loader)
else:
data_iter = None
pipe_output = pipe_model.eval_batch(data_iter=data_iter)
base_output = base_output.to('cpu')
pipe_output = pipe_output.to('cpu')
assert torch.allclose(base_output, pipe_output)
_helper()
This diff is collapsed.
import pytest
import deepspeed.runtime.utils as ds_utils
def test_call_to_str():
c2s = ds_utils.call_to_str
assert c2s('int') == 'int()'
assert c2s('int', 3) == 'int(3)'
assert c2s('int', 3, 'jeff') == 'int(3, \'jeff\')'
assert c2s('hello', val=3) == 'hello(val=3)'
assert c2s('hello', 1138, val=3) == 'hello(1138, val=3)'
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment