Pipeline parallel training engine. (#392)

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>

Pipeline parallel training engine. (#392)
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
65c2f974 · Shaden Smith · GitHub · 41db1c2f · 65c2f974 · 65c2f974
Unverified Commit 65c2f974 authored Sep 09, 2020 by Shaden Smith Committed by GitHub Sep 09, 2020
17 changed files
--- a/docs/assets/images/3d-parallelism.png
+++ b/docs/assets/images/3d-parallelism.png
--- a/docs/assets/images/pipe-schedule.png
+++ b/docs/assets/images/pipe-schedule.png
--- a/docs/code-docs/source/conf.py
+++ b/docs/code-docs/source/conf.py
@@ -24,6 +24,8 @@ release = '0.1.0'
 master_doc = 'index'
+autodoc_member_order = 'bysource'
 # -- General configuration ---------------------------------------------------
 # Add any Sphinx extension module names here, as strings. They can be

--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -34,6 +34,14 @@ Transformer Kernel API
   kernel
+Pipeline Parallelism
+--------------------
+.. toctree::
+   :maxdepth: 2
+   pipeline
+   pipeline-extending
 Indices and tables
 ------------------

--- a/docs/code-docs/source/pipeline-extending.rst
+++ b/docs/code-docs/source/pipeline-extending.rst
+Extending Pipeline Parallelism
+==============================
+.. automodule:: deepspeed.runtime.pipe.schedule
+    :members:
--- a/docs/code-docs/source/pipeline.rst
+++ b/docs/code-docs/source/pipeline.rst
+Pipeline Parallelism
+====================
+.. automodule:: deepspeed.runtime.pipe.engine
+    :members:
+.. automodule:: deepspeed.runtime.pipe.topology
+    :members:
--- a/install.sh
+++ b/install.sh
@@ -156,7 +156,7 @@ fi
 if [ "$pip_mirror" != "" ]; then
    PIP_INSTALL="pip install --use-feature=2020-resolver -v -i $pip_mirror"
 else
-    PIP_INSTALL="pip install -v"
+    PIP_INSTALL="pip install --use-feature=2020-resolver -v"
 fi
 if [ ! -f $hostfile ]; then

--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -32,7 +32,7 @@ def distributed_test(world_size=2, backend='nccl'):
        def dist_init(local_rank, num_procs, *func_args, **func_kwargs):
            """Initialize torch.distributed and execute the user function. """
            os.environ['MASTER_ADDR'] = '127.0.0.1'
-            os.environ['MASTER_PORT'] = '29500'
+            os.environ['MASTER_PORT'] = '29503'
            dist.init_process_group(backend=backend,
                                    init_method='env://',
                                    rank=local_rank,

--- a/tests/unit/simple_model.py
+++ b/tests/unit/simple_model.py
@@ -3,6 +3,8 @@ import json
 import argparse
 import torch
+from deepspeed.pipe import PipelineModule, LayerSpec
 class SimpleModel(torch.nn.Module):
    def __init__(self, hidden_dim, empty_grad=False, rank=0):
@@ -23,6 +25,59 @@ class SimpleModel(torch.nn.Module):
        return self.cross_entropy_loss(hidden_dim, y)
+class LinearStack(torch.nn.Module):
+    def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.hidden_dim = hidden_dim
+        self.input_layer = VerboseLinear(in_features=self.input_dim,
+                                         out_features=self.hidden_dim)
+        self.layers = torch.nn.ModuleList([
+            torch.nn.Linear(in_features=self.hidden_dim,
+                            out_features=self.hidden_dim,
+                            bias=False) for x in range(num_layers)
+        ])
+        self.output_layer = torch.nn.Linear(in_features=self.hidden_dim,
+                                            out_features=self.output_dim)
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+    def forward(self, x, y):
+        x = self.input_layer(x)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.output_layer(x)
+        return x
+class LinearStackPipe(PipelineModule):
+    def __init__(self,
+                 input_dim=128,
+                 hidden_dim=128,
+                 output_dim=128,
+                 num_layers=4,
+                 **kwargs):
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        layers = []
+        layers.append(LayerSpec(torch.nn.Linear, self.input_dim, self.hidden_dim))
+        for x in range(self.num_layers):
+            layers.append(
+                LayerSpec(torch.nn.Linear,
+                          self.hidden_dim,
+                          self.hidden_dim,
+                          bias=False))
+            layers.append(lambda x: x)
+        layers.append(LayerSpec(torch.nn.Linear, self.hidden_dim, self.output_dim))
+        super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), **kwargs)
 class SimpleOptimizer(torch.optim.Optimizer):
    def __init__(self, params, lr=0.11072018):
        defaults = dict(lr=lr)
@@ -70,5 +125,10 @@ def args_from_dict(tmpdir, config_dict):
    args = parser.parse_args(args='')
    args.deepspeed = True
    args.deepspeed_config = config_path
+    if torch.distributed.is_initialized():
+        # We assume up to one full node executing unit tests
+        assert torch.distributed.get_world_size() <= torch.cuda.device_count()
+        args.local_rank = torch.distributed.get_rank()
+    else:
        args.local_rank = 0
    return args
--- a/tests/unit/test_checkpointing.py
+++ b/tests/unit/test_checkpointing.py
 import torch
+import torch.distributed as dist
 import deepspeed
 from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
 from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
@@ -6,13 +8,16 @@ from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
 from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
+from deepspeed.runtime.pipe.topology import *
+PipeTopo = PipeDataParallelTopology
 import argparse
 import pytest
 import json
 import os
 import numbers
 from common import distributed_test
-from simple_model import SimpleModel, random_dataloader, args_from_dict
+from simple_model import *
 def compare_deepspeed_states(saved_model, loaded_model):
@@ -24,12 +29,15 @@ def compare_deepspeed_states(saved_model, loaded_model):
    assert saved_model.global_steps == loaded_model.global_steps
-def compare_model_states(saved_model, loaded_model):
+def compare_model_states(saved_model, loaded_model, compare_optimizer=True):
    compare_deepspeed_states(saved_model, loaded_model)
    for p0, p1 in zip(saved_model.module.parameters(), loaded_model.module.parameters()):
        assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}"
+    if not compare_optimizer:
+        return
    if isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer):
        for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups):
            assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
@@ -95,7 +103,8 @@ def checkpoint_correctness_verification(args,
                                        tmpdir,
                                        load_optimizer_states=False,
                                        load_lr_scheduler_states=False,
-                                        fp16=True):
+                                        fp16=True,
+                                        train_batch=False):
    dtype = torch.half if fp16 else torch.float32
    ds_model, _, _, _ = deepspeed.initialize(args=args,
                                             model=model,
@@ -105,8 +114,15 @@ def checkpoint_correctness_verification(args,
                                    hidden_dim=hidden_dim,
                                    device=ds_model.device,
                                    dtype=dtype)
+    if train_batch:
+        ds_model.set_dataloader(data_loader)
+        for n, batch in enumerate(data_loader):
+            loss = ds_model.train_batch()
+    else:
        for n, batch in enumerate(data_loader):
            loss = ds_model(batch[0], batch[1])
+            print(loss)
            ds_model.backward(loss)
            ds_model.step()
@@ -514,3 +530,108 @@ def test_checkpoint_fp32_optimizer(tmpdir):
        checkpoint_correctness_verification(args, model, hidden_dim, tmpdir, fp16=False)
    _test_checkpoint_fp32_optimizer(args=args, model=model, hidden_dim=hidden_dim)
+@pytest.mark.parametrize("zero_stage", [0, 1])
+def test_checkpoint_pipe_engine(zero_stage, tmpdir, stages=2):
+    config_dict = {
+        "train_batch_size": 2,
+        "train_micro_batch_size_per_gpu": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 1e-5
+            }
+        },
+        "zero_optimization": {
+            "stage": zero_stage
+        },
+        "fp16": {
+            "enabled": zero_stage > 0
+        },
+        "scheduler": {
+            "type": "OneCycle",
+            "params": {
+                "cycle_first_step_size": 1000,
+                "cycle_first_stair_count": 500,
+                "cycle_second_step_size": 1000,
+                "cycle_second_stair_count": 500,
+                "decay_step_size": 1000,
+                "cycle_min_lr": 0.0001,
+                "cycle_max_lr": 0.0010,
+                "decay_lr_rate": 0.001,
+                "cycle_min_mom": 0.85,
+                "cycle_max_mom": 0.99,
+                "decay_mom_rate": 0.0
+            }
+        }
+    }
+    @distributed_test(world_size=4)
+    def _test(save_folder, num_stages):
+        args = args_from_dict(tmpdir, config_dict)
+        model = LinearStackPipe(num_stages=num_stages)
+        checkpoint_correctness_verification(args=args,
+                                            model=model,
+                                            hidden_dim=model.hidden_dim,
+                                            tmpdir=save_folder,
+                                            fp16=config_dict['fp16']['enabled'],
+                                            load_optimizer_states=True,
+                                            load_lr_scheduler_states=True,
+                                            train_batch=True)
+    _test(tmpdir, num_stages=stages)
+@pytest.mark.parametrize("base_topo,test_topo",
+                         [
+                             (PipeTopo(num_pp=1,
+                                       num_dp=4),
+                              PipeTopo(num_pp=4,
+                                       num_dp=1)),
+                             (PipeTopo(num_pp=2,
+                                       num_dp=2),
+                              PipeTopo(num_pp=2,
+                                       num_dp=2)),
+                             (PipeTopo(num_pp=4,
+                                       num_dp=1),
+                              PipeTopo(num_pp=2,
+                                       num_dp=2)),
+                         ])
+def test_checkpoint_pipe_module(base_topo, test_topo, tmpdir):
+    @distributed_test(world_size=4)
+    def _test(base_topo, test_topo, save_folder):
+        base_model = LinearStackPipe(topology=base_topo)
+        base_model.save_state_dict(save_folder)
+        dist.barrier()
+        test_model = LinearStackPipe(topology=test_topo)
+        test_model.load_state_dir(save_folder)
+        # Base and test can have different lengths, so make sure we map from the
+        # smaller to larger model
+        if len(base_model.forward_funcs) < len(test_model.forward_funcs):
+            A = base_model
+            B = test_model
+        else:
+            A = test_model
+            B = base_model
+        # Compare layers individually since partitions are different
+        for idx, A_layer in enumerate(A.forward_funcs):
+            if not hasattr(A_layer, 'parameters'):
+                # Skip functionals, etc.
+                continue
+            # Find the corresponding layer in B
+            global_idx = idx + A._local_start
+            B_local_idx = global_idx - B._local_start
+            B_layer = B.forward_funcs[B_local_idx]
+            # Compare layer parameters
+            for p0, p1 in zip(A_layer.parameters(), B_layer.parameters()):
+                assert torch.allclose(p0, p1, atol=1e-07), f"Model state {p0} is not equal to {p1}"
+    _test(base_topo, test_topo, save_folder=tmpdir)
--- a/tests/unit/test_data.py
+++ b/tests/unit/test_data.py
+from deepspeed.utils import RepeatingLoader
+def test_repeating_loader():
+    loader = [1, 2, 3]
+    loader = RepeatingLoader(loader)
+    for idx in range(50):
+        assert next(loader) == 1
+        assert next(loader) == 2
+        assert next(loader) == 3
--- a/tests/unit/test_partition.py
+++ b/tests/unit/test_partition.py
+import pytest
+import torch
+import torch.distributed as dist
+from deepspeed.runtime.utils import partition_uniform
+from deepspeed.runtime.utils import partition_balanced
+from deepspeed.runtime.utils import prefix_sum_inc
+from deepspeed.runtime.utils import PartitionedTensor
+from common import distributed_test
+@distributed_test(world_size=4)
+def test_partitioned_tensor():
+    world = dist.get_world_size()
+    rank = dist.get_rank()
+    group = dist.new_group(ranks=list(range(world)))
+    rows = world * 4
+    cols = 3
+    full = torch.rand(rows, cols).cuda()
+    dist.broadcast(full, src=0, group=group)
+    part = PartitionedTensor(full, group=group)
+    assert len(part.local_size()) == 1
+    assert part.local_size()[0] * world == full.numel()
+    reconstructed = part.full()
+    assert torch.equal(full, reconstructed)
+@distributed_test(world_size=4)
+def test_partitioned_tensor_meta():
+    world = dist.get_world_size()
+    rank = dist.get_rank()
+    group = dist.new_group(ranks=list(range(world)))
+    rows = world * 7
+    cols = 3
+    full = torch.rand(rows, cols).cuda()
+    dist.broadcast(full, src=0, group=group)
+    part = PartitionedTensor(full, group=group)
+    my_meta = PartitionedTensor.from_meta(part.to_meta(), part.local_data, group)
+    assert torch.equal(full, my_meta.full())
+def assert_valid_partition(weights, parts, P):
+    N = len(weights)
+    assert len(parts) == P + 1
+    assert parts[0] == 0
+    assert parts[P] == N
+    for idx in range(P):
+        assert parts[idx] <= parts[idx + 1]
+def get_partition_weights(weights, parts):
+    """ Return the amount of weight in each partition. """
+    costs = [0] * (len(parts) - 1)
+    P = len(parts) - 1
+    for p in range(P):
+        start = parts[p]
+        stop = parts[p + 1]
+        costs[p] = sum(weights[start:stop])
+    return costs
+def test_prefix_sum():
+    x = [3, 4, 5]
+    psum = prefix_sum_inc(x)
+    assert psum == [3, 7, 12]
+def test_valid_partition():
+    N = 10
+    P = 1
+    weights = [1] * N
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+def test_short_partition_uniform():
+    N = 2
+    P = 4
+    weights = [1] * N
+    parts = partition_uniform(len(weights), P)
+    assert_valid_partition(weights, parts, P)
+def test_short_partition():
+    N = 2
+    P = 4
+    weights = [1] * N
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+def test_easy_balance_uniform():
+    weights = [1] * 8
+    P = 4
+    parts = partition_uniform(len(weights), P)
+    assert_valid_partition(weights, parts, P)
+    costs = get_partition_weights(weights, parts)
+    assert all(c == 2 for c in costs)
+def test_easy_balance_balanced():
+    weights = [1] * 8
+    P = 4
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+    costs = get_partition_weights(weights, parts)
+    assert all(c == 2 for c in costs), costs
+def test_int_balanced():
+    weights = [0, 1, 2, 3, 3, 3]
+    P = 4
+    parts = partition_balanced(weights, P)
+    assert parts == [0, 3, 4, 5, 6]
+    assert_valid_partition(weights, parts, P)
+    costs = get_partition_weights(weights, parts)
+    assert all(c == 3 for c in costs)
+def test_float_balanced():
+    weights = [0., 1.1, 1.9, 3., 3., 3.]
+    P = 4
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+    assert parts == [0, 3, 4, 5, 6]
+@pytest.mark.skip(reason="Variance-minimizing partitioning returns different result.")
+def test_float_lastheavy():
+    weights = [0., 1.1, 1.9, 3., 30.]
+    P = 2
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+    assert parts == [0, 4, 5]
+def test_float_midheavy():
+    weights = [0., 1.1, 30, 3.]
+    P = 3
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+    assert parts == [0, 2, 3, 4]
+def test_balance_bert():
+    # Parameters per layer for a transformer model with 24 transformers and hidden dim 1024
+    weights = [
+        52559872,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        0,
+        52559872
+    ]
+    P = 8
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
--- a/tests/unit/test_pipe.py
+++ b/tests/unit/test_pipe.py
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+import pytest
+import deepspeed
+import deepspeed.runtime.utils as ds_utils
+from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
+PipeTopo = PipeDataParallelTopology
+import deepspeed.runtime.pipe.module as PipelineModule
+from deepspeed.runtime.pipe.module import LayerSpec
+from common import distributed_test
+def rel_diff(A, B):
+    return abs(A - B) / abs(A)
+# All models
+from simple_model import args_from_dict
+class AlexNet(nn.Module):
+    def __init__(self, num_classes=10):
+        super(AlexNet, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3,
+                      64,
+                      kernel_size=11,
+                      stride=4,
+                      padding=5),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=2,
+                         stride=2),
+            nn.Conv2d(64,
+                      192,
+                      kernel_size=5,
+                      padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=2,
+                         stride=2),
+            nn.Conv2d(192,
+                      384,
+                      kernel_size=3,
+                      padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384,
+                      256,
+                      kernel_size=3,
+                      padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256,
+                      256,
+                      kernel_size=3,
+                      padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=2,
+                         stride=2),
+        )
+        self.classifier = nn.Linear(256, num_classes)
+        self.loss_fn = nn.CrossEntropyLoss()
+    def forward(self, x, y):
+        x = self.features(x)
+        x = x.view(x.size(0), -1)
+        x = self.classifier(x)
+        return self.loss_fn(x, y)
+class AlexNetPipe(PipelineModule.PipelineModule):
+    def __init__(self, num_classes=10, **kwargs):
+        self.num_classes = num_classes
+        specs = [
+            LayerSpec(nn.Conv2d, 3, 64, kernel_size=11, stride=4, padding=5),
+            LayerSpec(nn.ReLU, inplace=True),
+            LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
+            LayerSpec(nn.Conv2d, 64, 192, kernel_size=5, padding=2),
+            F.relu,
+            LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
+            LayerSpec(nn.Conv2d, 192, 384, kernel_size=3, padding=1),
+            F.relu,
+            LayerSpec(nn.Conv2d, 384, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerSpec(nn.Conv2d, 256, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
+            lambda x: x.view(x.size(0), -1),
+            LayerSpec(nn.Linear, 256, self.num_classes), # classifier
+        ]
+        super().__init__(layers=specs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
+def cifar_trainset(fp16=False):
+    import torchvision
+    import torchvision.transforms as transforms
+    transform_list = [
+        transforms.ToTensor(),
+        transforms.Normalize((0.5,
+                              0.5,
+                              0.5),
+                             (0.5,
+                              0.5,
+                              0.5)),
+    ]
+    if fp16:
+        transform_list.append(torchvision.transforms.Lambda(lambda x: x.half()))
+    transform = transforms.Compose(transform_list)
+    local_rank = torch.cuda.current_device()
+    # Only one rank per machine downloads.
+    dist.barrier()
+    if local_rank != 0:
+        dist.barrier()
+    trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10-data',
+                                            train=True,
+                                            download=True,
+                                            transform=transform)
+    if local_rank == 0:
+        dist.barrier()
+    return trainset
+def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, seed=123):
+    with torch.random.fork_rng(devices=[torch.cuda.current_device()]):
+        ds_utils.set_random_seed(seed)
+        trainset = cifar_trainset(fp16=fp16)
+        args.local_rank = dist.get_rank()
+        engine, _, _, _ = deepspeed.initialize(
+            args=args,
+            model=model,
+            model_parameters=[p for p in model.parameters()],
+            training_data=trainset)
+        losses = []
+        for step in range(num_steps):
+            loss = engine.train_batch()
+            losses.append(loss.item())
+            if step % 50 == 0:
+                print(f'STEP={step} LOSS={loss.item()}')
+        if average_dp_losses:
+            loss_tensor = torch.tensor(losses).cuda()
+            dist.all_reduce(loss_tensor)
+            loss_tensor /= dist.get_world_size()
+            losses = loss_tensor.tolist()
+    return losses
+@pytest.mark.parametrize('base_topo,test_topo',
+                         [
+                             (PipeTopo(num_pp=1,
+                                       num_dp=4),
+                              PipeTopo(num_pp=2,
+                                       num_dp=2)),
+                             (PipeTopo(num_pp=1,
+                                       num_dp=4),
+                              PipeTopo(num_pp=4,
+                                       num_dp=1)),
+                         ])
+def test_pipe_cifar10_seedlayers(base_topo, test_topo, tmpdir):
+    config_dict = {
+        "train_batch_size": 16,
+        "train_micro_batch_size_per_gpu": 4,
+        "steps_per_print": 20,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.001,
+                "betas": [0.9,
+                          0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7
+            }
+        },
+        "zero_optimization": {
+            "stage": 0
+        },
+        "fp16": {
+            "enabled": False
+        },
+        "pipeline": {
+            "seed_layers": True,
+            "activation_checkpoint_interval": 1
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    @distributed_test(world_size=4)
+    def _helper(base_topo, test_topo, tmpdir, steps=500):
+        assert steps >= 100
+        base_model = AlexNetPipe(num_classes=10,
+                                 topology=base_topo,
+                                 seed_layers=config_dict['pipeline']['seed_layers'])
+        base_losses = train_cifar(base_model,
+                                  args,
+                                  num_steps=steps,
+                                  fp16=config_dict['fp16']['enabled'])
+        test_model = AlexNetPipe(num_classes=10,
+                                 topology=test_topo,
+                                 seed_layers=config_dict['pipeline']['seed_layers'])
+        test_losses = train_cifar(test_model,
+                                  args,
+                                  num_steps=steps,
+                                  fp16=config_dict['fp16']['enabled'])
+        abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)]
+        rel_diffs = [rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses)]
+        if dist.get_rank() == 0:
+            print(
+                f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}'
+            )
+            print(
+                f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}'
+            )
+            print(
+                f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}'
+            )
+            for lastX in [1, 10, 100]:
+                base_avg = sum(base_losses[-lastX:]) / lastX
+                test_avg = sum(test_losses[-lastX:]) / lastX
+                print(
+                    f'last-{lastX}: base={base_avg} test={test_avg} abs={base_avg - test_avg} rel={rel_diff(base_avg, test_avg)}'
+                )
+        lastX = 100
+        base = base_losses[-lastX:]
+        base_avg = sum(base) / len(base)
+        test = test_losses[-lastX:]
+        test_avg = sum(test) / len(test)
+        assert rel_diff(base_avg, test_avg) < 0.03
+    _helper(base_topo, test_topo, tmpdir)
--- a/tests/unit/test_pipe_module.py
+++ b/tests/unit/test_pipe_module.py
+import copy
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+import pytest
+import deepspeed
+from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
+PipeTopo = PipeDataParallelTopology
+from deepspeed.pipe import PipelineModule, LayerSpec
+from deepspeed.utils import RepeatingLoader
+from common import distributed_test
+from simple_model import args_from_dict
+HIDDEN_DIM = 32
+LAYERS = 8
+@pytest.fixture
+def sequential_model():
+    model = torch.nn.Sequential(
+        *[nn.Linear(HIDDEN_DIM,
+                    HIDDEN_DIM) for _ in range(LAYERS)],
+        nn.Linear(HIDDEN_DIM,
+                  1),
+    )
+    return model
+@pytest.fixture
+def simple_args(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+        "train_micro_batch_size_per_gpu": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.001,
+                "betas": [0.9,
+                          0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7
+            }
+        },
+        "pipeline": {
+            "activation_checkpoint_interval": 1
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    return args
+def test_pipe_module_sequential(sequential_model, simple_args):
+    batch_input = torch.randn(1, HIDDEN_DIM)
+    @distributed_test(world_size=4)
+    def _helper():
+        base_model = copy.deepcopy(sequential_model)
+        base_input = batch_input.clone().detach()
+        base_output = base_model(base_input)
+        base_output = base_output
+        base_params = sum(p.numel() for p in base_model.parameters())
+        pipe_model = copy.deepcopy(sequential_model)
+        pipe_model = PipelineModule(layers=pipe_model, num_stages=4)
+        # Ensure all parameters are accounted for.
+        my_params = sum(p.numel() for p in pipe_model.parameters())
+        total_pipe_params = torch.LongTensor([my_params]).to('cuda')
+        dist.all_reduce(total_pipe_params)
+        total_pipe_params = total_pipe_params.item()
+        assert total_pipe_params == base_params
+        pipe_model, _, _, _ = deepspeed.initialize(
+            args=simple_args,
+            model=pipe_model,
+            model_parameters=[p for p in pipe_model.parameters()])
+        if pipe_model.is_first_stage or pipe_model.is_last_stage:
+            pipe_input = base_input.clone().detach().to('cuda')
+            # label 0 is meaningless
+            dataset = [(pipe_input, 0)]
+            loader = RepeatingLoader(dataset)
+            data_iter = iter(loader)
+        else:
+            data_iter = None
+        pipe_output = pipe_model.eval_batch(data_iter=data_iter)
+        base_output = base_output.to('cpu')
+        pipe_output = pipe_output.to('cpu')
+        assert torch.allclose(base_output, pipe_output)
+    _helper()
--- a/tests/unit/test_pipe_schedule.py
+++ b/tests/unit/test_pipe_schedule.py
--- a/tests/unit/test_runtime_utils.py
+++ b/tests/unit/test_runtime_utils.py
+import pytest
+import deepspeed.runtime.utils as ds_utils
+def test_call_to_str():
+    c2s = ds_utils.call_to_str
+    assert c2s('int') == 'int()'
+    assert c2s('int', 3) == 'int(3)'
+    assert c2s('int', 3, 'jeff') == 'int(3, \'jeff\')'
+    assert c2s('hello', val=3) == 'hello(val=3)'
+    assert c2s('hello', 1138, val=3) == 'hello(1138, val=3)'
--- a/tests/unit/test_topology.py
+++ b/tests/unit/test_topology.py