Single-process control via PipeRPCWrapper (#156)

Adds support for: * Reused layers (e.g. for weight sharing) * Lazily-constructed layers * Single-process control via PipeRPCWrapper * PipelineStyle.AsyncScheudle, which lays the foundation for asynchronous pipeline work by introducing an event loop for each rank/worker to process either activations or gradients as they arrive Also added examples for multi-process and PipeRPCWrapper

Single-process control via PipeRPCWrapper (#156)
Adds support for: * Reused layers (e.g. for weight sharing) * Lazily-constructed layers * Single-process control via PipeRPCWrapper * PipelineStyle.AsyncScheudle, which lays the foundation for asynchronous pipeline work by introducing an event loop for each rank/worker to process either activations or gradients as they arrive Also added examples for multi-process and PipeRPCWrapper
5d4f50fb · Tom Birch · GitHub · 543d5693 · 5d4f50fb · 5d4f50fb
Unverified Commit 5d4f50fb authored Nov 10, 2020 by Tom Birch Committed by GitHub Nov 10, 2020
18 changed files
--- a/stubs/torch/cuda/__init__.pyi
+++ b/stubs/torch/cuda/__init__.pyi
@@ -29,7 +29,7 @@ _device_t = Union[_device, int, str]
 def check_error(res: int) -> None: ...
 def device_count() -> int: ...
 def empty_cache() -> None: ...
-def synchronize(device: _device_t) -> None: ...
+def synchronize(device: Optional[_device_t]=None) -> None: ...
 def set_device(device: _device_t) -> None: ...
 def get_device_capability(device: Optional[_device_t]=...) -> Tuple[int, int]: ...
 def get_device_name(device: Optional[_device_t]=...) -> str: ...

--- a/stubs/torch/distributed/__init__.pyi
+++ b/stubs/torch/distributed/__init__.pyi
@@ -5,6 +5,7 @@ from torch import Tensor
 import datetime
 from . import rpc as rpc
+from . import distributed_c10d as distributed_c10d
 class Backend:
    GLOO: str

--- a/stubs/torch/distributed/distributed_c10d.pyi
+++ b/stubs/torch/distributed/distributed_c10d.pyi
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+from typing import Any, List, Union, Optional
+from . import ProcessGroup
+def _get_global_rank(group: ProcessGroup, rank: int) -> int: ...
--- a/stubs/torch/distributed/rpc/__init__.pyi
+++ b/stubs/torch/distributed/rpc/__init__.pyi
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 from typing import Union, Callable, Optional
+from torch.futures import Future
 class RRef:
@@ -17,7 +18,7 @@ def rpc_async(
    args: Optional[tuple] = None,
    kwargs: Optional[dict] = None,
    timeout=-1.0,
-) -> None:
+) -> Future:
    ...

--- a/stubs/torch/futures.pyi
+++ b/stubs/torch/futures.pyi
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+from typing import Any
+class Future:
+    def wait(self) -> Any: ...
--- a/stubs/torch/nn/__init__.pyi
+++ b/stubs/torch/nn/__init__.pyi
--- a/tests/nn/model_parallel/commons.py
+++ b/tests/nn/model_parallel/commons.py
@@ -21,6 +21,7 @@
 import functools
 import inspect
+import multiprocessing
 import os
 import random
@@ -100,17 +101,32 @@ def spawn_for_all_world_sizes(test_func, world_sizes=get_world_sizes(), args=[])
        mp.spawn(test_func, args=(world_size, *args), nprocs=world_size, join=True)
-def helper(rank, world_size, func, args):
+def worker_process(rank, world_size, func, args, error_queue):
+    """Main function for unit tests launced with torch_spawn"""
    dist_init(rank, world_size)
-    initialize_model_parallel(1, world_size)
+    kwargs = {}
+    if "OMPI_COMM_WORLD_RANK" not in os.environ:
+        kwargs["pipeline_backend"] = "gloo"
+    initialize_model_parallel(1, world_size, **kwargs)
+    try:
        func(*args)
+    except BaseException as e:
+        # If the function raises 'Skipped', this indicates pytest.skip(), so
+        # forward it to parent so we can call pytest.skip() there
+        if e.__class__.__name__ == "Skipped":
+            error_queue.put(str(e))
+            return
+        raise e
 def torch_spawn(world_sizes=None):
    if world_sizes is None:
        world_sizes = get_world_sizes()
-    def fixer(func):
+    def prepare_test(func):
+        """Function called with the test function as the argument. Generates a
+        replacement which serves as the actual test function."""
        name = func.__name__
        parameters = inspect.signature(func).parameters
@@ -128,21 +144,39 @@ def torch_spawn(world_sizes=None):
                kwargs[p] for p in parameters if p != "rank"
            )  # converting named parameters to positional parameters to pass to `spawn`
+            error_queue = multiprocessing.get_context("spawn").SimpleQueue()
            if "OMPI_COMM_WORLD_RANK" in os.environ:
+                os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
+                os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
+                os.environ["MASTER_ADDR"] = "localhost"
+                os.environ["MASTER_PORT"] = "10638"
                torch.distributed.init_process_group("mpi")
                world_size = torch.distributed.get_world_size()
                initialize_model_parallel(1, world_size)
                torch.cuda.set_device(torch.distributed.get_rank() % torch.cuda.device_count())
                if world_size in world_sizes:
+                    try:
                        func(*args)
+                    except BaseException as e:
+                        print(f"got exception {e} from test")
+                        import traceback
+                        print(f"{traceback.format_exc()}")
+                        raise e
                else:
                    pytest.skip(f"requested world size doesn't match current world size")
            else:
-                spawn_for_all_world_sizes(helper, world_sizes, (func, args))
+                spawn_for_all_world_sizes(worker_process, world_sizes, (func, args, error_queue))
+            if not error_queue.empty():
+                msg = error_queue.get()
+                pytest.skip(msg)
+        # Register a function with the same name, prefixed with "test_" in the
+        # calling module, so it will be picked up by pytest
        caller_module = inspect.getmodule(inspect.currentframe().f_back)
        setattr(caller_module, f"test_{name}", replacement)
        return func
-    return fixer
+    return prepare_test
--- a/tests/nn/model_parallel/test_initialize.py
+++ b/tests/nn/model_parallel/test_initialize.py
@@ -110,7 +110,7 @@ def test_adjacency(monkeypatch):
        def get_world_size(self):
            return data_parallel_size * pipeline_length * model_parallel_size
-        def new_group(self, args):
+        def new_group(self, args, backend=None):
            new_groups.append(args.copy())
            return ()

--- a/tests/nn/model_parallel/test_layers.py
+++ b/tests/nn/model_parallel/test_layers.py
@@ -436,6 +436,7 @@ def run_test_pipe(rank, world_size, skip_dist_init=False):
    model[2].weight.data = saved_weight_2
    worker_map = {i: f"Test{i}" for i in range(torch.distributed.get_world_size())}
+    style = Pipe.MultiProcess  # Pipe.AsyncSchedule
    if pipe_world_size == 2:
        print(f"actually doing pipe stuff now")
@@ -444,7 +445,7 @@ def run_test_pipe(rank, world_size, skip_dist_init=False):
        pipe_model = Pipe(
            model,
            [2, 1],
-            style=Pipe.MultiProcess,
+            style=style,
            group=pipeline_devices,
            worker_map=worker_map,
            input_device=torch.cuda.current_device(),
@@ -511,6 +512,7 @@ def run_test_pipe(rank, world_size, skip_dist_init=False):
            failed = False
            with torch.autograd.profiler.profile() as prof:
                try:
+                    if style == Pipe.MultiProcess:
                        pipe_model.back_helper(pipe_output)
                except Exception as e:
                    failed = True
@@ -527,6 +529,7 @@ def run_test_pipe(rank, world_size, skip_dist_init=False):
        pipe_model.zero_grad()
        torch.distributed.barrier()
+        pipe_model.eval()
        pipe_output = pipe_model(identity())
        updated_ref_output = forward_model(reference, target)
        if torch.distributed.get_rank(mpu.get_pipeline_parallel_group()) == 1:

--- a/tests/nn/moe/test_moe_layer.py
+++ b/tests/nn/moe/test_moe_layer.py
@@ -23,16 +23,17 @@ else:
 os.environ["MASTER_ADDR"] = "localhost"
 os.environ["MASTER_PORT"] = "29501"
 if "OMPI_COMM_WORLD_SIZE" in os.environ:
-    dist.init_process_group(backend=dist.Backend.MPI)
+    pass  # dist.init_process_group(backend=dist.Backend.MPI)
 def setup_module(module):
    if "OMPI_COMM_WORLD_SIZE" not in os.environ:
        dist.init_process_group(backend=BACKEND, rank=0, world_size=1)
+    else:
+        dist.init_process_group(backend=dist.Backend.MPI)
 def teardown_module(module):
-    if "OMPI_COMM_WORLD_SIZE" not in os.environ:
    torch.distributed.destroy_process_group()

--- a/tests/nn/pipe_process/conftest.py
+++ b/tests/nn/pipe_process/conftest.py
@@ -65,3 +65,7 @@ def pytest_runtest_teardown(item):
        destroy_model_parallel()
        if torch.distributed.is_initialized():
            torch.distributed.destroy_process_group()
+        try:
+            torch.distributed.rpc.shutdown()
+        except Exception:
+            pass
--- a/tests/nn/pipe_process/skip/test_gpipe.py
+++ b/tests/nn/pipe_process/skip/test_gpipe.py
@@ -23,7 +23,7 @@ import pytest
 import torch
 from torch import nn
-from fairscale.nn.pipe import Pipe
+from fairscale.nn.pipe import LazyModule, Pipe
 from fairscale.nn.pipe.skip import pop, skippable, stash
 from fairscale.nn.pipe.skip.portal import PortalBlue, PortalCopy, PortalOrange
 from tests.nn.model_parallel.commons import get_worker_map, torch_spawn
@@ -33,10 +33,15 @@ from tests.nn.model_parallel.commons import get_worker_map, torch_spawn
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
 @pytest.mark.parametrize("balance", [[3], [1, 2], [2, 1], [1, 1, 1]], ids=["3", "1:2", "2:1", "1:1:1"])
 @pytest.mark.parametrize("checkpoint", ["never", "always", "except_last"])
+@pytest.mark.parametrize("pipeline_style", [Pipe.MultiProcess, Pipe.AsyncSchedule])
 @pytest.mark.skipif("OMPI_COMM_WORLD_RANK" in os.environ, reason="broken on mpi")
-def x1to3(balance, checkpoint):
+def x1to3(balance, checkpoint, pipeline_style):
    torch.manual_seed(0)
+    if pipeline_style == Pipe.AsyncSchedule and len(balance) > 1:
+        print(f"skipping yarg")
+        pytest.skip("Skip tensors NYI for AsyncSchedule")
    @skippable(stash=["1to3"])
    class Layer1(nn.Module):
        def __init__(self):
@@ -75,7 +80,7 @@ def x1to3(balance, checkpoint):
        chunks=3,
        checkpoint=checkpoint,
        input_device=torch.cuda.current_device(),
-        style=Pipe.MultiProcess,
+        style=pipeline_style,
        worker_map=get_worker_map(),
        pipelined_backward=False,
    ).cuda()
@@ -101,7 +106,11 @@ def x1to3(balance, checkpoint):
 @torch_spawn([2])
 @pytest.mark.skipif("OMPI_COMM_WORLD_RANK" in os.environ, reason="broken on mpi")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
-def none_skip():
+@pytest.mark.parametrize("pipeline_style", [Pipe.MultiProcess, Pipe.AsyncSchedule])
+def none_skip(pipeline_style):
+    if pipeline_style == Pipe.AsyncSchedule:
+        pytest.skip("Skip tensors NYI for AsyncSchedule")
    @skippable(stash=["none"])
    class Stash(nn.Module):
        def forward(self, input):
@@ -119,7 +128,7 @@ def none_skip():
    model = Pipe(
        model,
        [1, 1],
-        style=Pipe.MultiProcess,
+        style=pipeline_style,
        worker_map=get_worker_map(),
        input_device=torch.cuda.current_device(),
        chunks=5,
@@ -151,7 +160,8 @@ def none_skip():
 @torch_spawn([2])
-def lazy_skippable_error():
+@pytest.mark.parametrize("pipeline_style", [Pipe.MultiProcess, Pipe.AsyncSchedule])
+def lazy_skippable_error(pipeline_style):
    """Using skippable layers in combination with lazy construction is currently
    not supported, check that it raises an Exception"""
@@ -163,9 +173,13 @@ def lazy_skippable_error():
    class Layer3(nn.Linear):
        pass
-    model = [lambda: Layer1(10, 10), lambda: nn.Linear(10, 10), lambda: Layer3(10, 10)]
+    model = [
+        LazyModule(lambda: Layer1(10, 10)),
+        LazyModule(lambda: nn.Linear(10, 10)),
+        LazyModule(lambda: Layer3(10, 10)),
+    ]
    with pytest.raises(ValueError, match="Can't use Skippable layers with multi-process pipe and lazy construction"):
        Pipe(
-            model, [2, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(),
+            model, [2, 1], style=pipeline_style, worker_map=get_worker_map(),
        )
--- a/tests/nn/pipe_process/skip/test_leak.py
+++ b/tests/nn/pipe_process/skip/test_leak.py
@@ -46,9 +46,10 @@ class Pop(nn.Module):
 @torch_spawn([2])
 @pytest.mark.parametrize("train", [True, False], ids=["train", "eval"])
 @pytest.mark.parametrize("checkpoint", ["always", "except_last", "never"])
+@pytest.mark.parametrize("pipeline_style", [Pipe.MultiProcess, Pipe.AsyncSchedule])
 @pytest.mark.skipif("OMPI_COMM_WORLD_RANK" in os.environ, reason="broken on mpi")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
-def delete_portal_tensor(train, checkpoint):
+def delete_portal_tensor(train, checkpoint, pipeline_style):
    # Without checkpointing:
    # +- Stash --+  +--- Pop ----+ - - - layers
    # | 2,blue,1 |--| 1,orange,0 | - - - tensor_life and portal function
@@ -59,6 +60,9 @@ def delete_portal_tensor(train, checkpoint):
    # | 3,blue,2 |--| 2,orange,1 |--| 1,orange,0 |--| 1,blue,0 |
    # +----------+  +------------+  +------------+  +----------+
+    if pipeline_style == Pipe.AsyncSchedule:
+        pytest.skip("Skip tensors NYI for AsyncSchedule")
    def portal_tensor_life_is(tensor_life, skip_tracker=None):
        if skip_tracker is None:
            skip_tracker = current_skip_tracker()
@@ -111,7 +115,7 @@ def delete_portal_tensor(train, checkpoint):
    model = nn.Sequential(NoPortalTensorAtBackward(), stash_, pop_)
    model = Pipe(
-        model, balance=[2, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(), chunks=2, checkpoint=checkpoint,
+        model, balance=[2, 1], style=pipeline_style, worker_map=get_worker_map(), chunks=2, checkpoint=checkpoint,
    )
    input = torch.rand(10, requires_grad=True)

--- a/tests/nn/pipe_process/test_bugs.py
+++ b/tests/nn/pipe_process/test_bugs.py
@@ -28,7 +28,9 @@ from tests.nn.model_parallel.commons import get_worker_map, torch_spawn
 @torch_spawn([2])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
-def python_autograd_function():
+@pytest.mark.parametrize("pipeline_style", [Pipe.MultiProcess, Pipe.AsyncSchedule])
+def python_autograd_function(pipeline_style):
+    # FIXME deadlock with Pipe.AsyncSchedule?
    # A Python autograd function might fail with this error:
    #
    #   RuntimeError: Returning Variables sharing storage with other Variables
@@ -55,7 +57,8 @@ def python_autograd_function():
            return Identity.apply(input)
    model = nn.Sequential(M(), M())
-    model = Pipe(model, [1, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(), checkpoint="always").cuda()
+    model = Pipe(model, [1, 1], style=pipeline_style, worker_map=get_worker_map(), checkpoint="always").cuda()
+    model.eval()
    x = torch.rand(42)
    y = model(x)
@@ -67,7 +70,8 @@ def python_autograd_function():
 @torch_spawn([3])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
-def exception_no_hang():
+@pytest.mark.parametrize("pipeline_style", [Pipe.MultiProcess, Pipe.AsyncSchedule])
+def exception_no_hang(pipeline_style):
    # In v0.0.2, once a failed partition receives a normal message
    # (non-closing) for the next micro-batch, a hang occured. The reason was
    # that a failed partition didn't call in_queue.task_done() on a normal
@@ -85,7 +89,8 @@ def exception_no_hang():
            raise ExpectedException()
    model = nn.Sequential(Pass(), Pass(), Raise())
-    model = Pipe(model, [1, 1, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(), chunks=3)
+    model = Pipe(model, [1, 1, 1], style=pipeline_style, worker_map=get_worker_map(), chunks=3)
+    model.eval()
    if model.group.rank() == 2:
        with pytest.raises(ExpectedException):
@@ -98,7 +103,8 @@ def exception_no_hang():
 @torch_spawn([2])
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="2 cuda devices required")
-def tuple_wait(cuda_sleep):
+@pytest.mark.parametrize("pipeline_style", [Pipe.MultiProcess, Pipe.AsyncSchedule])
+def tuple_wait(cuda_sleep, pipeline_style):
    # In v0.0.3, Wait is applied to only the first tensor on a micro-batch.
    # Under this behavior, if checkpointing was disabled, there's a possibility
    # that gradient accumulations on other tensors are not synchronized
@@ -129,7 +135,7 @@ def tuple_wait(cuda_sleep):
    model = Pipe(
        model,
        [1, 1],
-        style=Pipe.MultiProcess,
+        style=pipeline_style,
        worker_map=get_worker_map(),
        input_device=torch.cuda.current_device(),
        chunks=32,
@@ -151,7 +157,8 @@ def tuple_wait(cuda_sleep):
 @torch_spawn([2])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
-def parallel_randoms():
+@pytest.mark.parametrize("pipeline_style", [Pipe.MultiProcess, Pipe.AsyncSchedule])
+def parallel_randoms(pipeline_style):
    class Dropouts(nn.Module):
        def forward(self, x):
            for _ in range(100):
@@ -165,7 +172,7 @@ def parallel_randoms():
    model = Pipe(
        model,
        [1, 1],
-        style=Pipe.MultiProcess,
+        style=pipeline_style,
        input_device=torch.cuda.current_device(),
        worker_map=get_worker_map(),
        chunks=10,

--- a/tests/nn/pipe_process/test_inplace.py
+++ b/tests/nn/pipe_process/test_inplace.py
@@ -27,11 +27,17 @@ from tests.nn.model_parallel.commons import get_worker_map, torch_spawn
 @torch_spawn([2])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
-def inplace_on_requires_grad():
+@pytest.mark.parametrize("pipeline_style", [Pipe.MultiProcess, Pipe.AsyncSchedule])
+def inplace_on_requires_grad(pipeline_style):
    model = nn.Sequential(nn.Linear(1, 1), nn.ReLU(inplace=True))
-    model = Pipe(model, [1, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(), checkpoint="always")
+    model = Pipe(model, [1, 1], style=pipeline_style, worker_map=get_worker_map(), checkpoint="always")
    x = torch.rand(1)
+    if pipeline_style == Pipe.AsyncSchedule and model.group.rank() == 0:
+        # With AsyncSchedule, model will wait forever for gradients if not eval
+        model.eval()
    y = model(x)
    message = r"a leaf Variable that requires grad .* used in an in-place operation."
@@ -44,11 +50,12 @@ def inplace_on_requires_grad():
 @torch_spawn([1])
 @pytest.mark.xfail(strict=True)
-def inplace_on_not_requires_grad():
+@pytest.mark.parametrize("pipeline_style", [Pipe.MultiProcess, Pipe.AsyncSchedule])
+def inplace_on_not_requires_grad(pipeline_style):
    # In-place operation on a tensor not requiring grad doesn't cause a
    # RuntimeError. Currently, we cannot detect this case.
    model = nn.Sequential(nn.ReLU(inplace=True))
-    model = Pipe(model, [1], style=Pipe.MultiProcess, worker_map=get_worker_map(), checkpoint="always")
+    model = Pipe(model, [1], style=pipeline_style, worker_map=get_worker_map(), checkpoint="always")
    x = torch.rand(1)
    y = model(x)
@@ -63,7 +70,8 @@ def inplace_on_not_requires_grad():
 @torch_spawn([1])
 @pytest.mark.xfail(strict=True)
-def inplace_incorrect_grad():
+@pytest.mark.parametrize("pipeline_style", [Pipe.MultiProcess, Pipe.AsyncSchedule])
+def inplace_incorrect_grad(pipeline_style):
    class M(nn.Module):
        def forward(self, foo_bar):
            # 'foo' requires grad but 'bar' does not. In-place operation on
@@ -80,7 +88,7 @@ def inplace_incorrect_grad():
            return foo * bar
    model = nn.Sequential(M())
-    model = Pipe(model, [1], style=Pipe.MultiProcess, worker_map=get_worker_map(), checkpoint="always")
+    model = Pipe(model, [1], style=pipeline_style, worker_map=get_worker_map(), checkpoint="always")
    foo = torch.tensor([1.0], requires_grad=True)
    bar = torch.tensor([1.0])

--- a/tests/nn/pipe_process/test_pipe.py
+++ b/tests/nn/pipe_process/test_pipe.py
--- a/tests/nn/pipe_process/test_rpc.py
+++ b/tests/nn/pipe_process/test_rpc.py
+import copy
+import os
+import pytest
+import torch
+from torch import nn
+from torch.distributed import rpc
+from fairscale.nn.model_parallel.initialize import get_pipeline_parallel_group
+from fairscale.nn.pipe import PipeRPCWrapper
+from tests.nn.model_parallel.commons import get_worker_map, torch_spawn
+def init_rpc():
+    os.environ["MASTER_PORT"] = "10639"
+    init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}"
+    rpc.init_rpc(
+        f"Test{torch.distributed.get_rank()}",
+        rank=torch.distributed.get_rank(),
+        world_size=torch.distributed.get_world_size(),
+        backend=rpc.BackendType.TENSORPIPE,
+        rpc_backend_options=rpc.TensorPipeRpcBackendOptions(init_method=init_method),
+    )
+@torch_spawn([2])
+@pytest.mark.skipif("OMPI_COMM_WORLD_RANK" not in os.environ, reason="mpi required")
+def basic_rpc():
+    init_rpc()
+    if torch.distributed.get_rank() != 0:
+        rpc.shutdown()
+        torch.distributed.barrier()
+        return
+    model = [nn.Linear(10, 10), nn.ReLU()]
+    pipe = PipeRPCWrapper(model, [1, 1], input_device=torch.cuda.current_device(), worker_map=get_worker_map())
+    pipe.foreach_worker(register_optimizer, include_self=True)
+    inputs = torch.rand(10).cuda()
+    output = pipe(inputs)
+    loss = output.mean()
+    loss.backward()
+    pipe.foreach_worker(step_optimizer, include_self=True)
+    pipe.eval()
+    rpc.shutdown()
+    torch.distributed.barrier()
+def register_optimizer(ctx, model):
+    if len(list(model.parameters())) > 0:
+        model.optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+    else:
+        model.optimizer = None
+def step_optimizer(ctx, model):
+    if model.optimizer:
+        model.optimizer.step()
+def check_pipe_against_reference(balance, model_constructor, checkpoint="except_last", custom_inputs=None):
+    model = model_constructor()
+    reference_model = model_constructor()
+    for src, dst in zip(model, reference_model):
+        dst.load_state_dict(copy.deepcopy(src.state_dict()))
+    reference_model = nn.Sequential(*reference_model).cuda()
+    pipe = PipeRPCWrapper(
+        model, balance, input_device=torch.cuda.current_device(), worker_map=get_worker_map(), checkpoint=checkpoint,
+    )
+    pipe.foreach_worker(register_optimizer, include_self=True)
+    register_optimizer(None, reference_model)
+    inputs = torch.rand(10).cuda()
+    target = torch.rand(10).cuda()
+    cloned = inputs.clone()
+    output = pipe(inputs)
+    ref_out = reference_model(inputs)
+    assert torch.equal(ref_out.cpu(), output.cpu())
+    for out in output, ref_out:
+        target = target.to(out.device)
+        loss = nn.MSELoss()(out, target)
+        loss.backward()
+    pipe.foreach_worker(step_optimizer, include_self=True)
+    step_optimizer(None, reference_model.cuda())
+    pipe.eval()
+    reference_model.eval()
+    final_output = pipe(inputs)
+    final_ref = reference_model(inputs.cuda())
+    assert torch.equal(final_output.cpu(), final_ref.cpu())
+@torch_spawn([3])
+@pytest.mark.skipif("OMPI_COMM_WORLD_RANK" not in os.environ, reason="mpi required")
+def rpc_optimizer():
+    init_rpc()
+    if torch.distributed.get_rank() != 0:
+        rpc.shutdown()
+        torch.distributed.barrier()
+        return
+    def model_with_reuse():
+        reused_1 = nn.Linear(10, 10)
+        return [reused_1, nn.ReLU(), reused_1, nn.ReLU(), reused_1, nn.ReLU()]
+    check_pipe_against_reference(
+        [2, 2, 2], lambda: [nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU()],
+    )
+    check_pipe_against_reference([2, 1, 1], model_with_reuse)
+    rpc.shutdown()
+    torch.distributed.barrier()
+@torch_spawn([6])
+@pytest.mark.skipif("OMPI_COMM_WORLD_RANK" not in os.environ, reason="mpi required")
+def rpc_megatron_reuse():
+    from fairscale.nn.model_parallel import layers
+    from fairscale.nn.model_parallel.initialize import destroy_model_parallel, initialize_model_parallel
+    def make_model_simple():
+        return [
+            layers.ColumnParallelLinear(10, 10),
+            nn.ReLU(),
+            layers.RowParallelLinear(10, 10),
+            nn.ReLU(),
+            layers.ColumnParallelLinear(10, 10),
+            nn.ReLU(),
+            layers.RowParallelLinear(10, 10),
+            nn.ReLU(),
+            nn.Linear(10, 10),
+            nn.ReLU(),
+        ]
+    def make_model_with_reuse():
+        column = layers.ColumnParallelLinear(10, 10)
+        row = layers.RowParallelLinear(10, 10)
+        return [
+            column,
+            nn.ReLU(),
+            row,
+            nn.ReLU(),
+            column,
+            nn.ReLU(),
+            row,
+            nn.ReLU(),
+            nn.Linear(10, 10),
+            nn.ReLU(),
+        ]
+    destroy_model_parallel()
+    torch.distributed.destroy_process_group()
+    torch.distributed.init_process_group("gloo", rank=int(os.environ["RANK"]), world_size=int(os.environ["WORLD_SIZE"]))
+    initialize_model_parallel(2, 3, model_parallel_backend="nccl", pipeline_backend="mpi")
+    init_rpc()
+    if get_pipeline_parallel_group().rank() != 0:
+        rpc.shutdown()
+        torch.distributed.barrier()
+        return
+    check_pipe_against_reference([4, 4, 2], make_model_simple, "always")
+    check_pipe_against_reference([4, 2, 2], make_model_with_reuse)
+    rpc.shutdown()
+    torch.distributed.barrier()
+@torch_spawn([3])
+@pytest.mark.skipif("OMPI_COMM_WORLD_RANK" not in os.environ, reason="mpi required")
+def rpc_reuse_in_final_stage():
+    # 'reused' and 'reused2' are located on stage 2, so the backward pass for
+    # the final stage will need to first send gradients to stage 2, then receive
+    # gradients from stage 2. This tests custom logic to handle reuse of layers
+    # in the final stage of the pipeline.
+    reused = nn.Linear(10, 10)
+    reused2 = nn.Linear(10, 10)
+    model = [
+        nn.Linear(10, 10),
+        nn.ReLU(),
+        nn.Linear(10, 10),
+        reused2,
+        nn.ReLU(),
+        reused,
+        nn.ReLU(),
+        reused,
+        reused2,
+        nn.ReLU(),
+        reused,
+        nn.ReLU(),
+    ]
+    balance = [2, 3, 4]
+    init_rpc()
+    if torch.distributed.get_rank() != 0:
+        rpc.shutdown()
+        torch.distributed.barrier()
+        return
+    pipe = PipeRPCWrapper(model, balance, worker_map=get_worker_map())
+    inputs = torch.rand(10).cuda()
+    target = torch.rand(10).cuda()
+    output = pipe(inputs)
+    nn.MSELoss()(output, target).backward()
+    output = pipe(inputs)
+    nn.MSELoss()(output, target).backward()
+    rpc.shutdown()
+    torch.distributed.barrier()
+@torch_spawn([3])
+@pytest.mark.skipif("OMPI_COMM_WORLD_RANK" not in os.environ, reason="mpi required")
+def rpc_multiple_tensors():
+    class FuseTwo(nn.Module):
+        def forward(self, left, right):
+            return left + right
+    class SplitTwo(nn.Module):
+        def forward(self, inputs):
+            return (inputs, 2 * inputs)
+@torch_spawn([2])
+@pytest.mark.skipif("OMPI_COMM_WORLD_RANK" in os.environ, reason="no mpi")
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
+def construct_only_rank_zero():
+    model = [nn.Linear(10, 10), nn.ReLU()]
+    if torch.distributed.get_rank() == 0:
+        PipeRPCWrapper(model, [1, 1], worker_map=get_worker_map())
+        rpc.shutdown()
+    else:
+        # Must enter rpc loop to complte PipeRPCWrapper constructor above
+        rpc.shutdown()
+        with pytest.raises(AssertionError):
+            PipeRPCWrapper(model, [1, 1], worker_map=get_worker_map())
--- a/tests/nn/pipe_process/test_transparency.py
+++ b/tests/nn/pipe_process/test_transparency.py
@@ -27,7 +27,8 @@ from tests.nn.model_parallel.commons import get_worker_map, set_random_seed, tor
 @torch_spawn([2])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
-def simple_linears():
+@pytest.mark.parametrize("pipeline_style", [Pipe.MultiProcess, Pipe.AsyncSchedule])
+def simple_linears(pipeline_style):
    def sum_grad(parameters):
        return sum([p.grad.sum() for p in parameters if p.grad is not None])
@@ -54,19 +55,19 @@ def simple_linears():
    zero_grad(model.parameters())
    # With Pipe
-    model = Pipe(model, [2, 2], style=Pipe.MultiProcess, worker_map=get_worker_map(), chunks=4)
+    model = Pipe(model, [2, 2], style=pipeline_style, worker_map=get_worker_map(), chunks=4)
    outputs = model(inputs)
    if model.group.rank() == 1:
        loss = outputs.mean()
        loss.backward()
-        grad_with_pipe = sum_grad(model.pipeline.partitions[0].parameters())
+        grad_with_pipe = sum_grad(model.pipeline.mp_partitions[0].module.parameters())
        # Both grads should be identical.
        assert torch.allclose(grad_with_pipe, grad_without_pipe[1])
    else:
        model.back_helper(outputs)
-        grad_with_pipe = sum_grad(model.pipeline.partitions[0].parameters())
+        grad_with_pipe = sum_grad(model.pipeline.mp_partitions[0].module.parameters())
        # Both grads should be identical.
        assert torch.allclose(grad_with_pipe, grad_without_pipe[0])