Multi-process pipe (#90)

Adds support for distributing pipeline stages across multiple processes (and therefore multiple machines) * Adds a style argument to the Pipe constructor, defaulting to PipelineStyle.SingleProcess, but also supporting PipelineStyle.MultiProcess * Added support for lazy construction of modules (see lazy_construction for an example) * Added two implementations of inter-process communication: one based on rpc with globally visible queues, one based on send/recv * Copied all the relevant tests from tests/pipe to tests/pipe_process and modified them to exercise PipelineStyle.MultiProcess

Multi-process pipe (#90)
Adds support for distributing pipeline stages across multiple processes (and therefore multiple machines) * Adds a style argument to the Pipe constructor, defaulting to PipelineStyle.SingleProcess, but also supporting PipelineStyle.MultiProcess * Added support for lazy construction of modules (see lazy_construction for an example) * Added two implementations of inter-process communication: one based on rpc with globally visible queues, one based on send/recv * Copied all the relevant tests from tests/pipe to tests/pipe_process and modified them to exercise PipelineStyle.MultiProcess
63f7796a · Tom Birch · GitHub · 49a198c9 · 63f7796a · 63f7796a
Unverified Commit 63f7796a authored Sep 17, 2020 by Tom Birch Committed by GitHub Sep 17, 2020
4 changed files
--- a/tests/nn/pipe_process/test_bugs.py
+++ b/tests/nn/pipe_process/test_bugs.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Copyright 2019 Kakao Brain
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from fairscale.nn.pipe import Pipe
+from tests.nn.model_parallel.commons import get_worker_map, torch_spawn
+
+
+@torch_spawn([2])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
+def python_autograd_function():
+    # A Python autograd function might fail with this error:
+    #
+    #   RuntimeError: Returning Variables sharing storage with other Variables
+    #   that require grad is not supported in Python functions. Please submit a
+    #   feature request if you hit this error.
+    #
+    # It doesn't look like an essential restriction. But it happens on the
+    # current PyTorch version. To avoid it, we should detach the tensor before
+    # returning by identity autograd functions, such as Wait, Fork, and Join.
+
+    torch.manual_seed(0)
+
+    class Identity(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, input):
+            return input
+
+        @staticmethod
+        def backward(ctx, grad):
+            return grad
+
+    class M(nn.Module):
+        def forward(self, input):
+            return Identity.apply(input)
+
+    model = nn.Sequential(M(), M())
+    model = Pipe(model, [1, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(), checkpoint="always").cuda()
+
+    x = torch.rand(42)
+    y = model(x)
+    if model.group.rank() == 1:
+        assert torch.allclose(x, y)
+
+    torch.distributed.barrier()
+
+
+@torch_spawn([3])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
+def exception_no_hang():
+    # In v0.0.2, once a failed partition receives a normal message
+    # (non-closing) for the next micro-batch, a hang occured. The reason was
+    # that a failed partition didn't call in_queue.task_done() on a normal
+    # message. So the former partition was blocked at out_queue.join() for the
+    # next of next micro-batch.
+    class ExpectedException(Exception):
+        pass
+
+    class Pass(nn.Module):
+        def forward(self, x):
+            return x
+
+    class Raise(nn.Module):
+        def forward(self, x):
+            raise ExpectedException()
+
+    model = nn.Sequential(Pass(), Pass(), Raise())
+    model = Pipe(model, [1, 1, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(), chunks=3)
+
+    if model.group.rank() == 2:
+        with pytest.raises(ExpectedException):
+            model(torch.rand(3))
+    else:
+        model(torch.rand(3))
+
+    torch.distributed.barrier()
+
+
+@torch_spawn([2])
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="2 cuda devices required")
+def tuple_wait(cuda_sleep):
+    # In v0.0.3, Wait is applied to only the first tensor on a micro-batch.
+    # Under this behavior, if checkpointing was disabled, there's a possibility
+    # that gradient accumulations on other tensors are not synchronized
+    # properly to the copy stream.
+    class Sleep(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            return x.detach()
+
+        @staticmethod
+        def backward(ctx, grad):
+            with torch.cuda.device(grad.device):
+                cuda_sleep(0.05)
+            return grad
+
+    class Layer1(nn.Module):
+        def forward(self, pair):
+            a, b = pair
+            return a * 1, b * 2, b * 3
+
+    class Layer2(nn.Module):
+        def forward(self, triple):
+            a, b, c = triple
+            b = Sleep.apply(b)
+            return a + b + c
+
+    model = nn.Sequential(Layer1(), Layer2())
+    model = Pipe(
+        model,
+        [1, 1],
+        style=Pipe.MultiProcess,
+        worker_map=get_worker_map(),
+        input_device=torch.cuda.current_device(),
+        chunks=32,
+        checkpoint="never",
+    ).cuda()
+
+    a = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True)
+    b = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True)
+
+    y = model((a, b))
+    if model.group.rank() == 1:
+        y.norm().backward()
+    else:
+        model.back_helper(y)
+
+    if model.group.rank() == 0:
+        assert torch.isclose(b.grad.norm().cpu(), torch.tensor(5.000))
+
+
+@torch_spawn([2])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
+def parallel_randoms():
+    class Dropouts(nn.Module):
+        def forward(self, x):
+            for _ in range(100):
+                x = F.dropout(x, p=0.001)
+            return x
+
+    model = nn.Sequential(Dropouts(), Dropouts())
+
+    x = torch.rand(10, 10, requires_grad=True).cuda()
+    x.retain_grad()
+    model = Pipe(
+        model,
+        [1, 1],
+        style=Pipe.MultiProcess,
+        input_device=torch.cuda.current_device(),
+        worker_map=get_worker_map(),
+        chunks=10,
+        checkpoint="always",
+    ).cuda()
+    y = model(x)
+    tensor_list = [torch.empty_like(x) for _ in range(2)]
+    if model.group.rank() == 1:
+        y.norm().backward()
+        torch.distributed.barrier()
+        tensor_list[model.group.rank()] = y
+        torch.distributed.all_gather(tensor_list, y, group=model.group)
+        assert tensor_list[0].to(torch.bool).tolist() == tensor_list[1].to(torch.bool).tolist()
+    else:
+        model.back_helper(y)
+        torch.distributed.barrier()
+        tensor_list[model.group.rank()] = x.grad
+        torch.distributed.all_gather(tensor_list, x.grad, group=model.group)
--- a/tests/nn/pipe_process/test_inplace.py
+++ b/tests/nn/pipe_process/test_inplace.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Copyright 2019 Kakao Brain
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from torch import nn
+
+from fairscale.nn.pipe import Pipe
+from tests.nn.model_parallel.commons import get_worker_map, torch_spawn
+
+
+@torch_spawn([2])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
+def inplace_on_requires_grad():
+    model = nn.Sequential(nn.Linear(1, 1), nn.ReLU(inplace=True))
+    model = Pipe(model, [1, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(), checkpoint="always")
+
+    x = torch.rand(1)
+    y = model(x)
+
+    message = r"a leaf Variable that requires grad .* used in an in-place operation."
+    if model.group.rank() == 1:
+        with pytest.raises(RuntimeError, match=message):
+            y.backward()
+
+    torch.distributed.barrier()
+
+
+@torch_spawn([1])
+@pytest.mark.xfail(strict=True)
+def inplace_on_not_requires_grad():
+    # In-place operation on a tensor not requiring grad doesn't cause a
+    # RuntimeError. Currently, we cannot detect this case.
+    model = nn.Sequential(nn.ReLU(inplace=True))
+    model = Pipe(model, [1], style=Pipe.MultiProcess, worker_map=get_worker_map(), checkpoint="always")
+
+    x = torch.rand(1)
+    y = model(x)
+    del model
+
+    message = r"a leaf Variable that requires grad .* used in an in-place operation."
+    with pytest.raises(RuntimeError, match=message):
+        y.backward()
+
+    torch.distributed.barrier()
+
+
+@torch_spawn([1])
+@pytest.mark.xfail(strict=True)
+def inplace_incorrect_grad():
+    class M(nn.Module):
+        def forward(self, foo_bar):
+            # 'foo' requires grad but 'bar' does not. In-place operation on
+            # 'bar' won't cause a RuntimeError.
+            foo, bar = foo_bar
+
+            # add_(1) is not idempotent, in contrast to relu_(). If it is
+            # executed multiple times, it will accumulates each difference onto
+            # 'bar'.
+            bar.add_(1)
+
+            # 'bar' is still captured by checkpointing. 'foo' will get
+            # incorrect grad.
+            return foo * bar
+
+    model = nn.Sequential(M())
+    model = Pipe(model, [1], style=Pipe.MultiProcess, worker_map=get_worker_map(), checkpoint="always")
+
+    foo = torch.tensor([1.0], requires_grad=True)
+    bar = torch.tensor([1.0])
+
+    output = model((foo, bar))
+    del model
+    output.backward()
+
+    # The gradient of 'foo' should be 2, but it is 3 actually because
+    # bar.add_(1) was executed twice due to checkpointing.
+    assert foo.grad.item() == 2.0
--- a/tests/nn/pipe_process/test_pipe.py
+++ b/tests/nn/pipe_process/test_pipe.py
--- a/tests/nn/pipe_process/test_transparency.py
+++ b/tests/nn/pipe_process/test_transparency.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Copyright 2019 Kakao Brain
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from torch import nn
+
+from fairscale.nn import Pipe
+from tests.nn.model_parallel.commons import get_worker_map, set_random_seed, torch_spawn
+
+
+@torch_spawn([2])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
+def simple_linears():
+    def sum_grad(parameters):
+        return sum([p.grad.sum() for p in parameters if p.grad is not None])
+
+    def zero_grad(parameters):
+        for p in parameters:
+            p.grad = None
+
+    set_random_seed(12345)
+    inputs = torch.rand(8, 1)
+    model = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1),)
+
+    # Without Pipe
+    outputs = model(inputs)
+    loss = outputs.mean()
+    loss.backward()
+
+    grad_without_pipe = [
+        sum_grad([*model[0].parameters(), *model[1].parameters()]),
+        sum_grad([*model[2].parameters(), *model[3].parameters()]),
+    ]
+
+    ref_without_pipe = [p.grad for p in model.parameters()]
+
+    zero_grad(model.parameters())
+
+    # With Pipe
+    model = Pipe(model, [2, 2], style=Pipe.MultiProcess, worker_map=get_worker_map(), chunks=4)
+
+    outputs = model(inputs)
+    if model.group.rank() == 1:
+        loss = outputs.mean()
+        loss.backward()
+        grad_with_pipe = sum_grad(model.pipeline.partitions[0].parameters())
+
+        # Both grads should be identical.
+        assert torch.allclose(grad_with_pipe, grad_without_pipe[1])
+    else:
+        model.back_helper(outputs)
+        grad_with_pipe = sum_grad(model.pipeline.partitions[0].parameters())
+
+        # Both grads should be identical.
+        assert torch.allclose(grad_with_pipe, grad_without_pipe[0])
+    torch.distributed.barrier()