Unverified Commit 63f7796a authored by Tom Birch's avatar Tom Birch Committed by GitHub
Browse files

Multi-process pipe (#90)

Adds support for distributing pipeline stages across multiple processes (and therefore multiple machines)
* Adds a style argument to the Pipe constructor, defaulting to PipelineStyle.SingleProcess, but also supporting PipelineStyle.MultiProcess
* Added support for lazy construction of modules (see lazy_construction for an example)
* Added two implementations of inter-process communication: one based on rpc with globally visible queues, one based on send/recv
* Copied all the relevant tests from tests/pipe to tests/pipe_process and modified them to exercise PipelineStyle.MultiProcess
parent 49a198c9
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
# Copyright 2019 Kakao Brain
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import torch
from torch import nn
import torch.nn.functional as F
from fairscale.nn.pipe import Pipe
from tests.nn.model_parallel.commons import get_worker_map, torch_spawn
@torch_spawn([2])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
def python_autograd_function():
# A Python autograd function might fail with this error:
#
# RuntimeError: Returning Variables sharing storage with other Variables
# that require grad is not supported in Python functions. Please submit a
# feature request if you hit this error.
#
# It doesn't look like an essential restriction. But it happens on the
# current PyTorch version. To avoid it, we should detach the tensor before
# returning by identity autograd functions, such as Wait, Fork, and Join.
torch.manual_seed(0)
class Identity(torch.autograd.Function):
@staticmethod
def forward(ctx, input):
return input
@staticmethod
def backward(ctx, grad):
return grad
class M(nn.Module):
def forward(self, input):
return Identity.apply(input)
model = nn.Sequential(M(), M())
model = Pipe(model, [1, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(), checkpoint="always").cuda()
x = torch.rand(42)
y = model(x)
if model.group.rank() == 1:
assert torch.allclose(x, y)
torch.distributed.barrier()
@torch_spawn([3])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
def exception_no_hang():
# In v0.0.2, once a failed partition receives a normal message
# (non-closing) for the next micro-batch, a hang occured. The reason was
# that a failed partition didn't call in_queue.task_done() on a normal
# message. So the former partition was blocked at out_queue.join() for the
# next of next micro-batch.
class ExpectedException(Exception):
pass
class Pass(nn.Module):
def forward(self, x):
return x
class Raise(nn.Module):
def forward(self, x):
raise ExpectedException()
model = nn.Sequential(Pass(), Pass(), Raise())
model = Pipe(model, [1, 1, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(), chunks=3)
if model.group.rank() == 2:
with pytest.raises(ExpectedException):
model(torch.rand(3))
else:
model(torch.rand(3))
torch.distributed.barrier()
@torch_spawn([2])
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="2 cuda devices required")
def tuple_wait(cuda_sleep):
# In v0.0.3, Wait is applied to only the first tensor on a micro-batch.
# Under this behavior, if checkpointing was disabled, there's a possibility
# that gradient accumulations on other tensors are not synchronized
# properly to the copy stream.
class Sleep(torch.autograd.Function):
@staticmethod
def forward(ctx, x):
return x.detach()
@staticmethod
def backward(ctx, grad):
with torch.cuda.device(grad.device):
cuda_sleep(0.05)
return grad
class Layer1(nn.Module):
def forward(self, pair):
a, b = pair
return a * 1, b * 2, b * 3
class Layer2(nn.Module):
def forward(self, triple):
a, b, c = triple
b = Sleep.apply(b)
return a + b + c
model = nn.Sequential(Layer1(), Layer2())
model = Pipe(
model,
[1, 1],
style=Pipe.MultiProcess,
worker_map=get_worker_map(),
input_device=torch.cuda.current_device(),
chunks=32,
checkpoint="never",
).cuda()
a = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True)
b = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True)
y = model((a, b))
if model.group.rank() == 1:
y.norm().backward()
else:
model.back_helper(y)
if model.group.rank() == 0:
assert torch.isclose(b.grad.norm().cpu(), torch.tensor(5.000))
@torch_spawn([2])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
def parallel_randoms():
class Dropouts(nn.Module):
def forward(self, x):
for _ in range(100):
x = F.dropout(x, p=0.001)
return x
model = nn.Sequential(Dropouts(), Dropouts())
x = torch.rand(10, 10, requires_grad=True).cuda()
x.retain_grad()
model = Pipe(
model,
[1, 1],
style=Pipe.MultiProcess,
input_device=torch.cuda.current_device(),
worker_map=get_worker_map(),
chunks=10,
checkpoint="always",
).cuda()
y = model(x)
tensor_list = [torch.empty_like(x) for _ in range(2)]
if model.group.rank() == 1:
y.norm().backward()
torch.distributed.barrier()
tensor_list[model.group.rank()] = y
torch.distributed.all_gather(tensor_list, y, group=model.group)
assert tensor_list[0].to(torch.bool).tolist() == tensor_list[1].to(torch.bool).tolist()
else:
model.back_helper(y)
torch.distributed.barrier()
tensor_list[model.group.rank()] = x.grad
torch.distributed.all_gather(tensor_list, x.grad, group=model.group)
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
# Copyright 2019 Kakao Brain
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import torch
from torch import nn
from fairscale.nn.pipe import Pipe
from tests.nn.model_parallel.commons import get_worker_map, torch_spawn
@torch_spawn([2])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
def inplace_on_requires_grad():
model = nn.Sequential(nn.Linear(1, 1), nn.ReLU(inplace=True))
model = Pipe(model, [1, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(), checkpoint="always")
x = torch.rand(1)
y = model(x)
message = r"a leaf Variable that requires grad .* used in an in-place operation."
if model.group.rank() == 1:
with pytest.raises(RuntimeError, match=message):
y.backward()
torch.distributed.barrier()
@torch_spawn([1])
@pytest.mark.xfail(strict=True)
def inplace_on_not_requires_grad():
# In-place operation on a tensor not requiring grad doesn't cause a
# RuntimeError. Currently, we cannot detect this case.
model = nn.Sequential(nn.ReLU(inplace=True))
model = Pipe(model, [1], style=Pipe.MultiProcess, worker_map=get_worker_map(), checkpoint="always")
x = torch.rand(1)
y = model(x)
del model
message = r"a leaf Variable that requires grad .* used in an in-place operation."
with pytest.raises(RuntimeError, match=message):
y.backward()
torch.distributed.barrier()
@torch_spawn([1])
@pytest.mark.xfail(strict=True)
def inplace_incorrect_grad():
class M(nn.Module):
def forward(self, foo_bar):
# 'foo' requires grad but 'bar' does not. In-place operation on
# 'bar' won't cause a RuntimeError.
foo, bar = foo_bar
# add_(1) is not idempotent, in contrast to relu_(). If it is
# executed multiple times, it will accumulates each difference onto
# 'bar'.
bar.add_(1)
# 'bar' is still captured by checkpointing. 'foo' will get
# incorrect grad.
return foo * bar
model = nn.Sequential(M())
model = Pipe(model, [1], style=Pipe.MultiProcess, worker_map=get_worker_map(), checkpoint="always")
foo = torch.tensor([1.0], requires_grad=True)
bar = torch.tensor([1.0])
output = model((foo, bar))
del model
output.backward()
# The gradient of 'foo' should be 2, but it is 3 actually because
# bar.add_(1) was executed twice due to checkpointing.
assert foo.grad.item() == 2.0
This diff is collapsed.
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
# Copyright 2019 Kakao Brain
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import torch
from torch import nn
from fairscale.nn import Pipe
from tests.nn.model_parallel.commons import get_worker_map, set_random_seed, torch_spawn
@torch_spawn([2])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
def simple_linears():
def sum_grad(parameters):
return sum([p.grad.sum() for p in parameters if p.grad is not None])
def zero_grad(parameters):
for p in parameters:
p.grad = None
set_random_seed(12345)
inputs = torch.rand(8, 1)
model = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1),)
# Without Pipe
outputs = model(inputs)
loss = outputs.mean()
loss.backward()
grad_without_pipe = [
sum_grad([*model[0].parameters(), *model[1].parameters()]),
sum_grad([*model[2].parameters(), *model[3].parameters()]),
]
ref_without_pipe = [p.grad for p in model.parameters()]
zero_grad(model.parameters())
# With Pipe
model = Pipe(model, [2, 2], style=Pipe.MultiProcess, worker_map=get_worker_map(), chunks=4)
outputs = model(inputs)
if model.group.rank() == 1:
loss = outputs.mean()
loss.backward()
grad_with_pipe = sum_grad(model.pipeline.partitions[0].parameters())
# Both grads should be identical.
assert torch.allclose(grad_with_pipe, grad_without_pipe[1])
else:
model.back_helper(outputs)
grad_with_pipe = sum_grad(model.pipeline.partitions[0].parameters())
# Both grads should be identical.
assert torch.allclose(grad_with_pipe, grad_without_pipe[0])
torch.distributed.barrier()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment