# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. # # This source code is licensed under the BSD license found in the # LICENSE file in the root directory of this source tree. # Copyright 2019 Kakao Brain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import pytest import torch from torch import nn from fairscale.nn.pipe import Pipe from fairscale.nn.pipe.skip import pop, skippable, stash from fairscale.nn.pipe.skip.portal import PortalBlue, PortalCopy, PortalOrange from tests.nn.model_parallel.commons import get_worker_map, torch_spawn @torch_spawn([3]) @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required") @pytest.mark.parametrize("balance", [[3], [1, 2], [2, 1], [1, 1, 1]], ids=["3", "1:2", "2:1", "1:1:1"]) @pytest.mark.parametrize("checkpoint", ["never", "always", "except_last"]) @pytest.mark.skipif("OMPI_COMM_WORLD_RANK" in os.environ, reason="broken on mpi") def x1to3(balance, checkpoint): torch.manual_seed(0) @skippable(stash=["1to3"]) class Layer1(nn.Module): def __init__(self): super().__init__() self.conv = nn.Conv2d(3, 3, 1) def forward(self, input): yield stash("1to3", input) output = self.conv(input) return output class Layer2(nn.Module): def __init__(self): super().__init__() self.conv = nn.Conv2d(3, 3, 1) def forward(self, input): output = self.conv(input) return output @skippable(pop=["1to3"]) class Layer3(nn.Module): def __init__(self): super().__init__() self.conv = nn.Conv2d(3, 3, 1) def forward(self, input): skip_1to3 = yield pop("1to3") output = self.conv(input) + skip_1to3 return output model = nn.Sequential(Layer1(), Layer2(), Layer3()) model = Pipe( model, balance, chunks=3, checkpoint=checkpoint, input_device=torch.cuda.current_device(), style=Pipe.MultiProcess, worker_map=get_worker_map(), pipelined_backward=False, ).cuda() input = torch.rand(30, 3, 224, 224, requires_grad=True).cuda() input.retain_grad() output = model(input) if model.group.rank() == len(balance) - 1: loss = output.mean() loss.backward() elif model.group.rank() < len(balance) - 1: model.back_helper(output) if model.group.rank() == len(balance) - 1: # TODO(tom) the single-process test uses 2e-1 but for some reason # mutli-process is more noisy, need to investigate why assert torch.allclose(output.norm(), torch.tensor(1039.0).cuda(), atol=4e-1) if model.group.rank() == 0: assert torch.allclose(input.grad.norm(), torch.tensor(0.0004533053).cuda()) torch.distributed.barrier() @torch_spawn([2]) @pytest.mark.skipif("OMPI_COMM_WORLD_RANK" in os.environ, reason="broken on mpi") @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required") def none_skip(): @skippable(stash=["none"]) class Stash(nn.Module): def forward(self, input): yield stash("none", None) return input @skippable(pop=["none"]) class Pop(nn.Module): def forward(self, input): none = yield pop("none") assert none is None return input model = nn.Sequential(Stash(), Pop()) model = Pipe( model, [1, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(), input_device=torch.cuda.current_device(), chunks=5, ).cuda() input = torch.rand(10, requires_grad=True).cuda() input.retain_grad() output = model(input) def assert_grad_fn_is_not_portal(grad_fn, visited=set()): if grad_fn in visited or grad_fn is None: return assert not isinstance(grad_fn, PortalBlue._backward_cls) assert not isinstance(grad_fn, PortalCopy._backward_cls) assert not isinstance(grad_fn, PortalOrange._backward_cls) visited.add(grad_fn) for next_grad_fn, _ in grad_fn.next_functions: assert_grad_fn_is_not_portal(next_grad_fn, visited) if model.group.rank() == 1: assert_grad_fn_is_not_portal(output.grad_fn) output.sum().backward() else: model.back_helper(output) assert input.grad.mean().item() == 1 @torch_spawn([2]) def lazy_skippable_error(): """Using skippable layers in combination with lazy construction is currently not supported, check that it raises an Exception""" @skippable(stash=["1to3"]) class Layer1(nn.Linear): pass @skippable(pop=["1to3"]) class Layer3(nn.Linear): pass model = [lambda: Layer1(10, 10), lambda: nn.Linear(10, 10), lambda: Layer3(10, 10)] with pytest.raises(ValueError, match="Can't use Skippable layers with multi-process pipe and lazy construction"): Pipe( model, [2, 1], style=Pipe.MultiProcess, worker_map=get_worker_map(), )