test_gpipe.py 5.94 KB
Newer Older
Tom Birch's avatar
Tom Birch committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.

# Copyright 2019 Kakao Brain
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import pytest
import torch
from torch import nn

26
from fairscale.nn.pipe import AsyncPipe, LazyModule, MultiProcessPipe
Tom Birch's avatar
Tom Birch committed
27
28
from fairscale.nn.pipe.skip import pop, skippable, stash
from fairscale.nn.pipe.skip.portal import PortalBlue, PortalCopy, PortalOrange
29
from fairscale.utils.testing import get_worker_map, torch_spawn
Tom Birch's avatar
Tom Birch committed
30
31
32
33
34
35


@torch_spawn([3])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
@pytest.mark.parametrize("balance", [[3], [1, 2], [2, 1], [1, 1, 1]], ids=["3", "1:2", "2:1", "1:1:1"])
@pytest.mark.parametrize("checkpoint", ["never", "always", "except_last"])
36
@pytest.mark.parametrize("pipe_class", [MultiProcessPipe, AsyncPipe])
Tom Birch's avatar
Tom Birch committed
37
@pytest.mark.skipif("OMPI_COMM_WORLD_RANK" in os.environ, reason="broken on mpi")
38
def x1to3(balance, checkpoint, pipe_class):
Tom Birch's avatar
Tom Birch committed
39
40
    torch.manual_seed(0)

41
    if pipe_class == AsyncPipe and len(balance) > 1:
42
        print(f"skipping yarg")
43
        pytest.skip("Skip tensors NYI for AsyncPipe")
44

Tom Birch's avatar
Tom Birch committed
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
    @skippable(stash=["1to3"])
    class Layer1(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Conv2d(3, 3, 1)

        def forward(self, input):
            yield stash("1to3", input)
            output = self.conv(input)
            return output

    class Layer2(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Conv2d(3, 3, 1)

        def forward(self, input):
            output = self.conv(input)
            return output

    @skippable(pop=["1to3"])
    class Layer3(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Conv2d(3, 3, 1)

        def forward(self, input):
            skip_1to3 = yield pop("1to3")
            output = self.conv(input) + skip_1to3
            return output

    model = nn.Sequential(Layer1(), Layer2(), Layer3())
77
    model = pipe_class(
Tom Birch's avatar
Tom Birch committed
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
        model,
        balance,
        chunks=3,
        checkpoint=checkpoint,
        input_device=torch.cuda.current_device(),
        worker_map=get_worker_map(),
        pipelined_backward=False,
    ).cuda()

    input = torch.rand(30, 3, 224, 224, requires_grad=True).cuda()
    input.retain_grad()
    output = model(input)
    if model.group.rank() == len(balance) - 1:
        loss = output.mean()
        loss.backward()
    elif model.group.rank() < len(balance) - 1:
        model.back_helper(output)
    if model.group.rank() == len(balance) - 1:
        # TODO(tom) the single-process test uses 2e-1 but for some reason
        # mutli-process is more noisy, need to investigate why
        assert torch.allclose(output.norm(), torch.tensor(1039.0).cuda(), atol=4e-1)
    if model.group.rank() == 0:
        assert torch.allclose(input.grad.norm(), torch.tensor(0.0004533053).cuda())

    torch.distributed.barrier()


@torch_spawn([2])
@pytest.mark.skipif("OMPI_COMM_WORLD_RANK" in os.environ, reason="broken on mpi")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
108
@pytest.mark.parametrize("pipe_class", [MultiProcessPipe, AsyncPipe])
109
@pytest.mark.skip(reason="flaky test")
110
111
112
def none_skip(pipe_class):
    if pipe_class == AsyncPipe:
        pytest.skip("Skip tensors NYI for AsyncPipe")
113

Tom Birch's avatar
Tom Birch committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
    @skippable(stash=["none"])
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("none", None)
            return input

    @skippable(pop=["none"])
    class Pop(nn.Module):
        def forward(self, input):
            none = yield pop("none")
            assert none is None
            return input

    model = nn.Sequential(Stash(), Pop())
128
129
    model = pipe_class(
        model, [1, 1], worker_map=get_worker_map(), input_device=torch.cuda.current_device(), chunks=5,
Tom Birch's avatar
Tom Birch committed
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
    ).cuda()

    input = torch.rand(10, requires_grad=True).cuda()
    input.retain_grad()
    output = model(input)

    def assert_grad_fn_is_not_portal(grad_fn, visited=set()):
        if grad_fn in visited or grad_fn is None:
            return

        assert not isinstance(grad_fn, PortalBlue._backward_cls)
        assert not isinstance(grad_fn, PortalCopy._backward_cls)
        assert not isinstance(grad_fn, PortalOrange._backward_cls)

        visited.add(grad_fn)
        for next_grad_fn, _ in grad_fn.next_functions:
            assert_grad_fn_is_not_portal(next_grad_fn, visited)

    if model.group.rank() == 1:
        assert_grad_fn_is_not_portal(output.grad_fn)

        output.sum().backward()
    else:
        model.back_helper(output)
        assert input.grad.mean().item() == 1


@torch_spawn([2])
158
159
@pytest.mark.parametrize("pipe_class", [MultiProcessPipe, AsyncPipe])
def lazy_skippable_error(pipe_class):
Tom Birch's avatar
Tom Birch committed
160
161
162
163
164
165
166
167
168
169
170
    """Using skippable layers in combination with lazy construction is currently
    not supported, check that it raises an Exception"""

    @skippable(stash=["1to3"])
    class Layer1(nn.Linear):
        pass

    @skippable(pop=["1to3"])
    class Layer3(nn.Linear):
        pass

171
172
173
174
175
    model = [
        LazyModule(lambda: Layer1(10, 10)),
        LazyModule(lambda: nn.Linear(10, 10)),
        LazyModule(lambda: Layer3(10, 10)),
    ]
Tom Birch's avatar
Tom Birch committed
176
177

    with pytest.raises(ValueError, match="Can't use Skippable layers with multi-process pipe and lazy construction"):
178
179
        pipe_class(
            model, [2, 1], worker_map=get_worker_map(),
Tom Birch's avatar
Tom Birch committed
180
        )