test_parity.py 3.28 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.

"""
Testing Pipe Module Parity
"""

import contextlib
import copy

import numpy as np
import pytest
import torch

17
from fair_dev.testing.testing import skip_if_single_gpu
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from fairscale.nn import Pipe


def _get_model(num_inputs=2, num_hidden=20, num_outputs=2):
    num_layers = torch.cuda.device_count() - 2
    model = torch.nn.Sequential(
        torch.nn.Linear(num_inputs, num_hidden),
        *([torch.nn.Linear(num_hidden, num_hidden) for _ in range(num_layers)]),
        torch.nn.Linear(num_hidden, num_outputs),
    )
    return model


def _check_parity(rmodel, pmodel, ropt, popt, rloss, ploss):

    for pparams, rparams in zip(pmodel.parameters(), rmodel.parameters()):
        assert torch.allclose(pparams.cuda(), rparams, atol=1e-2), f"Model params are different {oparams} {rparams}"

    for p_pg, reg_pg in zip(popt.param_groups, ropt.param_groups):
        for p_pg, reg_pg in zip(p_pg["params"], reg_pg["params"]):
            assert torch.allclose(
                p_pg.cuda(), reg_pg, atol=1e-2
            ), f"Model parameters differ in between Pipe and Vanilla {[o_pg]} {reg_pg}"

        for p_buf, reg_buf in zip(pmodel.buffers(), rmodel.buffers()):
            assert torch.allclose(p_buf.cuda(), reg_buf, atol=1e-2), "Model buffers differ in between Pipe and Vanilla."


def _get_fp16_context(use_fp16=False):
    if use_fp16:
        return torch.cuda.amp.autocast()
    else:
        return contextlib.nullcontext()


def _train(model, optimizer, use_fp16):

    inputs = torch.ones(32, 2).cuda()
    labels = torch.ones(32, 2)
    loss_fn = torch.nn.MSELoss(reduction="sum")
    model.train()
    with _get_fp16_context(use_fp16):
        pred = model(inputs)
        loss = loss_fn(pred, labels.to(pred.device))
        loss.backward()
    optimizer.step()
    return model, optimizer, loss


def _train_reg_model(model, use_fp16=False):
    model = copy.deepcopy(model)
    model = model.cuda()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
    return _train(model, optimizer, use_fp16)


def _train_pipe_model(model, use_fp16=False, checkpoint="never", chunks=1):
    model = copy.deepcopy(model)
    model = Pipe(
        model,
        balance=[1] * torch.cuda.device_count(),
        devices=list(range(torch.cuda.device_count())),
        chunks=chunks,
        checkpoint=checkpoint,
    )
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
    return _train(model, optimizer, use_fp16)


@skip_if_single_gpu
@pytest.mark.parametrize("use_fp16", [True, False])
@pytest.mark.parametrize("checkpoint", ["always", "except_last", "never"])
@pytest.mark.parametrize("chunks", [1, 4])
def test_correctness(use_fp16, checkpoint, chunks):
    torch.manual_seed(0)
    np.random.seed(0)

    if use_fp16 and not hasattr(torch.cuda.amp, "custom_fwd"):
        pytest.skip(f"AMP APIs are not supported in torch version {torch.__version__}")

    model = _get_model()
    rmodel, ropt, rloss = _train_reg_model(model)
100
101
102
103
104
105
    pmodel, popt, ploss = _train_pipe_model(
        model,
        use_fp16=use_fp16,
        checkpoint=checkpoint,
        chunks=chunks,
    )
106
    _check_parity(rmodel, pmodel, ropt, popt, rloss, ploss)