grads.py 5.53 KB
Newer Older
1
# coding=utf-8
Mohammad's avatar
Mohammad committed
2
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# Parts of the code here are adapted from PyTorch
# repo: https://github.com/pytorch/pytorch


import torch
from torch._six import inf

24
25
26
27
28
29
try:
    from apex.multi_tensor_apply import multi_tensor_applier
    import amp_C

except Exception as e:
    print('WARNING: APEX is not installed, multi_tensor_applier will not be available.')
30

31
from .initialize import is_inter_layer_first_stage
32
from .initialize import get_model_parallel_group
33
from .initialize import get_intra_layer_model_parallel_rank
34
35


36
37
38
39
def l2_grad_clipper(parameters, max_norm):
    """Efficient L2 norm gradient clipping."""

    overflow_buf = torch.zeros(1, dtype=torch.int, device='cuda')
Mohammad's avatar
Mohammad committed
40
    # Make sure we have an iterable.
41
42
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
Mohammad's avatar
Mohammad committed
43
44
45
46
    # Filter parameters with gradients.
    parameters_with_grads = list(filter(
        lambda p: p.grad is not None, parameters))
    # Filter parameters for norm calculations.
47
    mp_rank_is_zero = (get_intra_layer_model_parallel_rank() == 0)
Mohammad's avatar
Mohammad committed
48
    parameters_for_norm = list(filter(
49
        lambda p: p.intra_layer_model_parallel or mp_rank_is_zero, parameters_with_grads))
Mohammad's avatar
Mohammad committed
50
    # Calculate L2 norm.
51
52
53
    norm, _ = multi_tensor_applier(
        amp_C.multi_tensor_l2norm,
        overflow_buf,
Mohammad's avatar
Mohammad committed
54
        [parameters_for_norm],
55
56
57
58
59
60
61
62
        False # no per-parameter norm
    )
    # Sum across all model parallel GPUs.
    norm_2 = norm * norm
    torch.distributed.all_reduce(norm_2,
                                 op=torch.distributed.ReduceOp.SUM,
                                 group=get_model_parallel_group())
    total_norm = norm_2.item() ** 0.5
Mohammad's avatar
Mohammad committed
63
64
65
66
    # Scale to get max_norm.
    clip_coef = float(max_norm) / (total_norm + 1.0e-6)
    grads = [p.grad for p in parameters_with_grads]
    if clip_coef < 1.0:
67
68
69
70
71
72
73
74
        multi_tensor_applier(
            amp_C.multi_tensor_scale,
            overflow_buf,
            [grads, grads],
            clip_coef)
    return total_norm


75
def clip_grad_norm(parameters, max_norm, norm_type=2, parameter_names=None):
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    """Clips gradient norm of an iterable of parameters.

    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
    added functionality to handle model parallel parameters. Note that
    the gradients are modified in place.

    Arguments:
        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
            single Tensor that will have gradients normalized
        max_norm (float or int): max norm of the gradients
        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
            infinity norm.

    Returns:
        Total norm of the parameters (viewed as a single vector).
    """
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
    if parameter_names is not None:
        filtered_parameters = []
        assert len(parameters) == len(parameter_names), \
            'length of parameters and parameter_names should be the same'
        for p, n in zip(parameters, parameter_names):
            if p.grad is not None:
                # TODO: Bit hacky; is there a cleaner way to do this?
                # Count embedding layer only once (in first stage).
                # Don't count the weights a second time in the last stage.
                if "embedding" not in n or \
                    is_inter_layer_first_stage():
                    filtered_parameters.append(p)
        parameters = filtered_parameters
    else:
        parameters = list(filter(lambda p: p.grad is not None, parameters))
109
110
111
112
113
    max_norm = float(max_norm)
    norm_type = float(norm_type)
    if norm_type == inf:
        total_norm = max(p.grad.data.abs().max() for p in parameters)
        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
114
        # Take max across all model-parallel GPUs.
115
116
117
118
        torch.distributed.all_reduce(total_norm_cuda,
                                     op=torch.distributed.ReduceOp.MAX,
                                     group=get_model_parallel_group())
        total_norm = total_norm_cuda[0].item()
119
120
121
122
        clip_coef = max_norm / (total_norm + 1e-6)
        if clip_coef < 1:
            for p in parameters:
                p.grad.data.mul_(clip_coef)
123
124
125
    else:
        total_norm = 0
        for p in parameters:
126
            if p.intra_layer_model_parallel or (get_intra_layer_model_parallel_rank() == 0):
127
128
                param_norm = p.grad.data.norm(norm_type)
                total_norm += param_norm.item() ** norm_type
129
        # Sum across all model-parallel GPUs.
130
131
132
133
134
        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
        torch.distributed.all_reduce(total_norm_cuda,
                                     op=torch.distributed.ReduceOp.SUM,
                                     group=get_model_parallel_group())
        total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
135
136
137
138
        clip_coef = max_norm / (total_norm + 1e-6)
        if clip_coef < 1:
            for p in parameters:
                p.grad.data.mul_(clip_coef)
139
    return total_norm