grads.py 4.66 KB
Newer Older
1
# coding=utf-8
Mohammad's avatar
Mohammad committed
2
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# Parts of the code here are adapted from PyTorch
# repo: https://github.com/pytorch/pytorch


import torch
from torch._six import inf

24
25
26
from apex.multi_tensor_apply import multi_tensor_applier
import amp_C

27
28
29
30
from .initialize import get_model_parallel_group
from .initialize import get_model_parallel_rank


31
32
33
34
def l2_grad_clipper(parameters, max_norm):
    """Efficient L2 norm gradient clipping."""

    overflow_buf = torch.zeros(1, dtype=torch.int, device='cuda')
Mohammad's avatar
Mohammad committed
35
    # Make sure we have an iterable.
36
37
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
Mohammad's avatar
Mohammad committed
38
39
40
41
    # Filter parameters with gradients.
    parameters_with_grads = list(filter(
        lambda p: p.grad is not None, parameters))
    # Filter parameters for norm calculations.
Mohammad's avatar
Mohammad committed
42
    mp_rank_is_zero = (get_model_parallel_rank() == 0)
Mohammad's avatar
Mohammad committed
43
44
45
    parameters_for_norm = list(filter(
        lambda p: p.model_parallel or mp_rank_is_zero, parameters_with_grads))
    # Calculate L2 norm.
46
47
48
    norm, _ = multi_tensor_applier(
        amp_C.multi_tensor_l2norm,
        overflow_buf,
Mohammad's avatar
Mohammad committed
49
        [parameters_for_norm],
50
51
52
53
54
55
56
57
        False # no per-parameter norm
    )
    # Sum across all model parallel GPUs.
    norm_2 = norm * norm
    torch.distributed.all_reduce(norm_2,
                                 op=torch.distributed.ReduceOp.SUM,
                                 group=get_model_parallel_group())
    total_norm = norm_2.item() ** 0.5
Mohammad's avatar
Mohammad committed
58
59
60
61
    # Scale to get max_norm.
    clip_coef = float(max_norm) / (total_norm + 1.0e-6)
    grads = [p.grad for p in parameters_with_grads]
    if clip_coef < 1.0:
62
63
64
65
66
67
68
69
        multi_tensor_applier(
            amp_C.multi_tensor_scale,
            overflow_buf,
            [grads, grads],
            clip_coef)
    return total_norm


70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def clip_grad_norm(parameters, max_norm, norm_type=2):
    """Clips gradient norm of an iterable of parameters.

    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
    added functionality to handle model parallel parameters. Note that
    the gradients are modified in place.

    Arguments:
        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
            single Tensor that will have gradients normalized
        max_norm (float or int): max norm of the gradients
        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
            infinity norm.

    Returns:
        Total norm of the parameters (viewed as a single vector).
    """
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
    parameters = list(filter(lambda p: p.grad is not None, parameters))
    max_norm = float(max_norm)
    norm_type = float(norm_type)
    if norm_type == inf:
        total_norm = max(p.grad.data.abs().max() for p in parameters)
        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
        # Take max across all GPUs.
        torch.distributed.all_reduce(total_norm_cuda,
                                     op=torch.distributed.ReduceOp.MAX,
                                     group=get_model_parallel_group())
        total_norm = total_norm_cuda[0].item()
100
101
102
103
        clip_coef = max_norm / (total_norm + 1e-6)
        if clip_coef < 1:
            for p in parameters:
                p.grad.data.mul_(clip_coef)
Mohammad's avatar
Mohammad committed
104
105
    #elif norm_type == 2:
    #    total_norm = l2_grad_clipper(parameters, max_norm)
106

107
108
109
110
111
112
113
114
115
116
117
118
    else:
        total_norm = 0
        for p in parameters:
            if p.model_parallel or (get_model_parallel_rank() == 0):
                param_norm = p.grad.data.norm(norm_type)
                total_norm += param_norm.item() ** norm_type
        # Sum across all model parallel GPUs.
        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
        torch.distributed.all_reduce(total_norm_cuda,
                                     op=torch.distributed.ReduceOp.SUM,
                                     group=get_model_parallel_group())
        total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
119
120
121
122
        clip_coef = max_norm / (total_norm + 1e-6)
        if clip_coef < 1:
            for p in parameters:
                p.grad.data.mul_(clip_coef)
123
    return total_norm