fp_optimizers.py 8.33 KB
Newer Older
Pan,Huiwen's avatar
Pan,Huiwen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

huchen's avatar
huchen committed
21
22
23
24
25
26
27
28
29
import logging
import math

import torch
from torch.nn.utils import clip_grad_norm_
import apex.amp._amp_state
from apex import amp


Pan,Huiwen's avatar
Pan,Huiwen committed
30
class FP16Optimizer:
huchen's avatar
huchen committed
31
32
33
34
35
    """
    Mixed precision optimizer with dynamic loss scaling and backoff.
    https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#scalefactor
    """
    @staticmethod
Pan,Huiwen's avatar
Pan,Huiwen committed
36
37
38
    def set_grads(params, params_with_grad):
        """
        Copies gradients from param_with_grad to params
huchen's avatar
huchen committed
39

Pan,Huiwen's avatar
Pan,Huiwen committed
40
41
42
43
44
45
46
        :param params: dst parameters
        :param params_with_grad: src parameters
        """
        for param, param_w_grad in zip(params, params_with_grad):
            if param.grad is None:
                param.grad = torch.nn.Parameter(torch.empty_like(param))
            param.grad.data.copy_(param_w_grad.grad.data)
huchen's avatar
huchen committed
47
48

    @staticmethod
Pan,Huiwen's avatar
Pan,Huiwen committed
49
50
51
52
53
54
55
56
57
58
59
60
    def set_weights(params, new_params):
        """
        Copies parameters from new_params to params

        :param params: dst parameters
        :param new_params: src parameters
        """
        for param, new_param in zip(params, new_params):
            param.data.copy_(new_param.data)

    def __init__(self, model, grad_clip=float('inf'), loss_scale=8192,
                 dls_downscale=2, dls_upscale=2, dls_upscale_interval=128):
huchen's avatar
huchen committed
61
62
63
        """
        Constructor for the Fp16Optimizer.

Pan,Huiwen's avatar
Pan,Huiwen committed
64
        :param model: model
huchen's avatar
huchen committed
65
66
67
68
69
70
71
72
73
74
        :param grad_clip: coefficient for gradient clipping, max L2 norm of the
            gradients
        :param loss_scale: initial loss scale
        :param dls_downscale: loss downscale factor, loss scale is divided by
            this factor when NaN/INF occurs in the gradients
        :param dls_upscale: loss upscale factor, loss scale is multiplied by
            this factor if previous dls_upscale_interval batches finished
            successfully
        :param dls_upscale_interval: interval for loss scale upscaling
        """
Pan,Huiwen's avatar
Pan,Huiwen committed
75
76
77
        logging.info('Initializing fp16 optimizer')
        self.initialize_model(model)

huchen's avatar
huchen committed
78
79
80
81
82
83
84
85
86
87
88
89
90
        self.since_last_invalid = 0
        self.loss_scale = loss_scale
        self.dls_downscale = dls_downscale
        self.dls_upscale = dls_upscale
        self.dls_upscale_interval = dls_upscale_interval
        self.grad_clip = grad_clip

    def initialize_model(self, model):
        """
        Initializes internal state and build fp32 master copy of weights.

        :param model: fp16 model
        """
Pan,Huiwen's avatar
Pan,Huiwen committed
91
92
        logging.info('Converting model to half precision')
        model.half()
huchen's avatar
huchen committed
93
        logging.info('Initializing fp32 clone weights')
Pan,Huiwen's avatar
Pan,Huiwen committed
94
95
        self.model = model
        self.model.zero_grad()
huchen's avatar
huchen committed
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
        self.fp32_params = [param.to(torch.float32).detach()
                            for param in model.parameters()]

        for param in self.fp32_params:
            param.requires_grad = True

    def step(self, loss, optimizer, scheduler, update=True):
        """
        Performs one step of the optimizer.
        Applies loss scaling, computes gradients in fp16, converts gradients to
        fp32, inverts scaling and applies optional gradient norm clipping.
        If gradients are finite, it applies update to fp32 master weights and
        copies updated parameters to fp16 model for the next iteration. If
        gradients are not finite, it skips the batch and adjusts scaling factor
        for the next iteration.

        :param loss: value of loss function
        :param optimizer: optimizer
        :param update: if True executes weight update
        """
        loss *= self.loss_scale
        loss.backward()

Pan,Huiwen's avatar
Pan,Huiwen committed
119
120
121
122
123
        if update:
            self.set_grads(self.fp32_params, self.model.parameters())
            if self.loss_scale != 1.0:
                for param in self.fp32_params:
                    param.grad.data /= self.loss_scale
huchen's avatar
huchen committed
124

Pan,Huiwen's avatar
Pan,Huiwen committed
125
            norm = clip_grad_norm_(self.fp32_params, self.grad_clip)
huchen's avatar
huchen committed
126

Pan,Huiwen's avatar
Pan,Huiwen committed
127
128
129
130
131
132
133
134
135
136
137
            if math.isfinite(norm):
                scheduler.step()
                optimizer.step()
                self.set_weights(self.model.parameters(),
                                 self.fp32_params)
                self.since_last_invalid += 1
            else:
                self.loss_scale /= self.dls_downscale
                self.since_last_invalid = 0
                logging.info(f'Gradient norm: {norm}')
                logging.info(f'Skipped batch, new scale: {self.loss_scale}')
huchen's avatar
huchen committed
138

Pan,Huiwen's avatar
Pan,Huiwen committed
139
140
141
142
143
            if self.since_last_invalid >= self.dls_upscale_interval:
                self.loss_scale *= self.dls_upscale
                self.loss_scale = min(self.loss_scale, 8192.0)
                logging.info(f'Upscaling, new scale: {self.loss_scale}')
                self.since_last_invalid = 0
huchen's avatar
huchen committed
144

Pan,Huiwen's avatar
Pan,Huiwen committed
145
            self.model.zero_grad()
huchen's avatar
huchen committed
146
147


Pan,Huiwen's avatar
Pan,Huiwen committed
148
class FP32Optimizer:
huchen's avatar
huchen committed
149
    """
Pan,Huiwen's avatar
Pan,Huiwen committed
150
    Standard optimizer, computes backward and applies weight update.
huchen's avatar
huchen committed
151
    """
Pan,Huiwen's avatar
Pan,Huiwen committed
152
    def __init__(self, model, grad_clip=None):
huchen's avatar
huchen committed
153
        """
Pan,Huiwen's avatar
Pan,Huiwen committed
154
        Constructor for the Fp32Optimizer
huchen's avatar
huchen committed
155

Pan,Huiwen's avatar
Pan,Huiwen committed
156
157
158
        :param model: model
        :param grad_clip: coefficient for gradient clipping, max L2 norm of the
            gradients
huchen's avatar
huchen committed
159
        """
Pan,Huiwen's avatar
Pan,Huiwen committed
160
161
162
        logging.info('Initializing fp32 optimizer')
        self.initialize_model(model)
        self.grad_clip = grad_clip
huchen's avatar
huchen committed
163

Pan,Huiwen's avatar
Pan,Huiwen committed
164
165
166
167
168
169
170
171
    def initialize_model(self, model):
        """
        Initializes state of the model.

        :param model: model
        """
        self.model = model
        self.model.zero_grad()
huchen's avatar
huchen committed
172
173
174
175
176
177
178
179
180
181

    def step(self, loss, optimizer, scheduler, update=True):
        """
        Performs one step of the optimizer.

        :param loss: value of loss function
        :param optimizer: optimizer
        :param update: if True executes weight update
        """
        loss.backward()
Pan,Huiwen's avatar
Pan,Huiwen committed
182
183
184
185
186
187
        if update:
            if self.grad_clip != float('inf'):
                clip_grad_norm_(self.model.parameters(), self.grad_clip)
            scheduler.step()
            optimizer.step()
            self.model.zero_grad()
huchen's avatar
huchen committed
188
189


Pan,Huiwen's avatar
Pan,Huiwen committed
190
class AMPOptimizer:
huchen's avatar
huchen committed
191
    """
Pan,Huiwen's avatar
Pan,Huiwen committed
192
193
194
    Optimizer compatible with AMP.
    Uses AMP to apply loss scaling, computes backward and applies weight
    update.
huchen's avatar
huchen committed
195
    """
Pan,Huiwen's avatar
Pan,Huiwen committed
196
197
    def __init__(self, model, grad_clip=None, loss_scale=8192,
                 dls_upscale_interval=128):
huchen's avatar
huchen committed
198
        """
Pan,Huiwen's avatar
Pan,Huiwen committed
199
        Constructor for the AMPOptimizer
huchen's avatar
huchen committed
200
201
202
203
204

        :param model: model
        :param grad_clip: coefficient for gradient clipping, max L2 norm of the
            gradients
        """
Pan,Huiwen's avatar
Pan,Huiwen committed
205
        logging.info('Initializing amp optimizer')
huchen's avatar
huchen committed
206
207
208
        self.initialize_model(model)
        self.grad_clip = grad_clip

Pan,Huiwen's avatar
Pan,Huiwen committed
209
210
211
212
        loss_scaler = apex.amp._amp_state.loss_scalers[0]
        loss_scaler._loss_scale = loss_scale
        loss_scaler._scale_seq_len = dls_upscale_interval

huchen's avatar
huchen committed
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
    def initialize_model(self, model):
        """
        Initializes state of the model.

        :param model: model
        """
        self.model = model
        self.model.zero_grad()

    def step(self, loss, optimizer, scheduler, update=True):
        """
        Performs one step of the optimizer.

        :param loss: value of loss function
        :param optimizer: optimizer
        :param update: if True executes weight update
        """
Pan,Huiwen's avatar
Pan,Huiwen committed
230
231
232
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

huchen's avatar
huchen committed
233
234
        if update:
            if self.grad_clip != float('inf'):
Pan,Huiwen's avatar
Pan,Huiwen committed
235
236
                clip_grad_norm_(amp.master_params(optimizer), self.grad_clip)
            scheduler.step()
huchen's avatar
huchen committed
237
238
            optimizer.step()
            self.model.zero_grad()