test_fp16.py 11.5 KB
Newer Older
1
2
3
4
5
6
7
import torch
import deepspeed
import argparse
import pytest
import json
import os
from common import distributed_test
8
from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


def test_lamb_fp16_basic(tmpdir):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Lamb",
            "params": {
                "lr": 0.00015,
                "max_grad_norm": 1.0
            }
        },
        "fp16": {
            "enabled": True
        }
    }
26
    args = args_from_dict(tmpdir, config_dict)
27
28
29
30
31
32
33
34
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[1, 2])
    def _test_lamb_fp16_basic(args, model, hidden_dim):
        model, _, _,_ = deepspeed.initialize(args=args,
                                             model=model,
35
                                             model_parameters=model.parameters())
36
37
38
39
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_lamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)


def test_lamb_fp16_empty_grad(tmpdir):
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Lamb",
            "params": {
                "lr": 0.00015,
                "max_grad_norm": 1.0
            }
        },
        "fp16": {
            "enabled": True
        }
    }
63
    args = args_from_dict(tmpdir, config_dict)
64
65
66
67
68
69
70
71
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=True)

    @distributed_test(world_size=[1])
    def _test_lamb_fp16_empty_grad(args, model, hidden_dim):
        model, _, _,_ = deepspeed.initialize(args=args,
                                             model=model,
72
                                             model_parameters=model.parameters())
73
74
75
76
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_lamb_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)


def test_adamw_fp16_basic(tmpdir):
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "fp16": {
            "enabled": True
        }
    }
93
    args = args_from_dict(tmpdir, config_dict)
94
95
96
97
98
99
100
101
102
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[1])
    def _test_adamw_fp16_basic(args, model, hidden_dim):
        optimizer = torch.optim.AdamW(params=model.parameters())
        model, _, _,_ = deepspeed.initialize(args=args,
                                             model=model,
103
                                             optimizer=optimizer)
104
105
106
107
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)


def test_adamw_fp16_empty_grad(tmpdir):
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "fp16": {
            "enabled": True
        }
    }
124
    args = args_from_dict(tmpdir, config_dict)
125
126
127
128
129
130
131
132
133
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=True)

    @distributed_test(world_size=[1])
    def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
        optimizer = torch.optim.AdamW(params=model.parameters())
        model, _, _,_ = deepspeed.initialize(args=args,
                                             model=model,
134
                                             optimizer=optimizer)
135
136
137
138
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
139
140
141
142
143
144
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_adamw_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
145
146


Jeff Rasley's avatar
Jeff Rasley committed
147
148
@pytest.mark.parametrize("zero_stage", [0, 1, 2])
def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage):
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            }
        },
        "scheduler": {
            "type": "OneCycle",
            "params": {
                "cycle_first_step_size": 16000,
                "cycle_first_stair_count": 8000,
                "decay_step_size": 16000,
                "cycle_min_lr": 1e-06,
                "cycle_max_lr": 3e-05,
                "decay_lr_rate": 1e-07,
                "cycle_min_mom": 0.85,
                "cycle_max_mom": 0.99,
                "decay_mom_rate": 0.0
            }
        },
        "fp16": {
            "enabled": True
        },
Jeff Rasley's avatar
Jeff Rasley committed
175
176
177
        "zero_optimization": {
            "stage": zero_stage
        }
178
    }
Jeff Rasley's avatar
Jeff Rasley committed
179

180
181
182
183
184
185
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=True)

    @distributed_test(world_size=[1])
Jeff Rasley's avatar
Jeff Rasley committed
186
    def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim):
187
188
189
190
191
192
193
194
195
196
197
198
        model, _, _,_ = deepspeed.initialize(args=args,
                                             model=model,
                                             model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

Jeff Rasley's avatar
Jeff Rasley committed
199
200
201
    _test_adam_fp16_zero_onecycle_compatibility(args=args,
                                                model=model,
                                                hidden_dim=hidden_dim)
202
203


Jeff Rasley's avatar
Jeff Rasley committed
204
205
@pytest.mark.parametrize("zero_stage", [1, 2])
def test_zero_static_scale(tmpdir, zero_stage):
206
    config_dict = {
Jeff Rasley's avatar
Jeff Rasley committed
207
        "train_batch_size": 4,
208
209
210
211
212
213
214
215
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            }
        },
        "fp16": {
Jeff Rasley's avatar
Jeff Rasley committed
216
217
            "enabled": True,
            "loss_scale": 138.
218
        },
Jeff Rasley's avatar
Jeff Rasley committed
219
220
221
        "zero_optimization": {
            "stage": zero_stage
        }
222
223
224
    }
    args = args_from_dict(tmpdir, config_dict)

Jeff Rasley's avatar
Jeff Rasley committed
225
226
227
228
229
230
231
    @distributed_test(world_size=2)
    def _test_zero_static_scale(args):
        hidden_dim = 10
        model = SimpleModel(hidden_dim, empty_grad=True)
        model, optim, _,_ = deepspeed.initialize(args=args,
                                            model=model,
                                            model_parameters=model.parameters())
232

Jeff Rasley's avatar
Jeff Rasley committed
233
234
235
236
237
        # Ensure the static scaler is configured.
        assert optim.dynamic_loss_scale == False
        assert optim.loss_scaler.loss_scale == 138.

        # Now make sure things work..
238
        data_loader = random_dataloader(model=model,
Jeff Rasley's avatar
Jeff Rasley committed
239
                                        total_samples=10,
240
241
242
243
244
245
246
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

Jeff Rasley's avatar
Jeff Rasley committed
247
    _test_zero_static_scale(args)
248
249


Jeff Rasley's avatar
Jeff Rasley committed
250
def test_zero_static_scale_deprecated_format(tmpdir):
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
    config_dict = {
        "train_batch_size": 4,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            }
        },
        "fp16": {
            "enabled": True,
            "loss_scale": 138.
        },
        "zero_optimization": True
    }
    args = args_from_dict(tmpdir, config_dict)

    @distributed_test(world_size=2)
    def _test_zero_static_scale(args):
        hidden_dim = 10
        model = SimpleModel(hidden_dim, empty_grad=True)
        model, optim, _,_ = deepspeed.initialize(args=args,
                                            model=model,
                                            model_parameters=model.parameters())

        # Ensure the static scaler is configured.
        assert optim.dynamic_loss_scale == False
        assert optim.loss_scaler.loss_scale == 138.

        # Now make sure things work..
        data_loader = random_dataloader(model=model,
                                        total_samples=10,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_zero_static_scale(args)
291
292


Jeff Rasley's avatar
Jeff Rasley committed
293
294
@pytest.mark.parametrize("zero_stage", [1, 2])
def test_zero_allow_untested_optimizer(tmpdir, zero_stage):
295
296
297
298
299
300
    config_dict = {
        "train_batch_size": 4,
        "steps_per_print": 1,
        "fp16": {
            "enabled": True,
        },
Jeff Rasley's avatar
Jeff Rasley committed
301
302
303
        "zero_optimization": {
            "stage": zero_stage
        },
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
        "zero_allow_untested_optimizer": False
    }
    args = args_from_dict(tmpdir, config_dict)

    @distributed_test(world_size=[1])
    def _test_zero_allow_untested_optimizer(args):
        hidden_dim = 10
        model = SimpleModel(hidden_dim, empty_grad=True)
        optimizer = SimpleOptimizer(model.parameters())
        with pytest.raises(AssertionError):
            model, optim, _,_ = deepspeed.initialize(args=args,
                                                    model=model,
                                                    optimizer=optimizer,
                                                    model_parameters=model.parameters())

    _test_zero_allow_untested_optimizer(args)
320
321


Jeff Rasley's avatar
Jeff Rasley committed
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
# @pytest.mark.parametrize("zero_stage", [1])
# def test_zero_empty_partition(tmpdir, zero_stage):
#     config_dict = {
#         "train_batch_size": 3,
#         "fp16": {
#             "enabled": True
#         },
#         "optimizer": {
#             "type": "Adam",
#             "params": {
#                 "lr": 0.00015
#             }
#         },
#         "zero_optimization": {
#             "stage": zero_stage
#         }
#     }
#     args = args_from_dict(tmpdir, config_dict)

#     @distributed_test(world_size=[3])
#     def _test_zero_empty_partition(args):
#         hidden_dim = 1
#         model = SimpleModel(hidden_dim)
#         # Ensure model has 2 parameters, to cause empty partition with DP=3
#         assert len(list(model.parameters())) == 2
#         model, _, _, _ = deepspeed.initialize(args=args,
#                                               model=model,
#                                               model_parameters=model.parameters())
#         model.step()

#     _test_zero_empty_partition(args)