test_optimizer.py 10.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# Copyright (c) OpenMMLab. All rights reserved.
import functools
from collections import OrderedDict
from copy import deepcopy
from typing import Iterable

import torch
import torch.nn as nn
from mmcv.runner import build_optimizer
from mmcv.runner.optimizer.builder import OPTIMIZERS
from mmcv.utils.registry import build_from_cfg
from torch.autograd import Variable
from torch.optim.optimizer import Optimizer

import mmcls.core  # noqa: F401

base_lr = 0.01
base_wd = 0.0001


def assert_equal(x, y):
    if isinstance(x, torch.Tensor) and isinstance(y, torch.Tensor):
        torch.testing.assert_allclose(x, y.to(x.device))
    elif isinstance(x, OrderedDict) and isinstance(y, OrderedDict):
        for x_value, y_value in zip(x.values(), y.values()):
            assert_equal(x_value, y_value)
    elif isinstance(x, dict) and isinstance(y, dict):
        assert x.keys() == y.keys()
        for key in x.keys():
            assert_equal(x[key], y[key])
    elif isinstance(x, str) and isinstance(y, str):
        assert x == y
    elif isinstance(x, Iterable) and isinstance(y, Iterable):
        assert len(x) == len(y)
        for x_item, y_item in zip(x, y):
            assert_equal(x_item, y_item)
    else:
        assert x == y


class SubModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(2, 2, kernel_size=1, groups=2)
        self.gn = nn.GroupNorm(2, 2)
        self.fc = nn.Linear(2, 2)
        self.param1 = nn.Parameter(torch.ones(1))

    def forward(self, x):
        return x


class ExampleModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.param1 = nn.Parameter(torch.ones(1))
        self.conv1 = nn.Conv2d(3, 4, kernel_size=1, bias=False)
        self.conv2 = nn.Conv2d(4, 2, kernel_size=1)
        self.bn = nn.BatchNorm2d(2)
        self.sub = SubModel()
        self.fc = nn.Linear(2, 1)

    def forward(self, x):
        return x


def check_lamb_optimizer(optimizer,
                         model,
                         bias_lr_mult=1,
                         bias_decay_mult=1,
                         norm_decay_mult=1,
                         dwconv_decay_mult=1):
    param_groups = optimizer.param_groups
    assert isinstance(optimizer, Optimizer)
    assert optimizer.defaults['lr'] == base_lr
    assert optimizer.defaults['weight_decay'] == base_wd
    model_parameters = list(model.parameters())
    assert len(param_groups) == len(model_parameters)
    for i, param in enumerate(model_parameters):
        param_group = param_groups[i]
        assert torch.equal(param_group['params'][0], param)
    # param1
    param1 = param_groups[0]
    assert param1['lr'] == base_lr
    assert param1['weight_decay'] == base_wd
    # conv1.weight
    conv1_weight = param_groups[1]
    assert conv1_weight['lr'] == base_lr
    assert conv1_weight['weight_decay'] == base_wd
    # conv2.weight
    conv2_weight = param_groups[2]
    assert conv2_weight['lr'] == base_lr
    assert conv2_weight['weight_decay'] == base_wd
    # conv2.bias
    conv2_bias = param_groups[3]
    assert conv2_bias['lr'] == base_lr * bias_lr_mult
    assert conv2_bias['weight_decay'] == base_wd * bias_decay_mult
    # bn.weight
    bn_weight = param_groups[4]
    assert bn_weight['lr'] == base_lr
    assert bn_weight['weight_decay'] == base_wd * norm_decay_mult
    # bn.bias
    bn_bias = param_groups[5]
    assert bn_bias['lr'] == base_lr
    assert bn_bias['weight_decay'] == base_wd * norm_decay_mult
    # sub.param1
    sub_param1 = param_groups[6]
    assert sub_param1['lr'] == base_lr
    assert sub_param1['weight_decay'] == base_wd
    # sub.conv1.weight
    sub_conv1_weight = param_groups[7]
    assert sub_conv1_weight['lr'] == base_lr
    assert sub_conv1_weight['weight_decay'] == base_wd * dwconv_decay_mult
    # sub.conv1.bias
    sub_conv1_bias = param_groups[8]
    assert sub_conv1_bias['lr'] == base_lr * bias_lr_mult
    assert sub_conv1_bias['weight_decay'] == base_wd * dwconv_decay_mult
    # sub.gn.weight
    sub_gn_weight = param_groups[9]
    assert sub_gn_weight['lr'] == base_lr
    assert sub_gn_weight['weight_decay'] == base_wd * norm_decay_mult
    # sub.gn.bias
    sub_gn_bias = param_groups[10]
    assert sub_gn_bias['lr'] == base_lr
    assert sub_gn_bias['weight_decay'] == base_wd * norm_decay_mult
    # sub.fc1.weight
    sub_fc_weight = param_groups[11]
    assert sub_fc_weight['lr'] == base_lr
    assert sub_fc_weight['weight_decay'] == base_wd
    # sub.fc1.bias
    sub_fc_bias = param_groups[12]
    assert sub_fc_bias['lr'] == base_lr * bias_lr_mult
    assert sub_fc_bias['weight_decay'] == base_wd * bias_decay_mult
    # fc1.weight
    fc_weight = param_groups[13]
    assert fc_weight['lr'] == base_lr
    assert fc_weight['weight_decay'] == base_wd
    # fc1.bias
    fc_bias = param_groups[14]
    assert fc_bias['lr'] == base_lr * bias_lr_mult
    assert fc_bias['weight_decay'] == base_wd * bias_decay_mult


def _test_state_dict(weight, bias, input, constructor):
    weight = Variable(weight, requires_grad=True)
    bias = Variable(bias, requires_grad=True)
    inputs = Variable(input)

    def fn_base(optimizer, weight, bias):
        optimizer.zero_grad()
        i = input_cuda if weight.is_cuda else inputs
        loss = (weight.mv(i) + bias).pow(2).sum()
        loss.backward()
        return loss

    optimizer = constructor(weight, bias)
    fn = functools.partial(fn_base, optimizer, weight, bias)

    # Prime the optimizer
    for _ in range(20):
        optimizer.step(fn)
    # Clone the weights and construct new optimizer for them
    weight_c = Variable(weight.data.clone(), requires_grad=True)
    bias_c = Variable(bias.data.clone(), requires_grad=True)
    optimizer_c = constructor(weight_c, bias_c)
    fn_c = functools.partial(fn_base, optimizer_c, weight_c, bias_c)
    # Load state dict
    state_dict = deepcopy(optimizer.state_dict())
    state_dict_c = deepcopy(optimizer.state_dict())
    optimizer_c.load_state_dict(state_dict_c)
    # Run both optimizations in parallel
    for _ in range(20):
        optimizer.step(fn)
        optimizer_c.step(fn_c)
        assert_equal(weight, weight_c)
        assert_equal(bias, bias_c)
    # Make sure state dict wasn't modified
    assert_equal(state_dict, state_dict_c)
    # Make sure state dict is deterministic with equal
    # but not identical parameters
    # NOTE: The state_dict of optimizers in PyTorch 1.5 have random keys,
    state_dict = deepcopy(optimizer.state_dict())
    state_dict_c = deepcopy(optimizer_c.state_dict())
    keys = state_dict['param_groups'][-1]['params']
    keys_c = state_dict_c['param_groups'][-1]['params']
    for key, key_c in zip(keys, keys_c):
        assert_equal(optimizer.state_dict()['state'][key],
                     optimizer_c.state_dict()['state'][key_c])
    # Make sure repeated parameters have identical representation in state dict
    optimizer_c.param_groups.extend(optimizer_c.param_groups)
    assert_equal(optimizer_c.state_dict()['param_groups'][0],
                 optimizer_c.state_dict()['param_groups'][1])

    # Check that state dict can be loaded even when we cast parameters
    # to a different type and move to a different device.
    if not torch.cuda.is_available():
        return

    input_cuda = Variable(inputs.data.float().cuda())
    weight_cuda = Variable(weight.data.float().cuda(), requires_grad=True)
    bias_cuda = Variable(bias.data.float().cuda(), requires_grad=True)
    optimizer_cuda = constructor(weight_cuda, bias_cuda)
    fn_cuda = functools.partial(fn_base, optimizer_cuda, weight_cuda,
                                bias_cuda)

    state_dict = deepcopy(optimizer.state_dict())
    state_dict_c = deepcopy(optimizer.state_dict())
    optimizer_cuda.load_state_dict(state_dict_c)

    # Make sure state dict wasn't modified
    assert_equal(state_dict, state_dict_c)

    for _ in range(20):
        optimizer.step(fn)
        optimizer_cuda.step(fn_cuda)
        assert_equal(weight, weight_cuda)
        assert_equal(bias, bias_cuda)

    # validate deepcopy() copies all public attributes
    def getPublicAttr(obj):
        return set(k for k in obj.__dict__ if not k.startswith('_'))

    assert_equal(getPublicAttr(optimizer), getPublicAttr(deepcopy(optimizer)))


def _test_basic_cases_template(weight, bias, inputs, constructor,
                               scheduler_constructors):
    """Copied from PyTorch."""
    weight = Variable(weight, requires_grad=True)
    bias = Variable(bias, requires_grad=True)
    inputs = Variable(inputs)
    optimizer = constructor(weight, bias)
    schedulers = []
    for scheduler_constructor in scheduler_constructors:
        schedulers.append(scheduler_constructor(optimizer))

    # to check if the optimizer can be printed as a string
    optimizer.__repr__()

    def fn():
        optimizer.zero_grad()
        y = weight.mv(inputs)
        if y.is_cuda and bias.is_cuda and y.get_device() != bias.get_device():
            y = y.cuda(bias.get_device())
        loss = (y + bias).pow(2).sum()
        loss.backward()
        return loss

    initial_value = fn().item()
    for _ in range(200):
        for scheduler in schedulers:
            scheduler.step()
        optimizer.step(fn)

    assert fn().item() < initial_value


def _test_basic_cases(constructor,
                      scheduler_constructors=None,
                      ignore_multidevice=False):
    """Copied from PyTorch."""
    if scheduler_constructors is None:
        scheduler_constructors = []
    _test_state_dict(
        torch.randn(10, 5), torch.randn(10), torch.randn(5), constructor)
    _test_basic_cases_template(
        torch.randn(10, 5), torch.randn(10), torch.randn(5), constructor,
        scheduler_constructors)
    # non-contiguous parameters
    _test_basic_cases_template(
        torch.randn(10, 5, 2)[..., 0],
        torch.randn(10, 2)[..., 0], torch.randn(5), constructor,
        scheduler_constructors)
    # CUDA
    if not torch.cuda.is_available():
        return
    _test_basic_cases_template(
        torch.randn(10, 5).cuda(),
        torch.randn(10).cuda(),
        torch.randn(5).cuda(), constructor, scheduler_constructors)
    # Multi-GPU
    if not torch.cuda.device_count() > 1 or ignore_multidevice:
        return
    _test_basic_cases_template(
        torch.randn(10, 5).cuda(0),
        torch.randn(10).cuda(1),
        torch.randn(5).cuda(0), constructor, scheduler_constructors)


def test_lamb_optimizer():
    model = ExampleModel()
    optimizer_cfg = dict(
        type='Lamb',
        lr=base_lr,
        betas=(0.9, 0.999),
        eps=1e-8,
        weight_decay=base_wd,
        paramwise_cfg=dict(
            bias_lr_mult=2,
            bias_decay_mult=0.5,
            norm_decay_mult=0,
            dwconv_decay_mult=0.1))
    optimizer = build_optimizer(model, optimizer_cfg)
    check_lamb_optimizer(optimizer, model, **optimizer_cfg['paramwise_cfg'])

    _test_basic_cases(lambda weight, bias: build_from_cfg(
        dict(type='Lamb', params=[weight, bias], lr=base_lr), OPTIMIZERS))