optimizer.py 7.74 KB
Newer Older
WenmuZhou's avatar
WenmuZhou committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from paddle import optimizer as optim


class Momentum(object):
    """
    Simple Momentum optimizer with velocity state.
    Args:
        learning_rate (float|Variable) - The learning rate used to update parameters.
            Can be a float value or a Variable with one float value as data element.
        momentum (float) - Momentum factor.
        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
    """

zhoujun's avatar
zhoujun committed
33
34
35
36
37
38
    def __init__(self,
                 learning_rate,
                 momentum,
                 weight_decay=None,
                 grad_clip=None,
                 **args):
WenmuZhou's avatar
WenmuZhou committed
39
40
41
42
        super(Momentum, self).__init__()
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.weight_decay = weight_decay
zhoujun's avatar
zhoujun committed
43
        self.grad_clip = grad_clip
WenmuZhou's avatar
WenmuZhou committed
44

Topdu's avatar
Topdu committed
45
    def __call__(self, model):
46
47
48
        train_params = [
            param for param in model.parameters() if param.trainable is True
        ]
WenmuZhou's avatar
WenmuZhou committed
49
50
51
        opt = optim.Momentum(
            learning_rate=self.learning_rate,
            momentum=self.momentum,
zhoujun's avatar
zhoujun committed
52
53
            weight_decay=self.weight_decay,
            grad_clip=self.grad_clip,
54
            parameters=train_params)
WenmuZhou's avatar
WenmuZhou committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
        return opt


class Adam(object):
    def __init__(self,
                 learning_rate=0.001,
                 beta1=0.9,
                 beta2=0.999,
                 epsilon=1e-08,
                 parameter_list=None,
                 weight_decay=None,
                 grad_clip=None,
                 name=None,
                 lazy_mode=False,
                 **kwargs):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.parameter_list = parameter_list
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.grad_clip = grad_clip
        self.name = name
        self.lazy_mode = lazy_mode

Topdu's avatar
Topdu committed
81
    def __call__(self, model):
82
83
84
        train_params = [
            param for param in model.parameters() if param.trainable is True
        ]
WenmuZhou's avatar
WenmuZhou committed
85
86
87
88
89
90
91
92
93
        opt = optim.Adam(
            learning_rate=self.learning_rate,
            beta1=self.beta1,
            beta2=self.beta2,
            epsilon=self.epsilon,
            weight_decay=self.weight_decay,
            grad_clip=self.grad_clip,
            name=self.name,
            lazy_mode=self.lazy_mode,
94
            parameters=train_params)
WenmuZhou's avatar
WenmuZhou committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
        return opt


class RMSProp(object):
    """
    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method.
    Args:
        learning_rate (float|Variable) - The learning rate used to update parameters.
            Can be a float value or a Variable with one float value as data element.
        momentum (float) - Momentum factor.
        rho (float) - rho value in equation.
        epsilon (float) - avoid division by zero, default is 1e-6.
        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
    """

    def __init__(self,
                 learning_rate,
zhoujun's avatar
zhoujun committed
112
                 momentum=0.0,
WenmuZhou's avatar
WenmuZhou committed
113
114
115
                 rho=0.95,
                 epsilon=1e-6,
                 weight_decay=None,
zhoujun's avatar
zhoujun committed
116
                 grad_clip=None,
WenmuZhou's avatar
WenmuZhou committed
117
118
119
120
121
122
123
                 **args):
        super(RMSProp, self).__init__()
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.rho = rho
        self.epsilon = epsilon
        self.weight_decay = weight_decay
zhoujun's avatar
zhoujun committed
124
        self.grad_clip = grad_clip
WenmuZhou's avatar
WenmuZhou committed
125

Topdu's avatar
Topdu committed
126
    def __call__(self, model):
127
128
129
        train_params = [
            param for param in model.parameters() if param.trainable is True
        ]
WenmuZhou's avatar
WenmuZhou committed
130
131
132
133
134
135
        opt = optim.RMSProp(
            learning_rate=self.learning_rate,
            momentum=self.momentum,
            rho=self.rho,
            epsilon=self.epsilon,
            weight_decay=self.weight_decay,
zhoujun's avatar
zhoujun committed
136
            grad_clip=self.grad_clip,
137
            parameters=train_params)
WenmuZhou's avatar
WenmuZhou committed
138
        return opt
tink2123's avatar
tink2123 committed
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159


class Adadelta(object):
    def __init__(self,
                 learning_rate=0.001,
                 epsilon=1e-08,
                 rho=0.95,
                 parameter_list=None,
                 weight_decay=None,
                 grad_clip=None,
                 name=None,
                 **kwargs):
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.rho = rho
        self.parameter_list = parameter_list
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.grad_clip = grad_clip
        self.name = name

Topdu's avatar
Topdu committed
160
    def __call__(self, model):
161
162
163
        train_params = [
            param for param in model.parameters() if param.trainable is True
        ]
tink2123's avatar
tink2123 committed
164
165
166
167
168
169
170
        opt = optim.Adadelta(
            learning_rate=self.learning_rate,
            epsilon=self.epsilon,
            rho=self.rho,
            weight_decay=self.weight_decay,
            grad_clip=self.grad_clip,
            name=self.name,
171
            parameters=train_params)
tink2123's avatar
tink2123 committed
172
        return opt
173
174
175
176
177
178
179


class AdamW(object):
    def __init__(self,
                 learning_rate=0.001,
                 beta1=0.9,
                 beta2=0.999,
Topdu's avatar
Topdu committed
180
                 epsilon=1e-8,
181
                 weight_decay=0.01,
Topdu's avatar
Topdu committed
182
                 multi_precision=False,
183
                 grad_clip=None,
Topdu's avatar
Topdu committed
184
185
                 no_weight_decay_name=None,
                 one_dim_param_no_weight_decay=False,
186
187
                 name=None,
                 lazy_mode=False,
Topdu's avatar
Topdu committed
188
189
                 **args):
        super().__init__()
190
191
192
193
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
Topdu's avatar
Topdu committed
194
        self.grad_clip = grad_clip
195
196
197
198
        self.weight_decay = 0.01 if weight_decay is None else weight_decay
        self.grad_clip = grad_clip
        self.name = name
        self.lazy_mode = lazy_mode
Topdu's avatar
Topdu committed
199
200
201
202
203
204
        self.multi_precision = multi_precision
        self.no_weight_decay_name_list = no_weight_decay_name.split(
        ) if no_weight_decay_name else []
        self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay

    def __call__(self, model):
205
206
207
        parameters = [
            param for param in model.parameters() if param.trainable is True
        ]
Topdu's avatar
Topdu committed
208
209

        self.no_weight_decay_param_name_list = [
210
211
            p.name for n, p in model.named_parameters()
            if any(nd in n for nd in self.no_weight_decay_name_list)
Topdu's avatar
Topdu committed
212
213
214
215
        ]

        if self.one_dim_param_no_weight_decay:
            self.no_weight_decay_param_name_list += [
216
                p.name for n, p in model.named_parameters() if len(p.shape) == 1
Topdu's avatar
Topdu committed
217
            ]
218

219
220
221
222
223
        opt = optim.AdamW(
            learning_rate=self.learning_rate,
            beta1=self.beta1,
            beta2=self.beta2,
            epsilon=self.epsilon,
Topdu's avatar
Topdu committed
224
            parameters=parameters,
225
            weight_decay=self.weight_decay,
Topdu's avatar
Topdu committed
226
            multi_precision=self.multi_precision,
227
228
229
            grad_clip=self.grad_clip,
            name=self.name,
            lazy_mode=self.lazy_mode,
Topdu's avatar
Topdu committed
230
            apply_decay_param_fun=self._apply_decay_param_fun)
231
        return opt
Topdu's avatar
Topdu committed
232
233

    def _apply_decay_param_fun(self, name):
234
        return name not in self.no_weight_decay_param_name_list