Commit 8de66223 authored by maming's avatar maming
Browse files

Initial commit

parents
Pipeline #3358 canceled with stages
from __future__ import absolute_import
from keras.optimizers import Optimizer
from keras import backend as K
class FTML(Optimizer):
"""FTML optimizer.
# Arguments
lr: float >= 0. Learning rate.
beta_1: float, 0 < beta < 1. Generally close to 0.5.
beta_2: float, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor.
decay: float >= 0. Learning rate decay over each update.
# References
- [FTML - Follow the Moving Leader in Deep Learning](
http://www.cse.ust.hk/~szhengac/papers/icml17.pdf)
"""
def __init__(self, lr=0.0025, beta_1=0.6, beta_2=0.999,
epsilon=1e-8, decay=0., **kwargs):
super(FTML, self).__init__(**kwargs)
self.__dict__.update(locals())
self.iterations = K.variable(0)
self.lr = K.variable(lr)
self.beta_1 = K.variable(beta_1)
self.beta_2 = K.variable(beta_2)
self.decay = K.variable(decay)
self.epsilon = epsilon
self.inital_decay = decay
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
lr = self.lr
if self.inital_decay > 0:
lr *= (1. / (1. + self.decay * self.iterations))
t = self.iterations + 1
lr_t = lr / (1. - K.pow(self.beta_1, t))
shapes = [K.int_shape(p) for p in params]
zs = [K.zeros(shape) for shape in shapes]
vs = [K.zeros(shape) for shape in shapes]
ds = [K.zeros(shape) for shape in shapes]
self.weights = [self.iterations] + zs + vs + ds
for p, g, z, v, d in zip(params, grads, zs, vs, ds):
v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
d_t = (K.sqrt(v_t / (1. - K.pow(self.beta_2, t)))
+ self.epsilon) / lr_t
sigma_t = d_t - self.beta_1 * d
z_t = self.beta_1 * z + (1. - self.beta_1) * g - sigma_t * p
p_t = - z_t / d_t
self.updates.append(K.update(z, z_t))
self.updates.append(K.update(v, v_t))
self.updates.append(K.update(d, d_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon}
base_config = super(FTML, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
from keras import backend as K
from keras.optimizers import Optimizer
class LARS(Optimizer):
"""Layer-wise Adaptive Rate Scaling for large batch training.
Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
Implements the LARS learning rate scheme presented in the paper above. This
optimizer is useful when scaling the batch size to up to 32K without
significant performance degradation. It is recommended to use the optimizer
in conjunction with:
- Gradual learning rate warm-up
- Linear learning rate scaling
- Poly rule learning rate decay
Note, LARS scaling is currently only enabled for dense tensors.
Args:
lr: A `Tensor` or floating point value. The base learning rate.
momentum: A floating point value. Momentum hyperparameter.
weight_decay: A floating point value. Weight decay hyperparameter.
eeta: LARS coefficient as used in the paper. Dfault set to LARS
coefficient from the paper. (eeta / weight_decay) determines the
highest scaling factor in LARS.
epsilon: Optional epsilon parameter to be set in models that have very
small gradients. Default set to 0.0.
nesterov: when set to True, nesterov momentum will be enabled
"""
def __init__(self,
lr,
momentum=0.9,
weight_decay=0.0001,
eeta=0.001,
epsilon=0.0,
nesterov=False,
**kwargs):
if momentum < 0.0:
raise ValueError("momentum should be positive: %s" % momentum)
if weight_decay < 0.0:
raise ValueError("weight_decay is not positive: %s" % weight_decay)
super(LARS, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.momentum = K.variable(momentum, name='momentum')
self.weight_decay = K.variable(weight_decay, name='weight_decay')
self.eeta = K.variable(eeta, name='eeta')
self.epsilon = epsilon
self.nesterov = nesterov
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
weights = self.get_weights()
self.updates = [K.update_add(self.iterations, 1)]
scaled_lr = self.lr
w_norm = K.sqrt(K.sum([K.sum(K.square(weight))
for weight in weights]))
g_norm = K.sqrt(K.sum([K.sum(K.square(grad))
for grad in grads]))
scaled_lr = K.switch(K.greater(w_norm * g_norm, K.zeros([1])),
K.expand_dims((self.eeta * w_norm /
(g_norm + self.weight_decay * w_norm +
self.epsilon)) * self.lr),
K.ones([1]) * self.lr)
if K.backend() == 'theano':
scaled_lr = scaled_lr[0] # otherwise theano raise broadcasting error
# momentum
moments = [K.zeros(K.int_shape(param), dtype=K.dtype(param))
for param in params]
self.weights = [self.iterations] + moments
for param, grad, moment in zip(params, grads, moments):
v0 = (moment * self.momentum)
v1 = scaled_lr * grad # velocity
veloc = v0 - v1
self.updates.append(K.update(moment, veloc))
if self.nesterov:
new_param = param + (veloc * self.momentum) - v1
else:
new_param = param + veloc
# Apply constraints.
if getattr(param, 'constraint', None) is not None:
new_param = param.constraint(new_param)
self.updates.append(K.update(param, new_param))
return self.updates
def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'momentum': float(K.get_value(self.momentum)),
'weight_decay': float(K.get_value(self.weight_decay)),
'epsilon': self.epsilon,
'eeta': float(K.get_value(self.eeta)),
'nesterov': self.nesterov}
base_config = super(LARS, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
from keras import backend as K
from keras.optimizers import Optimizer
class Padam(Optimizer):
"""Partially adaptive momentum estimation optimizer.
# Arguments
lr: float >= 0. Learning rate.
beta_1: float, 0 < beta < 1. Generally close to 1.
beta_2: float, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
amsgrad: boolean. Whether to apply the AMSGrad variant of this
algorithm from the paper "On the Convergence of Adam and
Beyond".
partial: float, 0 <= partial <= 0.5 . Parameter controlling partial
momentum adaption. For `partial=0`, this optimizer behaves like SGD,
for `partial=0.5` it behaves like AMSGrad.
# References
- [Closing the Generalization Gap of Adaptive Gradient Methods
in Training Deep Neural Networks](https://arxiv.org/pdf/1806.06763.pdf)
"""
def __init__(self, lr=1e-1, beta_1=0.9, beta_2=0.999,
epsilon=1e-8, decay=0., amsgrad=False, partial=1. / 8., **kwargs):
if partial < 0 or partial > 0.5:
raise ValueError(
"Padam: 'partial' must be a positive float with a maximum "
"value of `0.5`, since higher values will cause divergence "
"during training."
)
super(Padam, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.partial = partial
self.initial_decay = decay
self.amsgrad = amsgrad
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
lr = self.lr
if self.initial_decay > 0:
lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))
t = K.cast(self.iterations, K.floatx()) + 1
lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
(1. - K.pow(self.beta_1, t)))
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
if self.amsgrad:
vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
else:
vhats = [K.zeros(1) for _ in params]
self.weights = [self.iterations] + ms + vs + vhats
for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
if self.amsgrad:
vhat_t = K.maximum(vhat, v_t)
denom = (K.sqrt(vhat_t) + self.epsilon)
self.updates.append(K.update(vhat, vhat_t))
else:
denom = (K.sqrt(v_t) + self.epsilon)
self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))
# Partial momentum adaption.
new_p = p - (lr_t * (m_t / (denom ** (self.partial * 2))))
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon,
'amsgrad': self.amsgrad,
'partial': self.partial}
base_config = super(Padam, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
from keras import backend as K
from keras.optimizers import Optimizer
class Yogi(Optimizer):
"""Yogi optimizer.
Yogi is a variation of Adam that controls the increase in effective
learning rate, which (according to the paper) leads to even better
performance than Adam with similar theoretical guarantees on convergence.
Default parameters follow those provided in the original paper, Tab.1
# Arguments
lr: float >= 0. Learning rate.
beta_1: float, 0 < beta < 1. Generally close to 1.
beta_2: float, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
# References
- [Adaptive Methods for Nonconvex Optimization](
https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization)
If you open an issue or a pull request about the Yogi optimizer,
please add 'cc @MarcoAndreaBuchmann' to notify him.
"""
def __init__(self, lr=0.01, beta_1=0.9, beta_2=0.999,
epsilon=1e-3, decay=0., **kwargs):
super(Yogi, self).__init__(**kwargs)
if beta_1 <= 0 or beta_1 >= 1:
raise ValueError("beta_1 has to be in ]0, 1[")
if beta_2 <= 0 or beta_2 >= 1:
raise ValueError("beta_2 has to be in ]0, 1[")
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
if epsilon is None:
epsilon = K.epsilon()
if epsilon <= 0:
raise ValueError("epsilon has to be larger than 0")
self.epsilon = epsilon
self.initial_decay = decay
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
lr = self.lr
if self.initial_decay > 0:
lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))
t = K.cast(self.iterations, K.floatx()) + 1
lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
(1. - K.pow(self.beta_1, t)))
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vhats = [K.zeros(1) for _ in params]
self.weights = [self.iterations] + ms + vs + vhats
for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
g2 = K.square(g)
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = v - (1. - self.beta_2) * K.sign(v - g2) * g2
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon}
base_config = super(Yogi, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
from __future__ import absolute_import
import numpy as np
from keras import backend as K
def get_standard_values():
'''
These are just a set of floats used for testing the activation
functions, and are useful in multiple tests.
'''
return np.array([[0, 0.1, 0.5, 0.9, 1.0]], dtype=K.floatx())
def validate_activation(activation):
activation(get_standard_values())
import numpy as np
from keras import backend as K
all_metrics = []
all_sparse_metrics = []
def validate_metric(metric):
y_a = K.variable(np.random.random((6, 7)))
y_b = K.variable(np.random.random((6, 7)))
output = metric(y_a, y_b)
assert K.eval(output).shape == ()
from __future__ import print_function
import numpy as np
from keras_contrib.utils import test_utils
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils import to_categorical
def get_test_data():
np.random.seed(1337)
(x_train, y_train), _ = test_utils.get_test_data(num_train=1000,
num_test=200,
input_shape=(10,),
classification=True,
num_classes=2)
y_train = to_categorical(y_train)
return x_train, y_train
def get_model(input_dim, num_hidden, output_dim):
model = Sequential()
model.add(Dense(num_hidden, input_shape=(input_dim,)))
model.add(Activation('relu'))
model.add(Dense(output_dim))
model.add(Activation('softmax'))
return model
def _test_optimizer(optimizer, target=0.75):
x_train, y_train = get_test_data()
model = get_model(x_train.shape[1], 10, y_train.shape[1])
model.compile(loss='categorical_crossentropy',
optimizer=optimizer,
metrics=['accuracy'])
history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
assert history.history['acc'][-1] >= target
config = optimizers.serialize(optimizer)
custom_objects = {optimizer.__class__.__name__: optimizer.__class__}
optim = optimizers.deserialize(config, custom_objects)
new_config = optimizers.serialize(optim)
assert config == new_config
import numpy as np
from keras.datasets import mnist
from keras.layers import Activation
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import np_utils
np.random.seed(1337)
nb_classes = 10
batch_size = 128
nb_epoch = 5
weighted_class = 9
standard_weight = 1
high_weight = 5
max_train_samples = 5000
max_test_samples = 1000
def get_data():
# the data, shuffled and split between tran and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784)[:max_train_samples]
X_test = X_test.reshape(10000, 784)[:max_test_samples]
X_train = X_train.astype('float32') / 255
X_test = X_test.astype('float32') / 255
# convert class vectors to binary class matrices
y_train = y_train[:max_train_samples]
y_test = y_test[:max_test_samples]
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
test_ids = np.where(y_test == np.array(weighted_class))[0]
return (X_train, Y_train), (X_test, Y_test), test_ids
def validate_regularizer(weight_reg=None, activity_reg=None):
model = Sequential()
model.add(Dense(50, input_shape=(784,)))
model.add(Activation('relu'))
model.add(Dense(10, W_regularizer=weight_reg,
activity_regularizer=activity_reg))
model.add(Activation('softmax'))
return model
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment