Initial commit

8de66223 · maming · 8de66223 · 8de66223 · 8de66223 · 8de66223
Commit 8de66223 authored Feb 04, 2026 by maming
20 changed files
--- a/code/keras_contrib/optimizers/__pycache__/ftml.cpython-310.pyc
+++ b/code/keras_contrib/optimizers/__pycache__/ftml.cpython-310.pyc
--- a/code/keras_contrib/optimizers/__pycache__/lars.cpython-310.pyc
+++ b/code/keras_contrib/optimizers/__pycache__/lars.cpython-310.pyc
--- a/code/keras_contrib/optimizers/__pycache__/padam.cpython-310.pyc
+++ b/code/keras_contrib/optimizers/__pycache__/padam.cpython-310.pyc
--- a/code/keras_contrib/optimizers/__pycache__/yogi.cpython-310.pyc
+++ b/code/keras_contrib/optimizers/__pycache__/yogi.cpython-310.pyc
--- a/code/keras_contrib/optimizers/ftml.py
+++ b/code/keras_contrib/optimizers/ftml.py
+from __future__ import absolute_import
+from keras.optimizers import Optimizer
+from keras import backend as K
+
+
+class FTML(Optimizer):
+    """FTML optimizer.
+
+    # Arguments
+        lr: float >= 0. Learning rate.
+        beta_1: float, 0 < beta < 1. Generally close to 0.5.
+        beta_2: float, 0 < beta < 1. Generally close to 1.
+        epsilon: float >= 0. Fuzz factor.
+        decay: float >= 0. Learning rate decay over each update.
+
+    # References
+        - [FTML - Follow the Moving Leader in Deep Learning](
+        http://www.cse.ust.hk/~szhengac/papers/icml17.pdf)
+    """
+
+    def __init__(self, lr=0.0025, beta_1=0.6, beta_2=0.999,
+                 epsilon=1e-8, decay=0., **kwargs):
+        super(FTML, self).__init__(**kwargs)
+        self.__dict__.update(locals())
+        self.iterations = K.variable(0)
+        self.lr = K.variable(lr)
+        self.beta_1 = K.variable(beta_1)
+        self.beta_2 = K.variable(beta_2)
+        self.decay = K.variable(decay)
+        self.epsilon = epsilon
+        self.inital_decay = decay
+
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        self.updates = [K.update_add(self.iterations, 1)]
+
+        lr = self.lr
+        if self.inital_decay > 0:
+            lr *= (1. / (1. + self.decay * self.iterations))
+
+        t = self.iterations + 1
+
+        lr_t = lr / (1. - K.pow(self.beta_1, t))
+
+        shapes = [K.int_shape(p) for p in params]
+        zs = [K.zeros(shape) for shape in shapes]
+        vs = [K.zeros(shape) for shape in shapes]
+        ds = [K.zeros(shape) for shape in shapes]
+        self.weights = [self.iterations] + zs + vs + ds
+
+        for p, g, z, v, d in zip(params, grads, zs, vs, ds):
+            v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
+            d_t = (K.sqrt(v_t / (1. - K.pow(self.beta_2, t)))
+                   + self.epsilon) / lr_t
+            sigma_t = d_t - self.beta_1 * d
+            z_t = self.beta_1 * z + (1. - self.beta_1) * g - sigma_t * p
+
+            p_t = - z_t / d_t
+
+            self.updates.append(K.update(z, z_t))
+            self.updates.append(K.update(v, v_t))
+            self.updates.append(K.update(d, d_t))
+
+            new_p = p_t
+
+            # Apply constraints.
+            if getattr(p, 'constraint', None) is not None:
+                new_p = p.constraint(new_p)
+
+            self.updates.append(K.update(p, new_p))
+        return self.updates
+
+    def get_config(self):
+        config = {'lr': float(K.get_value(self.lr)),
+                  'beta_1': float(K.get_value(self.beta_1)),
+                  'beta_2': float(K.get_value(self.beta_2)),
+                  'decay': float(K.get_value(self.decay)),
+                  'epsilon': self.epsilon}
+        base_config = super(FTML, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
--- a/code/keras_contrib/optimizers/lars.py
+++ b/code/keras_contrib/optimizers/lars.py
+from keras import backend as K
+from keras.optimizers import Optimizer
+
+
+class LARS(Optimizer):
+    """Layer-wise Adaptive Rate Scaling for large batch training.
+    Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
+    I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
+    Implements the LARS learning rate scheme presented in the paper above. This
+    optimizer is useful when scaling the batch size to up to 32K without
+    significant performance degradation. It is recommended to use the optimizer
+    in conjunction with:
+        - Gradual learning rate warm-up
+        - Linear learning rate scaling
+        - Poly rule learning rate decay
+    Note, LARS scaling is currently only enabled for dense tensors.
+
+    Args:
+        lr: A `Tensor` or floating point value. The base learning rate.
+        momentum: A floating point value. Momentum hyperparameter.
+        weight_decay: A floating point value. Weight decay hyperparameter.
+        eeta: LARS coefficient as used in the paper. Dfault set to LARS
+            coefficient from the paper. (eeta / weight_decay) determines the
+            highest scaling factor in LARS.
+        epsilon: Optional epsilon parameter to be set in models that have very
+            small gradients. Default set to 0.0.
+        nesterov: when set to True, nesterov momentum will be enabled
+    """
+
+    def __init__(self,
+                 lr,
+                 momentum=0.9,
+                 weight_decay=0.0001,
+                 eeta=0.001,
+                 epsilon=0.0,
+                 nesterov=False,
+                 **kwargs):
+
+        if momentum < 0.0:
+            raise ValueError("momentum should be positive: %s" % momentum)
+        if weight_decay < 0.0:
+            raise ValueError("weight_decay is not positive: %s" % weight_decay)
+        super(LARS, self).__init__(**kwargs)
+        with K.name_scope(self.__class__.__name__):
+            self.iterations = K.variable(0, dtype='int64', name='iterations')
+            self.lr = K.variable(lr, name='lr')
+            self.momentum = K.variable(momentum, name='momentum')
+            self.weight_decay = K.variable(weight_decay, name='weight_decay')
+            self.eeta = K.variable(eeta, name='eeta')
+        self.epsilon = epsilon
+        self.nesterov = nesterov
+
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        weights = self.get_weights()
+        self.updates = [K.update_add(self.iterations, 1)]
+        scaled_lr = self.lr
+        w_norm = K.sqrt(K.sum([K.sum(K.square(weight))
+                               for weight in weights]))
+        g_norm = K.sqrt(K.sum([K.sum(K.square(grad))
+                               for grad in grads]))
+        scaled_lr = K.switch(K.greater(w_norm * g_norm, K.zeros([1])),
+                             K.expand_dims((self.eeta * w_norm /
+                                            (g_norm + self.weight_decay * w_norm +
+                                             self.epsilon)) * self.lr),
+                             K.ones([1]) * self.lr)
+        if K.backend() == 'theano':
+            scaled_lr = scaled_lr[0]  # otherwise theano raise broadcasting error
+        # momentum
+        moments = [K.zeros(K.int_shape(param), dtype=K.dtype(param))
+                   for param in params]
+        self.weights = [self.iterations] + moments
+        for param, grad, moment in zip(params, grads, moments):
+            v0 = (moment * self.momentum)
+            v1 = scaled_lr * grad  # velocity
+            veloc = v0 - v1
+            self.updates.append(K.update(moment, veloc))
+
+            if self.nesterov:
+                new_param = param + (veloc * self.momentum) - v1
+            else:
+                new_param = param + veloc
+
+            # Apply constraints.
+            if getattr(param, 'constraint', None) is not None:
+                new_param = param.constraint(new_param)
+
+            self.updates.append(K.update(param, new_param))
+        return self.updates
+
+    def get_config(self):
+        config = {'lr': float(K.get_value(self.lr)),
+                  'momentum': float(K.get_value(self.momentum)),
+                  'weight_decay': float(K.get_value(self.weight_decay)),
+                  'epsilon': self.epsilon,
+                  'eeta': float(K.get_value(self.eeta)),
+                  'nesterov': self.nesterov}
+        base_config = super(LARS, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
--- a/code/keras_contrib/optimizers/padam.py
+++ b/code/keras_contrib/optimizers/padam.py
+from keras import backend as K
+from keras.optimizers import Optimizer
+
+
+class Padam(Optimizer):
+    """Partially adaptive momentum estimation optimizer.
+
+    # Arguments
+        lr: float >= 0. Learning rate.
+        beta_1: float, 0 < beta < 1. Generally close to 1.
+        beta_2: float, 0 < beta < 1. Generally close to 1.
+        epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
+        decay: float >= 0. Learning rate decay over each update.
+        amsgrad: boolean. Whether to apply the AMSGrad variant of this
+            algorithm from the paper "On the Convergence of Adam and
+            Beyond".
+        partial: float, 0 <= partial <= 0.5 . Parameter controlling partial
+            momentum adaption. For `partial=0`, this optimizer behaves like SGD,
+            for `partial=0.5` it behaves like AMSGrad.
+
+    # References
+        - [Closing the Generalization Gap of Adaptive Gradient Methods
+        in Training Deep Neural Networks](https://arxiv.org/pdf/1806.06763.pdf)
+
+    """
+
+    def __init__(self, lr=1e-1, beta_1=0.9, beta_2=0.999,
+                 epsilon=1e-8, decay=0., amsgrad=False, partial=1. / 8., **kwargs):
+        if partial < 0 or partial > 0.5:
+            raise ValueError(
+                "Padam: 'partial' must be a positive float with a maximum "
+                "value of `0.5`, since higher values will cause divergence "
+                "during training."
+            )
+        super(Padam, self).__init__(**kwargs)
+        with K.name_scope(self.__class__.__name__):
+            self.iterations = K.variable(0, dtype='int64', name='iterations')
+            self.lr = K.variable(lr, name='lr')
+            self.beta_1 = K.variable(beta_1, name='beta_1')
+            self.beta_2 = K.variable(beta_2, name='beta_2')
+            self.decay = K.variable(decay, name='decay')
+        if epsilon is None:
+            epsilon = K.epsilon()
+        self.epsilon = epsilon
+        self.partial = partial
+        self.initial_decay = decay
+        self.amsgrad = amsgrad
+
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        self.updates = [K.update_add(self.iterations, 1)]
+
+        lr = self.lr
+        if self.initial_decay > 0:
+            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
+                                                      K.dtype(self.decay))))
+
+        t = K.cast(self.iterations, K.floatx()) + 1
+        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
+                     (1. - K.pow(self.beta_1, t)))
+
+        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
+        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
+        if self.amsgrad:
+            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
+        else:
+            vhats = [K.zeros(1) for _ in params]
+        self.weights = [self.iterations] + ms + vs + vhats
+
+        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
+            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
+            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
+            if self.amsgrad:
+                vhat_t = K.maximum(vhat, v_t)
+                denom = (K.sqrt(vhat_t) + self.epsilon)
+                self.updates.append(K.update(vhat, vhat_t))
+            else:
+                denom = (K.sqrt(v_t) + self.epsilon)
+
+            self.updates.append(K.update(m, m_t))
+            self.updates.append(K.update(v, v_t))
+
+            # Partial momentum adaption.
+            new_p = p - (lr_t * (m_t / (denom ** (self.partial * 2))))
+
+            # Apply constraints.
+            if getattr(p, 'constraint', None) is not None:
+                new_p = p.constraint(new_p)
+
+            self.updates.append(K.update(p, new_p))
+        return self.updates
+
+    def get_config(self):
+        config = {'lr': float(K.get_value(self.lr)),
+                  'beta_1': float(K.get_value(self.beta_1)),
+                  'beta_2': float(K.get_value(self.beta_2)),
+                  'decay': float(K.get_value(self.decay)),
+                  'epsilon': self.epsilon,
+                  'amsgrad': self.amsgrad,
+                  'partial': self.partial}
+        base_config = super(Padam, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
--- a/code/keras_contrib/optimizers/yogi.py
+++ b/code/keras_contrib/optimizers/yogi.py
+from keras import backend as K
+from keras.optimizers import Optimizer
+
+
+class Yogi(Optimizer):
+    """Yogi optimizer.
+    Yogi is a variation of Adam that controls the increase in effective
+    learning rate, which (according to the paper) leads to even better
+    performance than Adam with similar theoretical guarantees on convergence.
+    Default parameters follow those provided in the original paper, Tab.1
+    # Arguments
+        lr: float >= 0. Learning rate.
+        beta_1: float, 0 < beta < 1. Generally close to 1.
+        beta_2: float, 0 < beta < 1. Generally close to 1.
+        epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
+        decay: float >= 0. Learning rate decay over each update.
+    # References
+        - [Adaptive Methods for Nonconvex Optimization](
+           https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization)
+
+    If you open an issue or a pull request about the Yogi optimizer,
+    please add 'cc @MarcoAndreaBuchmann' to notify him.
+    """
+
+    def __init__(self, lr=0.01, beta_1=0.9, beta_2=0.999,
+                 epsilon=1e-3, decay=0., **kwargs):
+        super(Yogi, self).__init__(**kwargs)
+        if beta_1 <= 0 or beta_1 >= 1:
+            raise ValueError("beta_1 has to be in ]0, 1[")
+        if beta_2 <= 0 or beta_2 >= 1:
+            raise ValueError("beta_2 has to be in ]0, 1[")
+
+        with K.name_scope(self.__class__.__name__):
+            self.iterations = K.variable(0, dtype='int64', name='iterations')
+            self.lr = K.variable(lr, name='lr')
+            self.beta_1 = K.variable(beta_1, name='beta_1')
+            self.beta_2 = K.variable(beta_2, name='beta_2')
+            self.decay = K.variable(decay, name='decay')
+        if epsilon is None:
+            epsilon = K.epsilon()
+        if epsilon <= 0:
+            raise ValueError("epsilon has to be larger than 0")
+        self.epsilon = epsilon
+        self.initial_decay = decay
+
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        self.updates = [K.update_add(self.iterations, 1)]
+
+        lr = self.lr
+        if self.initial_decay > 0:
+            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
+                                                      K.dtype(self.decay))))
+
+        t = K.cast(self.iterations, K.floatx()) + 1
+        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
+                     (1. - K.pow(self.beta_1, t)))
+
+        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
+        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
+        vhats = [K.zeros(1) for _ in params]
+        self.weights = [self.iterations] + ms + vs + vhats
+
+        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
+            g2 = K.square(g)
+            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
+            v_t = v - (1. - self.beta_2) * K.sign(v - g2) * g2
+            p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
+
+            self.updates.append(K.update(m, m_t))
+            self.updates.append(K.update(v, v_t))
+            new_p = p_t
+
+            # Apply constraints.
+            if getattr(p, 'constraint', None) is not None:
+                new_p = p.constraint(new_p)
+
+            self.updates.append(K.update(p, new_p))
+        return self.updates
+
+    def get_config(self):
+        config = {'lr': float(K.get_value(self.lr)),
+                  'beta_1': float(K.get_value(self.beta_1)),
+                  'beta_2': float(K.get_value(self.beta_2)),
+                  'decay': float(K.get_value(self.decay)),
+                  'epsilon': self.epsilon}
+        base_config = super(Yogi, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
--- a/code/keras_contrib/preprocessing/__init__.py
+++ b/code/keras_contrib/preprocessing/__init__.py
--- a/code/keras_contrib/preprocessing/__pycache__/__init__.cpython-310.pyc
+++ b/code/keras_contrib/preprocessing/__pycache__/__init__.cpython-310.pyc
--- a/code/keras_contrib/regularizers/__init__.py
+++ b/code/keras_contrib/regularizers/__init__.py
+
+from __future__ import absolute_import
--- a/code/keras_contrib/regularizers/__pycache__/__init__.cpython-310.pyc
+++ b/code/keras_contrib/regularizers/__pycache__/__init__.cpython-310.pyc
--- a/code/keras_contrib/tests/__init__.py
+++ b/code/keras_contrib/tests/__init__.py
--- a/code/keras_contrib/tests/activations.py
+++ b/code/keras_contrib/tests/activations.py
+import numpy as np
+
+from keras import backend as K
+
+
+def get_standard_values():
+    '''
+    These are just a set of floats used for testing the activation
+    functions, and are useful in multiple tests.
+    '''
+    return np.array([[0, 0.1, 0.5, 0.9, 1.0]], dtype=K.floatx())
+
+
+def validate_activation(activation):
+    activation(get_standard_values())
--- a/code/keras_contrib/tests/metrics.py
+++ b/code/keras_contrib/tests/metrics.py
+import numpy as np
+
+from keras import backend as K
+
+all_metrics = []
+all_sparse_metrics = []
+
+
+def validate_metric(metric):
+    y_a = K.variable(np.random.random((6, 7)))
+    y_b = K.variable(np.random.random((6, 7)))
+    output = metric(y_a, y_b)
+    assert K.eval(output).shape == ()
--- a/code/keras_contrib/tests/optimizers.py
+++ b/code/keras_contrib/tests/optimizers.py
+from __future__ import print_function
+import numpy as np
+
+from keras_contrib.utils import test_utils
+from keras import optimizers
+from keras.models import Sequential
+from keras.layers import Dense, Activation
+from keras.utils import to_categorical
+
+
+def get_test_data():
+    np.random.seed(1337)
+    (x_train, y_train), _ = test_utils.get_test_data(num_train=1000,
+                                                     num_test=200,
+                                                     input_shape=(10,),
+                                                     classification=True,
+                                                     num_classes=2)
+    y_train = to_categorical(y_train)
+    return x_train, y_train
+
+
+def get_model(input_dim, num_hidden, output_dim):
+    model = Sequential()
+    model.add(Dense(num_hidden, input_shape=(input_dim,)))
+    model.add(Activation('relu'))
+    model.add(Dense(output_dim))
+    model.add(Activation('softmax'))
+    return model
+
+
+def _test_optimizer(optimizer, target=0.75):
+    x_train, y_train = get_test_data()
+    model = get_model(x_train.shape[1], 10, y_train.shape[1])
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=optimizer,
+                  metrics=['accuracy'])
+    history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
+    assert history.history['acc'][-1] >= target
+    config = optimizers.serialize(optimizer)
+    custom_objects = {optimizer.__class__.__name__: optimizer.__class__}
+    optim = optimizers.deserialize(config, custom_objects)
+    new_config = optimizers.serialize(optim)
+    assert config == new_config
--- a/code/keras_contrib/tests/regularizers.py
+++ b/code/keras_contrib/tests/regularizers.py
+import numpy as np
+from keras.datasets import mnist
+from keras.layers import Activation
+from keras.layers import Dense
+from keras.models import Sequential
+from keras.utils import np_utils
+
+np.random.seed(1337)
+
+nb_classes = 10
+batch_size = 128
+nb_epoch = 5
+weighted_class = 9
+standard_weight = 1
+high_weight = 5
+max_train_samples = 5000
+max_test_samples = 1000
+
+
+def get_data():
+    # the data, shuffled and split between tran and test sets
+    (X_train, y_train), (X_test, y_test) = mnist.load_data()
+    X_train = X_train.reshape(60000, 784)[:max_train_samples]
+    X_test = X_test.reshape(10000, 784)[:max_test_samples]
+    X_train = X_train.astype('float32') / 255
+    X_test = X_test.astype('float32') / 255
+
+    # convert class vectors to binary class matrices
+    y_train = y_train[:max_train_samples]
+    y_test = y_test[:max_test_samples]
+    Y_train = np_utils.to_categorical(y_train, nb_classes)
+    Y_test = np_utils.to_categorical(y_test, nb_classes)
+    test_ids = np.where(y_test == np.array(weighted_class))[0]
+
+    return (X_train, Y_train), (X_test, Y_test), test_ids
+
+
+def validate_regularizer(weight_reg=None, activity_reg=None):
+    model = Sequential()
+    model.add(Dense(50, input_shape=(784,)))
+    model.add(Activation('relu'))
+    model.add(Dense(10, W_regularizer=weight_reg,
+                    activity_regularizer=activity_reg))
+    model.add(Activation('softmax'))
+    return model
--- a/code/keras_contrib/utils/__init__.py
+++ b/code/keras_contrib/utils/__init__.py
--- a/code/keras_contrib/utils/__pycache__/__init__.cpython-310.pyc
+++ b/code/keras_contrib/utils/__pycache__/__init__.cpython-310.pyc
--- a/code/keras_contrib/utils/__pycache__/conv_utils.cpython-310.pyc
+++ b/code/keras_contrib/utils/__pycache__/conv_utils.cpython-310.pyc