Initial commit

8de66223 · maming · 8de66223 · 8de66223 · 8de66223 · 8de66223
Commit 8de66223 authored Feb 04, 2026 by maming
20 changed files
--- a/code/keras_contrib/activations/squash.py
+++ b/code/keras_contrib/activations/squash.py
+from keras import backend as K
+def squash(x, axis=-1):
+    """
+    Squash activation function (generally used in Capsule layers).
+    """
+    s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
+    scale = K.sqrt(s_squared_norm) / (0.5 + s_squared_norm)
+    return scale * x
--- a/code/keras_contrib/applications/__init__.py
+++ b/code/keras_contrib/applications/__init__.py
+from .densenet import DenseNet
+from .resnet import ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152
+from .wide_resnet import WideResidualNetwork
+from .nasnet import NASNet, NASNetLarge, NASNetMobile
--- a/code/keras_contrib/applications/densenet.py
+++ b/code/keras_contrib/applications/densenet.py
+# -*- coding: utf-8 -*-
+'''DenseNet and DenseNet-FCN models for Keras.
+DenseNet is a network architecture where each layer is directly connected
+to every other layer in a feed-forward fashion (within each dense block).
+For each layer, the feature maps of all preceding layers are treated as
+separate inputs whereas its own feature maps are passed on as inputs to
+all subsequent layers. This connectivity pattern yields state-of-the-art
+accuracies on CIFAR10/100 (with or without data augmentation) and SVHN.
+On the large scale ILSVRC 2012 (ImageNet) dataset, DenseNet achieves a
+similar accuracy as ResNet, but using less than half the amount of
+parameters and roughly half the number of FLOPs.
+DenseNets support any input image size of 32x32 or greater, and are thus
+suited for CIFAR-10 or CIFAR-100 datasets. There are two types of DenseNets,
+one suited for smaller images (DenseNet) and one suited for ImageNet,
+called DenseNetImageNet. They are differentiated by the strided convolution
+and pooling operations prior to the initial dense block.
+The following table describes the size and accuracy of DenseNetImageNet models
+on the ImageNet dataset (single crop), for which weights are provided:
+------------------------------------------------------------------------------------
+    Model type      | ImageNet Acc (Top 1)  |  ImageNet Acc (Top 5) |  Params (M)  |
+------------------------------------------------------------------------------------
+|   DenseNet-121    |    25.02 %            |        7.71 %         |     8.0      |
+|   DenseNet-169    |    23.80 %            |        6.85 %         |     14.3     |
+|   DenseNet-201    |    22.58 %            |        6.34 %         |     20.2     |
+|   DenseNet-161    |    22.20 %            |         -   %         |     28.9     |
+------------------------------------------------------------------------------------
+DenseNets can be extended to image segmentation tasks as described in the
+paper "The One Hundred Layers Tiramisu: Fully Convolutional DenseNets for
+Semantic Segmentation". Here, the dense blocks are arranged and concatenated
+with long skip connections for state of the art performance on the CamVid dataset.
+# Reference
+- [Densely Connected Convolutional Networks](https://arxiv.org/pdf/1608.06993.pdf)
+- [The One Hundred Layers Tiramisu: Fully Convolutional DenseNets for Semantic
+   Segmentation](https://arxiv.org/pdf/1611.09326.pdf)
+This implementation is based on the following reference code:
+ - https://github.com/gpleiss/efficient_densenet_pytorch
+ - https://github.com/liuzhuang13/DenseNet
+'''
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+import warnings
+from keras.models import Model
+from keras.layers import Dense
+from keras.layers import Dropout
+from keras.layers import Activation
+from keras.layers import Reshape
+from keras.layers import Conv2D
+from keras.layers import Conv2DTranspose
+from keras.layers import UpSampling2D
+from keras.layers import MaxPooling2D
+from keras.layers import AveragePooling2D
+from keras.layers import GlobalMaxPooling2D
+from keras.layers import GlobalAveragePooling2D
+from keras.layers import Input
+from keras.layers import concatenate
+from keras.layers import BatchNormalization
+from keras.regularizers import l2
+from keras.utils.layer_utils import convert_all_kernels_in_model
+from keras.utils.data_utils import get_file
+from keras.engine.topology import get_source_inputs
+from keras_applications.imagenet_utils import _obtain_input_shape
+from keras.applications.imagenet_utils import preprocess_input as _preprocess_input
+import keras.backend as K
+from keras_contrib.layers import SubPixelUpscaling
+DENSENET_121_WEIGHTS_PATH = (r'https://github.com/titu1994/DenseNet/releases/download'
+                             r'/v3.0/DenseNet-BC-121-32.h5')
+DENSENET_161_WEIGHTS_PATH = (r'https://github.com/titu1994/DenseNet/releases/download'
+                             r'/v3.0/DenseNet-BC-161-48.h5')
+DENSENET_169_WEIGHTS_PATH = (r'https://github.com/titu1994/DenseNet/releases/download'
+                             r'/v3.0/DenseNet-BC-169-32.h5')
+DENSENET_121_WEIGHTS_PATH_NO_TOP = (r'https://github.com/titu1994/DenseNet/releases/'
+                                    r'download/v3.0/DenseNet-BC-121-32-no-top.h5')
+DENSENET_161_WEIGHTS_PATH_NO_TOP = (r'https://github.com/titu1994/DenseNet/releases/'
+                                    r'download/v3.0/DenseNet-BC-161-48-no-top.h5')
+DENSENET_169_WEIGHTS_PATH_NO_TOP = (r'https://github.com/titu1994/DenseNet/releases/'
+                                    r'download/v3.0/DenseNet-BC-169-32-no-top.h5')
+def preprocess_input(x, data_format=None):
+    """Preprocesses a tensor encoding a batch of images.
+    # Arguments
+        x: input Numpy tensor, 4D.
+        data_format: data format of the image tensor.
+    # Returns
+        Preprocessed tensor.
+    """
+    x = _preprocess_input(x, data_format=data_format)
+    x *= 0.017  # scale values
+    return x
+def DenseNet(input_shape=None,
+             depth=40,
+             nb_dense_block=3,
+             growth_rate=12,
+             nb_filter=-1,
+             nb_layers_per_block=-1,
+             bottleneck=False,
+             reduction=0.0,
+             dropout_rate=0.0,
+             weight_decay=1e-4,
+             subsample_initial_block=False,
+             include_top=True,
+             weights=None,
+             input_tensor=None,
+             pooling=None,
+             classes=10,
+             activation='softmax',
+             transition_pooling='avg'):
+    '''Instantiate the DenseNet architecture.
+    The model and the weights are compatible with both
+    TensorFlow and Theano. The dimension ordering
+    convention used by the model is the one
+    specified in your Keras config file.
+    # Arguments
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(224, 224, 3)` (with `channels_last` dim ordering)
+            or `(3, 224, 224)` (with `channels_first` dim ordering).
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 8.
+            E.g. `(224, 224, 3)` would be one valid value.
+        depth: number or layers in the DenseNet
+        nb_dense_block: number of dense blocks to add to end
+        growth_rate: number of filters to add per dense block
+        nb_filter: initial number of filters. -1 indicates initial
+            number of filters will default to 2 * growth_rate
+        nb_layers_per_block: number of layers in each dense block.
+            Can be a -1, positive integer or a list.
+            If -1, calculates nb_layer_per_block from the network depth.
+            If positive integer, a set number of layers per dense block.
+            If list, nb_layer is used as provided. Note that list size must
+            be nb_dense_block
+        bottleneck: flag to add bottleneck blocks in between dense blocks
+        reduction: reduction factor of transition blocks.
+            Note : reduction value is inverted to compute compression.
+        dropout_rate: dropout rate
+        weight_decay: weight decay rate
+        subsample_initial_block: Changes model type to suit different datasets.
+            Should be set to True for ImageNet, and False for CIFAR datasets.
+            When set to True, the initial convolution will be strided and
+            adds a MaxPooling2D before the initial dense block.
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: one of `None` (random initialization) or
+            'imagenet' (pre-training on ImageNet)..
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        activation: Type of activation at the top layer. Can be one of
+            'softmax' or 'sigmoid'. Note that if sigmoid is used,
+             classes must be 1.
+        transition_pooling: `avg` for avg pooling (default), `max` for max pooling,
+            None for no pooling during scale transition blocks. Please note that this
+            default differs from the DenseNetFCN paper in accordance with the DenseNet
+            paper.
+    # Returns
+        A Keras model instance.
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+    '''
+    if weights not in {'imagenet', None}:
+        raise ValueError('The `weights` argument should be either '
+                         '`None` (random initialization) or `imagenet` '
+                         '(pre-training on ImageNet).')
+    if weights == 'imagenet' and include_top and classes != 1000:
+        raise ValueError('If using `weights` as ImageNet with `include_top` '
+                         'as true, `classes` should be 1000')
+    if activation not in ['softmax', 'sigmoid']:
+        raise ValueError('activation must be one of "softmax" or "sigmoid"')
+    if activation == 'sigmoid' and classes != 1:
+        raise ValueError('sigmoid activation can only be used when classes = 1')
+    # Determine proper input shape
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=32,
+                                      min_size=8,
+                                      data_format=K.image_data_format(),
+                                      require_flatten=include_top)
+    if input_tensor is None:
+        img_input = Input(shape=input_shape)
+    else:
+        if not K.is_keras_tensor(input_tensor):
+            img_input = Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+    x = __create_dense_net(classes, img_input, include_top, depth, nb_dense_block,
+                           growth_rate, nb_filter, nb_layers_per_block, bottleneck,
+                           reduction, dropout_rate, weight_decay,
+                           subsample_initial_block, pooling, activation,
+                           transition_pooling)
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+    # Create model.
+    model = Model(inputs, x, name='densenet')
+    # load weights
+    if weights == 'imagenet':
+        weights_loaded = False
+        if ((depth == 121) and (nb_dense_block == 4) and (growth_rate == 32) and
+                (nb_filter == 64) and (bottleneck is True) and (reduction == 0.5) and
+                subsample_initial_block):
+            if include_top:
+                weights_path = get_file('DenseNet-BC-121-32.h5',
+                                        DENSENET_121_WEIGHTS_PATH,
+                                        cache_subdir='models',
+                                        md5_hash='a439dd41aa672aef6daba4ee1fd54abd')
+            else:
+                weights_path = get_file('DenseNet-BC-121-32-no-top.h5',
+                                        DENSENET_121_WEIGHTS_PATH_NO_TOP,
+                                        cache_subdir='models',
+                                        md5_hash='55e62a6358af8a0af0eedf399b5aea99')
+            model.load_weights(weights_path, by_name=True)
+            weights_loaded = True
+        if ((depth == 161) and (nb_dense_block == 4) and (growth_rate == 48) and
+                (nb_filter == 96) and (bottleneck is True) and (reduction == 0.5) and
+                subsample_initial_block):
+            if include_top:
+                weights_path = get_file('DenseNet-BC-161-48.h5',
+                                        DENSENET_161_WEIGHTS_PATH,
+                                        cache_subdir='models',
+                                        md5_hash='6c326cf4fbdb57d31eff04333a23fcca')
+            else:
+                weights_path = get_file('DenseNet-BC-161-48-no-top.h5',
+                                        DENSENET_161_WEIGHTS_PATH_NO_TOP,
+                                        cache_subdir='models',
+                                        md5_hash='1a9476b79f6b7673acaa2769e6427b92')
+            model.load_weights(weights_path, by_name=True)
+            weights_loaded = True
+        if ((depth == 169) and (nb_dense_block == 4) and (growth_rate == 32) and
+                (nb_filter == 64) and (bottleneck is True) and (reduction == 0.5) and
+                subsample_initial_block):
+            if include_top:
+                weights_path = get_file('DenseNet-BC-169-32.h5',
+                                        DENSENET_169_WEIGHTS_PATH,
+                                        cache_subdir='models',
+                                        md5_hash='914869c361303d2e39dec640b4e606a6')
+            else:
+                weights_path = get_file('DenseNet-BC-169-32-no-top.h5',
+                                        DENSENET_169_WEIGHTS_PATH_NO_TOP,
+                                        cache_subdir='models',
+                                        md5_hash='89c19e8276cfd10585d5fadc1df6859e')
+            model.load_weights(weights_path, by_name=True)
+            weights_loaded = True
+        if weights_loaded:
+            if K.backend() == 'theano':
+                convert_all_kernels_in_model(model)
+            if ((K.image_data_format() == 'channels_first') and
+                    (K.backend() == 'tensorflow')):
+                warnings.warn('You are using the TensorFlow backend, yet you '
+                              'are using the Theano '
+                              'image data format convention '
+                              '(`image_data_format="channels_first"`). '
+                              'For best performance, set '
+                              '`image_data_format="channels_last"` in '
+                              'your Keras config '
+                              'at ~/.keras/keras.json.')
+            print("Weights for the model were loaded successfully")
+    return model
+def DenseNetFCN(input_shape, nb_dense_block=5, growth_rate=16, nb_layers_per_block=4,
+                reduction=0.0, dropout_rate=0.0, weight_decay=1E-4,
+                init_conv_filters=48, include_top=True, weights=None, input_tensor=None,
+                classes=1, activation='softmax', upsampling_conv=128,
+                upsampling_type='deconv', early_transition=False,
+                transition_pooling='max', initial_kernel_size=(3, 3)):
+    '''Instantiate the DenseNet FCN architecture.
+        Note that when using TensorFlow,
+        for best performance you should set
+        `image_data_format='channels_last'` in your Keras config
+        at ~/.keras/keras.json.
+        # Arguments
+            nb_dense_block: number of dense blocks to add to end (generally = 3)
+            growth_rate: number of filters to add per dense block
+            nb_layers_per_block: number of layers in each dense block.
+                Can be a positive integer or a list.
+                If positive integer, a set number of layers per dense block.
+                If list, nb_layer is used as provided. Note that list size must
+                be (nb_dense_block + 1)
+            reduction: reduction factor of transition blocks.
+                Note : reduction value is inverted to compute compression.
+            dropout_rate: dropout rate
+            weight_decay: weight decay factor
+            init_conv_filters: number of layers in the initial convolution layer
+            include_top: whether to include the fully-connected
+                layer at the top of the network.
+            weights: one of `None` (random initialization) or
+                'cifar10' (pre-training on CIFAR-10)..
+            input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+                to use as image input for the model.
+            input_shape: optional shape tuple, only to be specified
+                if `include_top` is False (otherwise the input shape
+                has to be `(32, 32, 3)` (with `channels_last` dim ordering)
+                or `(3, 32, 32)` (with `channels_first` dim ordering).
+                It should have exactly 3 inputs channels,
+                and width and height should be no smaller than 8.
+                E.g. `(200, 200, 3)` would be one valid value.
+            classes: optional number of classes to classify images
+                into, only to be specified if `include_top` is True, and
+                if no `weights` argument is specified.
+            activation: Type of activation at the top layer. Can be one of 'softmax'
+                or 'sigmoid'. Note that if sigmoid is used, classes must be 1.
+            upsampling_conv: number of convolutional layers in upsampling via subpixel
+                convolution
+            upsampling_type: Can be one of 'deconv', 'upsampling' and
+                'subpixel'. Defines type of upsampling algorithm used.
+            batchsize: Fixed batch size. This is a temporary requirement for
+                computation of output shape in the case of Deconvolution2D layers.
+                Parameter will be removed in next iteration of Keras, which infers
+                output shape of deconvolution layers automatically.
+            early_transition: Start with an extra initial transition down and end with
+                an extra transition up to reduce the network size.
+            initial_kernel_size: The first Conv2D kernel might vary in size based on the
+                application, this parameter makes it configurable.
+        # Returns
+            A Keras model instance.
+    '''
+    if weights not in {None}:
+        raise ValueError('The `weights` argument should be '
+                         '`None` (random initialization) as no '
+                         'model weights are provided.')
+    upsampling_type = upsampling_type.lower()
+    if upsampling_type not in ['upsampling', 'deconv', 'subpixel']:
+        raise ValueError('Parameter "upsampling_type" must be one of "upsampling", '
+                         '"deconv" or "subpixel".')
+    if input_shape is None:
+        raise ValueError('For fully convolutional models, '
+                         'input shape must be supplied.')
+    if type(nb_layers_per_block) is not list and nb_dense_block < 1:
+        raise ValueError('Number of dense layers per block must be greater than 1. '
+                         'Argument value was %d.' % nb_layers_per_block)
+    if activation not in ['softmax', 'sigmoid']:
+        raise ValueError('activation must be one of "softmax" or "sigmoid"')
+    if activation == 'sigmoid' and classes != 1:
+        raise ValueError('sigmoid activation can only be used when classes = 1')
+    # Determine proper input shape
+    min_size = 2 ** nb_dense_block
+    if K.image_data_format() == 'channels_first':
+        if input_shape is not None:
+            if ((input_shape[1] is not None and input_shape[1] < min_size) or
+                    (input_shape[2] is not None and input_shape[2] < min_size)):
+                raise ValueError('Input size must be at least ' +
+                                 str(min_size) + 'x' + str(min_size) +
+                                 ', got `input_shape=' + str(input_shape) + '`')
+        else:
+            input_shape = (classes, None, None)
+    else:
+        if input_shape is not None:
+            if ((input_shape[0] is not None and input_shape[0] < min_size) or
+                    (input_shape[1] is not None and input_shape[1] < min_size)):
+                raise ValueError('Input size must be at least ' +
+                                 str(min_size) + 'x' + str(min_size) +
+                                 ', got `input_shape=' + str(input_shape) + '`')
+        else:
+            input_shape = (None, None, classes)
+    if input_tensor is None:
+        img_input = Input(shape=input_shape)
+    else:
+        if not K.is_keras_tensor(input_tensor):
+            img_input = Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+    x = __create_fcn_dense_net(classes, img_input, include_top, nb_dense_block,
+                               growth_rate, reduction, dropout_rate, weight_decay,
+                               nb_layers_per_block, upsampling_conv, upsampling_type,
+                               init_conv_filters, input_shape, activation,
+                               early_transition, transition_pooling,
+                               initial_kernel_size)
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+    # Create model.
+    model = Model(inputs, x, name='fcn-densenet')
+    return model
+def DenseNetImageNet121(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights='imagenet',
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=121, nb_dense_block=4, growth_rate=32,
+                    nb_filter=64, nb_layers_per_block=[6, 12, 24, 16],
+                    bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay,
+                    subsample_initial_block=True, include_top=include_top,
+                    weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+def DenseNetImageNet169(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights='imagenet',
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=169, nb_dense_block=4, growth_rate=32,
+                    nb_filter=64, nb_layers_per_block=[6, 12, 32, 32],
+                    bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay,
+                    subsample_initial_block=True, include_top=include_top,
+                    weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+def DenseNetImageNet201(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights=None,
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=201, nb_dense_block=4, growth_rate=32,
+                    nb_filter=64, nb_layers_per_block=[6, 12, 48, 32],
+                    bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay,
+                    subsample_initial_block=True, include_top=include_top,
+                    weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+def DenseNetImageNet264(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights=None,
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=264, nb_dense_block=4, growth_rate=32,
+                    nb_filter=64, nb_layers_per_block=[6, 12, 64, 48],
+                    bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay,
+                    subsample_initial_block=True, include_top=include_top,
+                    weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+def DenseNetImageNet161(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights='imagenet',
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=161, nb_dense_block=4, growth_rate=48,
+                    nb_filter=96, nb_layers_per_block=[6, 12, 36, 24],
+                    bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay,
+                    subsample_initial_block=True, include_top=include_top,
+                    weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+def name_or_none(prefix, name):
+    return prefix + name if (prefix is not None and name is not None) else None
+def __conv_block(ip, nb_filter, bottleneck=False, dropout_rate=None,
+                 weight_decay=1e-4, block_prefix=None):
+    '''
+    Adds a convolution layer (with batch normalization and relu),
+    and optionally a bottleneck layer.
+    # Arguments
+        ip: Input tensor
+        nb_filter: integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution)
+        bottleneck: if True, adds a bottleneck convolution block
+        dropout_rate: dropout rate
+        weight_decay: weight decay factor
+        block_prefix: str, for unique layer naming
+     # Input shape
+        4D tensor with shape:
+        `(samples, channels, rows, cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows, cols, channels)` if data_format='channels_last'.
+    # Output shape
+        4D tensor with shape:
+        `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
+        `rows` and `cols` values might have changed due to stride.
+    # Returns
+        output tensor of block
+    '''
+    with K.name_scope('ConvBlock'):
+        concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+        x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5,
+                               name=name_or_none(block_prefix, '_bn'))(ip)
+        x = Activation('relu')(x)
+        if bottleneck:
+            inter_channel = nb_filter * 4
+            x = Conv2D(inter_channel, (1, 1), kernel_initializer='he_normal',
+                       padding='same', use_bias=False,
+                       kernel_regularizer=l2(weight_decay),
+                       name=name_or_none(block_prefix, '_bottleneck_conv2D'))(x)
+            x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5,
+                                   name=name_or_none(block_prefix, '_bottleneck_bn'))(x)
+            x = Activation('relu')(x)
+        x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_normal', padding='same',
+                   use_bias=False, name=name_or_none(block_prefix, '_conv2D'))(x)
+        if dropout_rate:
+            x = Dropout(dropout_rate)(x)
+    return x
+def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False,
+                  dropout_rate=None, weight_decay=1e-4, grow_nb_filters=True,
+                  return_concat_list=False, block_prefix=None):
+    '''
+    Build a dense_block where the output of each conv_block is fed
+    to subsequent ones
+    # Arguments
+        x: input keras tensor
+        nb_layers: the number of conv_blocks to append to the model
+        nb_filter: integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution)
+        growth_rate: growth rate of the dense block
+        bottleneck: if True, adds a bottleneck convolution block to
+            each conv_block
+        dropout_rate: dropout rate
+        weight_decay: weight decay factor
+        grow_nb_filters: if True, allows number of filters to grow
+        return_concat_list: set to True to return the list of
+            feature maps along with the actual output
+        block_prefix: str, for block unique naming
+    # Return
+        If return_concat_list is True, returns a list of the output
+        keras tensor, the number of filters and a list of all the
+        dense blocks added to the keras tensor
+        If return_concat_list is False, returns a list of the output
+        keras tensor and the number of filters
+    '''
+    with K.name_scope('DenseBlock'):
+        concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+        x_list = [x]
+        for i in range(nb_layers):
+            cb = __conv_block(x, growth_rate, bottleneck, dropout_rate, weight_decay,
+                              block_prefix=name_or_none(block_prefix, '_%i' % i))
+            x_list.append(cb)
+            x = concatenate([x, cb], axis=concat_axis)
+            if grow_nb_filters:
+                nb_filter += growth_rate
+        if return_concat_list:
+            return x, nb_filter, x_list
+        else:
+            return x, nb_filter
+def __transition_block(ip, nb_filter, compression=1.0, weight_decay=1e-4,
+                       block_prefix=None, transition_pooling='max'):
+    '''
+    Adds a pointwise convolution layer (with batch normalization and relu),
+    and an average pooling layer. The number of output convolution filters
+    can be reduced by appropriately reducing the compression parameter.
+    # Arguments
+        ip: input keras tensor
+        nb_filter: integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution)
+        compression: calculated as 1 - reduction. Reduces the number
+            of feature maps in the transition block.
+        weight_decay: weight decay factor
+        block_prefix: str, for block unique naming
+    # Input shape
+        4D tensor with shape:
+        `(samples, channels, rows, cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows, cols, channels)` if data_format='channels_last'.
+    # Output shape
+        4D tensor with shape:
+        `(samples, nb_filter * compression, rows / 2, cols / 2)`
+        if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows / 2, cols / 2, nb_filter * compression)`
+        if data_format='channels_last'.
+    # Returns
+        a keras tensor
+    '''
+    with K.name_scope('Transition'):
+        concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+        x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5,
+                               name=name_or_none(block_prefix, '_bn'))(ip)
+        x = Activation('relu')(x)
+        x = Conv2D(int(nb_filter * compression), (1, 1), kernel_initializer='he_normal',
+                   padding='same', use_bias=False, kernel_regularizer=l2(weight_decay),
+                   name=name_or_none(block_prefix, '_conv2D'))(x)
+        if transition_pooling == 'avg':
+            x = AveragePooling2D((2, 2), strides=(2, 2))(x)
+        elif transition_pooling == 'max':
+            x = MaxPooling2D((2, 2), strides=(2, 2))(x)
+        return x
+def __transition_up_block(ip, nb_filters, type='deconv', weight_decay=1E-4,
+                          block_prefix=None):
+    '''Adds an upsampling block. Upsampling operation relies on the the type parameter.
+    # Arguments
+        ip: input keras tensor
+        nb_filters: integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution)
+        type: can be 'upsampling', 'subpixel', 'deconv'. Determines
+            type of upsampling performed
+        weight_decay: weight decay factor
+        block_prefix: str, for block unique naming
+    # Input shape
+        4D tensor with shape:
+        `(samples, channels, rows, cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows, cols, channels)` if data_format='channels_last'.
+    # Output shape
+        4D tensor with shape:
+        `(samples, nb_filter, rows * 2, cols * 2)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows * 2, cols * 2, nb_filter)` if data_format='channels_last'.
+    # Returns
+        a keras tensor
+    '''
+    with K.name_scope('TransitionUp'):
+        if type == 'upsampling':
+            x = UpSampling2D(name=name_or_none(block_prefix, '_upsampling'))(ip)
+        elif type == 'subpixel':
+            x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same',
+                       kernel_regularizer=l2(weight_decay), use_bias=False,
+                       kernel_initializer='he_normal',
+                       name=name_or_none(block_prefix, '_conv2D'))(ip)
+            x = SubPixelUpscaling(scale_factor=2,
+                                  name=name_or_none(block_prefix, '_subpixel'))(x)
+            x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same',
+                       kernel_regularizer=l2(weight_decay), use_bias=False,
+                       kernel_initializer='he_normal',
+                       name=name_or_none(block_prefix, '_conv2D'))(x)
+        else:
+            x = Conv2DTranspose(nb_filters, (3, 3), activation='relu', padding='same',
+                                strides=(2, 2), kernel_initializer='he_normal',
+                                kernel_regularizer=l2(weight_decay),
+                                name=name_or_none(block_prefix, '_conv2DT'))(ip)
+        return x
+def __create_dense_net(nb_classes, img_input, include_top, depth=40, nb_dense_block=3,
+                       growth_rate=12, nb_filter=-1, nb_layers_per_block=-1,
+                       bottleneck=False, reduction=0.0, dropout_rate=None,
+                       weight_decay=1e-4, subsample_initial_block=False, pooling=None,
+                       activation='softmax', transition_pooling='avg'):
+    ''' Build the DenseNet model
+    # Arguments
+        nb_classes: number of classes
+        img_input: tuple of shape (channels, rows, columns) or (rows, columns, channels)
+        include_top: flag to include the final Dense layer
+        depth: number or layers
+        nb_dense_block: number of dense blocks to add to end (generally = 3)
+        growth_rate: number of filters to add per dense block
+        nb_filter: initial number of filters. Default -1 indicates initial number
+            of filters is 2 * growth_rate
+        nb_layers_per_block: number of layers in each dense block.
+                Can be a -1, positive integer or a list.
+                If -1, calculates nb_layer_per_block from the depth of the network.
+                If positive integer, a set number of layers per dense block.
+                If list, nb_layer is used as provided. Note that list size must
+                be (nb_dense_block + 1)
+        bottleneck: add bottleneck blocks
+        reduction: reduction factor of transition blocks. Note : reduction value is
+            inverted to compute compression
+        dropout_rate: dropout rate
+        weight_decay: weight decay rate
+        subsample_initial_block: Changes model type to suit different datasets.
+            Should be set to True for ImageNet, and False for CIFAR datasets.
+            When set to True, the initial convolution will be strided and
+            adds a MaxPooling2D before the initial dense block.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        activation: Type of activation at the top layer. Can be one of 'softmax' or
+            'sigmoid'. Note that if sigmoid is used, classes must be 1.
+        transition_pooling: `avg` for avg pooling (default), `max` for max pooling,
+            None for no pooling during scale transition blocks. Please note that this
+            default differs from the DenseNetFCN paper in accordance with the DenseNet
+            paper.
+    # Returns
+        a keras tensor
+    # Raises
+        ValueError: in case of invalid argument for `reduction`
+            or `nb_dense_block`
+    '''
+    with K.name_scope('DenseNet'):
+        concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+        if reduction != 0.0:
+            if not (reduction <= 1.0 and reduction > 0.0):
+                raise ValueError('`reduction` value must lie between 0.0 and 1.0')
+        # layers in each dense block
+        if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple:
+            nb_layers = list(nb_layers_per_block)  # Convert tuple to list
+            if len(nb_layers) != nb_dense_block:
+                raise ValueError('If `nb_dense_block` is a list, its length must match '
+                                 'the number of layers provided by `nb_layers`.')
+            final_nb_layer = nb_layers[-1]
+            nb_layers = nb_layers[:-1]
+        else:
+            if nb_layers_per_block == -1:
+                assert (depth - 4) % 3 == 0, ('Depth must be 3 N + 4 '
+                                              'if nb_layers_per_block == -1')
+                count = int((depth - 4) / 3)
+                if bottleneck:
+                    count = count // 2
+                nb_layers = [count for _ in range(nb_dense_block)]
+                final_nb_layer = count
+            else:
+                final_nb_layer = nb_layers_per_block
+                nb_layers = [nb_layers_per_block] * nb_dense_block
+        # compute initial nb_filter if -1, else accept users initial nb_filter
+        if nb_filter <= 0:
+            nb_filter = 2 * growth_rate
+        # compute compression factor
+        compression = 1.0 - reduction
+        # Initial convolution
+        if subsample_initial_block:
+            initial_kernel = (7, 7)
+            initial_strides = (2, 2)
+        else:
+            initial_kernel = (3, 3)
+            initial_strides = (1, 1)
+        x = Conv2D(nb_filter, initial_kernel, kernel_initializer='he_normal',
+                   padding='same', name='initial_conv2D', strides=initial_strides,
+                   use_bias=False, kernel_regularizer=l2(weight_decay))(img_input)
+        if subsample_initial_block:
+            x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5,
+                                   name='initial_bn')(x)
+            x = Activation('relu')(x)
+            x = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
+        # Add dense blocks
+        for block_idx in range(nb_dense_block - 1):
+            x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter,
+                                         growth_rate, bottleneck=bottleneck,
+                                         dropout_rate=dropout_rate,
+                                         weight_decay=weight_decay,
+                                         block_prefix='dense_%i' % block_idx)
+            # add transition_block
+            x = __transition_block(x, nb_filter, compression=compression,
+                                   weight_decay=weight_decay,
+                                   block_prefix='tr_%i' % block_idx,
+                                   transition_pooling=transition_pooling)
+            nb_filter = int(nb_filter * compression)
+        # The last dense_block does not have a transition_block
+        x, nb_filter = __dense_block(x, final_nb_layer, nb_filter, growth_rate,
+                                     bottleneck=bottleneck, dropout_rate=dropout_rate,
+                                     weight_decay=weight_decay,
+                                     block_prefix='dense_%i' % (nb_dense_block - 1))
+        x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name='final_bn')(x)
+        x = Activation('relu')(x)
+        if include_top:
+            if pooling == 'avg':
+                x = GlobalAveragePooling2D()(x)
+            elif pooling == 'max':
+                x = GlobalMaxPooling2D()(x)
+            x = Dense(nb_classes, activation=activation)(x)
+        else:
+            if pooling == 'avg':
+                x = GlobalAveragePooling2D()(x)
+            elif pooling == 'max':
+                x = GlobalMaxPooling2D()(x)
+        return x
+def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5,
+                           growth_rate=12, reduction=0.0, dropout_rate=None,
+                           weight_decay=1e-4, nb_layers_per_block=4,
+                           nb_upsampling_conv=128, upsampling_type='deconv',
+                           init_conv_filters=48, input_shape=None, activation='softmax',
+                           early_transition=False, transition_pooling='max',
+                           initial_kernel_size=(3, 3)):
+    ''' Build the DenseNet-FCN model
+    # Arguments
+        nb_classes: number of classes
+        img_input: tuple of shape (channels, rows, columns) or (rows, columns, channels)
+        include_top: flag to include the final Dense layer
+        nb_dense_block: number of dense blocks to add to end (generally = 3)
+        growth_rate: number of filters to add per dense block
+        reduction: reduction factor of transition blocks. Note : reduction value
+            is inverted to compute compression
+        dropout_rate: dropout rate
+        weight_decay: weight decay
+        nb_layers_per_block: number of layers in each dense block.
+            Can be a positive integer or a list.
+            If positive integer, a set number of layers per dense block.
+            If list, nb_layer is used as provided. Note that list size must
+            be (nb_dense_block + 1)
+        nb_upsampling_conv: number of convolutional layers in upsampling via subpixel
+            convolution
+        upsampling_type: Can be one of 'upsampling', 'deconv' and 'subpixel'. Defines
+            type of upsampling algorithm used.
+        input_shape: Only used for shape inference in fully convolutional networks.
+        activation: Type of activation at the top layer. Can be one of 'softmax' or
+            'sigmoid'. Note that if sigmoid is used, classes must be 1.
+        early_transition: Start with an extra initial transition down and end with an
+            extra transition up to reduce the network size.
+        transition_pooling: 'max' for max pooling (default), 'avg' for average pooling,
+            None for no pooling. Please note that this default differs from the DenseNet
+            paper in accordance with the DenseNetFCN paper.
+        initial_kernel_size: The first Conv2D kernel might vary in size based on the
+            application, this parameter makes it configurable.
+    # Returns
+        a keras tensor
+    # Raises
+        ValueError: in case of invalid argument for `reduction`,
+            `nb_dense_block` or `nb_upsampling_conv`.
+    '''
+    with K.name_scope('DenseNetFCN'):
+        concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+        if concat_axis == 1:  # channels_first dim ordering
+            _, rows, cols = input_shape
+        else:
+            rows, cols, _ = input_shape
+        if reduction != 0.0:
+            if not (reduction <= 1.0 and reduction > 0.0):
+                raise ValueError('`reduction` value must lie between 0.0 and 1.0')
+        # check if upsampling_conv has minimum number of filters minimum
+        # is set to 12, as at least 3 color channels are needed for correct upsampling
+        if not (nb_upsampling_conv > 12 and nb_upsampling_conv % 4 == 0):
+            raise ValueError('Parameter `nb_upsampling_conv` number of channels must '
+                             'be a positive number divisible by 4 and greater than 12')
+        # layers in each dense block
+        if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple:
+            nb_layers = list(nb_layers_per_block)  # Convert tuple to list
+            if len(nb_layers) != (nb_dense_block + 1):
+                raise ValueError('If `nb_dense_block` is a list, its length must be '
+                                 '(`nb_dense_block` + 1)')
+            bottleneck_nb_layers = nb_layers[-1]
+            rev_layers = nb_layers[::-1]
+            nb_layers.extend(rev_layers[1:])
+        else:
+            bottleneck_nb_layers = nb_layers_per_block
+            nb_layers = [nb_layers_per_block] * (2 * nb_dense_block + 1)
+        # compute compression factor
+        compression = 1.0 - reduction
+        # Initial convolution
+        x = Conv2D(init_conv_filters, initial_kernel_size,
+                   kernel_initializer='he_normal', padding='same',
+                   name='initial_conv2D', use_bias=False,
+                   kernel_regularizer=l2(weight_decay))(img_input)
+        x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name='initial_bn')(x)
+        x = Activation('relu')(x)
+        nb_filter = init_conv_filters
+        skip_list = []
+        if early_transition:
+            x = __transition_block(x, nb_filter, compression=compression,
+                                   weight_decay=weight_decay, block_prefix='tr_early',
+                                   transition_pooling=transition_pooling)
+        # Add dense blocks and transition down block
+        for block_idx in range(nb_dense_block):
+            x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter,
+                                         growth_rate, dropout_rate=dropout_rate,
+                                         weight_decay=weight_decay,
+                                         block_prefix='dense_%i' % block_idx)
+            # Skip connection
+            skip_list.append(x)
+            # add transition_block
+            x = __transition_block(x, nb_filter, compression=compression,
+                                   weight_decay=weight_decay,
+                                   block_prefix='tr_%i' % block_idx,
+                                   transition_pooling=transition_pooling)
+            # this is calculated inside transition_down_block
+            nb_filter = int(nb_filter * compression)
+        # The last dense_block does not have a transition_down_block
+        # return the concatenated feature maps without the concatenation of the input
+        block_prefix = 'dense_%i' % nb_dense_block
+        _, nb_filter, concat_list = __dense_block(x, bottleneck_nb_layers, nb_filter,
+                                                  growth_rate,
+                                                  dropout_rate=dropout_rate,
+                                                  weight_decay=weight_decay,
+                                                  return_concat_list=True,
+                                                  block_prefix=block_prefix)
+        skip_list = skip_list[::-1]  # reverse the skip list
+        # Add dense blocks and transition up block
+        for block_idx in range(nb_dense_block):
+            n_filters_keep = growth_rate * nb_layers[nb_dense_block + block_idx]
+            # upsampling block must upsample only the feature maps (concat_list[1:]),
+            # not the concatenation of the input with the feature maps (concat_list[0].
+            l = concatenate(concat_list[1:], axis=concat_axis)
+            t = __transition_up_block(l, nb_filters=n_filters_keep,
+                                      type=upsampling_type, weight_decay=weight_decay,
+                                      block_prefix='tr_up_%i' % block_idx)
+            # concatenate the skip connection with the transition block
+            x = concatenate([t, skip_list[block_idx]], axis=concat_axis)
+            # Dont allow the feature map size to grow in upsampling dense blocks
+            block_layer_index = nb_dense_block + 1 + block_idx
+            block_prefix = 'dense_%i' % (block_layer_index)
+            x_up, nb_filter, concat_list = __dense_block(x,
+                                                         nb_layers[block_layer_index],
+                                                         nb_filter=growth_rate,
+                                                         growth_rate=growth_rate,
+                                                         dropout_rate=dropout_rate,
+                                                         weight_decay=weight_decay,
+                                                         return_concat_list=True,
+                                                         grow_nb_filters=False,
+                                                         block_prefix=block_prefix)
+        if early_transition:
+            x_up = __transition_up_block(x_up, nb_filters=nb_filter,
+                                         type=upsampling_type,
+                                         weight_decay=weight_decay,
+                                         block_prefix='tr_up_early')
+        if include_top:
+            x = Conv2D(nb_classes, (1, 1), activation='linear', padding='same',
+                       use_bias=False)(x_up)
+            if K.image_data_format() == 'channels_first':
+                channel, row, col = input_shape
+            else:
+                row, col, channel = input_shape
+            x = Reshape((row * col, nb_classes))(x)
+            x = Activation(activation)(x)
+            x = Reshape((row, col, nb_classes))(x)
+        else:
+            x = x_up
+        return x
--- a/code/keras_contrib/applications/nasnet.py
+++ b/code/keras_contrib/applications/nasnet.py
+"""Collection of NASNet models
+The reference paper:
+ - [Learning Transferable Architectures for Scalable Image Recognition]
+    (https://arxiv.org/abs/1707.07012)
+The reference implementation:
+1. TF Slim
+ - https://github.com/tensorflow/models/blob/master/research/slim/nets/
+   nasnet/nasnet.py
+2. TensorNets
+ - https://github.com/taehoonlee/tensornets/blob/master/tensornets/nasnets.py
+3. Weights
+ - https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet
+"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+import warnings
+from keras.models import Model
+from keras.layers import Input
+from keras.layers import Activation
+from keras.layers import Dense
+from keras.layers import Flatten
+from keras.layers import Dropout
+from keras.layers import BatchNormalization
+from keras.layers import MaxPooling2D
+from keras.layers import AveragePooling2D
+from keras.layers import GlobalAveragePooling2D
+from keras.layers import GlobalMaxPooling2D
+from keras.layers import Conv2D
+from keras.layers import SeparableConv2D
+from keras.layers import ZeroPadding2D
+from keras.layers import Cropping2D
+from keras.layers import concatenate
+from keras.layers import add
+from keras.regularizers import l2
+from keras.utils.data_utils import get_file
+from keras.engine.topology import get_source_inputs
+from keras_applications.imagenet_utils import _obtain_input_shape
+from keras import backend as K
+_BN_DECAY = 0.9997
+_BN_EPSILON = 1e-3
+NASNET_MOBILE_WEIGHT_PATH = (
+    "https://github.com/titu1994/Keras-NASNet/"
+    "releases/download/v1.0/NASNet-mobile.h5")
+NASNET_MOBILE_WEIGHT_PATH_NO_TOP = (
+    "https://github.com/titu1994/Keras-NASNet/"
+    "releases/download/v1.0/NASNet-mobile-no-top.h5")
+NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY = (
+    "https://github.com/titu1994/Keras-NASNet/"
+    "releases/download/v1.0/NASNet-auxiliary-mobile.h5")
+NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY_NO_TOP = (
+    "https://github.com/titu1994/Keras-NASNet/"
+    "releases/download/v1.0/NASNet-auxiliary-mobile-no-top.h5")
+NASNET_LARGE_WEIGHT_PATH = (
+    "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-large.h5")
+NASNET_LARGE_WEIGHT_PATH_NO_TOP = (
+    "https://github.com/titu1994/Keras-NASNet/"
+    "releases/download/v1.1/NASNet-large-no-top.h5")
+NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary = (
+    "https://github.com/titu1994/Keras-NASNet/"
+    "releases/download/v1.1/NASNet-auxiliary-large.h5")
+NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary_NO_TOP = (
+    "https://github.com/titu1994/Keras-NASNet/"
+    "releases/download/v1.1/NASNet-auxiliary-large-no-top.h5")
+def NASNet(input_shape=None,
+           penultimate_filters=4032,
+           nb_blocks=6,
+           stem_filters=96,
+           initial_reduction=True,
+           skip_reduction_layer_input=True,
+           use_auxiliary_branch=False,
+           filters_multiplier=2,
+           dropout=0.5,
+           weight_decay=5e-5,
+           include_top=True,
+           weights=None,
+           input_tensor=None,
+           pooling=None,
+           classes=1000,
+           default_size=None,
+           activation='softmax'):
+    """Instantiates a NASNet architecture.
+    Note that only TensorFlow is supported for now,
+    therefore it only works with the data format
+    `image_data_format='channels_last'` in your Keras config
+    at `~/.keras/keras.json`.
+    # Arguments
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(331, 331, 3)` for NASNetLarge or
+            `(224, 224, 3)` for NASNetMobile
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(224, 224, 3)` would be one valid value.
+        penultimate_filters: number of filters in the penultimate layer.
+            NASNet models use the notation `NASNet (N @ P)`, where:
+                -   N is the number of blocks
+                -   P is the number of penultimate filters
+        nb_blocks: number of repeated blocks of the NASNet model.
+            NASNet models use the notation `NASNet (N @ P)`, where:
+                -   N is the number of blocks
+                -   P is the number of penultimate filters
+        stem_filters: number of filters in the initial stem block
+        initial_reduction: Whether to perform the reduction step at the beginning
+            end of the network. Set to `True` for CIFAR models.
+        skip_reduction_layer_input: Determines whether to skip the reduction layers
+            when calculating the previous layer to connect to.
+        use_auxiliary_branch: Whether to use the auxiliary branch during
+            training or evaluation.
+        filters_multiplier: controls the width of the network.
+            - If `filters_multiplier` < 1.0, proportionally decreases the number
+                of filters in each layer.
+            - If `filters_multiplier` > 1.0, proportionally increases the number
+                of filters in each layer.
+            - If `filters_multiplier` = 1, default number of filters from the paper
+                 are used at each layer.
+        dropout: dropout rate
+        weight_decay: l2 regularization weight
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: `None` (random initialization) or
+            `imagenet` (ImageNet weights)
+        input_tensor: optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        default_size: specifies the default image size of the model
+        activation: Type of activation at the top layer.
+            Can be one of 'softmax' or 'sigmoid'.
+    # Returns
+        A Keras model instance.
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+    """
+    if K.backend() != 'tensorflow':
+        raise RuntimeError('Only Tensorflow backend is currently supported, '
+                           'as other backends do not support '
+                           'separable convolution.')
+    if weights not in {'imagenet', None}:
+        raise ValueError('The `weights` argument should be either '
+                         '`None` (random initialization) or `imagenet` '
+                         '(pre-training on ImageNet).')
+    if weights == 'imagenet' and include_top and classes != 1000:
+        raise ValueError('If using `weights` as ImageNet with `include_top` '
+                         'as true, `classes` should be 1000')
+    if default_size is None:
+        default_size = 331
+    # Determine proper input shape and default size.
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=default_size,
+                                      min_size=32,
+                                      data_format=K.image_data_format(),
+                                      require_flatten=include_top or weights)
+    if K.image_data_format() != 'channels_last':
+        warnings.warn('The NASNet family of models is only available '
+                      'for the input data format "channels_last" '
+                      '(width, height, channels). '
+                      'However your settings specify the default '
+                      'data format "channels_first" (channels, width, height).'
+                      ' You should set `image_data_format="channels_last"` '
+                      'in your Keras config located at ~/.keras/keras.json. '
+                      'The model being returned right now will expect inputs '
+                      'to follow the "channels_last" data format.')
+        K.set_image_data_format('channels_last')
+        old_data_format = 'channels_first'
+    else:
+        old_data_format = None
+    if input_tensor is None:
+        img_input = Input(shape=input_shape)
+    else:
+        if not K.is_keras_tensor(input_tensor):
+            img_input = Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+    assert penultimate_filters % 24 == 0, "`penultimate_filters` needs to be " \
+                                          "divisible by 24."
+    channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+    filters = penultimate_filters // 24
+    if initial_reduction:
+        x = Conv2D(stem_filters, (3, 3), strides=(2, 2), padding='valid',
+                   use_bias=False, name='stem_conv1', kernel_initializer='he_normal',
+                   kernel_regularizer=l2(weight_decay))(img_input)
+    else:
+        x = Conv2D(stem_filters, (3, 3), strides=(1, 1), padding='same', use_bias=False,
+                   name='stem_conv1', kernel_initializer='he_normal',
+                   kernel_regularizer=l2(weight_decay))(img_input)
+    x = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON,
+                           name='stem_bn1')(x)
+    p = None
+    if initial_reduction:  # imagenet / mobile mode
+        x, p = _reduction_A(x, p, filters // (filters_multiplier ** 2), weight_decay,
+                            id='stem_1')
+        x, p = _reduction_A(x, p, filters // filters_multiplier, weight_decay,
+                            id='stem_2')
+    for i in range(nb_blocks):
+        x, p = _normal_A(x, p, filters, weight_decay, id='%d' % i)
+    x, p0 = _reduction_A(x, p, filters * filters_multiplier, weight_decay,
+                         id='reduce_%d' % nb_blocks)
+    p = p0 if not skip_reduction_layer_input else p
+    for i in range(nb_blocks):
+        x, p = _normal_A(x, p, filters * filters_multiplier, weight_decay,
+                         id='%d' % (nb_blocks + i + 1))
+    auxiliary_x = None
+    if not initial_reduction:  # imagenet / mobile mode
+        if use_auxiliary_branch:
+            auxiliary_x = _add_auxiliary_head(x, classes, weight_decay, pooling,
+                                              include_top, activation)
+    x, p0 = _reduction_A(x, p, filters * filters_multiplier ** 2, weight_decay,
+                         id='reduce_%d' % (2 * nb_blocks))
+    if initial_reduction:  # CIFAR mode
+        if use_auxiliary_branch:
+            auxiliary_x = _add_auxiliary_head(x, classes, weight_decay, pooling,
+                                              include_top, activation)
+    p = p0 if not skip_reduction_layer_input else p
+    for i in range(nb_blocks):
+        x, p = _normal_A(x, p, filters * filters_multiplier ** 2, weight_decay,
+                         id='%d' % (2 * nb_blocks + i + 1))
+    x = Activation('relu')(x)
+    if include_top:
+        x = GlobalAveragePooling2D()(x)
+        x = Dropout(dropout)(x)
+        x = Dense(classes, activation=activation,
+                  kernel_regularizer=l2(weight_decay), name='predictions')(x)
+    else:
+        if pooling == 'avg':
+            x = GlobalAveragePooling2D()(x)
+        elif pooling == 'max':
+            x = GlobalMaxPooling2D()(x)
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+    # Create model.
+    if use_auxiliary_branch:
+        model = Model(inputs, [x, auxiliary_x], name='NASNet_with_auxiliary')
+    else:
+        model = Model(inputs, x, name='NASNet')
+    # load weights
+    if weights == 'imagenet':
+        if default_size == 224:  # mobile version
+            if include_top:
+                if use_auxiliary_branch:
+                    weight_path = NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY
+                    model_name = 'nasnet_mobile_with_aux.h5'
+                else:
+                    weight_path = NASNET_MOBILE_WEIGHT_PATH
+                    model_name = 'nasnet_mobile.h5'
+            else:
+                if use_auxiliary_branch:
+                    weight_path = NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY_NO_TOP
+                    model_name = 'nasnet_mobile_with_aux_no_top.h5'
+                else:
+                    weight_path = NASNET_MOBILE_WEIGHT_PATH_NO_TOP
+                    model_name = 'nasnet_mobile_no_top.h5'
+            weights_file = get_file(model_name, weight_path, cache_subdir='models')
+            model.load_weights(weights_file, by_name=True)
+        elif default_size == 331:  # large version
+            if include_top:
+                if use_auxiliary_branch:
+                    weight_path = NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary
+                    model_name = 'nasnet_large_with_aux.h5'
+                else:
+                    weight_path = NASNET_LARGE_WEIGHT_PATH
+                    model_name = 'nasnet_large.h5'
+            else:
+                if use_auxiliary_branch:
+                    weight_path = NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary_NO_TOP
+                    model_name = 'nasnet_large_with_aux_no_top.h5'
+                else:
+                    weight_path = NASNET_LARGE_WEIGHT_PATH_NO_TOP
+                    model_name = 'nasnet_large_no_top.h5'
+            weights_file = get_file(model_name, weight_path, cache_subdir='models')
+            model.load_weights(weights_file, by_name=True)
+        else:
+            raise ValueError('ImageNet weights can only be loaded on NASNetLarge '
+                             'or NASNetMobile')
+    if old_data_format:
+        K.set_image_data_format(old_data_format)
+    return model
+def NASNetLarge(input_shape=(331, 331, 3),
+                dropout=0.5,
+                weight_decay=5e-5,
+                use_auxiliary_branch=False,
+                include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                pooling=None,
+                classes=1000,
+                activation='softmax'):
+    """Instantiates a NASNet architecture in ImageNet mode.
+    Note that only TensorFlow is supported for now,
+    therefore it only works with the data format
+    `image_data_format='channels_last'` in your Keras config
+    at `~/.keras/keras.json`.
+    # Arguments
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(331, 331, 3)` for NASNetLarge.
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(224, 224, 3)` would be one valid value.
+        use_auxiliary_branch: Whether to use the auxiliary branch during
+            training or evaluation.
+        dropout: dropout rate
+        weight_decay: l2 regularization weight
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: `None` (random initialization) or
+            `imagenet` (ImageNet weights)
+        input_tensor: optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        default_size: specifies the default image size of the model
+         activation: Type of activation at the top layer.
+             Can be one of 'softmax' or 'sigmoid'.
+    # Returns
+        A Keras model instance.
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+    """
+    global _BN_DECAY, _BN_EPSILON
+    _BN_DECAY = 0.9997
+    _BN_EPSILON = 1e-3
+    return NASNet(input_shape,
+                  penultimate_filters=4032,
+                  nb_blocks=6,
+                  stem_filters=96,
+                  initial_reduction=True,
+                  skip_reduction_layer_input=True,
+                  use_auxiliary_branch=use_auxiliary_branch,
+                  filters_multiplier=2,
+                  dropout=dropout,
+                  weight_decay=weight_decay,
+                  include_top=include_top,
+                  weights=weights,
+                  input_tensor=input_tensor,
+                  pooling=pooling,
+                  classes=classes,
+                  default_size=331,
+                  activation=activation)
+def NASNetMobile(input_shape=(224, 224, 3),
+                 dropout=0.5,
+                 weight_decay=4e-5,
+                 use_auxiliary_branch=False,
+                 include_top=True,
+                 weights='imagenet',
+                 input_tensor=None,
+                 pooling=None,
+                 classes=1000,
+                 activation='softmax'):
+    """Instantiates a NASNet architecture in Mobile ImageNet mode.
+    Note that only TensorFlow is supported for now,
+    therefore it only works with the data format
+    `image_data_format='channels_last'` in your Keras config
+    at `~/.keras/keras.json`.
+    # Arguments
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(224, 224, 3)` for NASNetMobile
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(224, 224, 3)` would be one valid value.
+        use_auxiliary_branch: Whether to use the auxiliary branch during
+            training or evaluation.
+        dropout: dropout rate
+        weight_decay: l2 regularization weight
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: `None` (random initialization) or
+            `imagenet` (ImageNet weights)
+        input_tensor: optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        default_size: specifies the default image size of the model
+         activation: Type of activation at the top layer.
+             Can be one of 'softmax' or 'sigmoid'.
+    # Returns
+        A Keras model instance.
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+    """
+    global _BN_DECAY, _BN_EPSILON
+    _BN_DECAY = 0.9997
+    _BN_EPSILON = 1e-3
+    return NASNet(input_shape,
+                  penultimate_filters=1056,
+                  nb_blocks=4,
+                  stem_filters=32,
+                  initial_reduction=True,
+                  skip_reduction_layer_input=False,
+                  use_auxiliary_branch=use_auxiliary_branch,
+                  filters_multiplier=2,
+                  dropout=dropout,
+                  weight_decay=weight_decay,
+                  include_top=include_top,
+                  weights=weights,
+                  input_tensor=input_tensor,
+                  pooling=pooling,
+                  classes=classes,
+                  default_size=224)
+def NASNetCIFAR(input_shape=(32, 32, 3),
+                dropout=0.0,
+                weight_decay=5e-4,
+                use_auxiliary_branch=False,
+                include_top=True,
+                weights=None,
+                input_tensor=None,
+                pooling=None,
+                classes=10,
+                activation='softmax'):
+    """Instantiates a NASNet architecture in CIFAR mode.
+    Note that only TensorFlow is supported for now,
+    therefore it only works with the data format
+    `image_data_format='channels_last'` in your Keras config
+    at `~/.keras/keras.json`.
+    # Arguments
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(32, 32, 3)` for NASNetMobile
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(32, 32, 3)` would be one valid value.
+        use_auxiliary_branch: Whether to use the auxiliary branch during
+            training or evaluation.
+        dropout: dropout rate
+        weight_decay: l2 regularization weight
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: `None` (random initialization) or
+            `imagenet` (ImageNet weights)
+        input_tensor: optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        default_size: specifies the default image size of the model
+         activation: Type of activation at the top layer.
+             Can be one of 'softmax' or 'sigmoid'.
+    # Returns
+        A Keras model instance.
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+    """
+    global _BN_DECAY, _BN_EPSILON
+    _BN_DECAY = 0.9
+    _BN_EPSILON = 1e-5
+    return NASNet(input_shape,
+                  penultimate_filters=768,
+                  nb_blocks=6,
+                  stem_filters=32,
+                  initial_reduction=False,
+                  skip_reduction_layer_input=False,
+                  use_auxiliary_branch=use_auxiliary_branch,
+                  filters_multiplier=2,
+                  dropout=dropout,
+                  weight_decay=weight_decay,
+                  include_top=include_top,
+                  weights=weights,
+                  input_tensor=input_tensor,
+                  pooling=pooling,
+                  classes=classes,
+                  default_size=224,
+                  activation=activation)
+def _separable_conv_block(ip, filters, kernel_size=(3, 3), strides=(1, 1),
+                          weight_decay=5e-5, id=None):
+    '''Adds 2 blocks of [relu-separable conv-batchnorm]
+    # Arguments:
+        ip: input tensor
+        filters: number of output filters per layer
+        kernel_size: kernel size of separable convolutions
+        strides: strided convolution for downsampling
+        weight_decay: l2 regularization weight
+        id: string id
+    # Returns:
+        a Keras tensor
+    '''
+    channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+    with K.name_scope('separable_conv_block_%s' % id):
+        x = Activation('relu')(ip)
+        x = SeparableConv2D(filters, kernel_size, strides=strides,
+                            name='separable_conv_1_%s' % id, padding='same',
+                            use_bias=False, kernel_initializer='he_normal',
+                            kernel_regularizer=l2(weight_decay))(x)
+        x = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY,
+                               epsilon=_BN_EPSILON,
+                               name="separable_conv_1_bn_%s" % id)(x)
+        x = Activation('relu')(x)
+        x = SeparableConv2D(filters, kernel_size, name='separable_conv_2_%s' % id,
+                            padding='same', use_bias=False,
+                            kernel_initializer='he_normal',
+                            kernel_regularizer=l2(weight_decay))(x)
+        x = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY,
+                               epsilon=_BN_EPSILON,
+                               name="separable_conv_2_bn_%s" % id)(x)
+    return x
+def _adjust_block(p, ip, filters, weight_decay=5e-5, id=None):
+    '''
+    Adjusts the input `p` to match the shape of the `input`
+    or situations where the output number of filters needs to
+    be changed
+    # Arguments:
+        p: input tensor which needs to be modified
+        ip: input tensor whose shape needs to be matched
+        filters: number of output filters to be matched
+        weight_decay: l2 regularization weight
+        id: string id
+    # Returns:
+        an adjusted Keras tensor
+    '''
+    channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+    img_dim = 2 if K.image_data_format() == 'channels_first' else -2
+    with K.name_scope('adjust_block'):
+        if p is None:
+            p = ip
+        elif p._keras_shape[img_dim] != ip._keras_shape[img_dim]:
+            with K.name_scope('adjust_reduction_block_%s' % id):
+                p = Activation('relu', name='adjust_relu_1_%s' % id)(p)
+                p1 = AveragePooling2D((1, 1), strides=(2, 2), padding='valid',
+                                      name='adjust_avg_pool_1_%s' % id)(p)
+                p1 = Conv2D(filters // 2, (1, 1), padding='same', use_bias=False,
+                            kernel_regularizer=l2(weight_decay),
+                            name='adjust_conv_1_%s' % id,
+                            kernel_initializer='he_normal')(p1)
+                p2 = ZeroPadding2D(padding=((0, 1), (0, 1)))(p)
+                p2 = Cropping2D(cropping=((1, 0), (1, 0)))(p2)
+                p2 = AveragePooling2D((1, 1), strides=(2, 2), padding='valid',
+                                      name='adjust_avg_pool_2_%s' % id)(p2)
+                p2 = Conv2D(filters // 2, (1, 1), padding='same', use_bias=False,
+                            kernel_regularizer=l2(weight_decay),
+                            name='adjust_conv_2_%s' % id,
+                            kernel_initializer='he_normal')(p2)
+                p = concatenate([p1, p2], axis=channel_dim)
+                p = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY,
+                                       epsilon=_BN_EPSILON,
+                                       name='adjust_bn_%s' % id)(p)
+        elif p._keras_shape[channel_dim] != filters:
+            with K.name_scope('adjust_projection_block_%s' % id):
+                p = Activation('relu')(p)
+                p = Conv2D(filters, (1, 1), strides=(1, 1), padding='same',
+                           name='adjust_conv_projection_%s' % id, use_bias=False,
+                           kernel_regularizer=l2(weight_decay),
+                           kernel_initializer='he_normal')(p)
+                p = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY,
+                                       epsilon=_BN_EPSILON,
+                                       name='adjust_bn_%s' % id)(p)
+    return p
+def _normal_A(ip, p, filters, weight_decay=5e-5, id=None):
+    '''Adds a Normal cell for NASNet-A (Fig. 4 in the paper)
+    # Arguments:
+        ip: input tensor `x`
+        p: input tensor `p`
+        filters: number of output filters
+        weight_decay: l2 regularization weight
+        id: string id
+    # Returns:
+        a Keras tensor
+    '''
+    channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+    with K.name_scope('normal_A_block_%s' % id):
+        p = _adjust_block(p, ip, filters, weight_decay, id)
+        h = Activation('relu')(ip)
+        h = Conv2D(filters, (1, 1), strides=(1, 1), padding='same',
+                   name='normal_conv_1_%s' % id, use_bias=False,
+                   kernel_initializer='he_normal',
+                   kernel_regularizer=l2(weight_decay))(h)
+        h = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY,
+                               epsilon=_BN_EPSILON, name='normal_bn_1_%s' % id)(h)
+        with K.name_scope('block_1'):
+            x1_1 = _separable_conv_block(h, filters, kernel_size=(5, 5),
+                                         weight_decay=weight_decay,
+                                         id='normal_left1_%s' % id)
+            x1_2 = _separable_conv_block(p, filters, weight_decay=weight_decay,
+                                         id='normal_right1_%s' % id)
+            x1 = add([x1_1, x1_2], name='normal_add_1_%s' % id)
+        with K.name_scope('block_2'):
+            x2_1 = _separable_conv_block(p, filters, (5, 5), weight_decay=weight_decay,
+                                         id='normal_left2_%s' % id)
+            x2_2 = _separable_conv_block(p, filters, (3, 3), weight_decay=weight_decay,
+                                         id='normal_right2_%s' % id)
+            x2 = add([x2_1, x2_2], name='normal_add_2_%s' % id)
+        with K.name_scope('block_3'):
+            x3 = AveragePooling2D((3, 3), strides=(1, 1), padding='same',
+                                  name='normal_left3_%s' % id)(h)
+            x3 = add([x3, p], name='normal_add_3_%s' % id)
+        with K.name_scope('block_4'):
+            x4_1 = AveragePooling2D((3, 3), strides=(1, 1), padding='same',
+                                    name='normal_left4_%s' % id)(p)
+            x4_2 = AveragePooling2D((3, 3), strides=(1, 1), padding='same',
+                                    name='normal_right4_%s' % id)(p)
+            x4 = add([x4_1, x4_2], name='normal_add_4_%s' % id)
+        with K.name_scope('block_5'):
+            x5 = _separable_conv_block(h, filters, weight_decay=weight_decay,
+                                       id='normal_left5_%s' % id)
+            x5 = add([x5, h], name='normal_add_5_%s' % id)
+        x = concatenate([p, x1, x2, x3, x4, x5], axis=channel_dim,
+                        name='normal_concat_%s' % id)
+    return x, ip
+def _reduction_A(ip, p, filters, weight_decay=5e-5, id=None):
+    '''Adds a Reduction cell for NASNet-A (Fig. 4 in the paper)
+    # Arguments:
+        ip: input tensor `x`
+        p: input tensor `p`
+        filters: number of output filters
+        weight_decay: l2 regularization weight
+        id: string id
+    # Returns:
+        a Keras tensor
+    '''
+    """"""
+    channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+    with K.name_scope('reduction_A_block_%s' % id):
+        p = _adjust_block(p, ip, filters, weight_decay, id)
+        h = Activation('relu')(ip)
+        h = Conv2D(filters, (1, 1), strides=(1, 1), padding='same',
+                   name='reduction_conv_1_%s' % id, use_bias=False,
+                   kernel_initializer='he_normal',
+                   kernel_regularizer=l2(weight_decay))(h)
+        h = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY,
+                               epsilon=_BN_EPSILON,
+                               name='reduction_bn_1_%s' % id)(h)
+        with K.name_scope('block_1'):
+            x1_1 = _separable_conv_block(h, filters, (5, 5), strides=(2, 2),
+                                         weight_decay=weight_decay,
+                                         id='reduction_left1_%s' % id)
+            x1_2 = _separable_conv_block(p, filters, (7, 7), strides=(2, 2),
+                                         weight_decay=weight_decay,
+                                         id='reduction_1_%s' % id)
+            x1 = add([x1_1, x1_2], name='reduction_add_1_%s' % id)
+        with K.name_scope('block_2'):
+            x2_1 = MaxPooling2D((3, 3), strides=(2, 2), padding='same',
+                                name='reduction_left2_%s' % id)(h)
+            x2_2 = _separable_conv_block(p, filters, (7, 7), strides=(2, 2),
+                                         weight_decay=weight_decay,
+                                         id='reduction_right2_%s' % id)
+            x2 = add([x2_1, x2_2], name='reduction_add_2_%s' % id)
+        with K.name_scope('block_3'):
+            x3_1 = AveragePooling2D((3, 3), strides=(2, 2), padding='same',
+                                    name='reduction_left3_%s' % id)(h)
+            x3_2 = _separable_conv_block(p, filters, (5, 5), strides=(2, 2),
+                                         weight_decay=weight_decay,
+                                         id='reduction_right3_%s' % id)
+            x3 = add([x3_1, x3_2], name='reduction_add3_%s' % id)
+        with K.name_scope('block_4'):
+            x4 = AveragePooling2D((3, 3), strides=(1, 1), padding='same',
+                                  name='reduction_left4_%s' % id)(x1)
+            x4 = add([x2, x4])
+        with K.name_scope('block_5'):
+            x5_1 = _separable_conv_block(x1, filters, (3, 3),
+                                         weight_decay=weight_decay,
+                                         id='reduction_left4_%s' % id)
+            x5_2 = MaxPooling2D((3, 3), strides=(2, 2), padding='same',
+                                name='reduction_right5_%s' % id)(h)
+            x5 = add([x5_1, x5_2], name='reduction_add4_%s' % id)
+        x = concatenate([x2, x3, x4, x5], axis=channel_dim,
+                        name='reduction_concat_%s' % id)
+        return x, ip
+def _add_auxiliary_head(x, classes, weight_decay, pooling, include_top, activation):
+    '''Adds an auxiliary head for training the model
+    From section A.7 "Training of ImageNet models" of the paper, all NASNet models are
+    trained using an auxiliary classifier around 2/3 of the depth of the network, with
+    a loss weight of 0.4
+    # Arguments
+        x: input tensor
+        classes: number of output classes
+        weight_decay: l2 regularization weight
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        activation: Type of activation at the top layer.
+            Can be one of 'softmax' or 'sigmoid'.
+    # Returns
+        a keras Tensor
+    '''
+    img_height = 1 if K.image_data_format() == 'channels_last' else 2
+    img_width = 2 if K.image_data_format() == 'channels_last' else 3
+    channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+    with K.name_scope('auxiliary_branch'):
+        auxiliary_x = Activation('relu')(x)
+        auxiliary_x = AveragePooling2D((5, 5), strides=(3, 3), padding='valid',
+                                       name='aux_pool')(auxiliary_x)
+        auxiliary_x = Conv2D(128, (1, 1), padding='same', use_bias=False,
+                             name='aux_conv_projection', kernel_initializer='he_normal',
+                             kernel_regularizer=l2(weight_decay))(auxiliary_x)
+        auxiliary_x = BatchNormalization(axis=channel_axis, momentum=_BN_DECAY,
+                                         epsilon=_BN_EPSILON,
+                                         name='aux_bn_projection')(auxiliary_x)
+        auxiliary_x = Activation('relu')(auxiliary_x)
+        auxiliary_x = Conv2D(768, (auxiliary_x._keras_shape[img_height],
+                                   auxiliary_x._keras_shape[img_width]),
+                             padding='valid', use_bias=False,
+                             kernel_initializer='he_normal',
+                             kernel_regularizer=l2(weight_decay),
+                             name='aux_conv_reduction')(auxiliary_x)
+        auxiliary_x = BatchNormalization(axis=channel_axis, momentum=_BN_DECAY,
+                                         epsilon=_BN_EPSILON,
+                                         name='aux_bn_reduction')(auxiliary_x)
+        auxiliary_x = Activation('relu')(auxiliary_x)
+        if include_top:
+            auxiliary_x = Flatten()(auxiliary_x)
+            auxiliary_x = Dense(classes, activation=activation,
+                                kernel_regularizer=l2(weight_decay),
+                                name='aux_predictions')(auxiliary_x)
+        else:
+            if pooling == 'avg':
+                auxiliary_x = GlobalAveragePooling2D()(auxiliary_x)
+            elif pooling == 'max':
+                auxiliary_x = GlobalMaxPooling2D()(auxiliary_x)
+    return auxiliary_x
--- a/code/keras_contrib/applications/resnet.py
+++ b/code/keras_contrib/applications/resnet.py
+"""ResNet v1, v2, and segmentation models for Keras.
+# Reference
+- [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)
+- [Identity Mappings in Deep Residual Networks](https://arxiv.org/abs/1603.05027)
+Reference material for extended functionality:
+- [ResNeXt](https://arxiv.org/abs/1611.05431) for Tiny ImageNet support.
+- [Dilated Residual Networks](https://arxiv.org/pdf/1705.09914) for segmentation support
+- [Deep Residual Learning for Instrument Segmentation in
+   Robotic Surgery](https://arxiv.org/abs/1703.08580)
+  for segmentation support.
+Implementation Adapted from: github.com/raghakot/keras-resnet
+"""  # pylint: disable=E501
+from __future__ import division
+import six
+from keras.models import Model
+from keras.layers import Input
+from keras.layers import Activation
+from keras.layers import Reshape
+from keras.layers import Dense
+from keras.layers import Conv2D
+from keras.layers import MaxPooling2D
+from keras.layers import GlobalMaxPooling2D
+from keras.layers import GlobalAveragePooling2D
+from keras.layers import Dropout
+from keras.layers.merge import add
+from keras.layers.normalization import BatchNormalization
+from keras.regularizers import l2
+from keras import backend as K
+from keras_applications.imagenet_utils import _obtain_input_shape
+def _bn_relu(x, bn_name=None, relu_name=None):
+    """Helper to build a BN -> relu block
+    """
+    norm = BatchNormalization(axis=CHANNEL_AXIS, name=bn_name)(x)
+    return Activation("relu", name=relu_name)(norm)
+def _conv_bn_relu(**conv_params):
+    """Helper to build a conv -> BN -> relu residual unit activation function.
+       This is the original ResNet v1 scheme in https://arxiv.org/abs/1512.03385
+    """
+    filters = conv_params["filters"]
+    kernel_size = conv_params["kernel_size"]
+    strides = conv_params.setdefault("strides", (1, 1))
+    dilation_rate = conv_params.setdefault("dilation_rate", (1, 1))
+    conv_name = conv_params.setdefault("conv_name", None)
+    bn_name = conv_params.setdefault("bn_name", None)
+    relu_name = conv_params.setdefault("relu_name", None)
+    kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal")
+    padding = conv_params.setdefault("padding", "same")
+    kernel_regularizer = conv_params.setdefault("kernel_regularizer", l2(1.e-4))
+    def f(x):
+        x = Conv2D(filters=filters, kernel_size=kernel_size,
+                   strides=strides, padding=padding,
+                   dilation_rate=dilation_rate,
+                   kernel_initializer=kernel_initializer,
+                   kernel_regularizer=kernel_regularizer,
+                   name=conv_name)(x)
+        return _bn_relu(x, bn_name=bn_name, relu_name=relu_name)
+    return f
+def _bn_relu_conv(**conv_params):
+    """Helper to build a BN -> relu -> conv residual unit with full pre-activation
+    function. This is the ResNet v2 scheme proposed in
+    http://arxiv.org/pdf/1603.05027v2.pdf
+    """
+    filters = conv_params["filters"]
+    kernel_size = conv_params["kernel_size"]
+    strides = conv_params.setdefault("strides", (1, 1))
+    dilation_rate = conv_params.setdefault("dilation_rate", (1, 1))
+    conv_name = conv_params.setdefault("conv_name", None)
+    bn_name = conv_params.setdefault("bn_name", None)
+    relu_name = conv_params.setdefault("relu_name", None)
+    kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal")
+    padding = conv_params.setdefault("padding", "same")
+    kernel_regularizer = conv_params.setdefault("kernel_regularizer", l2(1.e-4))
+    def f(x):
+        activation = _bn_relu(x, bn_name=bn_name, relu_name=relu_name)
+        return Conv2D(filters=filters, kernel_size=kernel_size,
+                      strides=strides, padding=padding,
+                      dilation_rate=dilation_rate,
+                      kernel_initializer=kernel_initializer,
+                      kernel_regularizer=kernel_regularizer,
+                      name=conv_name)(activation)
+    return f
+def _shortcut(input_feature, residual, conv_name_base=None, bn_name_base=None):
+    """Adds a shortcut between input and residual block and merges them with "sum"
+    """
+    # Expand channels of shortcut to match residual.
+    # Stride appropriately to match residual (width, height)
+    # Should be int if network architecture is correctly configured.
+    input_shape = K.int_shape(input_feature)
+    residual_shape = K.int_shape(residual)
+    stride_width = int(round(input_shape[ROW_AXIS] / residual_shape[ROW_AXIS]))
+    stride_height = int(round(input_shape[COL_AXIS] / residual_shape[COL_AXIS]))
+    equal_channels = input_shape[CHANNEL_AXIS] == residual_shape[CHANNEL_AXIS]
+    shortcut = input_feature
+    # 1 X 1 conv if shape is different. Else identity.
+    if stride_width > 1 or stride_height > 1 or not equal_channels:
+        print('reshaping via a convolution...')
+        if conv_name_base is not None:
+            conv_name_base = conv_name_base + '1'
+        shortcut = Conv2D(filters=residual_shape[CHANNEL_AXIS],
+                          kernel_size=(1, 1),
+                          strides=(stride_width, stride_height),
+                          padding="valid",
+                          kernel_initializer="he_normal",
+                          kernel_regularizer=l2(0.0001),
+                          name=conv_name_base)(input_feature)
+        if bn_name_base is not None:
+            bn_name_base = bn_name_base + '1'
+        shortcut = BatchNormalization(axis=CHANNEL_AXIS,
+                                      name=bn_name_base)(shortcut)
+    return add([shortcut, residual])
+def _residual_block(block_function, filters, blocks, stage,
+                    transition_strides=None, transition_dilation_rates=None,
+                    dilation_rates=None, is_first_layer=False, dropout=None,
+                    residual_unit=_bn_relu_conv):
+    """Builds a residual block with repeating bottleneck blocks.
+       stage: integer, current stage label, used for generating layer names
+       blocks: number of blocks 'a','b'..., current block label, used for generating
+            layer names
+       transition_strides: a list of tuples for the strides of each transition
+       transition_dilation_rates: a list of tuples for the dilation rate of each
+            transition
+    """
+    if transition_dilation_rates is None:
+        transition_dilation_rates = [(1, 1)] * blocks
+    if transition_strides is None:
+        transition_strides = [(1, 1)] * blocks
+    if dilation_rates is None:
+        dilation_rates = [1] * blocks
+    def f(x):
+        for i in range(blocks):
+            is_first_block = is_first_layer and i == 0
+            x = block_function(filters=filters, stage=stage, block=i,
+                               transition_strides=transition_strides[i],
+                               dilation_rate=dilation_rates[i],
+                               is_first_block_of_first_layer=is_first_block,
+                               dropout=dropout,
+                               residual_unit=residual_unit)(x)
+        return x
+    return f
+def _block_name_base(stage, block):
+    """Get the convolution name base and batch normalization name base defined by
+    stage and block.
+    If there are less than 26 blocks they will be labeled 'a', 'b', 'c' to match the
+    paper and keras and beyond 26 blocks they will simply be numbered.
+    """
+    if block < 27:
+        block = '%c' % (block + 97)  # 97 is the ascii number for lowercase 'a'
+    conv_name_base = 'res' + str(stage) + block + '_branch'
+    bn_name_base = 'bn' + str(stage) + block + '_branch'
+    return conv_name_base, bn_name_base
+def basic_block(filters, stage, block, transition_strides=(1, 1),
+                dilation_rate=(1, 1), is_first_block_of_first_layer=False, dropout=None,
+                residual_unit=_bn_relu_conv):
+    """Basic 3 X 3 convolution blocks for use on resnets with layers <= 34.
+    Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf
+    """
+    def f(input_features):
+        conv_name_base, bn_name_base = _block_name_base(stage, block)
+        if is_first_block_of_first_layer:
+            # don't repeat bn->relu since we just did bn->relu->maxpool
+            x = Conv2D(filters=filters, kernel_size=(3, 3),
+                       strides=transition_strides,
+                       dilation_rate=dilation_rate,
+                       padding="same",
+                       kernel_initializer="he_normal",
+                       kernel_regularizer=l2(1e-4),
+                       name=conv_name_base + '2a')(input_features)
+        else:
+            x = residual_unit(filters=filters, kernel_size=(3, 3),
+                              strides=transition_strides,
+                              dilation_rate=dilation_rate,
+                              conv_name_base=conv_name_base + '2a',
+                              bn_name_base=bn_name_base + '2a')(input_features)
+        if dropout is not None:
+            x = Dropout(dropout)(x)
+        x = residual_unit(filters=filters, kernel_size=(3, 3),
+                          conv_name_base=conv_name_base + '2b',
+                          bn_name_base=bn_name_base + '2b')(x)
+        return _shortcut(input_features, x)
+    return f
+def bottleneck(filters, stage, block, transition_strides=(1, 1),
+               dilation_rate=(1, 1), is_first_block_of_first_layer=False, dropout=None,
+               residual_unit=_bn_relu_conv):
+    """Bottleneck architecture for > 34 layer resnet.
+    Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf
+    Returns:
+        A final conv layer of filters * 4
+    """
+    def f(input_feature):
+        conv_name_base, bn_name_base = _block_name_base(stage, block)
+        if is_first_block_of_first_layer:
+            # don't repeat bn->relu since we just did bn->relu->maxpool
+            x = Conv2D(filters=filters, kernel_size=(1, 1),
+                       strides=transition_strides,
+                       dilation_rate=dilation_rate,
+                       padding="same",
+                       kernel_initializer="he_normal",
+                       kernel_regularizer=l2(1e-4),
+                       name=conv_name_base + '2a')(input_feature)
+        else:
+            x = residual_unit(filters=filters, kernel_size=(1, 1),
+                              strides=transition_strides,
+                              dilation_rate=dilation_rate,
+                              conv_name_base=conv_name_base + '2a',
+                              bn_name_base=bn_name_base + '2a')(input_feature)
+        if dropout is not None:
+            x = Dropout(dropout)(x)
+        x = residual_unit(filters=filters, kernel_size=(3, 3),
+                          conv_name_base=conv_name_base + '2b',
+                          bn_name_base=bn_name_base + '2b')(x)
+        if dropout is not None:
+            x = Dropout(dropout)(x)
+        x = residual_unit(filters=filters * 4, kernel_size=(1, 1),
+                          conv_name_base=conv_name_base + '2c',
+                          bn_name_base=bn_name_base + '2c')(x)
+        return _shortcut(input_feature, x)
+    return f
+def _handle_dim_ordering():
+    global ROW_AXIS
+    global COL_AXIS
+    global CHANNEL_AXIS
+    if K.image_data_format() == 'channels_last':
+        ROW_AXIS = 1
+        COL_AXIS = 2
+        CHANNEL_AXIS = 3
+    else:
+        CHANNEL_AXIS = 1
+        ROW_AXIS = 2
+        COL_AXIS = 3
+def _string_to_function(identifier):
+    if isinstance(identifier, six.string_types):
+        res = globals().get(identifier)
+        if not res:
+            raise ValueError('Invalid {}'.format(identifier))
+        return res
+    return identifier
+def ResNet(input_shape=None, classes=10, block='bottleneck', residual_unit='v2',
+           repetitions=None, initial_filters=64, activation='softmax', include_top=True,
+           input_tensor=None, dropout=None, transition_dilation_rate=(1, 1),
+           initial_strides=(2, 2), initial_kernel_size=(7, 7), initial_pooling='max',
+           final_pooling=None, top='classification'):
+    """Builds a custom ResNet like architecture. Defaults to ResNet50 v2.
+    Args:
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(224, 224, 3)` (with `channels_last` dim ordering)
+            or `(3, 224, 224)` (with `channels_first` dim ordering).
+            It should have exactly 3 dimensions,
+            and width and height should be no smaller than 8.
+            E.g. `(224, 224, 3)` would be one valid value.
+        classes: The number of outputs at final softmax layer
+        block: The block function to use. This is either `'basic'` or `'bottleneck'`.
+            The original paper used `basic` for layers < 50.
+        repetitions: Number of repetitions of various block units.
+            At each block unit, the number of filters are doubled and the input size
+            is halved. Default of None implies the ResNet50v2 values of [3, 4, 6, 3].
+        residual_unit: the basic residual unit, 'v1' for conv bn relu, 'v2' for bn relu
+            conv. See [Identity Mappings in
+            Deep Residual Networks](https://arxiv.org/abs/1603.05027)
+            for details.
+        dropout: None for no dropout, otherwise rate of dropout from 0 to 1.
+            Based on [Wide Residual Networks.(https://arxiv.org/pdf/1605.07146) paper.
+        transition_dilation_rate: Dilation rate for transition layers. For semantic
+            segmentation of images use a dilation rate of (2, 2).
+        initial_strides: Stride of the very first residual unit and MaxPooling2D call,
+            with default (2, 2), set to (1, 1) for small images like cifar.
+        initial_kernel_size: kernel size of the very first convolution, (7, 7) for
+            imagenet and (3, 3) for small image datasets like tiny imagenet and cifar.
+            See [ResNeXt](https://arxiv.org/abs/1611.05431) paper for details.
+        initial_pooling: Determine if there will be an initial pooling layer,
+            'max' for imagenet and None for small image datasets.
+            See [ResNeXt](https://arxiv.org/abs/1611.05431) paper for details.
+        final_pooling: Optional pooling mode for feature extraction at the final
+            model layer when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        top: Defines final layers to evaluate based on a specific problem type. Options
+            are 'classification' for ImageNet style problems, 'segmentation' for
+            problems like the Pascal VOC dataset, and None to exclude these layers
+            entirely.
+    Returns:
+        The keras `Model`.
+    """
+    if activation not in ['softmax', 'sigmoid', None]:
+        raise ValueError('activation must be one of "softmax", "sigmoid", or None')
+    if activation == 'sigmoid' and classes != 1:
+        raise ValueError('sigmoid activation can only be used when classes = 1')
+    if repetitions is None:
+        repetitions = [3, 4, 6, 3]
+    # Determine proper input shape
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=32,
+                                      min_size=8,
+                                      data_format=K.image_data_format(),
+                                      require_flatten=include_top)
+    _handle_dim_ordering()
+    if len(input_shape) != 3:
+        raise Exception("Input shape should be a tuple (nb_channels, nb_rows, nb_cols)")
+    if block == 'basic':
+        block_fn = basic_block
+    elif block == 'bottleneck':
+        block_fn = bottleneck
+    elif isinstance(block, six.string_types):
+        block_fn = _string_to_function(block)
+    else:
+        block_fn = block
+    if residual_unit == 'v2':
+        residual_unit = _bn_relu_conv
+    elif residual_unit == 'v1':
+        residual_unit = _conv_bn_relu
+    elif isinstance(residual_unit, six.string_types):
+        residual_unit = _string_to_function(residual_unit)
+    else:
+        residual_unit = residual_unit
+    # Permute dimension order if necessary
+    if K.image_data_format() == 'channels_first':
+        input_shape = (input_shape[1], input_shape[2], input_shape[0])
+    # Determine proper input shape
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=32,
+                                      min_size=8,
+                                      data_format=K.image_data_format(),
+                                      require_flatten=include_top)
+    img_input = Input(shape=input_shape, tensor=input_tensor)
+    x = _conv_bn_relu(filters=initial_filters, kernel_size=initial_kernel_size,
+                      strides=initial_strides)(img_input)
+    if initial_pooling == 'max':
+        x = MaxPooling2D(pool_size=(3, 3), strides=initial_strides, padding="same")(x)
+    block = x
+    filters = initial_filters
+    for i, r in enumerate(repetitions):
+        transition_dilation_rates = [transition_dilation_rate] * r
+        transition_strides = [(1, 1)] * r
+        if transition_dilation_rate == (1, 1):
+            transition_strides[0] = (2, 2)
+        block = _residual_block(block_fn, filters=filters,
+                                stage=i, blocks=r,
+                                is_first_layer=(i == 0),
+                                dropout=dropout,
+                                transition_dilation_rates=transition_dilation_rates,
+                                transition_strides=transition_strides,
+                                residual_unit=residual_unit)(block)
+        filters *= 2
+    # Last activation
+    x = _bn_relu(block)
+    # Classifier block
+    if include_top and top is 'classification':
+        x = GlobalAveragePooling2D()(x)
+        x = Dense(units=classes, activation=activation,
+                  kernel_initializer="he_normal")(x)
+    elif include_top and top is 'segmentation':
+        x = Conv2D(classes, (1, 1), activation='linear', padding='same')(x)
+        if K.image_data_format() == 'channels_first':
+            channel, row, col = input_shape
+        else:
+            row, col, channel = input_shape
+        x = Reshape((row * col, classes))(x)
+        x = Activation(activation)(x)
+        x = Reshape((row, col, classes))(x)
+    elif final_pooling == 'avg':
+        x = GlobalAveragePooling2D()(x)
+    elif final_pooling == 'max':
+        x = GlobalMaxPooling2D()(x)
+    model = Model(inputs=img_input, outputs=x)
+    return model
+def ResNet18(input_shape, classes):
+    """ResNet with 18 layers and v2 residual units
+    """
+    return ResNet(input_shape, classes, basic_block, repetitions=[2, 2, 2, 2])
+def ResNet34(input_shape, classes):
+    """ResNet with 34 layers and v2 residual units
+    """
+    return ResNet(input_shape, classes, basic_block, repetitions=[3, 4, 6, 3])
+def ResNet50(input_shape, classes):
+    """ResNet with 50 layers and v2 residual units
+    """
+    return ResNet(input_shape, classes, bottleneck, repetitions=[3, 4, 6, 3])
+def ResNet101(input_shape, classes):
+    """ResNet with 101 layers and v2 residual units
+    """
+    return ResNet(input_shape, classes, bottleneck, repetitions=[3, 4, 23, 3])
+def ResNet152(input_shape, classes):
+    """ResNet with 152 layers and v2 residual units
+    """
+    return ResNet(input_shape, classes, bottleneck, repetitions=[3, 8, 36, 3])
--- a/code/keras_contrib/applications/wide_resnet.py
+++ b/code/keras_contrib/applications/wide_resnet.py
+# -*- coding: utf-8 -*-
+"""Wide Residual Network models for Keras.
+# Reference
+- [Wide Residual Networks](https://arxiv.org/abs/1605.07146)
+"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+import warnings
+from keras.models import Model
+from keras.layers.core import Dense, Dropout, Activation
+from keras.layers.pooling import MaxPooling2D, GlobalAveragePooling2D
+from keras.layers import Input, Conv2D
+from keras.layers.merge import add
+from keras.layers.normalization import BatchNormalization
+from keras.utils.layer_utils import convert_all_kernels_in_model
+from keras.utils.data_utils import get_file
+from keras.engine.topology import get_source_inputs
+from keras_applications.imagenet_utils import _obtain_input_shape
+import keras.backend as K
+TH_WEIGHTS_PATH = ('https://github.com/titu1994/Wide-Residual-Networks/'
+                   'releases/download/v1.2/wrn_28_8_th_kernels_th_dim_ordering.h5')
+TF_WEIGHTS_PATH = ('https://github.com/titu1994/Wide-Residual-Networks/'
+                   'releases/download/v1.2/wrn_28_8_tf_kernels_tf_dim_ordering.h5')
+TH_WEIGHTS_PATH_NO_TOP = ('https://github.com/titu1994/Wide-Residual-Networks/releases/'
+                          'download/v1.2/wrn_28_8_th_kernels_th_dim_ordering_no_top.h5')
+TF_WEIGHTS_PATH_NO_TOP = ('https://github.com/titu1994/Wide-Residual-Networks/releases/'
+                          'download/v1.2/wrn_28_8_tf_kernels_tf_dim_ordering_no_top.h5')
+def WideResidualNetwork(depth=28, width=8, dropout_rate=0.0,
+                        include_top=True, weights='cifar10',
+                        input_tensor=None, input_shape=None,
+                        classes=10, activation='softmax'):
+    """Instantiate the Wide Residual Network architecture,
+        optionally loading weights pre-trained
+        on CIFAR-10. Note that when using TensorFlow,
+        for best performance you should set
+        `image_dim_ordering="tf"` in your Keras config
+        at ~/.keras/keras.json.
+        The model and the weights are compatible with both
+        TensorFlow and Theano. The dimension ordering
+        convention used by the model is the one
+        specified in your Keras config file.
+        # Arguments
+            depth: number or layers in the DenseNet
+            width: multiplier to the ResNet width (number of filters)
+            dropout_rate: dropout rate
+            include_top: whether to include the fully-connected
+                layer at the top of the network.
+            weights: one of `None` (random initialization) or
+                "cifar10" (pre-training on CIFAR-10)..
+            input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+                to use as image input for the model.
+            input_shape: optional shape tuple, only to be specified
+                if `include_top` is False (otherwise the input shape
+                has to be `(32, 32, 3)` (with `tf` dim ordering)
+                or `(3, 32, 32)` (with `th` dim ordering).
+                It should have exactly 3 inputs channels,
+                and width and height should be no smaller than 8.
+                E.g. `(200, 200, 3)` would be one valid value.
+            classes: optional number of classes to classify images
+                into, only to be specified if `include_top` is True, and
+                if no `weights` argument is specified.
+        # Returns
+            A Keras model instance.
+        """
+    if weights not in {'cifar10', None}:
+        raise ValueError('The `weights` argument should be either '
+                         '`None` (random initialization) or `cifar10` '
+                         '(pre-training on CIFAR-10).')
+    if weights == 'cifar10' and include_top and classes != 10:
+        raise ValueError('If using `weights` as CIFAR 10 with `include_top`'
+                         ' as true, `classes` should be 10')
+    if (depth - 4) % 6 != 0:
+        raise ValueError('Depth of the network must be such that (depth - 4)'
+                         'should be divisible by 6.')
+    # Determine proper input shape
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=32,
+                                      min_size=8,
+                                      data_format=K.image_dim_ordering(),
+                                      require_flatten=include_top)
+    if input_tensor is None:
+        img_input = Input(shape=input_shape)
+    else:
+        if not K.is_keras_tensor(input_tensor):
+            img_input = Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+    x = __create_wide_residual_network(classes, img_input, include_top, depth, width,
+                                       dropout_rate, activation)
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+    # Create model.
+    model = Model(inputs, x, name='wide-resnet')
+    # load weights
+    if weights == 'cifar10':
+        if (depth == 28) and (width == 8) and (dropout_rate == 0.0):
+            # Default parameters match. Weights for this model exist:
+            if K.image_dim_ordering() == 'th':
+                if include_top:
+                    h5_file = 'wide_resnet_28_8_th_dim_ordering_th_kernels.h5'
+                    weights_path = get_file(h5_file,
+                                            TH_WEIGHTS_PATH,
+                                            cache_subdir='models')
+                else:
+                    h5_file = 'wide_resnet_28_8_th_dim_ordering_th_kernels_no_top.h5'
+                    weights_path = get_file(h5_file,
+                                            TH_WEIGHTS_PATH_NO_TOP,
+                                            cache_subdir='models')
+                model.load_weights(weights_path)
+                if K.backend() == 'tensorflow':
+                    warnings.warn('You are using the TensorFlow backend, yet you '
+                                  'are using the Theano '
+                                  'image dimension ordering convention '
+                                  '(`image_dim_ordering="th"`). '
+                                  'For best performance, set '
+                                  '`image_dim_ordering="tf"` in '
+                                  'your Keras config '
+                                  'at ~/.keras/keras.json.')
+                    convert_all_kernels_in_model(model)
+            else:
+                if include_top:
+                    h5_file = 'wide_resnet_28_8_tf_dim_ordering_tf_kernels.h5'
+                    weights_path = get_file(h5_file,
+                                            TF_WEIGHTS_PATH,
+                                            cache_subdir='models')
+                else:
+                    h5_file = 'wide_resnet_28_8_tf_dim_ordering_tf_kernels_no_top.h5'
+                    weights_path = get_file(h5_file,
+                                            TF_WEIGHTS_PATH_NO_TOP,
+                                            cache_subdir='models')
+                model.load_weights(weights_path)
+                if K.backend() == 'theano':
+                    convert_all_kernels_in_model(model)
+    return model
+def __conv1_block(input):
+    x = Conv2D(16, (3, 3), padding='same')(input)
+    channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+    x = BatchNormalization(axis=channel_axis)(x)
+    x = Activation('relu')(x)
+    return x
+def __conv2_block(input, k=1, dropout=0.0):
+    init = input
+    channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+    # Check if input number of filters is same as 16 * k, else create
+    # convolution2d for this input
+    if K.image_data_format() == 'channels_first':
+        if init._keras_shape[1] != 16 * k:
+            init = Conv2D(16 * k, (1, 1), activation='linear', padding='same')(init)
+    else:
+        if init._keras_shape[-1] != 16 * k:
+            init = Conv2D(16 * k, (1, 1), activation='linear', padding='same')(init)
+    x = Conv2D(16 * k, (3, 3), padding='same')(input)
+    x = BatchNormalization(axis=channel_axis)(x)
+    x = Activation('relu')(x)
+    if dropout > 0.0:
+        x = Dropout(dropout)(x)
+    x = Conv2D(16 * k, (3, 3), padding='same')(x)
+    x = BatchNormalization(axis=channel_axis)(x)
+    x = Activation('relu')(x)
+    m = add([init, x])
+    return m
+def __conv3_block(input, k=1, dropout=0.0):
+    init = input
+    channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+    # Check if input number of filters is same as 32 * k, else
+    # create convolution2d for this input
+    if K.image_data_format() == 'channels_first':
+        if init._keras_shape[1] != 32 * k:
+            init = Conv2D(32 * k, (1, 1), activation='linear', padding='same')(init)
+    else:
+        if init._keras_shape[-1] != 32 * k:
+            init = Conv2D(32 * k, (1, 1), activation='linear', padding='same')(init)
+    x = Conv2D(32 * k, (3, 3), padding='same')(input)
+    x = BatchNormalization(axis=channel_axis)(x)
+    x = Activation('relu')(x)
+    if dropout > 0.0:
+        x = Dropout(dropout)(x)
+    x = Conv2D(32 * k, (3, 3), padding='same')(x)
+    x = BatchNormalization(axis=channel_axis)(x)
+    x = Activation('relu')(x)
+    m = add([init, x])
+    return m
+def ___conv4_block(input, k=1, dropout=0.0):
+    init = input
+    channel_axis = 1 if K.image_dim_ordering() == 'th' else -1
+    # Check if input number of filters is same as 64 * k, else
+    # create convolution2d for this input
+    if K.image_dim_ordering() == 'th':
+        if init._keras_shape[1] != 64 * k:
+            init = Conv2D(64 * k, (1, 1), activation='linear', padding='same')(init)
+    else:
+        if init._keras_shape[-1] != 64 * k:
+            init = Conv2D(64 * k, (1, 1), activation='linear', padding='same')(init)
+    x = Conv2D(64 * k, (3, 3), padding='same')(input)
+    x = BatchNormalization(axis=channel_axis)(x)
+    x = Activation('relu')(x)
+    if dropout > 0.0:
+        x = Dropout(dropout)(x)
+    x = Conv2D(64 * k, (3, 3), padding='same')(x)
+    x = BatchNormalization(axis=channel_axis)(x)
+    x = Activation('relu')(x)
+    m = add([init, x])
+    return m
+def __create_wide_residual_network(nb_classes, img_input, include_top, depth=28,
+                                   width=8, dropout=0.0, activation='softmax'):
+    ''' Creates a Wide Residual Network with specified parameters
+    Args:
+        nb_classes: Number of output classes
+        img_input: Input tensor or layer
+        include_top: Flag to include the last dense layer
+        depth: Depth of the network. Compute N = (n - 4) / 6.
+               For a depth of 16, n = 16, N = (16 - 4) / 6 = 2
+               For a depth of 28, n = 28, N = (28 - 4) / 6 = 4
+               For a depth of 40, n = 40, N = (40 - 4) / 6 = 6
+        width: Width of the network.
+        dropout: Adds dropout if value is greater than 0.0
+    Returns:a Keras Model
+    '''
+    N = (depth - 4) // 6
+    x = __conv1_block(img_input)
+    nb_conv = 4
+    for i in range(N):
+        x = __conv2_block(x, width, dropout)
+        nb_conv += 2
+    x = MaxPooling2D((2, 2))(x)
+    for i in range(N):
+        x = __conv3_block(x, width, dropout)
+        nb_conv += 2
+    x = MaxPooling2D((2, 2))(x)
+    for i in range(N):
+        x = ___conv4_block(x, width, dropout)
+        nb_conv += 2
+    if include_top:
+        x = GlobalAveragePooling2D()(x)
+        x = Dense(nb_classes, activation=activation)(x)
+    return x
--- a/code/keras_contrib/backend/__init__.py
+++ b/code/keras_contrib/backend/__init__.py
+from keras import backend as K
+# We import all keras backend functions here,
+# so that files in this repo can import both
+# core and contrib backend functions with a
+# single import statement.
+if K.backend() == 'theano':
+    from .theano_backend import *
+elif K.backend() == 'tensorflow':
+    from .tensorflow_backend import *
+elif K.backend() == 'cntk':
+    from .cntk_backend import *
--- a/code/keras_contrib/backend/__pycache__/__init__.cpython-310.pyc
+++ b/code/keras_contrib/backend/__pycache__/__init__.cpython-310.pyc
--- a/code/keras_contrib/backend/__pycache__/tensorflow_backend.cpython-310.pyc
+++ b/code/keras_contrib/backend/__pycache__/tensorflow_backend.cpython-310.pyc
--- a/code/keras_contrib/backend/cntk_backend.py
+++ b/code/keras_contrib/backend/cntk_backend.py
+from keras.backend import cntk_backend as KCN
+def moments(x, axes, shift=None, keep_dims=False):
+    ''' Calculates and returns the mean and variance of the input '''
+    mean, variant = KCN._moments(x, axes=axes, shift=shift, keep_dims=keep_dims)
+    return mean, variant
--- a/code/keras_contrib/backend/numpy_backend.py
+++ b/code/keras_contrib/backend/numpy_backend.py
+import numpy as np
+from keras import backend as K
+def extract_image_patches(X, ksizes, strides,
+                          padding='valid',
+                          data_format='channels_first'):
+    raise NotImplementedError
+def depth_to_space(input, scale, data_format=None):
+    raise NotImplementedError
+def moments(x, axes, shift=None, keep_dims=False):
+    mean_batch = np.mean(x, axis=tuple(axes), keepdims=keep_dims)
+    var_batch = np.var(x, axis=tuple(axes), keepdims=keep_dims)
+    return mean_batch, var_batch
--- a/code/keras_contrib/backend/tensorflow_backend.py
+++ b/code/keras_contrib/backend/tensorflow_backend.py
+import tensorflow as tf
+try:
+    from tensorflow.python.ops import ctc_ops as ctc
+except ImportError:
+    import tensorflow.contrib.ctc as ctc
+import keras.backend as K
+py_all = all
+def _preprocess_conv2d_input(x, data_format):
+    """Transpose and cast the input before the conv2d.
+    # Arguments
+        x: input tensor.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+    # Returns
+        A tensor.
+    """
+    if K.dtype(x) == 'float64':
+        x = tf.cast(x, 'float32')
+    if data_format == 'channels_first':
+        # TF uses the last dimension as channel dimension,
+        # instead of the 2nd one.
+        # TH input shape: (samples, input_depth, rows, cols)
+        # TF input shape: (samples, rows, cols, input_depth)
+        x = tf.transpose(x, (0, 2, 3, 1))
+    return x
+def _postprocess_conv2d_output(x, data_format):
+    """Transpose and cast the output from conv2d if needed.
+    # Arguments
+        x: A tensor.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+    # Returns
+        A tensor.
+    """
+    if data_format == 'channels_first':
+        x = tf.transpose(x, (0, 3, 1, 2))
+    if K.floatx() == 'float64':
+        x = tf.cast(x, 'float64')
+    return x
+def _preprocess_padding(padding):
+    """Convert keras' padding to tensorflow's padding.
+    # Arguments
+        padding: string, `"same"` or `"valid"`.
+    # Returns
+        a string, `"SAME"` or `"VALID"`.
+    # Raises
+        ValueError: if `padding` is invalid.
+    """
+    if padding == 'same':
+        padding = 'SAME'
+    elif padding == 'valid':
+        padding = 'VALID'
+    else:
+        raise ValueError('Invalid padding:', padding)
+    return padding
+def conv2d(x, kernel, strides=(1, 1), padding='valid', data_format='channels_first',
+           image_shape=None, filter_shape=None):
+    """2D convolution.
+    # Arguments
+        x: Input tensor
+        kernel: kernel tensor.
+        strides: strides tuple.
+        padding: string, "same" or "valid".
+        data_format: 'channels_first' or 'channels_last'.
+            Whether to use Theano or TensorFlow dimension
+            ordering in inputs/kernels/ouputs.
+        image_shape: Optional, the input tensor shape
+        filter_shape: Optional, the kernel shape.
+    # Returns
+        x convolved with the kernel.
+    # Raises
+        Exception: In case of invalid border mode or data format.
+    """
+    return K.conv2d(x, kernel, strides, padding, data_format)
+def extract_image_patches(x, ksizes, ssizes, padding='same',
+                          data_format='channels_last'):
+    """Extract the patches from an image.
+    # Arguments
+        x: The input image
+        ksizes: 2-d tuple with the kernel size
+        ssizes: 2-d tuple with the strides size
+        padding: 'same' or 'valid'
+        data_format: 'channels_last' or 'channels_first'
+    # Returns
+        The (k_w,k_h) patches extracted
+        TF ==> (batch_size,w,h,k_w,k_h,c)
+        TH ==> (batch_size,w,h,c,k_w,k_h)
+    """
+    kernel = [1, ksizes[0], ksizes[1], 1]
+    strides = [1, ssizes[0], ssizes[1], 1]
+    padding = _preprocess_padding(padding)
+    if data_format == 'channels_first':
+        x = K.permute_dimensions(x, (0, 2, 3, 1))
+    bs_i, w_i, h_i, ch_i = K.int_shape(x)
+    patches = tf.extract_image_patches(x, kernel, strides, [1, 1, 1, 1],
+                                       padding)
+    # Reshaping to fit Theano
+    bs, w, h, ch = K.int_shape(patches)
+    reshaped = tf.reshape(patches, [-1, w, h, tf.floordiv(ch, ch_i), ch_i])
+    final_shape = [-1, w, h, ch_i, ksizes[0], ksizes[1]]
+    patches = tf.reshape(tf.transpose(reshaped, [0, 1, 2, 4, 3]), final_shape)
+    if data_format == 'channels_last':
+        patches = K.permute_dimensions(patches, [0, 1, 2, 4, 5, 3])
+    return patches
+def depth_to_space(input, scale, data_format=None):
+    """ Uses phase shift algorithm to convert channels/depth for spatial resolution.
+    # Arguments
+        input: Input tensor
+        scale: n `int` that is `>= 2`. The size of the spatial block.
+        data_format: 'channels_first' or 'channels_last'.
+            Whether to use Theano or TensorFlow dimension
+            ordering in inputs/kernels/ouputs.
+    # Returns
+        TODO (PR welcome): Filling this section.
+    """
+    if data_format is None:
+        data_format = K.image_data_format()
+    data_format = data_format.lower()
+    input = _preprocess_conv2d_input(input, data_format)
+    out = tf.depth_to_space(input, scale)
+    out = _postprocess_conv2d_output(out, data_format)
+    return out
+def moments(x, axes, shift=None, keep_dims=False):
+    ''' Wrapper over tensorflow backend call '''
+    return tf.nn.moments(x, axes, shift=shift, keep_dims=keep_dims)
--- a/code/keras_contrib/backend/theano_backend.py
+++ b/code/keras_contrib/backend/theano_backend.py
+from theano import tensor as T
+from theano.sandbox.neighbours import images2neibs
+try:
+    import theano.sparse as th_sparse_module
+except ImportError:
+    th_sparse_module = None
+try:
+    from theano.tensor.nnet.nnet import softsign as T_softsign
+except ImportError:
+    from theano.sandbox.softsign import softsign as T_softsign
+from keras.backend import theano_backend as KTH
+from keras.backend.common import image_data_format
+from keras.backend.theano_backend import _preprocess_conv2d_input
+from keras.backend.theano_backend import _postprocess_conv2d_output
+py_all = all
+def conv2d(x, kernel, strides=(1, 1), padding='valid', data_format='channels_first',
+           image_shape=None, filter_shape=None):
+    '''
+    padding: string, "same" or "valid".
+    '''
+    if data_format not in {'channels_first', 'channels_last'}:
+        raise Exception('Unknown data_format ' + str(data_format))
+    if data_format == 'channels_last':
+        # TF uses the last dimension as channel dimension,
+        # instead of the 2nd one.
+        # TH input shape: (samples, input_depth, rows, cols)
+        # TF input shape: (samples, rows, cols, input_depth)
+        # TH kernel shape: (depth, input_depth, rows, cols)
+        # TF kernel shape: (rows, cols, input_depth, depth)
+        x = x.dimshuffle((0, 3, 1, 2))
+        kernel = kernel.dimshuffle((3, 2, 0, 1))
+        if image_shape:
+            image_shape = (image_shape[0], image_shape[3],
+                           image_shape[1], image_shape[2])
+        if filter_shape:
+            filter_shape = (filter_shape[3], filter_shape[2],
+                            filter_shape[0], filter_shape[1])
+    if padding == 'same':
+        th_padding = 'half'
+        np_kernel = kernel.eval()
+    elif padding == 'valid':
+        th_padding = 'valid'
+    else:
+        raise Exception('Border mode not supported: ' + str(padding))
+    # Theano might not accept long type
+    def int_or_none(value):
+        try:
+            return int(value)
+        except TypeError:
+            return None
+    if image_shape is not None:
+        image_shape = tuple(int_or_none(v) for v in image_shape)
+    if filter_shape is not None:
+        filter_shape = tuple(int_or_none(v) for v in filter_shape)
+    conv_out = T.nnet.conv2d(x, kernel,
+                             border_mode=th_padding,
+                             subsample=strides,
+                             input_shape=image_shape,
+                             filter_shape=filter_shape)
+    if padding == 'same':
+        if np_kernel.shape[2] % 2 == 0:
+            end = (x.shape[2] + strides[0] - 1) // strides[0]
+            conv_out = conv_out[:, :, :end, :]
+        if np_kernel.shape[3] % 2 == 0:
+            end = (x.shape[3] + strides[1] - 1) // strides[1]
+            conv_out = conv_out[:, :, :, :end]
+    if data_format == 'channels_last':
+        conv_out = conv_out.dimshuffle((0, 2, 3, 1))
+    return conv_out
+def extract_image_patches(X, ksizes, strides,
+                          padding='valid',
+                          data_format='channels_first'):
+    '''
+    Extract the patches from an image
+    Parameters
+    ----------
+    X : The input image
+    ksizes : 2-d tuple with the kernel size
+    strides : 2-d tuple with the strides size
+    padding : 'same' or 'valid'
+    data_format : 'channels_last' or 'channels_first'
+    Returns
+    -------
+    The (k_w,k_h) patches extracted
+    TF ==> (batch_size,w,h,k_w,k_h,c)
+    TH ==> (batch_size,w,h,c,k_w,k_h)
+    '''
+    patch_size = ksizes[1]
+    if padding == 'same':
+        padding = 'ignore_borders'
+    if data_format == 'channels_last':
+        X = KTH.permute_dimensions(X, [0, 3, 1, 2])
+    # Thanks to https://github.com/awentzonline for the help!
+    batch, c, w, h = KTH.shape(X)
+    xs = KTH.shape(X)
+    num_rows = 1 + (xs[-2] - patch_size) // strides[1]
+    num_cols = 1 + (xs[-1] - patch_size) // strides[1]
+    num_channels = xs[-3]
+    patches = images2neibs(X, ksizes, strides, padding)
+    # Theano is sorting by channel
+    new_shape = (batch, num_channels, num_rows * num_cols, patch_size, patch_size)
+    patches = KTH.reshape(patches, new_shape)
+    patches = KTH.permute_dimensions(patches, (0, 2, 1, 3, 4))
+    # arrange in a 2d-grid (rows, cols, channels, px, py)
+    new_shape = (batch, num_rows, num_cols, num_channels, patch_size, patch_size)
+    patches = KTH.reshape(patches, new_shape)
+    if data_format == 'channels_last':
+        patches = KTH.permute_dimensions(patches, [0, 1, 2, 4, 5, 3])
+    return patches
+def depth_to_space(input, scale, data_format=None):
+    """Uses phase shift algorithm to convert
+    channels/depth for spatial resolution
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    data_format = data_format.lower()
+    input = _preprocess_conv2d_input(input, data_format)
+    b, k, row, col = input.shape
+    out_channels = k // (scale ** 2)
+    x = T.reshape(input, (b, scale, scale, out_channels, row, col))
+    x = T.transpose(x, (0, 3, 4, 1, 5, 2))
+    out = T.reshape(x, (b, out_channels, row * scale, col * scale))
+    out = _postprocess_conv2d_output(out, input, None, None, None, data_format)
+    return out
+def moments(x, axes, shift=None, keep_dims=False):
+    ''' Calculates and returns the mean and variance of the input '''
+    mean_batch = KTH.mean(x, axis=axes, keepdims=keep_dims)
+    var_batch = KTH.var(x, axis=axes, keepdims=keep_dims)
+    return mean_batch, var_batch
--- a/code/keras_contrib/callbacks/__init__.py
+++ b/code/keras_contrib/callbacks/__init__.py
+from .snapshot import SnapshotCallbackBuilder, SnapshotModelCheckpoint
+from .dead_relu_detector import DeadReluDetector
+from .cyclical_learning_rate import CyclicLR
+from .tensorboard import TensorBoardGrouped
--- a/code/keras_contrib/callbacks/__pycache__/__init__.cpython-310.pyc
+++ b/code/keras_contrib/callbacks/__pycache__/__init__.cpython-310.pyc
--- a/code/keras_contrib/callbacks/__pycache__/cyclical_learning_rate.cpython-310.pyc
+++ b/code/keras_contrib/callbacks/__pycache__/cyclical_learning_rate.cpython-310.pyc
--- a/code/keras_contrib/callbacks/__pycache__/dead_relu_detector.cpython-310.pyc
+++ b/code/keras_contrib/callbacks/__pycache__/dead_relu_detector.cpython-310.pyc
--- a/code/keras_contrib/callbacks/__pycache__/snapshot.cpython-310.pyc
+++ b/code/keras_contrib/callbacks/__pycache__/snapshot.cpython-310.pyc
--- a/code/keras_contrib/callbacks/__pycache__/tensorboard.cpython-310.pyc
+++ b/code/keras_contrib/callbacks/__pycache__/tensorboard.cpython-310.pyc
--- a/code/keras_contrib/callbacks/cyclical_learning_rate.py
+++ b/code/keras_contrib/callbacks/cyclical_learning_rate.py
+from keras.callbacks import Callback
+from keras import backend as K
+import numpy as np
+class CyclicLR(Callback):
+    """This callback implements a cyclical learning rate policy (CLR).
+    The method cycles the learning rate between two boundaries with
+    some constant frequency.
+    # Arguments
+        base_lr: initial learning rate which is the
+            lower boundary in the cycle.
+        max_lr: upper boundary in the cycle. Functionally,
+            it defines the cycle amplitude (max_lr - base_lr).
+            The lr at any cycle is the sum of base_lr
+            and some scaling of the amplitude; therefore
+            max_lr may not actually be reached depending on
+            scaling function.
+        step_size: number of training iterations per
+            half cycle. Authors suggest setting step_size
+            2-8 x training iterations in epoch.
+        mode: one of {triangular, triangular2, exp_range}.
+            Default 'triangular'.
+            Values correspond to policies detailed above.
+            If scale_fn is not None, this argument is ignored.
+        gamma: constant in 'exp_range' scaling function:
+            gamma**(cycle iterations)
+        scale_fn: Custom scaling policy defined by a single
+            argument lambda function, where
+            0 <= scale_fn(x) <= 1 for all x >= 0.
+            mode paramater is ignored
+        scale_mode: {'cycle', 'iterations'}.
+            Defines whether scale_fn is evaluated on
+            cycle number or cycle iterations (training
+            iterations since start of cycle). Default is 'cycle'.
+    The amplitude of the cycle can be scaled on a per-iteration or
+    per-cycle basis.
+    This class has three built-in policies, as put forth in the paper.
+    "triangular":
+        A basic triangular cycle w/ no amplitude scaling.
+    "triangular2":
+        A basic triangular cycle that scales initial amplitude by half each cycle.
+    "exp_range":
+        A cycle that scales initial amplitude by gamma**(cycle iterations) at each
+        cycle iteration.
+    For more detail, please see paper.
+    # Example for CIFAR-10 w/ batch size 100:
+        ```python
+            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
+                                step_size=2000., mode='triangular')
+            model.fit(X_train, Y_train, callbacks=[clr])
+        ```
+    Class also supports custom scaling functions:
+        ```python
+            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
+            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
+                                step_size=2000., scale_fn=clr_fn,
+                                scale_mode='cycle')
+            model.fit(X_train, Y_train, callbacks=[clr])
+        ```
+    # References
+      - [Cyclical Learning Rates for Training Neural Networks](
+      https://arxiv.org/abs/1506.01186)
+    """
+    def __init__(
+            self,
+            base_lr=0.001,
+            max_lr=0.006,
+            step_size=2000.,
+            mode='triangular',
+            gamma=1.,
+            scale_fn=None,
+            scale_mode='cycle'):
+        super(CyclicLR, self).__init__()
+        if mode not in ['triangular', 'triangular2',
+                        'exp_range']:
+            raise KeyError("mode must be one of 'triangular', "
+                           "'triangular2', or 'exp_range'")
+        self.base_lr = base_lr
+        self.max_lr = max_lr
+        self.step_size = step_size
+        self.mode = mode
+        self.gamma = gamma
+        if scale_fn is None:
+            if self.mode == 'triangular':
+                self.scale_fn = lambda x: 1.
+                self.scale_mode = 'cycle'
+            elif self.mode == 'triangular2':
+                self.scale_fn = lambda x: 1 / (2.**(x - 1))
+                self.scale_mode = 'cycle'
+            elif self.mode == 'exp_range':
+                self.scale_fn = lambda x: gamma ** x
+                self.scale_mode = 'iterations'
+        else:
+            self.scale_fn = scale_fn
+            self.scale_mode = scale_mode
+        self.clr_iterations = 0.
+        self.trn_iterations = 0.
+        self.history = {}
+        self._reset()
+    def _reset(self, new_base_lr=None, new_max_lr=None,
+               new_step_size=None):
+        """Resets cycle iterations.
+        Optional boundary/step size adjustment.
+        """
+        if new_base_lr is not None:
+            self.base_lr = new_base_lr
+        if new_max_lr is not None:
+            self.max_lr = new_max_lr
+        if new_step_size is not None:
+            self.step_size = new_step_size
+        self.clr_iterations = 0.
+    def clr(self):
+        cycle = np.floor(1 + self.clr_iterations / (2 * self.step_size))
+        x = np.abs(self.clr_iterations / self.step_size - 2 * cycle + 1)
+        if self.scale_mode == 'cycle':
+            return self.base_lr + (self.max_lr - self.base_lr) * \
+                np.maximum(0, (1 - x)) * self.scale_fn(cycle)
+        else:
+            return self.base_lr + (self.max_lr - self.base_lr) * \
+                np.maximum(0, (1 - x)) * self.scale_fn(self.clr_iterations)
+    def on_train_begin(self, logs={}):
+        logs = logs or {}
+        if self.clr_iterations == 0:
+            K.set_value(self.model.optimizer.lr, self.base_lr)
+        else:
+            K.set_value(self.model.optimizer.lr, self.clr())
+    def on_batch_end(self, epoch, logs=None):
+        logs = logs or {}
+        self.trn_iterations += 1
+        self.clr_iterations += 1
+        K.set_value(self.model.optimizer.lr, self.clr())
+        self.history.setdefault(
+            'lr', []).append(
+            K.get_value(
+                self.model.optimizer.lr))
+        self.history.setdefault('iterations', []).append(self.trn_iterations)
+        for k, v in logs.items():
+            self.history.setdefault(k, []).append(v)
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        logs['lr'] = K.get_value(self.model.optimizer.lr)