添加inception_v3测试代码

1e2486af · sunxx1 · 1e2486af · 1e2486af · 1e2486af · 1e2486af
Commit 1e2486af authored Apr 12, 2023 by sunxx1
20 changed files
--- a/models/mnasnet.py
+++ b/models/mnasnet.py
+import torch
+import torch.nn as nn
+
+__all__ = ['mnasnet']
+
+
+class _InvertedResidual(nn.Module):
+
+    def __init__(self, in_ch, out_ch, kernel_size, stride, expansion_factor):
+        super(_InvertedResidual, self).__init__()
+        assert stride in [1, 2]
+        assert kernel_size in [3, 5]
+        mid_ch = in_ch * expansion_factor
+        self.apply_residual = (in_ch == out_ch and stride == 1)
+        self.layers = nn.Sequential(
+            # Pointwise
+            nn.Conv2d(in_ch, mid_ch, 1, bias=False),
+            nn.BatchNorm2d(mid_ch),
+            nn.ReLU(inplace=True),
+            # Depthwise
+            nn.Conv2d(mid_ch,
+                      mid_ch,
+                      kernel_size,
+                      padding=kernel_size // 2,
+                      stride=stride,
+                      groups=mid_ch,
+                      bias=False),
+            nn.BatchNorm2d(mid_ch),
+            nn.ReLU(inplace=True),
+            # Linear pointwise. Note that there's no activation.
+            nn.Conv2d(mid_ch, out_ch, 1, bias=False),
+            nn.BatchNorm2d(out_ch))
+
+    def forward(self, input):
+        if self.apply_residual:
+            return self.layers(input) + input
+        else:
+            return self.layers(input)
+
+
+def _stack(in_ch, out_ch, kernel_size, stride, exp_factor, repeats):
+    """ Creates a stack of inverted residuals. """
+    assert repeats >= 1
+    # First one has no skip, because feature map size changes.
+    first = _InvertedResidual(in_ch, out_ch, kernel_size, stride, exp_factor)
+    remaining = []
+    for _ in range(1, repeats):
+        remaining.append(
+            _InvertedResidual(out_ch, out_ch, kernel_size, 1, exp_factor))
+    return nn.Sequential(first, *remaining)
+
+
+def _round_to_multiple_of(val, divisor, round_up_bias=0.9):
+    """ Asymmetric rounding to make `val` divisible by `divisor`. With default
+    bias, will round up, unless the number is no more than 10% greater than the
+    smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88. """
+    assert 0.0 < round_up_bias < 1.0
+    new_val = max(divisor, int(val + divisor / 2) // divisor * divisor)
+    return new_val if new_val >= round_up_bias * val else new_val + divisor
+
+
+def _get_depths(scale):
+    """ Scales tensor depths as in reference MobileNet code, prefers rouding up
+    rather than down. """
+    depths = [32, 16, 24, 40, 80, 96, 192, 320]
+    return [_round_to_multiple_of(depth * scale, 8) for depth in depths]
+
+
+class MNASNet(torch.nn.Module):
+    # Version 2 adds depth scaling in the initial stages of the network.
+    _version = 2
+
+    def __init__(self, scale, num_classes=1000, dropout=0.2):
+        super(MNASNet, self).__init__()
+
+        assert scale > 0.0
+        self.scale = scale
+        self.num_classes = num_classes
+        depths = _get_depths(scale)
+        layers = [
+            # First layer: regular conv.
+            nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False),
+            nn.BatchNorm2d(depths[0]),
+            nn.ReLU(inplace=True),
+            # Depthwise separable, no skip.
+            nn.Conv2d(depths[0],
+                      depths[0],
+                      3,
+                      padding=1,
+                      stride=1,
+                      groups=depths[0],
+                      bias=False),
+            nn.BatchNorm2d(depths[0]),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(depths[0], depths[1], 1, padding=0, stride=1,
+                      bias=False),
+            nn.BatchNorm2d(depths[1]),
+            # MNASNet blocks: stacks of inverted residuals.
+            _stack(depths[1], depths[2], 3, 2, 3, 3),
+            _stack(depths[2], depths[3], 5, 2, 3, 3),
+            _stack(depths[3], depths[4], 5, 2, 6, 3),
+            _stack(depths[4], depths[5], 3, 1, 6, 2),
+            _stack(depths[5], depths[6], 5, 2, 6, 4),
+            _stack(depths[6], depths[7], 3, 1, 6, 1),
+            # Final mapping to classifier input.
+            nn.Conv2d(depths[7], 1280, 1, padding=0, stride=1, bias=False),
+            nn.BatchNorm2d(1280),
+            nn.ReLU(inplace=True),
+        ]
+        self.layers = nn.Sequential(*layers)
+        self.classifier = nn.Sequential(nn.Dropout(p=dropout, inplace=True),
+                                        nn.Linear(1280, num_classes))
+        self._initialize_weights()
+
+    def forward(self, x):
+        x = self.layers(x)
+        # Equivalent to global avgpool and removing H and W dimensions.
+        x = x.mean([2, 3])
+        return self.classifier(x)
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight,
+                                        mode="fan_out",
+                                        nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_uniform_(m.weight,
+                                         mode="fan_out",
+                                         nonlinearity="sigmoid")
+                nn.init.zeros_(m.bias)
+
+
+def mnasnet(**kwargs):
+    model = MNASNet(**kwargs)
+    return model
--- a/models/mobile_v1.py
+++ b/models/mobile_v1.py
+import torch.nn as nn
+from torch.nn import init
+
+__all__ = ["mobile_v1"]
+
+
+class MobileNetV1(nn.Module):
+
+    def __init__(self, scale=1.0, num_classes=1000, bn_group=None):
+        super(MobileNetV1, self).__init__()
+
+        BN = nn.BatchNorm2d
+        self.scale = scale
+
+        def conv_bn(inp, oup, stride):
+            return nn.Sequential(nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+                                 BN(oup), nn.ReLU(inplace=True))
+
+        def conv_dw(inp, oup, stride):
+            inp = int(inp * scale)
+            oup = int(oup * scale)
+            return nn.Sequential(
+                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+                BN(inp),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+                BN(oup),
+                nn.ReLU(inplace=True),
+            )
+
+        self.model = nn.Sequential(
+            conv_bn(3, int(32 * scale), 2),
+            conv_dw(32, 64, 1),
+            conv_dw(64, 128, 2),
+            conv_dw(128, 128, 1),
+            conv_dw(128, 256, 2),
+            conv_dw(256, 256, 1),
+            conv_dw(256, 512, 2),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 1024, 2),
+            conv_dw(1024, 1024, 1),
+            nn.AvgPool2d(7),
+        )
+        self.fc = nn.Linear(int(1024 * scale), num_classes)
+        self.init_params()
+
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.model(x)
+        x = x.view(-1, int(1024 * self.scale))
+        x = self.fc(x)
+        return x
+
+
+def mobile_v1(**kwargs):
+    model = MobileNetV1(**kwargs)
+    return model
--- a/models/mobile_v2.py
+++ b/models/mobile_v2.py
+import torch.nn as nn
+
+__all__ = ['mobile_v2']
+
+
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+                         nn.BatchNorm2d(oup), nn.ReLU6(inplace=False))
+
+
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+                         nn.BatchNorm2d(oup), nn.ReLU6(inplace=False))
+
+
+class InvertedResidual(nn.Module):
+
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = round(inp * expand_ratio)
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        if expand_ratio == 1:
+            self.conv = nn.Sequential(
+                nn.Conv2d(hidden_dim,
+                          hidden_dim,
+                          3,
+                          stride,
+                          1,
+                          groups=hidden_dim,
+                          bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=False),
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+        else:
+            self.conv = nn.Sequential(
+                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=False),
+                nn.Conv2d(hidden_dim,
+                          hidden_dim,
+                          3,
+                          stride,
+                          1,
+                          groups=hidden_dim,
+                          bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=False),
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+
+    def __init__(self,
+                 scale=1,
+                 num_classes=1000,
+                 input_size=224,
+                 width_mult=1.):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        interverted_residual_setting = [
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],
+        ]
+
+        assert input_size % 32 == 0
+        input_channel = int(input_channel * width_mult)
+        self.last_channel = int(
+            last_channel * width_mult) if width_mult > 1.0 else last_channel
+        self.features = [conv_bn(3, input_channel, 2)]
+        for t, c, n, s in interverted_residual_setting:
+            output_channel = int(c * width_mult)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(
+                        block(input_channel, output_channel, s,
+                              expand_ratio=t))
+                else:
+                    self.features.append(
+                        block(input_channel, output_channel, 1,
+                              expand_ratio=t))
+                input_channel = output_channel
+        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
+        self.features = nn.Sequential(*self.features)
+
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.2),
+            # nn.Conv2d(self.last_channel, num_classes, kernel_size=1))
+            nn.Linear(self.last_channel, num_classes))
+
+        self._initialize_weights()
+
+    def forward(self, x):
+        x = self.features(x)
+        x = x.mean(3).mean(2)
+        x = self.classifier(x)
+        # x = x.view(x.size(0), -1)
+        return x
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.001)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+
+def mobile_v2(**kwargs):
+    model = MobileNetV2(**kwargs)
+    return model
--- a/models/mobile_v3.py
+++ b/models/mobile_v3.py
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['mobile_v3']
+
+
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def conv_bn(inp, oup, stride, activation=nn.ReLU):
+    return nn.Sequential(nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+                         nn.BatchNorm2d(oup), activation(inplace=True))
+
+
+def conv_1x1_bn(inp, oup, activation=nn.ReLU):
+    return nn.Sequential(nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+                         nn.BatchNorm2d(oup), activation(inplace=True))
+
+
+class Hswish(nn.Module):
+
+    def __init__(self, inplace=True):
+        super(Hswish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return x * F.relu6(x + 3., inplace=self.inplace) / 6.
+
+
+class Hsigmoid(nn.Module):
+
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return F.relu6(x + 3., inplace=self.inplace) / 6.
+
+
+class SEModule(nn.Module):
+
+    def __init__(self, channel, reduction=4):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False), Hsigmoid())
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y.expand_as(x)
+
+
+class Identity(nn.Module):
+
+    def __init__(self, channel):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+class InvertedResidual(nn.Module):
+
+    def __init__(self, inp, oup, kernel, stride, exp, se=False, nl='RE'):
+        super(InvertedResidual, self).__init__()
+        assert stride in [1, 2]
+        assert kernel in [3, 5]
+        padding = (kernel - 1) // 2
+        self.use_res_connect = stride == 1 and inp == oup
+
+        if nl == 'RE':
+            activation = nn.ReLU
+        elif nl == 'HS':
+            activation = Hswish
+        else:
+            raise NotImplementedError
+
+        SELayer = SEModule if se else Identity
+
+        layers = []
+        if inp != exp:
+            # pw
+            layers.extend([
+                nn.Conv2d(inp, exp, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(exp),
+                activation(inplace=True),
+            ])
+        layers.extend([
+            # dw
+            nn.Conv2d(exp,
+                      exp,
+                      kernel,
+                      stride,
+                      padding,
+                      groups=exp,
+                      bias=False),
+            nn.BatchNorm2d(exp),
+            SELayer(exp),
+            activation(inplace=True),
+            # pw-linear
+            nn.Conv2d(exp, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV3(nn.Module):
+
+    def __init__(self,
+                 num_classes=1000,
+                 scale=1.0,
+                 dropout=0.8,
+                 round_nearest=8,
+                 mode='small',
+                 bn=None):
+        super(MobileNetV3, self).__init__()
+
+        input_channel = 16
+        last_channel = 1280
+        if mode == 'large':
+            mobile_setting = [
+                [3, 16, 16, False, 'RE', 1],
+                [3, 64, 24, False, 'RE', 2],
+                [3, 72, 24, False, 'RE', 1],
+                [5, 72, 40, True, 'RE', 2],
+                [5, 120, 40, True, 'RE', 1],
+                [5, 120, 40, True, 'RE', 1],
+                [3, 240, 80, False, 'HS', 2],
+                [3, 200, 80, False, 'HS', 1],
+                [3, 184, 80, False, 'HS', 1],
+                [3, 184, 80, False, 'HS', 1],
+                [3, 480, 112, True, 'HS', 1],
+                [3, 672, 112, True, 'HS', 1],
+                [5, 672, 160, True, 'HS', 2],
+                [5, 960, 160, True, 'HS', 1],
+                [5, 960, 160, True, 'HS', 1],
+            ]
+        elif mode == 'small':
+            mobile_setting = [
+                [3, 16, 16, True, 'RE', 2],
+                [3, 72, 24, False, 'RE', 2],
+                [3, 88, 24, False, 'RE', 1],
+                [5, 96, 40, True, 'HS', 2],
+                [5, 240, 40, True, 'HS', 1],
+                [5, 240, 40, True, 'HS', 1],
+                [5, 120, 48, True, 'HS', 1],
+                [5, 144, 48, True, 'HS', 1],
+                [5, 288, 96, True, 'HS', 2],
+                [5, 576, 96, True, 'HS', 1],
+                [5, 576, 96, True, 'HS', 1],
+            ]
+        else:
+            raise NotImplementedError
+
+        # building first layer
+        last_channel = _make_divisible(
+            last_channel *
+            scale, round_nearest) if scale > 1.0 else last_channel
+        self.features = [conv_bn(3, input_channel, 2, activation=Hswish)]
+        self.classifier = []
+
+        # building mobile blocks
+        for k, exp, c, se, nl, s in mobile_setting:
+            output_channel = _make_divisible(c * scale, round_nearest)
+            exp_channel = _make_divisible(exp * scale, round_nearest)
+            self.features.append(
+                InvertedResidual(input_channel, output_channel, k, s,
+                                 exp_channel, se, nl))
+            input_channel = output_channel
+
+        # building last several layers
+        if mode == 'large':
+            last_conv = _make_divisible(960 * scale, round_nearest)
+            self.features.append(
+                conv_1x1_bn(input_channel, last_conv, activation=Hswish))
+            self.features.append(nn.AdaptiveAvgPool2d(1))
+            self.features.append(nn.Conv2d(last_conv, last_channel, 1, 1, 0))
+            self.features.append(Hswish(inplace=True))
+        elif mode == 'small':
+            last_conv = _make_divisible(576 * scale, round_nearest)
+            self.features.append(
+                conv_1x1_bn(input_channel, last_conv, activation=Hswish))
+            self.features.append(nn.AdaptiveAvgPool2d(1))
+            self.features.append(nn.Conv2d(last_conv, last_channel, 1, 1, 0))
+            self.features.append(Hswish(inplace=True))
+        else:
+            raise NotImplementedError
+
+        self.features = nn.Sequential(*self.features)
+
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout),
+            nn.Linear(last_channel, num_classes),
+        )
+
+        self.init_params()
+
+    def forward(self, x):
+        x = self.features(x)
+        x = x.mean([2, 3])
+        x = self.classifier(x)
+        return x
+
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.001)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+
+def mobile_v3(**kwargs):
+    model = MobileNetV3(**kwargs)
+    return model
--- a/models/nasnet.py
+++ b/models/nasnet.py
+import torch
+import torch.nn as nn
+
+__all__ = ['nasnetamobile', 'nasnetalarge']
+
+
+class MaxPoolPad(nn.Module):
+
+    def __init__(self):
+        super(MaxPoolPad, self).__init__()
+        self.pad = nn.ZeroPad2d((1, 0, 1, 0))
+        self.pool = nn.MaxPool2d(3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.pad(x)
+        x = self.pool(x)
+        x = x[:, :, 1:, 1:]
+        return x
+
+
+class AvgPoolPad(nn.Module):
+
+    def __init__(self, stride=2, padding=1):
+        super(AvgPoolPad, self).__init__()
+        self.pad = nn.ZeroPad2d((1, 0, 1, 0))
+        self.pool = nn.AvgPool2d(3,
+                                 stride=stride,
+                                 padding=padding,
+                                 count_include_pad=False)
+
+    def forward(self, x):
+        x = self.pad(x)
+        x = self.pool(x)
+        x = x[:, :, 1:, 1:]
+        return x
+
+
+class SeparableConv2d(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 dw_kernel,
+                 dw_stride,
+                 dw_padding,
+                 bias=False):
+        super(SeparableConv2d, self).__init__()
+        self.depthwise_conv2d = nn.Conv2d(in_channels,
+                                          in_channels,
+                                          dw_kernel,
+                                          stride=dw_stride,
+                                          padding=dw_padding,
+                                          bias=bias,
+                                          groups=in_channels)
+        self.pointwise_conv2d = nn.Conv2d(in_channels,
+                                          out_channels,
+                                          1,
+                                          stride=1,
+                                          bias=bias)
+
+    def forward(self, x):
+        x = self.depthwise_conv2d(x)
+        x = self.pointwise_conv2d(x)
+        return x
+
+
+class BranchSeparables(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 bias=False):
+        super(BranchSeparables, self).__init__()
+        self.relu = nn.ReLU()
+        self.separable_1 = SeparableConv2d(in_channels,
+                                           in_channels,
+                                           kernel_size,
+                                           stride,
+                                           padding,
+                                           bias=bias)
+        self.bn_sep_1 = nn.BatchNorm2d(in_channels,
+                                       eps=0.001,
+                                       momentum=0.1,
+                                       affine=True)
+        self.relu1 = nn.ReLU()
+        self.separable_2 = SeparableConv2d(in_channels,
+                                           out_channels,
+                                           kernel_size,
+                                           1,
+                                           padding,
+                                           bias=bias)
+        self.bn_sep_2 = nn.BatchNorm2d(out_channels,
+                                       eps=0.001,
+                                       momentum=0.1,
+                                       affine=True)
+
+    def forward(self, x):
+        x = self.relu(x)
+        x = self.separable_1(x)
+        x = self.bn_sep_1(x)
+        x = self.relu1(x)
+        x = self.separable_2(x)
+        x = self.bn_sep_2(x)
+        return x
+
+
+class BranchSeparablesStem(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 bias=False):
+        super(BranchSeparablesStem, self).__init__()
+        self.relu = nn.ReLU()
+        self.separable_1 = SeparableConv2d(in_channels,
+                                           out_channels,
+                                           kernel_size,
+                                           stride,
+                                           padding,
+                                           bias=bias)
+        self.bn_sep_1 = nn.BatchNorm2d(out_channels,
+                                       eps=0.001,
+                                       momentum=0.1,
+                                       affine=True)
+        self.relu1 = nn.ReLU()
+        self.separable_2 = SeparableConv2d(out_channels,
+                                           out_channels,
+                                           kernel_size,
+                                           1,
+                                           padding,
+                                           bias=bias)
+        self.bn_sep_2 = nn.BatchNorm2d(out_channels,
+                                       eps=0.001,
+                                       momentum=0.1,
+                                       affine=True)
+
+    def forward(self, x):
+        x = self.relu(x)
+        x = self.separable_1(x)
+        x = self.bn_sep_1(x)
+        x = self.relu1(x)
+        x = self.separable_2(x)
+        x = self.bn_sep_2(x)
+        return x
+
+
+class BranchSeparablesReduction(BranchSeparables):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 z_padding=1,
+                 bias=False):
+        BranchSeparables.__init__(self, in_channels, out_channels, kernel_size,
+                                  stride, padding, bias)
+        self.padding = nn.ZeroPad2d((z_padding, 0, z_padding, 0))
+
+    def forward(self, x):
+        x = self.relu(x)
+        x = self.padding(x)
+        x = self.separable_1(x)
+        x = x[:, :, 1:, 1:].contiguous()
+        x = self.bn_sep_1(x)
+        x = self.relu1(x)
+        x = self.separable_2(x)
+        x = self.bn_sep_2(x)
+        return x
+
+
+class CellStem0(nn.Module):
+
+    def __init__(self, stem_filters, num_filters=42):
+        super(CellStem0, self).__init__()
+        self.num_filters = num_filters
+        self.stem_filters = stem_filters
+        self.conv_1x1 = nn.Sequential()
+        self.conv_1x1.add_module('relu', nn.ReLU())
+        self.conv_1x1.add_module(
+            'conv',
+            nn.Conv2d(self.stem_filters,
+                      self.num_filters,
+                      1,
+                      stride=1,
+                      bias=False))
+        self.conv_1x1.add_module(
+            'bn',
+            nn.BatchNorm2d(self.num_filters,
+                           eps=0.001,
+                           momentum=0.1,
+                           affine=True))
+
+        self.comb_iter_0_left = BranchSeparables(self.num_filters,
+                                                 self.num_filters, 5, 2, 2)
+        self.comb_iter_0_right = BranchSeparablesStem(self.stem_filters,
+                                                      self.num_filters,
+                                                      7,
+                                                      2,
+                                                      3,
+                                                      bias=False)
+
+        self.comb_iter_1_left = nn.MaxPool2d(3, stride=2, padding=1)
+        self.comb_iter_1_right = BranchSeparablesStem(self.stem_filters,
+                                                      self.num_filters,
+                                                      7,
+                                                      2,
+                                                      3,
+                                                      bias=False)
+
+        self.comb_iter_2_left = nn.AvgPool2d(3,
+                                             stride=2,
+                                             padding=1,
+                                             count_include_pad=False)
+        self.comb_iter_2_right = BranchSeparablesStem(self.stem_filters,
+                                                      self.num_filters,
+                                                      5,
+                                                      2,
+                                                      2,
+                                                      bias=False)
+
+        self.comb_iter_3_right = nn.AvgPool2d(3,
+                                              stride=1,
+                                              padding=1,
+                                              count_include_pad=False)
+
+        self.comb_iter_4_left = BranchSeparables(self.num_filters,
+                                                 self.num_filters,
+                                                 3,
+                                                 1,
+                                                 1,
+                                                 bias=False)
+        self.comb_iter_4_right = nn.MaxPool2d(3, stride=2, padding=1)
+
+    def forward(self, x):
+        x1 = self.conv_1x1(x)
+
+        x_comb_iter_0_left = self.comb_iter_0_left(x1)
+        x_comb_iter_0_right = self.comb_iter_0_right(x)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x1)
+        x_comb_iter_1_right = self.comb_iter_1_right(x)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x1)
+        x_comb_iter_2_right = self.comb_iter_2_right(x)
+        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
+
+        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
+        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
+        x_comb_iter_4_right = self.comb_iter_4_right(x1)
+        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
+
+        x_out = torch.cat(
+            [x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
+        return x_out
+
+
+class CellStem1(nn.Module):
+
+    def __init__(self, stem_filters, num_filters):
+        super(CellStem1, self).__init__()
+        self.num_filters = num_filters
+        self.stem_filters = stem_filters
+        self.conv_1x1 = nn.Sequential()
+        self.conv_1x1.add_module('relu', nn.ReLU())
+        self.conv_1x1.add_module(
+            'conv',
+            nn.Conv2d(2 * self.num_filters,
+                      self.num_filters,
+                      1,
+                      stride=1,
+                      bias=False))
+        self.conv_1x1.add_module(
+            'bn',
+            nn.BatchNorm2d(self.num_filters,
+                           eps=0.001,
+                           momentum=0.1,
+                           affine=True))
+
+        self.relu = nn.ReLU()
+        self.path_1 = nn.Sequential()
+        self.path_1.add_module(
+            'avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
+        self.path_1.add_module(
+            'conv',
+            nn.Conv2d(self.stem_filters,
+                      self.num_filters // 2,
+                      1,
+                      stride=1,
+                      bias=False))
+        self.path_2 = nn.ModuleList()
+        self.path_2.add_module('pad', nn.ZeroPad2d((0, 1, 0, 1)))
+        self.path_2.add_module(
+            'avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
+        self.path_2.add_module(
+            'conv',
+            nn.Conv2d(self.stem_filters,
+                      self.num_filters // 2,
+                      1,
+                      stride=1,
+                      bias=False))
+
+        self.final_path_bn = nn.BatchNorm2d(self.num_filters,
+                                            eps=0.001,
+                                            momentum=0.1,
+                                            affine=True)
+
+        self.comb_iter_0_left = BranchSeparables(self.num_filters,
+                                                 self.num_filters,
+                                                 5,
+                                                 2,
+                                                 2,
+                                                 bias=False)
+        self.comb_iter_0_right = BranchSeparables(self.num_filters,
+                                                  self.num_filters,
+                                                  7,
+                                                  2,
+                                                  3,
+                                                  bias=False)
+
+        self.comb_iter_1_left = nn.MaxPool2d(3, stride=2, padding=1)
+        self.comb_iter_1_right = BranchSeparables(self.num_filters,
+                                                  self.num_filters,
+                                                  7,
+                                                  2,
+                                                  3,
+                                                  bias=False)
+
+        self.comb_iter_2_left = nn.AvgPool2d(3,
+                                             stride=2,
+                                             padding=1,
+                                             count_include_pad=False)
+        self.comb_iter_2_right = BranchSeparables(self.num_filters,
+                                                  self.num_filters,
+                                                  5,
+                                                  2,
+                                                  2,
+                                                  bias=False)
+
+        self.comb_iter_3_right = nn.AvgPool2d(3,
+                                              stride=1,
+                                              padding=1,
+                                              count_include_pad=False)
+
+        self.comb_iter_4_left = BranchSeparables(self.num_filters,
+                                                 self.num_filters,
+                                                 3,
+                                                 1,
+                                                 1,
+                                                 bias=False)
+        self.comb_iter_4_right = nn.MaxPool2d(3, stride=2, padding=1)
+
+    def forward(self, x_conv0, x_stem_0):
+        x_left = self.conv_1x1(x_stem_0)
+
+        x_relu = self.relu(x_conv0)
+        # path 1
+        x_path1 = self.path_1(x_relu)
+        # path 2
+        x_path2 = self.path_2.pad(x_relu)
+        x_path2 = x_path2[:, :, 1:, 1:]
+        x_path2 = self.path_2.avgpool(x_path2)
+        x_path2 = self.path_2.conv(x_path2)
+        # final path
+        x_right = self.final_path_bn(torch.cat([x_path1, x_path2], 1))
+
+        x_comb_iter_0_left = self.comb_iter_0_left(x_left)
+        x_comb_iter_0_right = self.comb_iter_0_right(x_right)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x_left)
+        x_comb_iter_1_right = self.comb_iter_1_right(x_right)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x_left)
+        x_comb_iter_2_right = self.comb_iter_2_right(x_right)
+        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
+
+        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
+        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
+        x_comb_iter_4_right = self.comb_iter_4_right(x_left)
+        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
+
+        x_out = torch.cat(
+            [x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
+        return x_out
+
+
+class FirstCell(nn.Module):
+
+    def __init__(self, in_channels_left, out_channels_left, in_channels_right,
+                 out_channels_right):
+        super(FirstCell, self).__init__()
+        self.conv_1x1 = nn.Sequential()
+        self.conv_1x1.add_module('relu', nn.ReLU())
+        self.conv_1x1.add_module(
+            'conv',
+            nn.Conv2d(in_channels_right,
+                      out_channels_right,
+                      1,
+                      stride=1,
+                      bias=False))
+        self.conv_1x1.add_module(
+            'bn',
+            nn.BatchNorm2d(out_channels_right,
+                           eps=0.001,
+                           momentum=0.1,
+                           affine=True))
+
+        self.relu = nn.ReLU()
+        self.path_1 = nn.Sequential()
+        self.path_1.add_module(
+            'avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
+        self.path_1.add_module(
+            'conv',
+            nn.Conv2d(in_channels_left,
+                      out_channels_left,
+                      1,
+                      stride=1,
+                      bias=False))
+        self.path_2 = nn.ModuleList()
+        self.path_2.add_module('pad', nn.ZeroPad2d((0, 1, 0, 1)))
+        self.path_2.add_module(
+            'avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
+        self.path_2.add_module(
+            'conv',
+            nn.Conv2d(in_channels_left,
+                      out_channels_left,
+                      1,
+                      stride=1,
+                      bias=False))
+
+        self.final_path_bn = nn.BatchNorm2d(out_channels_left * 2,
+                                            eps=0.001,
+                                            momentum=0.1,
+                                            affine=True)
+
+        self.comb_iter_0_left = BranchSeparables(out_channels_right,
+                                                 out_channels_right,
+                                                 5,
+                                                 1,
+                                                 2,
+                                                 bias=False)
+        self.comb_iter_0_right = BranchSeparables(out_channels_right,
+                                                  out_channels_right,
+                                                  3,
+                                                  1,
+                                                  1,
+                                                  bias=False)
+
+        self.comb_iter_1_left = BranchSeparables(out_channels_right,
+                                                 out_channels_right,
+                                                 5,
+                                                 1,
+                                                 2,
+                                                 bias=False)
+        self.comb_iter_1_right = BranchSeparables(out_channels_right,
+                                                  out_channels_right,
+                                                  3,
+                                                  1,
+                                                  1,
+                                                  bias=False)
+
+        self.comb_iter_2_left = nn.AvgPool2d(3,
+                                             stride=1,
+                                             padding=1,
+                                             count_include_pad=False)
+
+        self.comb_iter_3_left = nn.AvgPool2d(3,
+                                             stride=1,
+                                             padding=1,
+                                             count_include_pad=False)
+        self.comb_iter_3_right = nn.AvgPool2d(3,
+                                              stride=1,
+                                              padding=1,
+                                              count_include_pad=False)
+
+        self.comb_iter_4_left = BranchSeparables(out_channels_right,
+                                                 out_channels_right,
+                                                 3,
+                                                 1,
+                                                 1,
+                                                 bias=False)
+
+    def forward(self, x, x_prev):
+        x_relu = self.relu(x_prev)
+        # path 1
+        x_path1 = self.path_1(x_relu)
+        # path 2
+        x_path2 = self.path_2.pad(x_relu)
+        x_path2 = x_path2[:, :, 1:, 1:]
+        x_path2 = self.path_2.avgpool(x_path2)
+        x_path2 = self.path_2.conv(x_path2)
+        # final path
+        x_left = self.final_path_bn(torch.cat([x_path1, x_path2], 1))
+
+        x_right = self.conv_1x1(x)
+
+        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
+        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x_left)
+        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
+        x_comb_iter_2 = x_comb_iter_2_left + x_left
+
+        x_comb_iter_3_left = self.comb_iter_3_left(x_left)
+        x_comb_iter_3_right = self.comb_iter_3_right(x_left)
+        x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_right)
+        x_comb_iter_4 = x_comb_iter_4_left + x_right
+
+        x_out = torch.cat([
+            x_left, x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3,
+            x_comb_iter_4
+        ], 1)
+        return x_out
+
+
+class NormalCell(nn.Module):
+
+    def __init__(self, in_channels_left, out_channels_left, in_channels_right,
+                 out_channels_right):
+        super(NormalCell, self).__init__()
+        self.conv_prev_1x1 = nn.Sequential()
+        self.conv_prev_1x1.add_module('relu', nn.ReLU())
+        self.conv_prev_1x1.add_module(
+            'conv',
+            nn.Conv2d(in_channels_left,
+                      out_channels_left,
+                      1,
+                      stride=1,
+                      bias=False))
+        self.conv_prev_1x1.add_module(
+            'bn',
+            nn.BatchNorm2d(out_channels_left,
+                           eps=0.001,
+                           momentum=0.1,
+                           affine=True))
+
+        self.conv_1x1 = nn.Sequential()
+        self.conv_1x1.add_module('relu', nn.ReLU())
+        self.conv_1x1.add_module(
+            'conv',
+            nn.Conv2d(in_channels_right,
+                      out_channels_right,
+                      1,
+                      stride=1,
+                      bias=False))
+        self.conv_1x1.add_module(
+            'bn',
+            nn.BatchNorm2d(out_channels_right,
+                           eps=0.001,
+                           momentum=0.1,
+                           affine=True))
+
+        self.comb_iter_0_left = BranchSeparables(out_channels_right,
+                                                 out_channels_right,
+                                                 5,
+                                                 1,
+                                                 2,
+                                                 bias=False)
+        self.comb_iter_0_right = BranchSeparables(out_channels_left,
+                                                  out_channels_left,
+                                                  3,
+                                                  1,
+                                                  1,
+                                                  bias=False)
+
+        self.comb_iter_1_left = BranchSeparables(out_channels_left,
+                                                 out_channels_left,
+                                                 5,
+                                                 1,
+                                                 2,
+                                                 bias=False)
+        self.comb_iter_1_right = BranchSeparables(out_channels_left,
+                                                  out_channels_left,
+                                                  3,
+                                                  1,
+                                                  1,
+                                                  bias=False)
+
+        self.comb_iter_2_left = nn.AvgPool2d(3,
+                                             stride=1,
+                                             padding=1,
+                                             count_include_pad=False)
+
+        self.comb_iter_3_left = nn.AvgPool2d(3,
+                                             stride=1,
+                                             padding=1,
+                                             count_include_pad=False)
+        self.comb_iter_3_right = nn.AvgPool2d(3,
+                                              stride=1,
+                                              padding=1,
+                                              count_include_pad=False)
+
+        self.comb_iter_4_left = BranchSeparables(out_channels_right,
+                                                 out_channels_right,
+                                                 3,
+                                                 1,
+                                                 1,
+                                                 bias=False)
+
+    def forward(self, x, x_prev):
+        x_left = self.conv_prev_1x1(x_prev)
+        x_right = self.conv_1x1(x)
+
+        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
+        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x_left)
+        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
+        x_comb_iter_2 = x_comb_iter_2_left + x_left
+
+        x_comb_iter_3_left = self.comb_iter_3_left(x_left)
+        x_comb_iter_3_right = self.comb_iter_3_right(x_left)
+        x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_right)
+        x_comb_iter_4 = x_comb_iter_4_left + x_right
+
+        x_out = torch.cat([
+            x_left, x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3,
+            x_comb_iter_4
+        ], 1)
+        return x_out
+
+
+class ReductionCell0(nn.Module):
+
+    def __init__(self, in_channels_left, out_channels_left, in_channels_right,
+                 out_channels_right):
+        super(ReductionCell0, self).__init__()
+        self.conv_prev_1x1 = nn.Sequential()
+        self.conv_prev_1x1.add_module('relu', nn.ReLU())
+        self.conv_prev_1x1.add_module(
+            'conv',
+            nn.Conv2d(in_channels_left,
+                      out_channels_left,
+                      1,
+                      stride=1,
+                      bias=False))
+        self.conv_prev_1x1.add_module(
+            'bn',
+            nn.BatchNorm2d(out_channels_left,
+                           eps=0.001,
+                           momentum=0.1,
+                           affine=True))
+
+        self.conv_1x1 = nn.Sequential()
+        self.conv_1x1.add_module('relu', nn.ReLU())
+        self.conv_1x1.add_module(
+            'conv',
+            nn.Conv2d(in_channels_right,
+                      out_channels_right,
+                      1,
+                      stride=1,
+                      bias=False))
+        self.conv_1x1.add_module(
+            'bn',
+            nn.BatchNorm2d(out_channels_right,
+                           eps=0.001,
+                           momentum=0.1,
+                           affine=True))
+
+        self.comb_iter_0_left = BranchSeparablesReduction(out_channels_right,
+                                                          out_channels_right,
+                                                          5,
+                                                          2,
+                                                          2,
+                                                          bias=False)
+        self.comb_iter_0_right = BranchSeparablesReduction(out_channels_right,
+                                                           out_channels_right,
+                                                           7,
+                                                           2,
+                                                           3,
+                                                           bias=False)
+
+        self.comb_iter_1_left = MaxPoolPad()
+        self.comb_iter_1_right = BranchSeparablesReduction(out_channels_right,
+                                                           out_channels_right,
+                                                           7,
+                                                           2,
+                                                           3,
+                                                           bias=False)
+
+        self.comb_iter_2_left = AvgPoolPad()
+        self.comb_iter_2_right = BranchSeparablesReduction(out_channels_right,
+                                                           out_channels_right,
+                                                           5,
+                                                           2,
+                                                           2,
+                                                           bias=False)
+
+        self.comb_iter_3_right = nn.AvgPool2d(3,
+                                              stride=1,
+                                              padding=1,
+                                              count_include_pad=False)
+
+        self.comb_iter_4_left = BranchSeparablesReduction(out_channels_right,
+                                                          out_channels_right,
+                                                          3,
+                                                          1,
+                                                          1,
+                                                          bias=False)
+        self.comb_iter_4_right = MaxPoolPad()
+
+    def forward(self, x, x_prev):
+        x_left = self.conv_prev_1x1(x_prev)
+        x_right = self.conv_1x1(x)
+
+        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
+        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x_right)
+        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
+        x_comb_iter_2_right = self.comb_iter_2_right(x_left)
+        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
+
+        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
+        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
+        x_comb_iter_4_right = self.comb_iter_4_right(x_right)
+        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
+
+        x_out = torch.cat(
+            [x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
+        return x_out
+
+
+class ReductionCell1(nn.Module):
+
+    def __init__(self, in_channels_left, out_channels_left, in_channels_right,
+                 out_channels_right):
+        super(ReductionCell1, self).__init__()
+        self.conv_prev_1x1 = nn.Sequential()
+        self.conv_prev_1x1.add_module('relu', nn.ReLU())
+        self.conv_prev_1x1.add_module(
+            'conv',
+            nn.Conv2d(in_channels_left,
+                      out_channels_left,
+                      1,
+                      stride=1,
+                      bias=False))
+        self.conv_prev_1x1.add_module(
+            'bn',
+            nn.BatchNorm2d(out_channels_left,
+                           eps=0.001,
+                           momentum=0.1,
+                           affine=True))
+
+        self.conv_1x1 = nn.Sequential()
+        self.conv_1x1.add_module('relu', nn.ReLU())
+        self.conv_1x1.add_module(
+            'conv',
+            nn.Conv2d(in_channels_right,
+                      out_channels_right,
+                      1,
+                      stride=1,
+                      bias=False))
+        self.conv_1x1.add_module(
+            'bn',
+            nn.BatchNorm2d(out_channels_right,
+                           eps=0.001,
+                           momentum=0.1,
+                           affine=True))
+
+        self.comb_iter_0_left = BranchSeparables(out_channels_right,
+                                                 out_channels_right,
+                                                 5,
+                                                 2,
+                                                 2,
+                                                 bias=False)
+        self.comb_iter_0_right = BranchSeparables(out_channels_right,
+                                                  out_channels_right,
+                                                  7,
+                                                  2,
+                                                  3,
+                                                  bias=False)
+
+        self.comb_iter_1_left = nn.MaxPool2d(3, stride=2, padding=1)
+        self.comb_iter_1_right = BranchSeparables(out_channels_right,
+                                                  out_channels_right,
+                                                  7,
+                                                  2,
+                                                  3,
+                                                  bias=False)
+
+        self.comb_iter_2_left = nn.AvgPool2d(3,
+                                             stride=2,
+                                             padding=1,
+                                             count_include_pad=False)
+        self.comb_iter_2_right = BranchSeparables(out_channels_right,
+                                                  out_channels_right,
+                                                  5,
+                                                  2,
+                                                  2,
+                                                  bias=False)
+
+        self.comb_iter_3_right = nn.AvgPool2d(3,
+                                              stride=1,
+                                              padding=1,
+                                              count_include_pad=False)
+
+        self.comb_iter_4_left = BranchSeparables(out_channels_right,
+                                                 out_channels_right,
+                                                 3,
+                                                 1,
+                                                 1,
+                                                 bias=False)
+        self.comb_iter_4_right = nn.MaxPool2d(3, stride=2, padding=1)
+
+    def forward(self, x, x_prev):
+        x_left = self.conv_prev_1x1(x_prev)
+        x_right = self.conv_1x1(x)
+
+        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
+        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x_right)
+        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
+        x_comb_iter_2_right = self.comb_iter_2_right(x_left)
+        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
+
+        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
+        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
+        x_comb_iter_4_right = self.comb_iter_4_right(x_right)
+        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
+
+        x_out = torch.cat(
+            [x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
+        return x_out
+
+
+class NASNetAMobile(nn.Module):
+    """NASNetAMobile (4 @ 1056) """
+
+    def __init__(self,
+                 num_classes=1000,
+                 stem_filters=32,
+                 penultimate_filters=1056,
+                 filters_multiplier=2):
+        super(NASNetAMobile, self).__init__()
+        self.num_classes = num_classes
+        self.stem_filters = stem_filters
+        self.penultimate_filters = penultimate_filters
+        self.filters_multiplier = filters_multiplier
+
+        filters = self.penultimate_filters // 24
+        # 24 is default value for the architecture
+
+        self.conv0 = nn.Sequential()
+        self.conv0.add_module(
+            'conv',
+            nn.Conv2d(in_channels=3,
+                      out_channels=self.stem_filters,
+                      kernel_size=3,
+                      padding=0,
+                      stride=2,
+                      bias=False))
+        self.conv0.add_module(
+            'bn',
+            nn.BatchNorm2d(self.stem_filters,
+                           eps=0.001,
+                           momentum=0.1,
+                           affine=True))
+
+        self.cell_stem_0 = CellStem0(self.stem_filters,
+                                     num_filters=filters //
+                                     (filters_multiplier**2))
+        self.cell_stem_1 = CellStem1(self.stem_filters,
+                                     num_filters=filters // filters_multiplier)
+
+        self.cell_0 = FirstCell(
+            in_channels_left=filters,
+            out_channels_left=filters // 2,  # 1, 0.5
+            in_channels_right=2 * filters,
+            out_channels_right=filters)  # 2, 1
+        self.cell_1 = NormalCell(
+            in_channels_left=2 * filters,
+            out_channels_left=filters,  # 2, 1
+            in_channels_right=6 * filters,
+            out_channels_right=filters)  # 6, 1
+        self.cell_2 = NormalCell(
+            in_channels_left=6 * filters,
+            out_channels_left=filters,  # 6, 1
+            in_channels_right=6 * filters,
+            out_channels_right=filters)  # 6, 1
+        self.cell_3 = NormalCell(
+            in_channels_left=6 * filters,
+            out_channels_left=filters,  # 6, 1
+            in_channels_right=6 * filters,
+            out_channels_right=filters)  # 6, 1
+
+        self.reduction_cell_0 = ReductionCell0(
+            in_channels_left=6 * filters,
+            out_channels_left=2 * filters,  # 6, 2
+            in_channels_right=6 * filters,
+            out_channels_right=2 * filters)  # 6, 2
+
+        self.cell_6 = FirstCell(
+            in_channels_left=6 * filters,
+            out_channels_left=filters,  # 6, 1
+            in_channels_right=8 * filters,
+            out_channels_right=2 * filters)  # 8, 2
+        self.cell_7 = NormalCell(
+            in_channels_left=8 * filters,
+            out_channels_left=2 * filters,  # 8, 2
+            in_channels_right=12 * filters,
+            out_channels_right=2 * filters)  # 12, 2
+        self.cell_8 = NormalCell(
+            in_channels_left=12 * filters,
+            out_channels_left=2 * filters,  # 12, 2
+            in_channels_right=12 * filters,
+            out_channels_right=2 * filters)  # 12, 2
+        self.cell_9 = NormalCell(
+            in_channels_left=12 * filters,
+            out_channels_left=2 * filters,  # 12, 2
+            in_channels_right=12 * filters,
+            out_channels_right=2 * filters)  # 12, 2
+
+        self.reduction_cell_1 = ReductionCell1(
+            in_channels_left=12 * filters,
+            out_channels_left=4 * filters,  # 12, 4
+            in_channels_right=12 * filters,
+            out_channels_right=4 * filters)  # 12, 4
+
+        self.cell_12 = FirstCell(
+            in_channels_left=12 * filters,
+            out_channels_left=2 * filters,  # 12, 2
+            in_channels_right=16 * filters,
+            out_channels_right=4 * filters)  # 16, 4
+        self.cell_13 = NormalCell(
+            in_channels_left=16 * filters,
+            out_channels_left=4 * filters,  # 16, 4
+            in_channels_right=24 * filters,
+            out_channels_right=4 * filters)  # 24, 4
+        self.cell_14 = NormalCell(
+            in_channels_left=24 * filters,
+            out_channels_left=4 * filters,  # 24, 4
+            in_channels_right=24 * filters,
+            out_channels_right=4 * filters)  # 24, 4
+        self.cell_15 = NormalCell(
+            in_channels_left=24 * filters,
+            out_channels_left=4 * filters,  # 24, 4
+            in_channels_right=24 * filters,
+            out_channels_right=4 * filters)  # 24, 4
+
+        self.relu = nn.ReLU()
+        self.avg_pool = nn.AvgPool2d(7, stride=1, padding=0)
+        self.dropout = nn.Dropout()
+        self.last_linear = nn.Linear(24 * filters, self.num_classes)
+
+    def features(self, input):
+        x_conv0 = self.conv0(input)
+        x_stem_0 = self.cell_stem_0(x_conv0)
+        x_stem_1 = self.cell_stem_1(x_conv0, x_stem_0)
+
+        x_cell_0 = self.cell_0(x_stem_1, x_stem_0)
+        x_cell_1 = self.cell_1(x_cell_0, x_stem_1)
+        x_cell_2 = self.cell_2(x_cell_1, x_cell_0)
+        x_cell_3 = self.cell_3(x_cell_2, x_cell_1)
+
+        x_reduction_cell_0 = self.reduction_cell_0(x_cell_3, x_cell_2)
+
+        x_cell_6 = self.cell_6(x_reduction_cell_0, x_cell_3)
+        x_cell_7 = self.cell_7(x_cell_6, x_reduction_cell_0)
+        x_cell_8 = self.cell_8(x_cell_7, x_cell_6)
+        x_cell_9 = self.cell_9(x_cell_8, x_cell_7)
+
+        x_reduction_cell_1 = self.reduction_cell_1(x_cell_9, x_cell_8)
+
+        x_cell_12 = self.cell_12(x_reduction_cell_1, x_cell_9)
+        x_cell_13 = self.cell_13(x_cell_12, x_reduction_cell_1)
+        x_cell_14 = self.cell_14(x_cell_13, x_cell_12)
+        x_cell_15 = self.cell_15(x_cell_14, x_cell_13)
+        return x_cell_15
+
+    def logits(self, features):
+        x = self.relu(features)
+        x = self.avg_pool(x)
+        x = x.view(x.size(0), -1)
+        x = self.dropout(x)
+        x = self.last_linear(x)
+        return x
+
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+
+
+def nasnetamobile(num_classes=1000):
+    model = NASNetAMobile(num_classes=num_classes)
+    return model
+
+
+class NASNetALarge(nn.Module):
+    """NASNetALarge (6 @ 4032) """
+
+    def __init__(self,
+                 num_classes=1001,
+                 stem_filters=96,
+                 penultimate_filters=4032,
+                 filters_multiplier=2):
+        super(NASNetALarge, self).__init__()
+        self.num_classes = num_classes
+        self.stem_filters = stem_filters
+        self.penultimate_filters = penultimate_filters
+        self.filters_multiplier = filters_multiplier
+
+        filters = self.penultimate_filters // 24
+        # 24 is default value for the architecture
+
+        self.conv0 = nn.Sequential()
+        self.conv0.add_module(
+            'conv',
+            nn.Conv2d(in_channels=3,
+                      out_channels=self.stem_filters,
+                      kernel_size=3,
+                      padding=0,
+                      stride=2,
+                      bias=False))
+        self.conv0.add_module(
+            'bn',
+            nn.BatchNorm2d(self.stem_filters,
+                           eps=0.001,
+                           momentum=0.1,
+                           affine=True))
+
+        self.cell_stem_0 = CellStem0(self.stem_filters,
+                                     num_filters=filters //
+                                     (filters_multiplier**2))
+        self.cell_stem_1 = CellStem1(self.stem_filters,
+                                     num_filters=filters // filters_multiplier)
+
+        self.cell_0 = FirstCell(in_channels_left=filters,
+                                out_channels_left=filters // 2,
+                                in_channels_right=2 * filters,
+                                out_channels_right=filters)
+        self.cell_1 = NormalCell(in_channels_left=2 * filters,
+                                 out_channels_left=filters,
+                                 in_channels_right=6 * filters,
+                                 out_channels_right=filters)
+        self.cell_2 = NormalCell(in_channels_left=6 * filters,
+                                 out_channels_left=filters,
+                                 in_channels_right=6 * filters,
+                                 out_channels_right=filters)
+        self.cell_3 = NormalCell(in_channels_left=6 * filters,
+                                 out_channels_left=filters,
+                                 in_channels_right=6 * filters,
+                                 out_channels_right=filters)
+        self.cell_4 = NormalCell(in_channels_left=6 * filters,
+                                 out_channels_left=filters,
+                                 in_channels_right=6 * filters,
+                                 out_channels_right=filters)
+        self.cell_5 = NormalCell(in_channels_left=6 * filters,
+                                 out_channels_left=filters,
+                                 in_channels_right=6 * filters,
+                                 out_channels_right=filters)
+
+        self.reduction_cell_0 = ReductionCell0(in_channels_left=6 * filters,
+                                               out_channels_left=2 * filters,
+                                               in_channels_right=6 * filters,
+                                               out_channels_right=2 * filters)
+
+        self.cell_6 = FirstCell(in_channels_left=6 * filters,
+                                out_channels_left=filters,
+                                in_channels_right=8 * filters,
+                                out_channels_right=2 * filters)
+        self.cell_7 = NormalCell(in_channels_left=8 * filters,
+                                 out_channels_left=2 * filters,
+                                 in_channels_right=12 * filters,
+                                 out_channels_right=2 * filters)
+        self.cell_8 = NormalCell(in_channels_left=12 * filters,
+                                 out_channels_left=2 * filters,
+                                 in_channels_right=12 * filters,
+                                 out_channels_right=2 * filters)
+        self.cell_9 = NormalCell(in_channels_left=12 * filters,
+                                 out_channels_left=2 * filters,
+                                 in_channels_right=12 * filters,
+                                 out_channels_right=2 * filters)
+        self.cell_10 = NormalCell(in_channels_left=12 * filters,
+                                  out_channels_left=2 * filters,
+                                  in_channels_right=12 * filters,
+                                  out_channels_right=2 * filters)
+        self.cell_11 = NormalCell(in_channels_left=12 * filters,
+                                  out_channels_left=2 * filters,
+                                  in_channels_right=12 * filters,
+                                  out_channels_right=2 * filters)
+
+        self.reduction_cell_1 = ReductionCell1(in_channels_left=12 * filters,
+                                               out_channels_left=4 * filters,
+                                               in_channels_right=12 * filters,
+                                               out_channels_right=4 * filters)
+
+        self.cell_12 = FirstCell(in_channels_left=12 * filters,
+                                 out_channels_left=2 * filters,
+                                 in_channels_right=16 * filters,
+                                 out_channels_right=4 * filters)
+        self.cell_13 = NormalCell(in_channels_left=16 * filters,
+                                  out_channels_left=4 * filters,
+                                  in_channels_right=24 * filters,
+                                  out_channels_right=4 * filters)
+        self.cell_14 = NormalCell(in_channels_left=24 * filters,
+                                  out_channels_left=4 * filters,
+                                  in_channels_right=24 * filters,
+                                  out_channels_right=4 * filters)
+        self.cell_15 = NormalCell(in_channels_left=24 * filters,
+                                  out_channels_left=4 * filters,
+                                  in_channels_right=24 * filters,
+                                  out_channels_right=4 * filters)
+        self.cell_16 = NormalCell(in_channels_left=24 * filters,
+                                  out_channels_left=4 * filters,
+                                  in_channels_right=24 * filters,
+                                  out_channels_right=4 * filters)
+        self.cell_17 = NormalCell(in_channels_left=24 * filters,
+                                  out_channels_left=4 * filters,
+                                  in_channels_right=24 * filters,
+                                  out_channels_right=4 * filters)
+
+        self.relu = nn.ReLU()
+        self.avg_pool = nn.AvgPool2d(11, stride=1, padding=0)
+        self.dropout = nn.Dropout()
+        self.last_linear = nn.Linear(24 * filters, self.num_classes)
+
+    def features(self, input):
+        x_conv0 = self.conv0(input)
+        x_stem_0 = self.cell_stem_0(x_conv0)
+        x_stem_1 = self.cell_stem_1(x_conv0, x_stem_0)
+
+        x_cell_0 = self.cell_0(x_stem_1, x_stem_0)
+        x_cell_1 = self.cell_1(x_cell_0, x_stem_1)
+        x_cell_2 = self.cell_2(x_cell_1, x_cell_0)
+        x_cell_3 = self.cell_3(x_cell_2, x_cell_1)
+        x_cell_4 = self.cell_4(x_cell_3, x_cell_2)
+        x_cell_5 = self.cell_5(x_cell_4, x_cell_3)
+
+        x_reduction_cell_0 = self.reduction_cell_0(x_cell_5, x_cell_4)
+
+        x_cell_6 = self.cell_6(x_reduction_cell_0, x_cell_4)
+        x_cell_7 = self.cell_7(x_cell_6, x_reduction_cell_0)
+        x_cell_8 = self.cell_8(x_cell_7, x_cell_6)
+        x_cell_9 = self.cell_9(x_cell_8, x_cell_7)
+        x_cell_10 = self.cell_10(x_cell_9, x_cell_8)
+        x_cell_11 = self.cell_11(x_cell_10, x_cell_9)
+
+        x_reduction_cell_1 = self.reduction_cell_1(x_cell_11, x_cell_10)
+
+        x_cell_12 = self.cell_12(x_reduction_cell_1, x_cell_10)
+        x_cell_13 = self.cell_13(x_cell_12, x_reduction_cell_1)
+        x_cell_14 = self.cell_14(x_cell_13, x_cell_12)
+        x_cell_15 = self.cell_15(x_cell_14, x_cell_13)
+        x_cell_16 = self.cell_16(x_cell_15, x_cell_14)
+        x_cell_17 = self.cell_17(x_cell_16, x_cell_15)
+        return x_cell_17
+
+    def logits(self, features):
+        x = self.relu(features)
+        x = self.avg_pool(x)
+        x = x.view(x.size(0), -1)
+        x = self.dropout(x)
+        x = self.last_linear(x)
+        return x
+
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+
+
+def nasnetalarge(num_classes=1000):
+    model = NASNetALarge(num_classes=num_classes)
+    return model
--- a/models/preact_resnet.py
+++ b/models/preact_resnet.py
+import torch.nn as nn
+import math
+
+__all__ = [
+    'preact_resnet18', 'preact_resnet34', 'preact_resnet50',
+    'preact_resnet101', 'preact_resnet152'
+]
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=1,
+                     bias=False)
+
+
+class PreactBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 preactivate=True):
+        super(PreactBasicBlock, self).__init__()
+        self.pre_bn = self.pre_relu = None
+        if preactivate:
+            self.pre_bn = nn.BatchNorm2d(inplanes)
+            self.pre_relu = nn.ReLU(inplace=True)
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.downsample = downsample
+        self.stride = stride
+        self.preactivate = preactivate
+
+    def forward(self, x):
+        if self.preactivate:
+            preact = self.pre_bn(x)
+            preact = self.pre_relu(preact)
+        else:
+            preact = x
+
+        out = self.conv1(preact)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(preact)
+        else:
+            residual = x
+
+        out += residual
+
+        return out
+
+
+class PreactBottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 preactivate=True):
+        super(PreactBottleneck, self).__init__()
+        self.pre_bn = self.pre_relu = None
+        if preactivate:
+            self.pre_bn = nn.BatchNorm2d(inplanes)
+            self.pre_relu = nn.ReLU(inplace=True)
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes,
+                               planes,
+                               kernel_size=3,
+                               stride=stride,
+                               padding=1,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.preactivate = preactivate
+
+    def forward(self, x):
+        if self.preactivate:
+            preact = self.pre_bn(x)
+            preact = self.pre_relu(preact)
+        else:
+            preact = x
+
+        out = self.conv1(preact)
+        out = self.bn1(out)
+        out = self.relu1(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu2(out)
+
+        out = self.conv3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(preact)
+        else:
+            residual = x
+
+        out += residual
+
+        return out
+
+
+class PreactResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 num_classes=1000,
+                 deep_stem=False,
+                 avg_down=False,
+                 bypass_last_bn=False,
+                 bn=None):
+
+        super(PreactResNet, self).__init__()
+
+        global bypass_bn_weight_list
+
+        bypass_bn_weight_list = []
+
+        self.inplanes = 64
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+
+        if self.deep_stem:
+            self.conv1 = nn.Sequential(
+                nn.Conv2d(3,
+                          32,
+                          kernel_size=3,
+                          stride=2,
+                          padding=1,
+                          bias=False),
+                nn.BatchNorm2d(32),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32,
+                          32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          bias=False),
+                nn.BatchNorm2d(32),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32,
+                          64,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          bias=False),
+            )
+        else:
+            self.conv1 = nn.Conv2d(3,
+                                   64,
+                                   kernel_size=7,
+                                   stride=2,
+                                   padding=3,
+                                   bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+        self.final_bn = nn.BatchNorm2d(512 * block.expansion)
+        self.final_relu = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 1.0 / float(n))
+                m.bias.data.zero_()
+
+        if bypass_last_bn:
+            for param in bypass_bn_weight_list:
+                param.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1, avg_down=False):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            if self.avg_down:
+                downsample = nn.Sequential(
+                    nn.AvgPool2d(stride,
+                                 stride=stride,
+                                 ceil_mode=True,
+                                 count_include_pad=False),
+                    nn.Conv2d(self.inplanes,
+                              planes * block.expansion,
+                              kernel_size=1,
+                              stride=1,
+                              bias=False),
+                    # BN(planes * block.expansion),
+                )
+            else:
+                downsample = nn.Sequential(
+                    nn.Conv2d(self.inplanes,
+                              planes * block.expansion,
+                              kernel_size=1,
+                              stride=stride,
+                              bias=False),
+                    # BN(planes * block.expansion),
+                )
+
+        # On the first residual block in the first residual layer we don't pre-activate,
+        # because we take care of that (+ maxpool) after the initial conv layer
+        preactivate_first = stride != 1
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample,
+                  preactivate_first))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.final_bn(x)
+        x = self.final_relu(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def preact_resnet18(**kwargs):
+    model = PreactResNet(PreactBasicBlock, [2, 2, 2, 2], **kwargs)
+    return model
+
+
+def preact_resnet34(**kwargs):
+    model = PreactResNet(PreactBasicBlock, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def preact_resnet50(**kwargs):
+    model = PreactResNet(PreactBottleneck, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def preact_resnet101(**kwargs):
+    model = PreactResNet(PreactBottleneck, [3, 4, 23, 3], **kwargs)
+    return model
+
+
+def preact_resnet152(**kwargs):
+    model = PreactResNet(PreactBottleneck, [3, 8, 36, 3], **kwargs)
+    return model
--- a/models/resnest.py
+++ b/models/resnest.py
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.utils import _pair
+
+__all__ = ['ResNest', 'resnest50', 'resnest101', 'resnest200', 'resnest269']
+
+
+class SplAtConv2d(nn.Module):
+    """Split-Attention Conv2d
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=(1, 1),
+                 padding=(0, 0),
+                 dilation=(1, 1),
+                 groups=1,
+                 bias=True,
+                 radix=2,
+                 reduction_factor=4,
+                 rectify=False,
+                 rectify_avg=False,
+                 norm_layer=None,
+                 dropblock_prob=0.0,
+                 **kwargs):
+        super(SplAtConv2d, self).__init__()
+        padding = _pair(padding)
+        self.rectify = rectify and (padding[0] > 0 or padding[1] > 0)
+        self.rectify_avg = rectify_avg
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.cardinality = groups
+        self.channels = channels
+        self.dropblock_prob = dropblock_prob
+        if self.rectify:
+            from rfconv import RFConv2d
+            self.conv = RFConv2d(in_channels,
+                                 channels * radix,
+                                 kernel_size,
+                                 stride,
+                                 padding,
+                                 dilation,
+                                 groups=groups * radix,
+                                 bias=bias,
+                                 average_mode=rectify_avg,
+                                 **kwargs)
+        else:
+            self.conv = nn.Conv2d(in_channels,
+                                  channels * radix,
+                                  kernel_size,
+                                  stride,
+                                  padding,
+                                  dilation,
+                                  groups=groups * radix,
+                                  bias=bias,
+                                  **kwargs)
+        self.use_bn = norm_layer is not None
+        if self.use_bn:
+            self.bn0 = norm_layer(channels * radix)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = nn.Conv2d(channels,
+                             inter_channels,
+                             1,
+                             groups=self.cardinality)
+        if self.use_bn:
+            self.bn1 = norm_layer(inter_channels)
+        self.fc2 = nn.Conv2d(inter_channels,
+                             channels * radix,
+                             1,
+                             groups=self.cardinality)
+        if dropblock_prob > 0.0:
+            self.dropblock = DropBlock2D(dropblock_prob, 3)
+        self.rsoftmax = rSoftMax(radix, groups)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn0(x)
+        if self.dropblock_prob > 0.0:
+            x = self.dropblock(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        if self.radix > 1:
+            if torch.__version__ < '1.5':
+                splited = torch.split(x, int(rchannel // self.radix), dim=1)
+            else:
+                splited = torch.split(x, rchannel // self.radix, dim=1)
+            gap = sum(splited)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        if self.use_bn:
+            gap = self.bn1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            if torch.__version__ < '1.5':
+                attens = torch.split(atten, int(rchannel // self.radix), dim=1)
+            else:
+                attens = torch.split(atten, rchannel // self.radix, dim=1)
+            out = sum([att * split for (att, split) in zip(attens, splited)])
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class rSoftMax(nn.Module):
+
+    def __init__(self, radix, cardinality):
+        super().__init__()
+        self.radix = radix
+        self.cardinality = cardinality
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class DropBlock2D(object):
+
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+class GlobalAvgPool2d(nn.Module):
+
+    def __init__(self):
+        """Global average pooling over the input's spatial dimensions"""
+        super(GlobalAvgPool2d, self).__init__()
+
+    def forward(self, inputs):
+        return nn.functional.adaptive_avg_pool2d(inputs,
+                                                 1).view(inputs.size(0), -1)
+
+
+class Bottleneck(nn.Module):
+    """ResNet Bottleneck
+    """
+    # pylint: disable=unused-argument
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 radix=1,
+                 cardinality=1,
+                 bottleneck_width=64,
+                 avd=False,
+                 avd_first=False,
+                 dilation=1,
+                 is_first=False,
+                 rectified_conv=False,
+                 rectify_avg=False,
+                 norm_layer=None,
+                 dropblock_prob=0.0,
+                 last_gamma=False):
+        super(Bottleneck, self).__init__()
+        group_width = int(planes * (bottleneck_width / 64.)) * cardinality
+        self.conv1 = nn.Conv2d(inplanes,
+                               group_width,
+                               kernel_size=1,
+                               bias=False)
+        self.bn1 = norm_layer(group_width)
+        self.dropblock_prob = dropblock_prob
+        self.radix = radix
+        self.avd = avd and (stride > 1 or is_first)
+        self.avd_first = avd_first
+
+        if self.avd:
+            self.avd_layer = nn.AvgPool2d(3, stride, padding=1)
+            stride = 1
+
+        if dropblock_prob > 0.0:
+            self.dropblock1 = DropBlock2D(dropblock_prob, 3)
+            if radix == 1:
+                self.dropblock2 = DropBlock2D(dropblock_prob, 3)
+            self.dropblock3 = DropBlock2D(dropblock_prob, 3)
+
+        if radix >= 1:
+            self.conv2 = SplAtConv2d(group_width,
+                                     group_width,
+                                     kernel_size=3,
+                                     stride=stride,
+                                     padding=dilation,
+                                     dilation=dilation,
+                                     groups=cardinality,
+                                     bias=False,
+                                     radix=radix,
+                                     rectify=rectified_conv,
+                                     rectify_avg=rectify_avg,
+                                     norm_layer=norm_layer,
+                                     dropblock_prob=dropblock_prob)
+        elif rectified_conv:
+            from rfconv import RFConv2d
+            self.conv2 = RFConv2d(group_width,
+                                  group_width,
+                                  kernel_size=3,
+                                  stride=stride,
+                                  padding=dilation,
+                                  dilation=dilation,
+                                  groups=cardinality,
+                                  bias=False,
+                                  average_mode=rectify_avg)
+            self.bn2 = norm_layer(group_width)
+        else:
+            self.conv2 = nn.Conv2d(group_width,
+                                   group_width,
+                                   kernel_size=3,
+                                   stride=stride,
+                                   padding=dilation,
+                                   dilation=dilation,
+                                   groups=cardinality,
+                                   bias=False)
+            self.bn2 = norm_layer(group_width)
+
+        self.conv3 = nn.Conv2d(group_width,
+                               planes * 4,
+                               kernel_size=1,
+                               bias=False)
+        self.bn3 = norm_layer(planes * 4)
+
+        if last_gamma:
+            from torch.nn.init import zeros_
+            zeros_(self.bn3.weight)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        if self.dropblock_prob > 0.0:
+            out = self.dropblock1(out)
+        out = self.relu(out)
+
+        if self.avd and self.avd_first:
+            out = self.avd_layer(out)
+
+        out = self.conv2(out)
+        if self.radix == 0:
+            out = self.bn2(out)
+            if self.dropblock_prob > 0.0:
+                out = self.dropblock2(out)
+            out = self.relu(out)
+
+        if self.avd and not self.avd_first:
+            out = self.avd_layer(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.dropblock_prob > 0.0:
+            out = self.dropblock3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNest(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 radix=1,
+                 groups=1,
+                 bottleneck_width=64,
+                 num_classes=1000,
+                 dilated=False,
+                 dilation=1,
+                 deep_stem=False,
+                 stem_width=64,
+                 avg_down=False,
+                 rectified_conv=False,
+                 rectify_avg=False,
+                 avd=False,
+                 avd_first=False,
+                 final_drop=0.0,
+                 dropblock_prob=0,
+                 last_gamma=False,
+                 norm_layer=nn.BatchNorm2d):
+        self.cardinality = groups
+        self.bottleneck_width = bottleneck_width
+        # ResNet-D params
+        self.inplanes = stem_width * 2 if deep_stem else 64
+        self.avg_down = avg_down
+        self.last_gamma = last_gamma
+        # ResNeSt params
+        self.radix = radix
+        self.avd = avd
+        self.avd_first = avd_first
+
+        super(ResNest, self).__init__()
+        self.rectified_conv = rectified_conv
+        self.rectify_avg = rectify_avg
+        if rectified_conv:
+            from rfconv import RFConv2d
+            conv_layer = RFConv2d
+        else:
+            conv_layer = nn.Conv2d
+        conv_kwargs = {'average_mode': rectify_avg} if rectified_conv else {}
+        if deep_stem:
+            self.conv1 = nn.Sequential(
+                conv_layer(3,
+                           stem_width,
+                           kernel_size=3,
+                           stride=2,
+                           padding=1,
+                           bias=False,
+                           **conv_kwargs),
+                norm_layer(stem_width),
+                nn.ReLU(inplace=True),
+                conv_layer(stem_width,
+                           stem_width,
+                           kernel_size=3,
+                           stride=1,
+                           padding=1,
+                           bias=False,
+                           **conv_kwargs),
+                norm_layer(stem_width),
+                nn.ReLU(inplace=True),
+                conv_layer(stem_width,
+                           stem_width * 2,
+                           kernel_size=3,
+                           stride=1,
+                           padding=1,
+                           bias=False,
+                           **conv_kwargs),
+            )
+        else:
+            self.conv1 = conv_layer(3,
+                                    64,
+                                    kernel_size=7,
+                                    stride=2,
+                                    padding=3,
+                                    bias=False,
+                                    **conv_kwargs)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block,
+                                       64,
+                                       layers[0],
+                                       norm_layer=norm_layer,
+                                       is_first=False)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2,
+                                       norm_layer=norm_layer)
+        if dilated or dilation == 4:
+            self.layer3 = self._make_layer(block,
+                                           256,
+                                           layers[2],
+                                           stride=1,
+                                           dilation=2,
+                                           norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+            self.layer4 = self._make_layer(block,
+                                           512,
+                                           layers[3],
+                                           stride=1,
+                                           dilation=4,
+                                           norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+        elif dilation == 2:
+            self.layer3 = self._make_layer(block,
+                                           256,
+                                           layers[2],
+                                           stride=2,
+                                           dilation=1,
+                                           norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+            self.layer4 = self._make_layer(block,
+                                           512,
+                                           layers[3],
+                                           stride=1,
+                                           dilation=2,
+                                           norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+        else:
+            self.layer3 = self._make_layer(block,
+                                           256,
+                                           layers[2],
+                                           stride=2,
+                                           norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+            self.layer4 = self._make_layer(block,
+                                           512,
+                                           layers[3],
+                                           stride=2,
+                                           norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+        self.avgpool = GlobalAvgPool2d()
+        self.drop = nn.Dropout(final_drop) if final_drop > 0.0 else None
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, norm_layer):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    blocks,
+                    stride=1,
+                    dilation=1,
+                    norm_layer=None,
+                    dropblock_prob=0.0,
+                    is_first=True):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            down_layers = []
+            if self.avg_down:
+                if dilation == 1:
+                    down_layers.append(
+                        nn.AvgPool2d(kernel_size=stride,
+                                     stride=stride,
+                                     ceil_mode=True,
+                                     count_include_pad=False))
+                else:
+                    down_layers.append(
+                        nn.AvgPool2d(kernel_size=1,
+                                     stride=1,
+                                     ceil_mode=True,
+                                     count_include_pad=False))
+                down_layers.append(
+                    nn.Conv2d(self.inplanes,
+                              planes * block.expansion,
+                              kernel_size=1,
+                              stride=1,
+                              bias=False))
+            else:
+                down_layers.append(
+                    nn.Conv2d(self.inplanes,
+                              planes * block.expansion,
+                              kernel_size=1,
+                              stride=stride,
+                              bias=False))
+            down_layers.append(norm_layer(planes * block.expansion))
+            downsample = nn.Sequential(*down_layers)
+
+        layers = []
+        if dilation == 1 or dilation == 2:
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      stride,
+                      downsample=downsample,
+                      radix=self.radix,
+                      cardinality=self.cardinality,
+                      bottleneck_width=self.bottleneck_width,
+                      avd=self.avd,
+                      avd_first=self.avd_first,
+                      dilation=1,
+                      is_first=is_first,
+                      rectified_conv=self.rectified_conv,
+                      rectify_avg=self.rectify_avg,
+                      norm_layer=norm_layer,
+                      dropblock_prob=dropblock_prob,
+                      last_gamma=self.last_gamma))
+        elif dilation == 4:
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      stride,
+                      downsample=downsample,
+                      radix=self.radix,
+                      cardinality=self.cardinality,
+                      bottleneck_width=self.bottleneck_width,
+                      avd=self.avd,
+                      avd_first=self.avd_first,
+                      dilation=2,
+                      is_first=is_first,
+                      rectified_conv=self.rectified_conv,
+                      rectify_avg=self.rectify_avg,
+                      norm_layer=norm_layer,
+                      dropblock_prob=dropblock_prob,
+                      last_gamma=self.last_gamma))
+        else:
+            raise RuntimeError("=> unknown dilation size: {}".format(dilation))
+
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      radix=self.radix,
+                      cardinality=self.cardinality,
+                      bottleneck_width=self.bottleneck_width,
+                      avd=self.avd,
+                      avd_first=self.avd_first,
+                      dilation=dilation,
+                      rectified_conv=self.rectified_conv,
+                      rectify_avg=self.rectify_avg,
+                      norm_layer=norm_layer,
+                      dropblock_prob=dropblock_prob,
+                      last_gamma=self.last_gamma))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        # x = x.view(x.size(0), -1)
+        x = torch.flatten(x, 1)
+        if self.drop:
+            x = self.drop(x)
+        x = self.fc(x)
+
+        return x
+
+
+def resnest50(pretrained=False, **kwargs):
+    model = ResNest(Bottleneck, [3, 4, 6, 3],
+                    radix=2,
+                    groups=1,
+                    bottleneck_width=64,
+                    deep_stem=True,
+                    stem_width=32,
+                    avg_down=True,
+                    avd=True,
+                    avd_first=False,
+                    **kwargs)
+    return model
+
+
+def resnest101(pretrained=False, **kwargs):
+    model = ResNest(Bottleneck, [3, 4, 23, 3],
+                    radix=2,
+                    groups=1,
+                    bottleneck_width=64,
+                    deep_stem=True,
+                    stem_width=64,
+                    avg_down=True,
+                    avd=True,
+                    avd_first=False,
+                    **kwargs)
+    return model
+
+
+def resnest200(pretrained=False, **kwargs):
+    model = ResNest(Bottleneck, [3, 24, 36, 3],
+                    radix=2,
+                    groups=1,
+                    bottleneck_width=64,
+                    deep_stem=True,
+                    stem_width=64,
+                    avg_down=True,
+                    avd=True,
+                    avd_first=False,
+                    **kwargs)
+    return model
+
+
+def resnest269(pretrained=False, **kwargs):
+    model = ResNest(Bottleneck, [3, 30, 48, 8],
+                    radix=2,
+                    groups=1,
+                    bottleneck_width=64,
+                    deep_stem=True,
+                    stem_width=64,
+                    avg_down=True,
+                    avd=True,
+                    avd_first=False,
+                    **kwargs)
+    return model
--- a/models/resnet.py
+++ b/models/resnet.py
+import torch.nn as nn
+import math
+
+__all__ = [
+    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet50c', 'resnet50d',
+    'resnet101', 'resnet101d', 'resnet152', 'resnet152d'
+]
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=1,
+                     bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes,
+                               planes,
+                               kernel_size=3,
+                               stride=stride,
+                               padding=1,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+        bypass_bn_weight_list.append(self.bn3.weight)
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 num_classes=1000,
+                 deep_stem=False,
+                 avg_down=False,
+                 bypass_last_bn=False):
+
+        global bypass_bn_weight_list
+
+        bypass_bn_weight_list = []
+
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+
+        if self.deep_stem:
+            self.conv1 = nn.Sequential(
+                nn.Conv2d(3,
+                          32,
+                          kernel_size=3,
+                          stride=2,
+                          padding=1,
+                          bias=False),
+                nn.BatchNorm2d(32),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32,
+                          32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          bias=False),
+                nn.BatchNorm2d(32),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32,
+                          64,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          bias=False),
+            )
+        else:
+            self.conv1 = nn.Conv2d(3,
+                                   64,
+                                   kernel_size=7,
+                                   stride=2,
+                                   padding=3,
+                                   bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+        if bypass_last_bn:
+            for param in bypass_bn_weight_list:
+                param.data.zero_()
+            print('bypass {} bn.weight in BottleneckBlocks'.format(
+                len(bypass_bn_weight_list)))
+
+    def _make_layer(self, block, planes, blocks, stride=1, avg_down=False):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            if self.avg_down:
+                downsample = nn.Sequential(
+                    nn.AvgPool2d(stride, stride=stride, ceil_mode=True),
+                    nn.Conv2d(self.inplanes,
+                              planes * block.expansion,
+                              kernel_size=1,
+                              stride=1,
+                              bias=False),
+                    nn.BatchNorm2d(planes * block.expansion),
+                )
+            else:
+                downsample = nn.Sequential(
+                    nn.Conv2d(self.inplanes,
+                              planes * block.expansion,
+                              kernel_size=1,
+                              stride=stride,
+                              bias=False),
+                    nn.BatchNorm2d(planes * block.expansion),
+                )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def resnet18(pretrained=False, **kwargs):
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    return model
+
+
+def resnet34(pretrained=False, **kwargs):
+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def resnet50(pretrained=False, **kwargs):
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def resnet50c(pretrained=False, **kwargs):
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs, deep_stem=True)
+    return model
+
+
+def resnet50d(pretrained=False, **kwargs):
+    model = ResNet(Bottleneck, [3, 4, 6, 3],
+                   **kwargs,
+                   deep_stem=True,
+                   avg_down=True)
+    return model
+
+
+def resnet101(pretrained=False, **kwargs):
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    return model
+
+
+def resnet101d(pretrained=False, **kwargs):
+    model = ResNet(Bottleneck, [3, 4, 23, 3],
+                   **kwargs,
+                   deep_stem=True,
+                   avg_down=True)
+    return model
+
+
+def resnet152(pretrained=False, **kwargs):
+    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+    return model
+
+
+def resnet152d(pretrained=False, **kwargs):
+    model = ResNet(Bottleneck, [3, 8, 36, 3],
+                   **kwargs,
+                   deep_stem=True,
+                   avg_down=True)
+    return model
--- a/models/resnet_official.py
+++ b/models/resnet_official.py
+import torch
+import torch.nn as nn
+
+__all__ = [
+    'resnext50_32x4d', 'resnext101_32x8d', 'resnext50d_32x4d',
+    'resnext101_32x8d', 'wide_resnet50_2', 'wide_resnet101_2',
+    'wide_resnet50d_2', 'wide_resnet101d_2'
+]
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias=False,
+                     dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+    __constants__ = ['downsample']
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError(
+                "Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+    __constants__ = ['downsample']
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 num_classes=1000,
+                 zero_init_residual=False,
+                 groups=1,
+                 width_per_group=64,
+                 replace_stride_with_dilation=None,
+                 norm_layer=None,
+                 deep_stem=False,
+                 avg_down=False,
+                 bn=None):
+
+        super(ResNet, self).__init__()
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(
+                                 replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+
+        if self.deep_stem:
+            self.conv1 = nn.Sequential(
+                nn.Conv2d(3,
+                          32,
+                          kernel_size=3,
+                          stride=2,
+                          padding=1,
+                          bias=False),
+                norm_layer(32),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32,
+                          32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          bias=False),
+                norm_layer(32),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32,
+                          64,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          bias=False),
+            )
+        else:
+            self.conv1 = nn.Conv2d(3,
+                                   64,
+                                   kernel_size=7,
+                                   stride=2,
+                                   padding=3,
+                                   bias=False)
+
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block,
+                                       512,
+                                       layers[3],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight,
+                                        mode='fan_out',
+                                        nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            if self.avg_down:
+                downsample = nn.Sequential(
+                    nn.AvgPool2d(stride,
+                                 stride=stride,
+                                 ceil_mode=True,
+                                 count_include_pad=False),
+                    conv1x1(self.inplanes, planes * block.expansion),
+                    norm_layer(planes * block.expansion),
+                )
+            else:
+                downsample = nn.Sequential(
+                    conv1x1(self.inplanes, planes * block.expansion, stride),
+                    norm_layer(planes * block.expansion),
+                )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      groups=self.groups,
+                      base_width=self.base_width,
+                      dilation=self.dilation,
+                      norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x):
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+
+        return x
+
+    def forward(self, x):
+        return self._forward_impl(x)
+
+
+def resnext50_32x4d(**kwargs):
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def resnext101_32x8d(**kwargs):
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    return model
+
+
+def resnext50d_32x4d(**kwargs):
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    model = ResNet(Bottleneck, [3, 4, 6, 3],
+                   **kwargs,
+                   deep_stem=True,
+                   avg_down=True)
+    return model
+
+
+def resnext101d_32x8d(**kwargs):
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    model = ResNet(Bottleneck, [3, 4, 23, 3],
+                   **kwargs,
+                   deep_stem=True,
+                   avg_down=True)
+    return model
+
+
+def wide_resnet50_2(**kwargs):
+    kwargs['width_per_group'] = 64 * 2
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def wide_resnet101_2(**kwargs):
+    kwargs['width_per_group'] = 64 * 2
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    return model
+
+
+def wide_resnet50d_2(**kwargs):
+    kwargs['width_per_group'] = 64 * 2
+    model = ResNet(Bottleneck, [3, 4, 6, 3],
+                   **kwargs,
+                   deep_stem=True,
+                   avg_down=True)
+    return model
+
+
+def wide_resnet101d_2(**kwargs):
+    kwargs['width_per_group'] = 64 * 2
+    model = ResNet(Bottleneck, [3, 4, 23, 3],
+                   **kwargs,
+                   deep_stem=True,
+                   avg_down=True)
+    return model
--- a/models/resnet_v2.py
+++ b/models/resnet_v2.py
+import torch.nn as nn
+import math
+
+__all__ = [
+    'ResNetV2', 'resnet18_v2', 'resnet34_v2', 'resnet50_v2', 'resnet50c_v2',
+    'resnet50d_v2', 'resnet101_v2', 'resnet152_v2', 'resnet200_v2'
+]
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=1,
+                     bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2d(inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = conv3x3(planes, planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.bn1(x)
+        out = self.relu(out)
+        out = self.conv1(out)
+
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+
+        self.bn1 = nn.BatchNorm2d(inplanes)
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes,
+                               planes,
+                               kernel_size=3,
+                               stride=stride,
+                               padding=1,
+                               bias=False)
+        self.conv3 = nn.Conv2d(planes,
+                               planes * self.expansion,
+                               kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+        bypass_bn_weight_list.append(self.bn3.weight)
+
+    def forward(self, x):
+        residual = x
+
+        out = self.bn1(x)
+        out = self.relu(out)
+        out = self.conv1(out)
+
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+
+        out = self.bn3(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+
+        return out
+
+
+class ResNetV2(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 num_classes=1000,
+                 deep_stem=False,
+                 avg_down=False,
+                 bypass_last_bn=False):
+
+        global bypass_bn_weight_list
+
+        bypass_bn_weight_list = []
+
+        self.inplanes = 64
+        super(ResNetV2, self).__init__()
+
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+
+        if self.deep_stem:
+            self.conv1 = nn.Sequential(
+                nn.Conv2d(3,
+                          32,
+                          kernel_size=3,
+                          stride=2,
+                          padding=1,
+                          bias=False),
+                nn.BatchNorm2d(32),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32,
+                          32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          bias=False),
+                nn.BatchNorm2d(32),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32,
+                          64,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          bias=False),
+            )
+        else:
+            self.conv1 = nn.Conv2d(3,
+                                   64,
+                                   kernel_size=7,
+                                   stride=2,
+                                   padding=3,
+                                   bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.bn2 = nn.BatchNorm2d(512 * block.expansion)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+        if bypass_last_bn:
+            for param in bypass_bn_weight_list:
+                param.data.zero_()
+            print('bypass {} bn.weight in BottleneckBlocks'.format(
+                len(bypass_bn_weight_list)))
+
+    def _make_layer(self, block, planes, blocks, stride=1, avg_down=False):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            if self.avg_down:
+                downsample = nn.Sequential(
+                    nn.AvgPool2d(stride, stride=stride, ceil_mode=True),
+                    nn.Conv2d(self.inplanes,
+                              planes * block.expansion,
+                              kernel_size=1,
+                              stride=1,
+                              bias=False),
+                    nn.BatchNorm2d(planes * block.expansion),
+                )
+            else:
+                downsample = nn.Sequential(
+                    nn.Conv2d(self.inplanes,
+                              planes * block.expansion,
+                              kernel_size=1,
+                              stride=stride,
+                              bias=False),
+                    nn.BatchNorm2d(planes * block.expansion),
+                )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def resnet18_v2(pretrained=False, **kwargs):
+    model = ResNetV2(BasicBlock, [2, 2, 2, 2], **kwargs)
+    return model
+
+
+def resnet34_v2(pretrained=False, **kwargs):
+    model = ResNetV2(BasicBlock, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def resnet50_v2(pretrained=False, **kwargs):
+    model = ResNetV2(Bottleneck, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def resnet50c_v2(pretrained=False, **kwargs):
+    model = ResNetV2(Bottleneck, [3, 4, 6, 3], **kwargs, deep_stem=True)
+    return model
+
+
+def resnet50d_v2(pretrained=False, **kwargs):
+    model = ResNetV2(Bottleneck, [3, 4, 6, 3],
+                     **kwargs,
+                     deep_stem=True,
+                     avg_down=True)
+    return model
+
+
+def resnet101_v2(pretrained=False, **kwargs):
+    model = ResNetV2(Bottleneck, [3, 4, 23, 3], **kwargs)
+    return model
+
+
+def resnet152_v2(pretrained=False, **kwargs):
+    model = ResNetV2(Bottleneck, [3, 8, 36, 3], **kwargs)
+    return model
+
+
+def resnet200_v2(pretrained=False, **kwargs):
+    model = ResNetV2(Bottleneck, [3, 24, 36, 3], **kwargs)
+    return model
--- a/models/senet.py
+++ b/models/senet.py
+from collections import OrderedDict
+import math
+
+import torch.nn as nn
+
+__all__ = [
+    'SENet', 'senet154', 'se_resnet50', 'se_resnet101', 'se_resnet152',
+    'se_resnext50_32x4d', 'se_resnext101_32x4d', 'se_resnext101_64x4d'
+]
+
+
+class SEModule(nn.Module):
+
+    def __init__(self, channels, reduction):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc1 = nn.Conv2d(channels,
+                             channels // reduction,
+                             kernel_size=1,
+                             padding=0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc2 = nn.Conv2d(channels // reduction,
+                             channels,
+                             kernel_size=1,
+                             padding=0)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return module_input * x
+
+
+class Bottleneck(nn.Module):
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out = self.se_module(out) + residual
+        out = self.relu(out)
+
+        return out
+
+
+class SEBottleneck(Bottleneck):
+
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups,
+                 reduction,
+                 stride=1,
+                 downsample=None):
+        super(SEBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes * 2, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes * 2)
+        self.conv2 = nn.Conv2d(planes * 2,
+                               planes * 4,
+                               kernel_size=3,
+                               stride=stride,
+                               padding=1,
+                               groups=groups,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(planes * 4)
+        self.conv3 = nn.Conv2d(planes * 4,
+                               planes * 4,
+                               kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes * 4, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+
+class SEResNetBottleneck(Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups,
+                 reduction,
+                 stride=1,
+                 downsample=None):
+        super(SEResNetBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes,
+                               planes,
+                               kernel_size=1,
+                               bias=False,
+                               stride=stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes,
+                               planes,
+                               kernel_size=3,
+                               padding=1,
+                               groups=groups,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes * 4, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+
+class SEResNeXtBottleneck(Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups,
+                 reduction,
+                 stride=1,
+                 downsample=None,
+                 base_width=4):
+        super(SEResNeXtBottleneck, self).__init__()
+        width = math.floor(planes * (base_width / 64)) * groups
+        self.conv1 = nn.Conv2d(inplanes,
+                               width,
+                               kernel_size=1,
+                               bias=False,
+                               stride=1)
+        self.bn1 = nn.BatchNorm2d(width)
+        self.conv2 = nn.Conv2d(width,
+                               width,
+                               kernel_size=3,
+                               stride=stride,
+                               padding=1,
+                               groups=groups,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(width)
+        self.conv3 = nn.Conv2d(width, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes * 4, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+
+class SENet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 groups,
+                 reduction,
+                 dropout_p=0.2,
+                 inplanes=128,
+                 input_3x3=True,
+                 downsample_kernel_size=3,
+                 downsample_padding=1,
+                 num_classes=1000):
+        super(SENet, self).__init__()
+        self.inplanes = inplanes
+        if input_3x3:
+            layer0_modules = [
+                ('conv1', nn.Conv2d(3, 64, 3, stride=2, padding=1,
+                                    bias=False)),
+                ('bn1', nn.BatchNorm2d(64)),
+                ('relu1', nn.ReLU(inplace=True)),
+                ('conv2', nn.Conv2d(64, 64, 3, stride=1, padding=1,
+                                    bias=False)),
+                ('bn2', nn.BatchNorm2d(64)),
+                ('relu2', nn.ReLU(inplace=True)),
+                ('conv3',
+                 nn.Conv2d(64, inplanes, 3, stride=1, padding=1, bias=False)),
+                ('bn3', nn.BatchNorm2d(inplanes)),
+                ('relu3', nn.ReLU(inplace=True)),
+            ]
+        else:
+            layer0_modules = [
+                ('conv1',
+                 nn.Conv2d(3,
+                           inplanes,
+                           kernel_size=7,
+                           stride=2,
+                           padding=3,
+                           bias=False)),
+                ('bn1', nn.BatchNorm2d(inplanes)),
+                ('relu1', nn.ReLU(inplace=True)),
+            ]
+        layer0_modules.append(('pool', nn.MaxPool2d(3,
+                                                    stride=2,
+                                                    ceil_mode=True)))
+        self.layer0 = nn.Sequential(OrderedDict(layer0_modules))
+        self.layer1 = self._make_layer(block,
+                                       planes=64,
+                                       blocks=layers[0],
+                                       groups=groups,
+                                       reduction=reduction,
+                                       downsample_kernel_size=1,
+                                       downsample_padding=0)
+        self.layer2 = self._make_layer(
+            block,
+            planes=128,
+            blocks=layers[1],
+            stride=2,
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=downsample_kernel_size,
+            downsample_padding=downsample_padding)
+        self.layer3 = self._make_layer(
+            block,
+            planes=256,
+            blocks=layers[2],
+            stride=2,
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=downsample_kernel_size,
+            downsample_padding=downsample_padding)
+        self.layer4 = self._make_layer(
+            block,
+            planes=512,
+            blocks=layers[3],
+            stride=2,
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=downsample_kernel_size,
+            downsample_padding=downsample_padding)
+        self.avg_pool = nn.AvgPool2d(7, stride=1)
+        self.dropout = nn.Dropout(dropout_p) if dropout_p is not None else None
+        self.last_linear = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    blocks,
+                    groups,
+                    reduction,
+                    stride=1,
+                    downsample_kernel_size=1,
+                    downsample_padding=0):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes,
+                          planes * block.expansion,
+                          kernel_size=downsample_kernel_size,
+                          stride=stride,
+                          padding=downsample_padding,
+                          bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, groups, reduction, stride,
+                  downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups, reduction))
+
+        return nn.Sequential(*layers)
+
+    def features(self, x):
+        x = self.layer0(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x
+
+    def logits(self, x):
+        x = self.avg_pool(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.logits(x)
+        return x
+
+
+def senet154(**kwargs):
+    model = SENet(SEBottleneck, [3, 8, 36, 3],
+                  groups=64,
+                  reduction=16,
+                  dropout_p=0.2,
+                  **kwargs)
+    return model
+
+
+def se_resnet50(**kwargs):
+    model = SENet(SEResNetBottleneck, [3, 4, 6, 3],
+                  groups=1,
+                  reduction=16,
+                  dropout_p=None,
+                  inplanes=64,
+                  input_3x3=False,
+                  downsample_kernel_size=1,
+                  downsample_padding=0,
+                  **kwargs)
+    return model
+
+
+def se_resnet101(**kwargs):
+    model = SENet(SEResNetBottleneck, [3, 4, 23, 3],
+                  groups=1,
+                  reduction=16,
+                  dropout_p=None,
+                  inplanes=64,
+                  input_3x3=False,
+                  downsample_kernel_size=1,
+                  downsample_padding=0,
+                  **kwargs)
+    return model
+
+
+def se_resnet152(**kwargs):
+    model = SENet(SEResNetBottleneck, [3, 8, 36, 3],
+                  groups=1,
+                  reduction=16,
+                  dropout_p=None,
+                  inplanes=64,
+                  input_3x3=False,
+                  downsample_kernel_size=1,
+                  downsample_padding=0,
+                  **kwargs)
+    return model
+
+
+def se_resnext50_32x4d(**kwargs):
+    model = SENet(SEResNeXtBottleneck, [3, 4, 6, 3],
+                  groups=32,
+                  reduction=16,
+                  dropout_p=None,
+                  inplanes=64,
+                  input_3x3=False,
+                  downsample_kernel_size=1,
+                  downsample_padding=0,
+                  **kwargs)
+    return model
+
+
+def se_resnext101_32x4d(**kwargs):
+    model = SENet(SEResNeXtBottleneck, [3, 4, 23, 3],
+                  groups=32,
+                  reduction=16,
+                  dropout_p=None,
+                  inplanes=64,
+                  input_3x3=False,
+                  downsample_kernel_size=1,
+                  downsample_padding=0,
+                  **kwargs)
+    return model
+
+
+def se_resnext101_64x4d(**kwargs):
+    model = SENet(SEResNeXtBottleneck, [3, 4, 23, 3],
+                  groups=64,
+                  reduction=16,
+                  dropout_p=None,
+                  inplanes=64,
+                  input_3x3=False,
+                  downsample_kernel_size=1,
+                  downsample_padding=0,
+                  **kwargs)
+    return model
--- a/models/shuffle_v1.py
+++ b/models/shuffle_v1.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from collections import OrderedDict
+from torch.nn import init
+
+__all__ = ["shuffle_v1"]
+
+
+def conv3x3(in_channels,
+            out_channels,
+            stride=1,
+            padding=1,
+            bias=True,
+            groups=1):
+    """3x3 convolution with padding
+    """
+    return nn.Conv2d(in_channels,
+                     out_channels,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=padding,
+                     bias=bias,
+                     groups=groups)
+
+
+def conv1x1(in_channels, out_channels, groups=1):
+    """1x1 convolution with padding
+    - Normal pointwise convolution When groups == 1
+    - Grouped pointwise convolution when groups > 1
+    """
+    return nn.Conv2d(in_channels,
+                     out_channels,
+                     kernel_size=1,
+                     groups=groups,
+                     stride=1)
+
+
+def channel_shuffle(x, groups):
+    batchsize, num_channels, height, width = x.data.size()
+
+    channels_per_group = num_channels // groups
+
+    # reshape
+    x = x.view(batchsize, groups, channels_per_group, height, width)
+
+    # transpose
+    # - contiguous() required if transpose() is used before view().
+    #   See https://github.com/pytorch/pytorch/issues/764
+    x = torch.transpose(x, 1, 2).contiguous()
+
+    # flatten
+    x = x.view(batchsize, -1, height, width)
+
+    return x
+
+
+class ShuffleUnit(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 groups=3,
+                 grouped_conv=True,
+                 combine='add'):
+
+        super(ShuffleUnit, self).__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.grouped_conv = grouped_conv
+        self.combine = combine
+        self.groups = groups
+        self.bottleneck_channels = self.out_channels // 4
+
+        # define the type of ShuffleUnit
+        if self.combine == 'add':
+            # ShuffleUnit Figure 2b
+            self.depthwise_stride = 1
+            self._combine_func = self._add
+        elif self.combine == 'concat':
+            # ShuffleUnit Figure 2c
+            self.depthwise_stride = 2
+            self._combine_func = self._concat
+
+            # ensure output of concat has the same channels as
+            # original output channels.
+            self.out_channels -= self.in_channels
+        else:
+            raise ValueError("Cannot combine tensors with \"{}\""
+                             "Only \"add\" and \"concat\" are"
+                             "supported".format(self.combine))
+
+        # Use a 1x1 grouped or non-grouped convolution to reduce input channels
+        # to bottleneck channels, as in a ResNet bottleneck module.
+        # NOTE: Do not use group convolution for the first conv1x1 in Stage 2.
+        self.first_1x1_groups = self.groups if grouped_conv else 1
+
+        self.g_conv_1x1_compress = self._make_grouped_conv1x1(
+            self.in_channels,
+            self.bottleneck_channels,
+            self.first_1x1_groups,
+            batch_norm=True,
+            relu=True)
+
+        # 3x3 depthwise convolution followed by batch normalization
+        self.depthwise_conv3x3 = conv3x3(self.bottleneck_channels,
+                                         self.bottleneck_channels,
+                                         stride=self.depthwise_stride,
+                                         groups=self.bottleneck_channels)
+        self.bn_after_depthwise = BN(self.bottleneck_channels)
+
+        # Use 1x1 grouped convolution to expand from
+        # bottleneck_channels to out_channels
+        self.g_conv_1x1_expand = self._make_grouped_conv1x1(
+            self.bottleneck_channels,
+            self.out_channels,
+            self.groups,
+            batch_norm=True,
+            relu=False)
+
+    @staticmethod
+    def _add(x, out):
+        # residual connection
+        return x + out
+
+    @staticmethod
+    def _concat(x, out):
+        # concatenate along channel axis
+        return torch.cat((x, out), 1)
+
+    def _make_grouped_conv1x1(self,
+                              in_channels,
+                              out_channels,
+                              groups,
+                              batch_norm=True,
+                              relu=False):
+
+        modules = OrderedDict()
+
+        conv = conv1x1(in_channels, out_channels, groups=groups)
+        modules['conv1x1'] = conv
+
+        if batch_norm:
+            modules['batch_norm'] = BN(out_channels)
+        if relu:
+            modules['relu'] = nn.ReLU()
+        if len(modules) > 1:
+            return nn.Sequential(modules)
+        else:
+            return conv
+
+    def forward(self, x):
+        # save for combining later with output
+        residual = x
+
+        if self.combine == 'concat':
+            residual = F.avg_pool2d(residual,
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1)
+
+        out = self.g_conv_1x1_compress(x)
+        out = channel_shuffle(out, self.groups)
+        out = self.depthwise_conv3x3(out)
+        out = self.bn_after_depthwise(out)
+        out = self.g_conv_1x1_expand(out)
+
+        out = self._combine_func(residual, out)
+        return F.relu(out)
+
+
+class ShuffleNetV1(nn.Module):
+    """ShuffleNet implementation.
+    """
+
+    def __init__(self,
+                 groups=3,
+                 in_channels=3,
+                 num_classes=1000,
+                 width_mult=1):
+        """ShuffleNet constructor.
+        Arguments:
+            groups (int, optional): number of groups to be used in grouped
+                1x1 convolutions in each ShuffleUnit. Default is 3 for best
+                performance according to original paper.
+            in_channels (int, optional): number of channels in the input tensor.
+                Default is 3 for RGB image inputs.
+            num_classes (int, optional): number of classes to predict. Default
+                is 1000 for ImageNet.
+        """
+        super(ShuffleNetV1, self).__init__()
+
+        self.groups = groups
+        self.stage_repeats = [3, 7, 3]
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+
+        global BN
+
+        BN = nn.BatchNorm2d
+
+        # index 0 is invalid and should never be called.
+        # only used for indexing convenience.
+        if groups == 1:
+            self.stage_out_channels = list(
+                map(lambda a: a * width_mult
+                    if a != -1 else a, [-1, 24, 144, 288, 576]))
+        elif groups == 2:
+            self.stage_out_channels = list(
+                map(lambda a: a * width_mult
+                    if a != -1 else a, [-1, 24, 200, 400, 800]))
+        elif groups == 3:
+            self.stage_out_channels = list(
+                map(lambda a: a * width_mult
+                    if a != -1 else a, [-1, 24, 240, 480, 960]))
+        elif groups == 4:
+            self.stage_out_channels = list(
+                map(lambda a: a * width_mult
+                    if a != -1 else a, [-1, 24, 272, 544, 1088]))
+        elif groups == 8:
+            self.stage_out_channels = list(
+                map(lambda a: a * width_mult
+                    if a != -1 else a, [-1, 24, 384, 768, 1536]))
+        else:
+            raise ValueError("""{} groups is not supported for
+                   1x1 Grouped Convolutions""".format(groups))
+
+        # Stage 1 always has 24 output channels
+        self.conv1 = conv3x3(
+            self.in_channels,
+            self.stage_out_channels[1],  # stage 1
+            stride=2)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        # Stage 2
+        self.stage2 = self._make_stage(2)
+        # Stage 3
+        self.stage3 = self._make_stage(3)
+        # Stage 4
+        self.stage4 = self._make_stage(4)
+
+        # Global pooling:
+        # Undefined as PyTorch's functional API can be used for on-the-fly
+        # shape inference if input size is not ImageNet's 224x224
+
+        # Fully-connected classification layer
+        num_inputs = self.stage_out_channels[-1]
+        self.fc = nn.Linear(num_inputs, self.num_classes)
+        self.init_params()
+
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+
+    def _make_stage(self, stage):
+        modules = OrderedDict()
+        stage_name = "ShuffleUnit_Stage{}".format(stage)
+
+        # First ShuffleUnit in the stage
+        # 1. non-grouped 1x1 convolution (i.e. pointwise convolution)
+        #   is used in Stage 2. Group convolutions used everywhere else.
+        grouped_conv = stage > 2
+
+        # 2. concatenation unit is always used.
+        first_module = ShuffleUnit(self.stage_out_channels[stage - 1],
+                                   self.stage_out_channels[stage],
+                                   groups=self.groups,
+                                   grouped_conv=grouped_conv,
+                                   combine='concat')
+        modules[stage_name + "_0"] = first_module
+
+        # add more ShuffleUnits depending on pre-defined number of repeats
+        for i in range(self.stage_repeats[stage - 2]):
+            name = stage_name + "_{}".format(i + 1)
+            module = ShuffleUnit(self.stage_out_channels[stage],
+                                 self.stage_out_channels[stage],
+                                 groups=self.groups,
+                                 grouped_conv=True,
+                                 combine='add')
+            modules[name] = module
+
+        return nn.Sequential(modules)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+
+        x = self.stage2(x)
+        x = self.stage3(x)
+        x = self.stage4(x)
+
+        # global average pooling layer
+        x = F.avg_pool2d(x, x.data.size()[-2:])
+        # flatten for input to fully-connected layer
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+
+
+def shuffle_v1(**kwargs):
+    model = ShuffleNetV1(**kwargs)
+    return model
--- a/models/shuffle_v2.py
+++ b/models/shuffle_v2.py
+import torch as torch
+import torch.nn as nn
+from torch.nn import init
+
+__all__ = ["shuffle_v2"]
+
+
+def conv3x3(in_channels,
+            out_channels,
+            stride=1,
+            padding=1,
+            bias=True,
+            groups=1):
+    return nn.Conv2d(in_channels,
+                     out_channels,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=padding,
+                     bias=bias,
+                     groups=groups)
+
+
+def conv1x1(in_channels, out_channels, bias=True, groups=1):
+    return nn.Conv2d(in_channels,
+                     out_channels,
+                     kernel_size=1,
+                     stride=1,
+                     padding=0,
+                     bias=bias,
+                     groups=groups)
+
+
+def channel_shuffle(x, groups):
+    batchsize, num_channels, height, width = x.data.size()
+    channels_per_group = num_channels // groups
+    x = x.view(batchsize, groups, channels_per_group, height, width)
+    x = torch.transpose(x, 1, 2).contiguous()
+    x = x.view(batchsize, -1, height, width)
+    return x
+
+
+def channel_split(x, splits=[24, 24]):
+    return torch.split(x, splits, dim=1)
+
+
+class ParimaryModule(nn.Module):
+
+    def __init__(self, in_channels=3, out_channels=24):
+        super(ParimaryModule, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.ParimaryModule = nn.Sequential(
+            conv3x3(in_channels, out_channels, 2, 1, True, 1),
+            nn.BatchNorm2d(out_channels),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_uniform_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.ParimaryModule(x)
+        return x
+
+
+class FinalModule(nn.Module):
+
+    def __init__(self, in_channels=464, out_channels=1024, num_classes=1000):
+        super(FinalModule, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_classes = num_classes
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(out_channels, num_classes)
+        self.FinalConv = nn.Sequential(
+            conv1x1(in_channels, out_channels, True, 1),
+            nn.BatchNorm2d(out_channels), nn.ReLU())
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_uniform_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.FinalConv(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+
+
+class ShuffleNetV2Block(nn.Module):
+
+    def __init__(self, in_channels, out_channels, stride=1, splits_left=2):
+        super(ShuffleNetV2Block, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+        self.splits_left = splits_left
+
+        if stride == 2:
+            self.Left = nn.Sequential(
+                conv3x3(in_channels, in_channels, stride, 1, True,
+                        in_channels), nn.BatchNorm2d(in_channels),
+                conv1x1(in_channels, out_channels // 2, True, 1),
+                nn.BatchNorm2d(out_channels // 2), nn.ReLU())
+            self.Right = nn.Sequential(
+                conv1x1(in_channels, in_channels, True, 1),
+                nn.BatchNorm2d(in_channels), nn.ReLU(),
+                conv3x3(in_channels, in_channels, stride, 1, True,
+                        in_channels), nn.BatchNorm2d(in_channels),
+                conv1x1(in_channels, out_channels // 2, True, 1),
+                nn.BatchNorm2d(out_channels // 2), nn.ReLU())
+        elif stride == 1:
+            in_channels = in_channels - in_channels // splits_left
+            self.Right = nn.Sequential(
+                conv1x1(in_channels, in_channels, True, 1),
+                nn.BatchNorm2d(in_channels), nn.ReLU(),
+                conv3x3(in_channels, in_channels, stride, 1, True,
+                        in_channels), nn.BatchNorm2d(in_channels),
+                conv1x1(in_channels, in_channels, True, 1),
+                nn.BatchNorm2d(in_channels), nn.ReLU())
+        else:
+            raise ValueError('stride must be 1 or 2')
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_uniform_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        if self.stride == 2:
+            x_left, x_right = x, x
+            x_left = self.Left(x_left)
+            x_right = self.Right(x_right)
+        elif self.stride == 1:
+            x_split = channel_split(x, [
+                self.in_channels // self.splits_left,
+                self.in_channels - self.in_channels // self.splits_left
+            ])
+            x_left, x_right = x_split[0], x_split[1]
+            x_right = self.Right(x_right)
+
+        x = torch.cat((x_left, x_right), dim=1)
+        x = channel_shuffle(x, 2)
+        return x
+
+
+class ShuffleNetV2(nn.Module):
+
+    def __init__(self,
+                 in_channels=3,
+                 num_classes=1000,
+                 net_scale=1.0,
+                 stage_repeat=1,
+                 splits_left=2):
+        super(ShuffleNetV2, self).__init__()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.net_scale = net_scale
+        self.splits_left = splits_left
+
+        if net_scale == 0.5:
+            self.out_channels = [24, 48, 96, 192, 1024]
+        elif net_scale == 1.0:
+            self.out_channels = [24, 116, 232, 464, 1024]
+        elif net_scale == 1.5:
+            self.out_channels = [24, 176, 352, 704, 1024]
+        elif net_scale == 2.0:
+            self.out_channels = [24, 244, 488, 976, 2048]
+        else:
+            raise ValueError('net_scale must be 0.5,1.0,1.5 or 2.0')
+
+        self.ParimaryModule = ParimaryModule(in_channels, self.out_channels[0])
+
+        if stage_repeat == 1:
+            self.Stage1 = self.Stage(1, [1, 3])
+            self.Stage2 = self.Stage(2, [1, 7])
+            self.Stage3 = self.Stage(3, [1, 3])
+        elif stage_repeat == 2:
+            self.Stage1 = self.Stage(1, [1, 7])
+            self.Stage2 = self.Stage(2, [1, 15])
+            self.Stage3 = self.Stage(3, [1, 7])
+
+        self.FinalModule = FinalModule(self.out_channels[3],
+                                       self.out_channels[4], num_classes)
+
+    def Stage(self, stage=1, BlockRepeat=[1, 3]):
+        modules = []
+
+        if BlockRepeat[0] == 1:
+            modules.append(
+                ShuffleNetV2Block(self.out_channels[stage - 1],
+                                  self.out_channels[stage], 2,
+                                  self.splits_left))
+        else:
+            raise ValueError('stage first block must only repeat 1 time')
+
+        for i in range(BlockRepeat[1]):
+            modules.append(
+                ShuffleNetV2Block(self.out_channels[stage],
+                                  self.out_channels[stage], 1,
+                                  self.splits_left))
+
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        x = self.ParimaryModule(x)
+        x = self.Stage1(x)
+        x = self.Stage2(x)
+        x = self.Stage3(x)
+        x = self.FinalModule(x)
+        return x
+
+
+def shuffle_v2(**kwargs):
+    model = ShuffleNetV2(**kwargs)
+    return model
--- a/models/sknet.py
+++ b/models/sknet.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=1,
+                     bias=False,
+                     groups=groups)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = conv1x1(inplanes, planes)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2g = conv3x3(planes, planes, stride, groups=32)
+        self.bn2g = nn.BatchNorm2d(planes)
+
+        self.conv3 = conv1x1(planes, planes * self.expansion)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv_fc1 = nn.Conv2d(planes, planes // 16, 1, bias=False)
+        self.bn_fc1 = nn.BatchNorm2d(planes // 16)
+        self.conv_fc2 = nn.Conv2d(planes // 16, 2 * planes, 1, bias=False)
+
+        self.D = planes
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        d1 = self.conv2(out)
+        d1 = self.bn2(d1)
+        d1 = self.relu(d1)
+
+        d2 = self.conv2g(out)
+        d2 = self.bn2g(d2)
+        d2 = self.relu(d2)
+
+        d = self.avg_pool(d1) + self.avg_pool(d2)
+        d = F.relu(self.bn_fc1(self.conv_fc1(d)))
+        d = self.conv_fc2(d)
+        d = torch.unsqueeze(d, 1).view(-1, 2, self.D, 1, 1)
+        d = F.softmax(d, 1)
+        d1 = d1 * d[:, 0, :, :, :].squeeze(1)
+        d2 = d2 * d[:, 1, :, :, :].squeeze(1)
+        d = d1 + d2
+
+        out = self.conv3(d)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 num_classes=1000,
+                 zero_init_residual=False):
+        super(ResNet, self).__init__()
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(3,
+                               64,
+                               kernel_size=7,
+                               stride=2,
+                               padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight,
+                                        mode='fan_out',
+                                        nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def sk_resnet18(pretrained=False, **kwargs):
+    """Constructs a ResNet-18 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    return model
+
+
+def sk_resnet34(pretrained=False, **kwargs):
+    """Constructs a ResNet-34 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def sk_resnet50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def sk_resnet101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    return model
+
+
+def sk_resnet152(pretrained=False, **kwargs):
+    """Constructs a ResNet-152 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+    return model
--- a/models/vgg.py
+++ b/models/vgg.py
+import torch.nn as nn
+
+__all__ = [
+    'VGG',
+    'vgg11',
+    'vgg11_bn',
+    'vgg13',
+    'vgg13_bn',
+    'vgg16',
+    'vgg16_bn',
+    'vgg19_bn',
+    'vgg19',
+]
+
+
+class VGG(nn.Module):
+
+    def __init__(self, features, num_classes=1000, init_weights=True):
+        super(VGG, self).__init__()
+        self.features = features
+        self.classifier = nn.Sequential(
+            nn.Linear(512 * 7 * 7, 4096),
+            nn.ReLU(True),
+            nn.Dropout(),
+            nn.Linear(4096, 4096),
+            nn.ReLU(True),
+            nn.Dropout(),
+            nn.Linear(4096, num_classes),
+        )
+        if init_weights:
+            self._initialize_weights()
+
+    def forward(self, x):
+        x = self.features(x)
+        x = x.view(x.size(0), -1)
+        x = self.classifier(x)
+        return x
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight,
+                                        mode='fan_out',
+                                        nonlinearity='relu')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.constant_(m.bias, 0)
+
+
+def make_layers(cfg, batch_norm=False):
+    layers = []
+    in_channels = 3
+    for v in cfg:
+        if v == 'M':
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+        else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+            if batch_norm:
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+            else:
+                layers += [conv2d, nn.ReLU(inplace=True)]
+            in_channels = v
+    return nn.Sequential(*layers)
+
+
+cfg = {
+    'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'B':
+    [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'D': [
+        64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M',
+        512, 512, 512, 'M'
+    ],
+    'E': [
+        64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512,
+        512, 'M', 512, 512, 512, 512, 'M'
+    ],
+}
+
+
+def vgg11(**kwargs):
+    model = VGG(make_layers(cfg['A']), **kwargs)
+    return model
+
+
+def vgg11_bn(**kwargs):
+    model = VGG(make_layers(cfg['A'], batch_norm=True), **kwargs)
+    return model
+
+
+def vgg13(**kwargs):
+    model = VGG(make_layers(cfg['B']), **kwargs)
+    return model
+
+
+def vgg13_bn(**kwargs):
+    model = VGG(make_layers(cfg['B'], batch_norm=True), **kwargs)
+    return model
+
+
+def vgg16(**kwargs):
+    model = VGG(make_layers(cfg['D']), **kwargs)
+    return model
+
+
+def vgg16_bn(**kwargs):
+    model = VGG(make_layers(cfg['D'], batch_norm=True), **kwargs)
+    return model
+
+
+def vgg19(**kwargs):
+    model = VGG(make_layers(cfg['E']), **kwargs)
+    return model
+
+
+def vgg19_bn(**kwargs):
+    model = VGG(make_layers(cfg['E'], batch_norm=True), **kwargs)
+    return model
--- a/perf.py
+++ b/perf.py
+import os
+import shutil
+import argparse
+import random
+import re
+import time
+import yaml
+import json
+import socket
+import logging
+from addict import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.optim
+from torch.backends import cudnn
+
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+import models
+from utils.dataloader import build_dataloader
+from utils.misc import accuracy, check_keys, AverageMeter, ProgressMeter
+from utils.loss import LabelSmoothLoss
+
+parser = argparse.ArgumentParser(description='ImageNet Training Example')
+parser.add_argument('--config',
+                    default='configs/resnet50.yaml',
+                    type=str,
+                    help='path to config file')
+parser.add_argument('--test',
+                    dest='test',
+                    action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--output',
+                    dest='output',
+                    default='inception_result.json',
+                    help='output json file to hold perf results')
+
+parser.add_argument('--port',
+                    default=12345,
+                    type=int,
+                    metavar='P',
+                    help='master port')
+parser.add_argument('--rank', default=0, type=int,
+                    help='node rank for distributed training')
+logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s')
+logger = logging.getLogger()
+logger_all = logging.getLogger('all')
+
+
+def main():
+    args = parser.parse_args()
+    args.config = yaml.load(open(args.config, 'r'), Loader=yaml.Loader)
+    cfgs = Dict(args.config)
+
+    # args.rank = int(os.environ['SLURM_PROCID'])
+    # args.world_size = int(os.environ['SLURM_NTASKS'])
+    # args.local_rank = int(os.environ['SLURM_LOCALID'])
+    args.world_size = int(os.environ["WORLD_SIZE"])
+    args.local_rank = int(os.environ['LOCAL_RANK'])
+
+    # node_list = str(os.environ['SLURM_NODELIST'])
+    # node_parts = re.findall('[0-9]+', node_list)
+    # os.environ[
+    #     'MASTER_ADDR'] = f'{node_parts[1]}.{node_parts[2]}.{node_parts[3]}.{node_parts[4]}'
+    # os.environ['MASTER_PORT'] = str(args.port)
+    # os.environ['WORLD_SIZE'] = str(args.world_size)
+    # os.environ['RANK'] = str(args.rank)
+
+    dist.init_process_group(backend="nccl")
+    torch.cuda.set_device(args.local_rank)
+
+    if args.local_rank == 0:
+        logger.setLevel(logging.INFO)
+    else:
+        logger.setLevel(logging.ERROR)
+    logger_all.setLevel(logging.INFO)
+
+    logger_all.info("rank {} of {} jobs, in {}".format(args.local_rank,
+                                                       args.world_size,
+                                                       socket.gethostname()))
+
+    dist.barrier()
+
+    logger.info("config\n{}".format(
+        json.dumps(cfgs, indent=2, ensure_ascii=False)))
+
+    if cfgs.get('seed', None):
+        random.seed(cfgs.seed)
+        torch.manual_seed(cfgs.seed)
+        torch.cuda.manual_seed(cfgs.seed)
+        cudnn.deterministic = True
+
+    model = models.__dict__[cfgs.net.arch](**cfgs.net.kwargs)
+    model.cuda()
+
+    logger.info("creating model '{}'".format(cfgs.net.arch))
+
+    model = DDP(model, device_ids=[args.local_rank])
+    logger.info("model\n{}".format(model))
+
+    if cfgs.get('label_smooth', None):
+        criterion = LabelSmoothLoss(cfgs.trainer.label_smooth,
+                                    cfgs.net.kwargs.num_classes).cuda()
+    else:
+        criterion = nn.CrossEntropyLoss().cuda()
+    logger.info("loss\n{}".format(criterion))
+
+    optimizer = torch.optim.SGD(model.parameters(),
+                                **cfgs.trainer.optimizer.kwargs)
+    logger.info("optimizer\n{}".format(optimizer))
+
+    cudnn.benchmark = True
+
+    args.start_epoch = -cfgs.trainer.lr_scheduler.get('warmup_epochs', 0)
+    args.max_epoch = cfgs.trainer.max_epoch
+    args.test_freq = cfgs.trainer.test_freq
+    args.log_freq = cfgs.trainer.log_freq
+
+    best_acc1 = 0.0
+    if cfgs.saver.resume_model:
+        assert os.path.isfile(
+            cfgs.saver.resume_model), 'Not found resume model: {}'.format(
+                cfgs.saver.resume_model)
+        checkpoint = torch.load(cfgs.saver.resume_model)
+        check_keys(model=model, checkpoint=checkpoint)
+        model.load_state_dict(checkpoint['state_dict'])
+        args.start_epoch = checkpoint['epoch']
+        best_acc1 = checkpoint['best_acc1']
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        logger.info("resume training from '{}' at epoch {}".format(
+            cfgs.saver.resume_model, checkpoint['epoch']))
+    elif cfgs.saver.pretrain_model:
+        assert os.path.isfile(
+            cfgs.saver.pretrain_model), 'Not found pretrain model: {}'.format(
+                cfgs.saver.pretrain_model)
+        checkpoint = torch.load(cfgs.saver.pretrain_model)
+        check_keys(model=model, checkpoint=checkpoint)
+        model.load_state_dict(checkpoint['state_dict'])
+        logger.info("pretrain training from '{}'".format(
+            cfgs.saver.pretrain_model))
+
+    if args.local_rank == 0 and cfgs.saver.get('save_dir', None):
+        if not os.path.exists(cfgs.saver.save_dir):
+            os.makedirs(cfgs.saver.save_dir)
+            logger.info("create checkpoint folder {}".format(
+                cfgs.saver.save_dir))
+
+    # Data loading code
+    train_loader, train_sampler, test_loader, _ = build_dataloader(
+        cfgs.dataset, args.world_size)
+
+    # test mode
+    if args.test:
+        return
+
+    # choose scheduler
+    lr_scheduler = torch.optim.lr_scheduler.__dict__[
+        cfgs.trainer.lr_scheduler.type](optimizer if isinstance(
+            optimizer, torch.optim.Optimizer) else optimizer.optimizer,
+                                        **cfgs.trainer.lr_scheduler.kwargs,
+                                        last_epoch=args.start_epoch - 1)
+
+    monitor_writer = None
+    if args.local_rank == 0 and cfgs.get('monitor', None):
+        if cfgs.monitor.get('type', None) == 'pavi':
+            from pavi import SummaryWriter
+            if cfgs.monitor.get("_taskid", None):
+                monitor_writer = SummaryWriter(session_text=yaml.dump(
+                    args.config),
+                                               **cfgs.monitor.kwargs,
+                                               taskid=cfgs.monitor._taskid)
+            else:
+                monitor_writer = SummaryWriter(session_text=yaml.dump(
+                    args.config),
+                                               **cfgs.monitor.kwargs)
+
+    # training
+    args.max_epoch = 1
+    for epoch in range(args.start_epoch, args.max_epoch):
+        train_sampler.set_epoch(epoch)
+
+        # train for one epoch
+        avg_time = train(train_loader, model, criterion, optimizer, epoch,
+                         args, monitor_writer)
+        avg_time = avg_time.avg
+        if (epoch + 1) % args.test_freq == 0 or epoch + 1 == args.max_epoch:
+            # evaluate on validation set
+            if args.local_rank == 0:
+
+                results = {}
+                if os.path.exists(args.output):
+                    with open(args.output, 'r') as f:
+                        try:
+                            results = json.load(f)
+                        except:
+                            pass
+
+                if results.get('inceptionv3', None) is None:
+                    results['inceptionv3'] = {}
+
+                results['inceptionv3']['perf' + str(
+                    args.world_size
+                )] = cfgs.dataset.batch_size * args.world_size / avg_time
+
+                with open(args.output, 'w') as f:
+                    json.dump(results, f)
+        lr_scheduler.step()
+
+
+def train(train_loader, model, criterion, optimizer, epoch, args,
+          monitor_writer):
+    batch_time = AverageMeter('Time', ':.3f', -1)
+    data_time = AverageMeter('Data', ':.3f', 200)
+
+    losses = AverageMeter('Loss', ':.4f', 50)
+    top1 = AverageMeter('Acc@1', ':.2f', 50)
+    top5 = AverageMeter('Acc@5', ':.2f', 50)
+
+    memory = AverageMeter('Memory(MB)', ':.0f')
+    progress = ProgressMeter(len(train_loader),
+                             batch_time,
+                             data_time,
+                             losses,
+                             top1,
+                             top5,
+                             memory,
+                             prefix="Epoch: [{}/{}]".format(
+                                 epoch + 1, args.max_epoch))
+
+    # switch to train mode
+    model.train()
+    end = time.time()
+    for i, (input, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        input = input.cuda()
+        target = target.cuda()
+
+        # compute output
+        output = model(input)
+        loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+        stats_all = torch.tensor([loss.item(), acc1[0].item(),
+                                  acc5[0].item()]).float().cuda()
+        dist.all_reduce(stats_all)
+        stats_all /= args.world_size
+
+        losses.update(stats_all[0].item())
+        top1.update(stats_all[1].item())
+        top5.update(stats_all[2].item())
+        memory.update(torch.cuda.max_memory_allocated() / 1024 / 1024)
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        if i >= 3:
+            batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.log_freq == 0:
+            progress.display(i)
+            if args.local_rank == 0 and monitor_writer:
+                cur_iter = epoch * len(train_loader) + i
+                monitor_writer.add_scalar('Train_Loss', losses.avg, cur_iter)
+                monitor_writer.add_scalar('Accuracy_train_top1', top1.avg,
+                                          cur_iter)
+                monitor_writer.add_scalar('Accuracy_train_top5', top5.avg,
+                                          cur_iter)
+
+    return batch_time
+
+
+if __name__ == '__main__':
+    main()
--- a/train_inceptionv3.log-20221223_210106
+++ b/train_inceptionv3.log-20221223_210106
--- a/train_inceptionv3.log-20221224_201806
+++ b/train_inceptionv3.log-20221224_201806
--- a/utils/__init__.py
+++ b/utils/__init__.py
--- a/utils/__pycache__/__init__.cpython-37.pyc
+++ b/utils/__pycache__/__init__.cpython-37.pyc