添加inception_v3测试代码

1e2486af · sunxx1 · 1e2486af · 1e2486af · 1e2486af · 1e2486af
Commit 1e2486af authored Apr 12, 2023 by sunxx1
20 changed files
--- a/models/__pycache__/preact_resnet.cpython-37.pyc
+++ b/models/__pycache__/preact_resnet.cpython-37.pyc
--- a/models/__pycache__/resnest.cpython-37.pyc
+++ b/models/__pycache__/resnest.cpython-37.pyc
--- a/models/__pycache__/resnet.cpython-37.pyc
+++ b/models/__pycache__/resnet.cpython-37.pyc
--- a/models/__pycache__/resnet_official.cpython-37.pyc
+++ b/models/__pycache__/resnet_official.cpython-37.pyc
--- a/models/__pycache__/resnet_v2.cpython-37.pyc
+++ b/models/__pycache__/resnet_v2.cpython-37.pyc
--- a/models/__pycache__/senet.cpython-37.pyc
+++ b/models/__pycache__/senet.cpython-37.pyc
--- a/models/__pycache__/shuffle_v1.cpython-37.pyc
+++ b/models/__pycache__/shuffle_v1.cpython-37.pyc
--- a/models/__pycache__/shuffle_v2.cpython-37.pyc
+++ b/models/__pycache__/shuffle_v2.cpython-37.pyc
--- a/models/__pycache__/sknet.cpython-37.pyc
+++ b/models/__pycache__/sknet.cpython-37.pyc
--- a/models/__pycache__/vgg.cpython-37.pyc
+++ b/models/__pycache__/vgg.cpython-37.pyc
--- a/models/common.py
+++ b/models/common.py
+import torch.nn as nn
+import math
+__all__ = [
+    'round_channels', 'conv1x1', 'ConvBlock', 'conv1x1_block', 'conv7x7_block',
+    'SEBlock', 'ResNeXtBottleneck', 'ResInitBlock'
+]
+def round_channels(channels, divisor=8):
+    rounded_channels = max(
+        int(channels + divisor / 2.0) // divisor * divisor, divisor)
+    if float(rounded_channels) < 0.9 * channels:
+        rounded_channels += divisor
+    return rounded_channels
+def conv1x1(in_channels, out_channels, stride=1, groups=1, bias=False):
+    return nn.Conv2d(in_channels=in_channels,
+                     out_channels=out_channels,
+                     kernel_size=1,
+                     stride=stride,
+                     groups=groups,
+                     bias=bias)
+class ConvBlock(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 use_bn=True,
+                 bn_eps=1e-5):
+        super(ConvBlock, self).__init__()
+        self.use_bn = use_bn
+        self.conv = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=kernel_size,
+                              stride=stride,
+                              padding=padding,
+                              dilation=dilation,
+                              groups=groups,
+                              bias=bias)
+        if self.use_bn:
+            self.bn = nn.BatchNorm2d(num_features=out_channels, eps=bn_eps)
+        self.activ = nn.ReLU(inplace=True)
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn(x)
+        x = self.activ(x)
+        return x
+def conv1x1_block(in_channels,
+                  out_channels,
+                  stride=1,
+                  padding=0,
+                  groups=1,
+                  bias=False,
+                  use_bn=True,
+                  bn_eps=1e-5):
+    return ConvBlock(in_channels=in_channels,
+                     out_channels=out_channels,
+                     kernel_size=1,
+                     stride=stride,
+                     padding=padding,
+                     groups=groups,
+                     bias=bias,
+                     use_bn=use_bn,
+                     bn_eps=bn_eps)
+def conv3x3_block(in_channels,
+                  out_channels,
+                  stride=1,
+                  padding=1,
+                  dilation=1,
+                  groups=1,
+                  bias=False,
+                  use_bn=True,
+                  bn_eps=1e-5):
+    return ConvBlock(in_channels=in_channels,
+                     out_channels=out_channels,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=padding,
+                     dilation=dilation,
+                     groups=groups,
+                     bias=bias,
+                     use_bn=use_bn,
+                     bn_eps=bn_eps)
+def conv7x7_block(in_channels,
+                  out_channels,
+                  stride=1,
+                  padding=3,
+                  bias=False,
+                  use_bn=True):
+    return ConvBlock(in_channels=in_channels,
+                     out_channels=out_channels,
+                     kernel_size=7,
+                     stride=stride,
+                     padding=padding,
+                     bias=bias,
+                     use_bn=use_bn)
+class SEBlock(nn.Module):
+    def __init__(self, channels, reduction=16, round_mid=False):
+        super(SEBlock, self).__init__()
+        mid_channels = channels // reduction if not round_mid else round_channels(
+            float(channels) / reduction)
+        self.pool = nn.AdaptiveAvgPool2d(output_size=1)
+        self.conv1 = conv1x1(in_channels=channels,
+                             out_channels=mid_channels,
+                             bias=True)
+        self.activ = nn.ReLU(inplace=True)
+        self.conv2 = conv1x1(in_channels=mid_channels,
+                             out_channels=channels,
+                             bias=True)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        w = self.pool(x)
+        w = self.conv1(w)
+        w = self.activ(w)
+        w = self.conv2(w)
+        w = self.sigmoid(w)
+        x = x * w
+        return x
+class ResInitBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ResInitBlock, self).__init__()
+        self.conv = conv7x7_block(in_channels=in_channels,
+                                  out_channels=out_channels,
+                                  stride=2)
+        self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.pool(x)
+        return x
+class ResNeXtBottleneck(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 cardinality,
+                 bottleneck_width,
+                 bottleneck_factor=4):
+        super(ResNeXtBottleneck, self).__init__()
+        mid_channels = out_channels // bottleneck_factor
+        D = int(math.floor(mid_channels * (bottleneck_width / 64.0)))
+        group_width = cardinality * D
+        self.conv1 = conv1x1_block(in_channels=in_channels,
+                                   out_channels=group_width)
+        self.conv2 = conv3x3_block(in_channels=group_width,
+                                   out_channels=group_width,
+                                   stride=stride,
+                                   groups=cardinality)
+        self.conv3 = conv1x1_block(in_channels=group_width,
+                                   out_channels=out_channels)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        return x
--- a/models/densenet.py
+++ b/models/densenet.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from collections import OrderedDict
+from torch import Tensor
+__all__ = [
+    'DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161'
+]
+class _DenseLayer(nn.Module):
+    def __init__(self,
+                 num_input_features,
+                 growth_rate,
+                 bn_size,
+                 drop_rate,
+                 memory_efficient=False):
+        super(_DenseLayer, self).__init__()
+        self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
+        self.add_module('relu1', nn.ReLU(inplace=True)),
+        self.add_module(
+            'conv1',
+            nn.Conv2d(num_input_features,
+                      bn_size * growth_rate,
+                      kernel_size=1,
+                      stride=1,
+                      bias=False)),
+        self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
+        self.add_module('relu2', nn.ReLU(inplace=True)),
+        self.add_module(
+            'conv2',
+            nn.Conv2d(bn_size * growth_rate,
+                      growth_rate,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=False)),
+        self.drop_rate = float(drop_rate)
+        self.memory_efficient = memory_efficient
+    def bn_function(self, inputs):
+        concated_features = torch.cat(inputs, 1)
+        bottleneck_output = self.conv1(
+            self.relu1(self.norm1(concated_features)))  # noqa: T484
+        return bottleneck_output
+    # todo: rewrite when torchscript supports any
+    def any_requires_grad(self, input):
+        for tensor in input:
+            if tensor.requires_grad:
+                return True
+        return False
+    def call_checkpoint_bottleneck(self, input):
+        def closure(*inputs):
+            return self.bn_function(*inputs)
+        return cp.checkpoint(closure, input)
+    # allowing it to take either a List[Tensor] or single Tensor
+    def forward(self, input):  # noqa: F811
+        if isinstance(input, Tensor):
+            prev_features = [input]
+        else:
+            prev_features = input
+        if self.memory_efficient and self.any_requires_grad(prev_features):
+            if torch.jit.is_scripting():
+                raise Exception("Memory Efficient not supported in JIT")
+            bottleneck_output = self.call_checkpoint_bottleneck(prev_features)
+        else:
+            bottleneck_output = self.bn_function(prev_features)
+        new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))
+        if self.drop_rate > 0:
+            new_features = F.dropout(new_features,
+                                     p=self.drop_rate,
+                                     training=self.training)
+        return new_features
+class _DenseBlock(nn.ModuleDict):
+    _version = 2
+    __constants__ = ['layers']
+    def __init__(self,
+                 num_layers,
+                 num_input_features,
+                 bn_size,
+                 growth_rate,
+                 drop_rate,
+                 memory_efficient=False):
+        super(_DenseBlock, self).__init__()
+        for i in range(num_layers):
+            layer = _DenseLayer(
+                num_input_features + i * growth_rate,
+                growth_rate=growth_rate,
+                bn_size=bn_size,
+                drop_rate=drop_rate,
+                memory_efficient=memory_efficient,
+            )
+            self.add_module('denselayer%d' % (i + 1), layer)
+    def forward(self, init_features):
+        features = [init_features]
+        for name, layer in self.items():
+            new_features = layer(features)
+            features.append(new_features)
+        return torch.cat(features, 1)
+class _Transition(nn.Sequential):
+    def __init__(self, num_input_features, num_output_features):
+        super(_Transition, self).__init__()
+        self.add_module('norm', nn.BatchNorm2d(num_input_features))
+        self.add_module('relu', nn.ReLU(inplace=True))
+        self.add_module(
+            'conv',
+            nn.Conv2d(num_input_features,
+                      num_output_features,
+                      kernel_size=1,
+                      stride=1,
+                      bias=False))
+        self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
+class DenseNet(nn.Module):
+    r"""Densenet-BC model class, based on
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
+    Args:
+        growth_rate (int) - how many filters to add each layer (`k` in paper)
+        block_config (list of 4 ints) - how many layers in each pooling block
+        num_init_features (int) - the number of filters to learn in the first convolution layer
+        bn_size (int) - multiplicative factor for number of bottle neck layers
+          (i.e. bn_size * k features in the bottleneck layer)
+        drop_rate (float) - dropout rate after each dense layer
+        num_classes (int) - number of classification classes
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
+    """
+    __constants__ = ['features']
+    def __init__(self,
+                 growth_rate=32,
+                 block_config=(6, 12, 24, 16),
+                 num_init_features=64,
+                 bn_size=4,
+                 drop_rate=0,
+                 num_classes=1000,
+                 memory_efficient=False):
+        super(DenseNet, self).__init__()
+        # First convolution
+        self.features = nn.Sequential(
+            OrderedDict([
+                ('conv0',
+                 nn.Conv2d(3,
+                           num_init_features,
+                           kernel_size=7,
+                           stride=2,
+                           padding=3,
+                           bias=False)),
+                ('norm0', nn.BatchNorm2d(num_init_features)),
+                ('relu0', nn.ReLU(inplace=True)),
+                ('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
+            ]))
+        # Each denseblock
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            block = _DenseBlock(num_layers=num_layers,
+                                num_input_features=num_features,
+                                bn_size=bn_size,
+                                growth_rate=growth_rate,
+                                drop_rate=drop_rate,
+                                memory_efficient=memory_efficient)
+            self.features.add_module('denseblock%d' % (i + 1), block)
+            num_features = num_features + num_layers * growth_rate
+            if i != len(block_config) - 1:
+                trans = _Transition(num_input_features=num_features,
+                                    num_output_features=num_features // 2)
+                self.features.add_module('transition%d' % (i + 1), trans)
+                num_features = num_features // 2
+        # Final batch norm
+        self.features.add_module('norm5', nn.BatchNorm2d(num_features))
+        # Linear layer
+        self.classifier = nn.Linear(num_features, num_classes)
+        # Official init from torch repo.
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        features = self.features(x)
+        out = F.relu(features, inplace=True)
+        out = F.adaptive_avg_pool2d(out, (1, 1))
+        out = torch.flatten(out, 1)
+        out = self.classifier(out)
+        return out
+def _densenet(arch, growth_rate, block_config, num_init_features, **kwargs):
+    model = DenseNet(growth_rate, block_config, num_init_features, **kwargs)
+    return model
+def densenet121(**kwargs):
+    r"""Densenet-121 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
+    Args:
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
+    """
+    return _densenet('densenet121', 32, (6, 12, 24, 16), 64, **kwargs)
+def densenet161(**kwargs):
+    r"""Densenet-161 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
+    Args:
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
+    """
+    return _densenet('densenet161', 48, (6, 12, 36, 24), 96, **kwargs)
+def densenet169(**kwargs):
+    r"""Densenet-169 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
+    Args:
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
+    """
+    return _densenet('densenet169', 32, (6, 12, 32, 32), 64, **kwargs)
+def densenet201(**kwargs):
+    r"""Densenet-201 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
+    Args:
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
+    """
+    return _densenet('densenet201', 32, (6, 12, 48, 32), 64, **kwargs)
--- a/models/dpn.py
+++ b/models/dpn.py
+import torch as torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = ['DPN', 'dpn68', 'dpn68b', 'dpn92', 'dpn98', 'dpn131', 'dpn107']
+def dpn68(**kwargs):
+    model = DPN(small=True,
+                num_init_features=10,
+                k_r=128,
+                groups=32,
+                k_sec=(3, 4, 12, 3),
+                inc_sec=(16, 32, 32, 64),
+                test_time_pool=True,
+                **kwargs)
+    return model
+def dpn68b(**kwargs):
+    model = DPN(small=True,
+                num_init_features=10,
+                k_r=128,
+                groups=32,
+                b=True,
+                k_sec=(3, 4, 12, 3),
+                inc_sec=(16, 32, 32, 64),
+                test_time_pool=True,
+                **kwargs)
+    return model
+def dpn92(**kwargs):
+    model = DPN(num_init_features=64,
+                k_r=96,
+                groups=32,
+                k_sec=(3, 4, 20, 3),
+                inc_sec=(16, 32, 24, 128),
+                test_time_pool=True,
+                **kwargs)
+    return model
+def dpn98(**kwargs):
+    model = DPN(num_init_features=96,
+                k_r=160,
+                groups=40,
+                k_sec=(3, 6, 20, 3),
+                inc_sec=(16, 32, 32, 128),
+                test_time_pool=True,
+                **kwargs)
+    return model
+def dpn131(**kwargs):
+    model = DPN(num_init_features=128,
+                k_r=160,
+                groups=40,
+                k_sec=(4, 8, 28, 3),
+                inc_sec=(16, 32, 32, 128),
+                test_time_pool=True,
+                **kwargs)
+    return model
+def dpn107(pretrained='imagenet+5k', **kwargs):
+    model = DPN(num_init_features=128,
+                k_r=200,
+                groups=50,
+                k_sec=(4, 8, 20, 3),
+                inc_sec=(20, 64, 64, 128),
+                test_time_pool=True,
+                **kwargs)
+    return model
+class CatBnAct(nn.Module):
+    def __init__(self, in_chs, activation_fn=nn.ReLU(inplace=True)):
+        super(CatBnAct, self).__init__()
+        self.bn = nn.BatchNorm2d(in_chs, eps=0.001)
+        self.act = activation_fn
+    def forward(self, x):
+        x = torch.cat(x, dim=1) if isinstance(x, tuple) else x
+        return self.act(self.bn(x))
+class BnActConv2d(nn.Module):
+    def __init__(self,
+                 in_chs,
+                 out_chs,
+                 kernel_size,
+                 stride,
+                 padding=0,
+                 groups=1,
+                 activation_fn=nn.ReLU(inplace=True)):
+        super(BnActConv2d, self).__init__()
+        self.bn = nn.BatchNorm2d(in_chs, eps=0.001)
+        self.act = activation_fn
+        self.conv = nn.Conv2d(in_chs,
+                              out_chs,
+                              kernel_size,
+                              stride,
+                              padding,
+                              groups=groups,
+                              bias=False)
+    def forward(self, x):
+        return self.conv(self.act(self.bn(x)))
+class InputBlock(nn.Module):
+    def __init__(self,
+                 num_init_features,
+                 kernel_size=7,
+                 padding=3,
+                 activation_fn=nn.ReLU(inplace=True)):
+        super(InputBlock, self).__init__()
+        self.conv = nn.Conv2d(3,
+                              num_init_features,
+                              kernel_size=kernel_size,
+                              stride=2,
+                              padding=padding,
+                              bias=False)
+        self.bn = nn.BatchNorm2d(num_init_features, eps=0.001)
+        self.act = activation_fn
+        self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        x = self.pool(x)
+        return x
+class DualPathBlock(nn.Module):
+    def __init__(self,
+                 in_chs,
+                 num_1x1_a,
+                 num_3x3_b,
+                 num_1x1_c,
+                 inc,
+                 groups,
+                 block_type='normal',
+                 b=False):
+        super(DualPathBlock, self).__init__()
+        self.num_1x1_c = num_1x1_c
+        self.inc = inc
+        self.b = b
+        if block_type == 'proj':
+            self.key_stride = 1
+            self.has_proj = True
+        elif block_type == 'down':
+            self.key_stride = 2
+            self.has_proj = True
+        else:
+            assert block_type == 'normal'
+            self.key_stride = 1
+            self.has_proj = False
+        if self.has_proj:
+            if self.key_stride == 2:
+                self.c1x1_w_s2 = BnActConv2d(in_chs=in_chs,
+                                             out_chs=num_1x1_c + 2 * inc,
+                                             kernel_size=1,
+                                             stride=2)
+            else:
+                self.c1x1_w_s1 = BnActConv2d(in_chs=in_chs,
+                                             out_chs=num_1x1_c + 2 * inc,
+                                             kernel_size=1,
+                                             stride=1)
+        self.c1x1_a = BnActConv2d(in_chs=in_chs,
+                                  out_chs=num_1x1_a,
+                                  kernel_size=1,
+                                  stride=1)
+        self.c3x3_b = BnActConv2d(in_chs=num_1x1_a,
+                                  out_chs=num_3x3_b,
+                                  kernel_size=3,
+                                  stride=self.key_stride,
+                                  padding=1,
+                                  groups=groups)
+        if b:
+            self.c1x1_c = CatBnAct(in_chs=num_3x3_b)
+            self.c1x1_c1 = nn.Conv2d(num_3x3_b,
+                                     num_1x1_c,
+                                     kernel_size=1,
+                                     bias=False)
+            self.c1x1_c2 = nn.Conv2d(num_3x3_b, inc, kernel_size=1, bias=False)
+        else:
+            self.c1x1_c = BnActConv2d(in_chs=num_3x3_b,
+                                      out_chs=num_1x1_c + inc,
+                                      kernel_size=1,
+                                      stride=1)
+    def forward(self, x):
+        x_in = torch.cat(x, dim=1) if isinstance(x, tuple) else x
+        if self.has_proj:
+            if self.key_stride == 2:
+                x_s = self.c1x1_w_s2(x_in)
+            else:
+                x_s = self.c1x1_w_s1(x_in)
+            x_s1 = x_s[:, :self.num_1x1_c, :, :]
+            x_s2 = x_s[:, self.num_1x1_c:, :, :]
+        else:
+            x_s1 = x[0]
+            x_s2 = x[1]
+        x_in = self.c1x1_a(x_in)
+        x_in = self.c3x3_b(x_in)
+        if self.b:
+            x_in = self.c1x1_c(x_in)
+            out1 = self.c1x1_c1(x_in)
+            out2 = self.c1x1_c2(x_in)
+        else:
+            x_in = self.c1x1_c(x_in)
+            out1 = x_in[:, :self.num_1x1_c, :, :]
+            out2 = x_in[:, self.num_1x1_c:, :, :]
+        resid = x_s1 + out1
+        dense = torch.cat([x_s2, out2], dim=1)
+        return resid, dense
+class DPN(nn.Module):
+    def __init__(self,
+                 small=False,
+                 num_init_features=64,
+                 k_r=96,
+                 groups=32,
+                 b=False,
+                 k_sec=(3, 4, 20, 3),
+                 inc_sec=(16, 32, 24, 128),
+                 num_classes=1000,
+                 test_time_pool=False):
+        super(DPN, self).__init__()
+        self.test_time_pool = test_time_pool
+        self.b = b
+        bw_factor = 1 if small else 4
+        blocks = []
+        # conv1
+        if small:
+            blocks.append(
+                InputBlock(num_init_features, kernel_size=3, padding=1))
+        else:
+            blocks.append(
+                InputBlock(num_init_features, kernel_size=7, padding=3))
+        # conv2
+        bw = 64 * bw_factor
+        inc = inc_sec[0]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks.append(
+            DualPathBlock(num_init_features, r, r, bw, inc, groups, 'proj', b))
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[0] + 1):
+            blocks.append(
+                DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b))
+            in_chs += inc
+        # conv3
+        bw = 128 * bw_factor
+        inc = inc_sec[1]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks.append(DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b))
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[1] + 1):
+            blocks.append(
+                DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b))
+            in_chs += inc
+        # conv4
+        bw = 256 * bw_factor
+        inc = inc_sec[2]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks.append(DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b))
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[2] + 1):
+            blocks.append(
+                DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b))
+            in_chs += inc
+        # conv5
+        bw = 512 * bw_factor
+        inc = inc_sec[3]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks.append(DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b))
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[3] + 1):
+            blocks.append(
+                DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b))
+            in_chs += inc
+        blocks.append(CatBnAct(in_chs))
+        self.features = nn.Sequential(*blocks)
+        self.classifier = nn.Conv2d(in_chs,
+                                    num_classes,
+                                    kernel_size=1,
+                                    bias=True)
+    def logits(self, features):
+        if not self.training and self.test_time_pool:
+            x = F.avg_pool2d(features, kernel_size=7, stride=1)
+            out = self.classifier(x)
+            out = adaptive_avgmax_pool2d(out, pool_type='avgmax')
+        else:
+            x = adaptive_avgmax_pool2d(features, pool_type='avg')
+            out = self.classifier(x)
+        return out.view(out.size(0), -1)
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+def pooling_factor(pool_type='avg'):
+    return 2 if pool_type == 'avgmaxc' else 1
+def adaptive_avgmax_pool2d(x, pool_type='avg', padding=0):
+    if pool_type == 'avgmaxc':
+        x = torch.cat([
+            F.avg_pool2d(
+                x, kernel_size=(x.size(2), x.size(3)), padding=padding),
+            F.max_pool2d(
+                x, kernel_size=(x.size(2), x.size(3)), padding=padding)
+        ],
+                      dim=1)
+    elif pool_type == 'avgmax':
+        x_avg = F.avg_pool2d(x,
+                             kernel_size=(x.size(2), x.size(3)),
+                             padding=padding)
+        x_max = F.max_pool2d(x,
+                             kernel_size=(x.size(2), x.size(3)),
+                             padding=padding)
+        x = 0.5 * (x_avg + x_max)
+    elif pool_type == 'max':
+        x = F.max_pool2d(x,
+                         kernel_size=(x.size(2), x.size(3)),
+                         padding=padding)
+    else:
+        if pool_type != 'avg':
+            print(
+                'Invalid pool type %s specified. Defaulting to average pooling.'
+                % pool_type)
+        x = F.avg_pool2d(x,
+                         kernel_size=(x.size(2), x.size(3)),
+                         padding=padding)
+    return x
+class AdaptiveAvgMaxPool2d(torch.nn.Module):
+    def __init__(self, output_size=1, pool_type='avg'):
+        super(AdaptiveAvgMaxPool2d, self).__init__()
+        self.output_size = output_size
+        self.pool_type = pool_type
+        if pool_type == 'avgmaxc' or pool_type == 'avgmax':
+            self.pool = nn.ModuleList([
+                nn.AdaptiveAvgPool2d(output_size),
+                nn.AdaptiveMaxPool2d(output_size)
+            ])
+        elif pool_type == 'max':
+            self.pool = nn.AdaptiveMaxPool2d(output_size)
+        else:
+            if pool_type != 'avg':
+                print(
+                    'Invalid pool type %s specified. Defaulting to average pooling.'
+                    % pool_type)
+            self.pool = nn.AdaptiveAvgPool2d(output_size)
+    def forward(self, x):
+        if self.pool_type == 'avgmaxc':
+            x = torch.cat([p(x) for p in self.pool], dim=1)
+        elif self.pool_type == 'avgmax':
+            x = 0.5 * torch.sum(torch.stack([p(x) for p in self.pool]),
+                                0).squeeze(dim=0)
+        else:
+            x = self.pool(x)
+        return x
+    def factor(self):
+        return pooling_factor(self.pool_type)
+    def __repr__(self):
+        return self.__class__.__name__ + ' (' \
+               + 'output_size=' + str(self.output_size) \
+               + ', pool_type=' + self.pool_type + ')'
--- a/models/efficientnet.py
+++ b/models/efficientnet.py
+import torch.nn as nn
+import torch
+import math
+import torch.nn.functional as F
+from torch.nn import init
+import re
+import collections
+from collections import OrderedDict
+__all__ = [
+    'efficientnet_b0', 'efficientnet_b1', 'efficientnet_b2', 'efficientnet_b3',
+    'efficientnet_b4', 'efficientnet_b5', 'efficientnet_b6', 'efficientnet_b7'
+]
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'dropout_rate',
+    'data_format',
+    'num_classes',
+    'width_coefficient',
+    'depth_coefficient',
+    'depth_divisor',
+    'min_depth',
+    'drop_connect_rate',
+])
+GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields)
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
+    'expand_ratio', 'id_skip', 'strides', 'se_ratio'
+])
+BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields)
+def efficientnet_params(model_name):
+    """Get efficientnet params based on model name."""
+    params_dict = {
+        # (width_coefficient, depth_coefficient, resolution, dropout_rate)
+        'efficientnet_b0': (1.0, 1.0, 224, 0.2),
+        'efficientnet_b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet_b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet_b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet_b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet_b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet_b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet_b7': (2.0, 3.1, 600, 0.5),
+    }
+    return params_dict[model_name]
+def efficientnet(width_coefficient=None,
+                 depth_coefficient=None,
+                 dropout_rate=0.2,
+                 drop_connect_rate=0.3,
+                 override_block=None):
+    """Creates a efficientnet model."""
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25',
+        'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25',
+        'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s11_e6_i80_o112_se0.25',
+        'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s11_e6_i192_o320_se0.25',
+    ]
+    if override_block is not None:
+        assert isinstance(override_block, dict)
+        for k, v in override_block.items():
+            blocks_args[int(k)] = v
+    global_params = GlobalParams(dropout_rate=dropout_rate,
+                                 drop_connect_rate=drop_connect_rate,
+                                 data_format='channels_last',
+                                 num_classes=1000,
+                                 width_coefficient=width_coefficient,
+                                 depth_coefficient=depth_coefficient,
+                                 depth_divisor=8,
+                                 min_depth=None)
+    decoder = BlockDecoder()
+    return decoder.decode(blocks_args), global_params
+class BlockDecoder(object):
+    """Block Decoder for readability."""
+    def _decode_block_string(self, block_string):
+        """Gets a block through a string notation of arguments."""
+        assert isinstance(block_string, str)
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+        if 's' not in options or len(options['s']) != 2:
+            raise ValueError('Strides options should be a pair of integers.')
+        return BlockArgs(
+            kernel_size=int(options['k']),
+            num_repeat=int(options['r']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            expand_ratio=int(options['e']),
+            id_skip=('noskip' not in block_string),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            strides=[int(options['s'][0]),
+                     int(options['s'][1])])
+    def _encode_block_string(self, block):
+        """Encodes a block to a string."""
+        args = [
+            'r%d' % block.num_repeat,
+            'k%d' % block.kernel_size,
+            's%d%d' % (block.strides[0], block.strides[1]),
+            'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters,
+            'o%d' % block.output_filters
+        ]
+        if block.se_ratio > 0 and block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+    def decode(self, string_list):
+        """Decodes a list of string notations to specify blocks inside the network.
+        Args:
+            string_list: a list of strings, each string is a notation of block.
+        Returns:
+            A list of namedtuples to represent blocks arguments.
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(self._decode_block_string(block_string))
+        return blocks_args
+    def encode(self, blocks_args):
+        """Encodes a list of Blocks to a list of strings.
+        Args:
+            blocks_args: A list of namedtuples to represent blocks arguments.
+        Returns:
+            a list of strings, each string is a notation of block.
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(self._encode_block_string(block))
+        return block_strings
+def get_model_params(model_name, override_params=None, override_block=None):
+    """Get the block args and global params for a given model."""
+    if model_name.startswith('efficientnet'):
+        width_coefficient, depth_coefficient, _, dropout_rate = (
+            efficientnet_params(model_name))
+        blocks_args, global_params = efficientnet(
+            width_coefficient,
+            depth_coefficient,
+            dropout_rate,
+            override_block=override_block)
+    else:
+        raise NotImplementedError('model name is not pre-defined: %s' %
+                                  model_name)
+    if override_params is not None:
+        # ValueError will be raised here if override_params has fields not included
+        # in global_params.
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+def round_filters(filters, global_params):
+    """Round number of filters based on depth multiplier."""
+    # orig_f = filters
+    multiplier = global_params.width_coefficient
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    if not multiplier:
+        return filters
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth,
+                      int(filters + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_filters < 0.9 * filters:
+        new_filters += divisor
+    return int(new_filters)
+def round_repeats(repeats, global_params):
+    """Round number of filters based on depth multiplier."""
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    return int(math.ceil(multiplier * repeats))
+def drop_connect(x, training=False, drop_connect_rate=None):
+    if drop_connect_rate is None:
+        raise RuntimeError("drop_connect_rate not given")
+    if not training:
+        return x
+    else:
+        keep_prob = 1.0 - drop_connect_rate
+        n = x.size(0)
+        random_tensor = torch.rand([n, 1, 1, 1],
+                                   dtype=x.dtype,
+                                   device=x.device)
+        random_tensor = random_tensor + keep_prob
+        binary_mask = torch.floor(random_tensor)
+        x = (x / keep_prob) * binary_mask
+        return x
+class swish(nn.Module):
+    def __init__(self):
+        super(swish, self).__init__()
+    def forward(self, x):
+        x = x * torch.sigmoid(x)
+        return x
+def activation(act_type='swish'):
+    if act_type == 'swish':
+        act = swish()
+        return act
+    else:
+        act = nn.ReLU(inplace=True)
+        return act
+class MBConvBlock(nn.Module):
+    def __init__(self, block_args):
+        super(MBConvBlock, self).__init__()
+        self._block_args = block_args
+        self.has_se = (self._block_args.se_ratio is not None) and \
+            (self._block_args.se_ratio > 0) and \
+            (self._block_args.se_ratio <= 1)
+        self._build(inp=self._block_args.input_filters,
+                    oup=self._block_args.output_filters,
+                    expand_ratio=self._block_args.expand_ratio,
+                    kernel_size=self._block_args.kernel_size,
+                    stride=self._block_args.strides)
+    def block_args(self):
+        return self._block_args
+    def _build(self, inp, oup, expand_ratio, kernel_size, stride):
+        module_lists = []
+        self.use_res_connect = all([s == 1 for s in stride]) and inp == oup
+        if expand_ratio != 1:
+            module_lists.append(
+                nn.Conv2d(inp, inp * expand_ratio, 1, 1, 0, bias=False))
+            module_lists.append(nn.BatchNorm2d(inp * expand_ratio))
+            module_lists.append(activation())
+        module_lists.append(
+            nn.Conv2d(inp * expand_ratio,
+                      inp * expand_ratio,
+                      kernel_size,
+                      stride,
+                      kernel_size // 2,
+                      groups=inp * expand_ratio,
+                      bias=False))
+        module_lists.append(nn.BatchNorm2d(inp * expand_ratio))
+        module_lists.append(activation())
+        self.in_conv = nn.Sequential(*module_lists)
+        if self.has_se:
+            se_size = max(1, int(inp * self._block_args.se_ratio))
+            s = OrderedDict()
+            s['conv1'] = nn.Conv2d(inp * expand_ratio,
+                                   se_size,
+                                   kernel_size=1,
+                                   stride=1,
+                                   padding=0)
+            s['act1'] = activation()
+            s['conv2'] = nn.Conv2d(se_size,
+                                   inp * expand_ratio,
+                                   kernel_size=1,
+                                   stride=1,
+                                   padding=0)
+            s['act2'] = nn.Sigmoid()
+            self.se_block = nn.Sequential(s)
+        self.out_conv = nn.Sequential(
+            nn.Conv2d(inp * expand_ratio, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup))
+    def forward(self, x, drop_connect_rate=None):
+        out = self.in_conv(x)
+        if self.has_se:
+            weight = F.adaptive_avg_pool2d(out, output_size=1)
+            weight = self.se_block(weight)
+            out = out * weight
+        out = self.out_conv(out)
+        if self._block_args.id_skip:
+            if self.use_res_connect:
+                if drop_connect_rate is not None:
+                    out = drop_connect(out, self.training, drop_connect_rate)
+                out = out + x
+        return out
+class EfficientNet(nn.Module):
+    def __init__(self,
+                 blocks_args=None,
+                 global_params=None,
+                 use_fc_bn=False,
+                 fc_bn_init_scale=1.0,
+                 bn=None):
+        super(EfficientNet, self).__init__()
+        if not isinstance(blocks_args, list):
+            raise ValueError('blocks_args should be a list.')
+        self._global_params = global_params
+        self._blocks_args = blocks_args
+        self.use_fc_bn = use_fc_bn
+        self.fc_bn_init_scale = fc_bn_init_scale
+        self._build()
+    def _build(self):
+        blocks = []
+        for block_args in self._blocks_args:
+            assert block_args.num_repeat > 0
+            block_args = block_args._replace(
+                input_filters=round_filters(block_args.input_filters,
+                                            self._global_params),
+                output_filters=round_filters(block_args.output_filters,
+                                             self._global_params),
+                num_repeat=round_repeats(block_args.num_repeat,
+                                         self._global_params))
+            blocks.append(MBConvBlock(block_args))
+            if block_args.num_repeat > 1:
+                block_args = block_args._replace(
+                    input_filters=block_args.output_filters, strides=[1, 1])
+            for _ in range(block_args.num_repeat - 1):
+                blocks.append(MBConvBlock(block_args))
+        self.blocks = nn.ModuleList(blocks)
+        c_in = round_filters(32, self._global_params)
+        self.stem = nn.Sequential(
+            nn.Conv2d(3, c_in, kernel_size=3, stride=2, padding=1, bias=False),
+            nn.BatchNorm2d(c_in),
+            activation(),
+        )
+        c_in = round_filters(320, self._global_params)
+        c_final = round_filters(1280, self._global_params)
+        self.head = nn.Sequential(
+            nn.Conv2d(c_in,
+                      c_final,
+                      kernel_size=1,
+                      stride=1,
+                      padding=0,
+                      bias=False),
+            nn.BatchNorm2d(c_final),
+            activation(),
+        )
+        self.avgpool = torch.nn.AdaptiveAvgPool2d(output_size=1)
+        self.fc = torch.nn.Linear(c_final, self._global_params.num_classes)
+        if self._global_params.dropout_rate > 0:
+            self.dropout = nn.Dropout2d(p=self._global_params.dropout_rate,
+                                        inplace=True)
+        else:
+            self.dropout = None
+        self._initialize_weights()
+        if self.use_fc_bn:
+            self.fc_bn = nn.BatchNorm2d(self._global_params.num_classes)
+            init.constant_(self.fc_bn.weight, self.fc_bn_init_scale)
+            init.constant_(self.fc_bn.bias, 0)
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 1.0 / float(n))
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.stem(x)
+        for idx in range(len(self.blocks)):
+            drop_rate = self._global_params.drop_connect_rate
+            if drop_rate:
+                drop_rate *= float(idx) / len(self.blocks)
+            x = self.blocks[idx](x, drop_rate)
+        x = self.head(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        x = self.fc(x)
+        if self.use_fc_bn and x.size(0) > 1:
+            x = self.fc_bn(x.view(x.size(0), -1, 1, 1))
+            x = x.view(x.size(0), -1)
+        return x
+def efficientnet_b0(override_params=None, override_block=None, **kwargs):
+    model_name = 'efficientnet_b0'
+    blocks_args, global_params = get_model_params(model_name, override_params,
+                                                  override_block)
+    model = EfficientNet(blocks_args, global_params, **kwargs)
+    return model
+def efficientnet_b1(override_params=None, **kwargs):
+    model_name = 'efficientnet_b1'
+    blocks_args, global_params = get_model_params(model_name, override_params)
+    model = EfficientNet(blocks_args, global_params, **kwargs)
+    return model
+def efficientnet_b2(override_params=None, **kwargs):
+    model_name = 'efficientnet_b2'
+    blocks_args, global_params = get_model_params(model_name, override_params)
+    model = EfficientNet(blocks_args, global_params, **kwargs)
+    return model
+def efficientnet_b3(override_params=None, **kwargs):
+    model_name = 'efficientnet_b3'
+    blocks_args, global_params = get_model_params(model_name, override_params)
+    model = EfficientNet(blocks_args, global_params, **kwargs)
+    return model
+def efficientnet_b4(override_params=None, **kwargs):
+    model_name = 'efficientnet_b4'
+    blocks_args, global_params = get_model_params(model_name, override_params)
+    model = EfficientNet(blocks_args, global_params, **kwargs)
+    return model
+def efficientnet_b5(override_params=None, **kwargs):
+    model_name = 'efficientnet_b5'
+    blocks_args, global_params = get_model_params(model_name, override_params)
+    model = EfficientNet(blocks_args, global_params, **kwargs)
+    return model
+def efficientnet_b6(override_params=None, **kwargs):
+    model_name = 'efficientnet_b6'
+    blocks_args, global_params = get_model_params(model_name, override_params)
+    model = EfficientNet(blocks_args, global_params, **kwargs)
+    return model
+def efficientnet_b7(override_params=None, **kwargs):
+    model_name = 'efficientnet_b7'
+    blocks_args, global_params = get_model_params(model_name, override_params)
+    model = EfficientNet(blocks_args, global_params, **kwargs)
+    return model
--- a/models/hrnet.py
+++ b/models/hrnet.py
+import math
+import torch
+import torch.nn as nn
+import torch._utils
+import torch.nn.functional as F
+__all__ = ['HRNet']
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=1,
+                     bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes,
+                               planes,
+                               kernel_size=3,
+                               stride=stride,
+                               padding=1,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes,
+                               planes * self.expansion,
+                               kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class HighResolutionModule(nn.Module):
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 num_inchannels,
+                 num_channels,
+                 fuse_method,
+                 multi_scale_output=True):
+        super(HighResolutionModule, self).__init__()
+        self._check_branches(num_branches, blocks, num_blocks, num_inchannels,
+                             num_channels)
+        self.num_inchannels = num_inchannels
+        self.fuse_method = fuse_method
+        self.num_branches = num_branches
+        self.multi_scale_output = multi_scale_output
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(False)
+    def _check_branches(self, num_branches, blocks, num_blocks, num_inchannels,
+                        num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
+                num_branches, len(num_blocks))
+            raise ValueError(error_msg)
+        if num_branches != len(num_channels):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(
+                num_branches, len(num_channels))
+            raise ValueError(error_msg)
+        if num_branches != len(num_inchannels):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(
+                num_branches, len(num_inchannels))
+            raise ValueError(error_msg)
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        downsample = None
+        if stride != 1 or \
+           self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.num_inchannels[branch_index],
+                          num_channels[branch_index] * block.expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          bias=False),
+                nn.BatchNorm2d(num_channels[branch_index] * block.expansion),
+            )
+        layers = []
+        layers.append(
+            block(self.num_inchannels[branch_index],
+                  num_channels[branch_index], stride, downsample))
+        self.num_inchannels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(self.num_inchannels[branch_index],
+                      num_channels[branch_index]))
+        return nn.Sequential(*layers)
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+        return nn.ModuleList(branches)
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+        num_branches = self.num_branches
+        num_inchannels = self.num_inchannels
+        fuse_layers = []
+        for i in range(num_branches if self.multi_scale_output else 1):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            nn.Conv2d(num_inchannels[j],
+                                      num_inchannels[i],
+                                      1,
+                                      1,
+                                      0,
+                                      bias=False),
+                            nn.BatchNorm2d(num_inchannels[i]),
+                            nn.Upsample(scale_factor=2**(j - i),
+                                        mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv3x3s = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            num_outchannels_conv3x3 = num_inchannels[i]
+                            conv3x3s.append(
+                                nn.Sequential(
+                                    nn.Conv2d(num_inchannels[j],
+                                              num_outchannels_conv3x3,
+                                              3,
+                                              2,
+                                              1,
+                                              bias=False),
+                                    nn.BatchNorm2d(num_outchannels_conv3x3)))
+                        else:
+                            num_outchannels_conv3x3 = num_inchannels[j]
+                            conv3x3s.append(
+                                nn.Sequential(
+                                    nn.Conv2d(num_inchannels[j],
+                                              num_outchannels_conv3x3,
+                                              3,
+                                              2,
+                                              1,
+                                              bias=False),
+                                    nn.BatchNorm2d(num_outchannels_conv3x3),
+                                    nn.ReLU(False)))
+                    fuse_layer.append(nn.Sequential(*conv3x3s))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+        return nn.ModuleList(fuse_layers)
+    def get_num_inchannels(self):
+        return self.num_inchannels
+    def forward(self, x):
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
+            for j in range(1, self.num_branches):
+                if i == j:
+                    y = y + x[j]
+                else:
+                    y = y + self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+class HighResolutionNet(nn.Module):
+    def __init__(self, stages, bn=None):
+        super(HighResolutionNet, self).__init__()
+        self.conv1 = nn.Conv2d(3,
+                               64,
+                               kernel_size=3,
+                               stride=2,
+                               padding=1,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.conv2 = nn.Conv2d(64,
+                               64,
+                               kernel_size=3,
+                               stride=2,
+                               padding=1,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.stage1_cfg = stages['STAGE1']
+        num_channels = self.stage1_cfg['NUM_CHANNELS'][0]
+        block = blocks_dict[self.stage1_cfg['BLOCK']]
+        num_blocks = self.stage1_cfg['NUM_BLOCKS'][0]
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+        stage1_out_channel = block.expansion * num_channels
+        self.stage2_cfg = stages['STAGE2']
+        num_channels = self.stage2_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage2_cfg['BLOCK']]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition1 = self._make_transition_layer([stage1_out_channel],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+        self.stage3_cfg = stages['STAGE3']
+        num_channels = self.stage3_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage3_cfg['BLOCK']]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+        self.stage4_cfg = stages['STAGE4']
+        num_channels = self.stage4_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage4_cfg['BLOCK']]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multi_scale_output=True)
+        # Classification Head
+        self.incre_modules, self.downsamp_modules, \
+            self.final_layer = self._make_head(pre_stage_channels)
+        self.classifier = nn.Linear(2048, 1000)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 1.0 / float(n))
+                m.bias.data.zero_()
+    def _make_head(self, pre_stage_channels):
+        head_block = Bottleneck
+        head_channels = [32, 64, 128, 256]
+        # Increasing the #channels on each resolution
+        # from C, 2C, 4C, 8C to 128, 256, 512, 1024
+        incre_modules = []
+        for i, channels in enumerate(pre_stage_channels):
+            incre_module = self._make_layer(head_block,
+                                            channels,
+                                            head_channels[i],
+                                            1,
+                                            stride=1)
+            incre_modules.append(incre_module)
+        incre_modules = nn.ModuleList(incre_modules)
+        # downsampling modules
+        downsamp_modules = []
+        for i in range(len(pre_stage_channels) - 1):
+            in_channels = head_channels[i] * head_block.expansion
+            out_channels = head_channels[i + 1] * head_block.expansion
+            downsamp_module = nn.Sequential(
+                nn.Conv2d(in_channels=in_channels,
+                          out_channels=out_channels,
+                          kernel_size=3,
+                          stride=2,
+                          padding=1), nn.BatchNorm2d(out_channels),
+                nn.ReLU(inplace=True))
+            downsamp_modules.append(downsamp_module)
+        downsamp_modules = nn.ModuleList(downsamp_modules)
+        final_layer = nn.Sequential(
+            nn.Conv2d(in_channels=head_channels[3] * head_block.expansion,
+                      out_channels=2048,
+                      kernel_size=1,
+                      stride=1,
+                      padding=0), nn.BatchNorm2d(2048), nn.ReLU(inplace=True))
+        return incre_modules, downsamp_modules, final_layer
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            nn.Conv2d(num_channels_pre_layer[i],
+                                      num_channels_cur_layer[i],
+                                      3,
+                                      1,
+                                      1,
+                                      bias=False),
+                            nn.BatchNorm2d(num_channels_cur_layer[i]),
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv3x3s = []
+                for j in range(i + 1 - num_branches_pre):
+                    inchannels = num_channels_pre_layer[-1]
+                    outchannels = num_channels_cur_layer[i] \
+                        if j == i-num_branches_pre else inchannels
+                    conv3x3s.append(
+                        nn.Sequential(
+                            nn.Conv2d(inchannels,
+                                      outchannels,
+                                      3,
+                                      2,
+                                      1,
+                                      bias=False), nn.BatchNorm2d(outchannels),
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv3x3s))
+        return nn.ModuleList(transition_layers)
+    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(inplanes,
+                          planes * block.expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+        return nn.Sequential(*layers)
+    def _make_stage(self,
+                    layer_config,
+                    num_inchannels,
+                    multi_scale_output=True):
+        num_modules = layer_config['NUM_MODULES']
+        num_branches = layer_config['NUM_BRANCHES']
+        num_blocks = layer_config['NUM_BLOCKS']
+        num_channels = layer_config['NUM_CHANNELS']
+        block = blocks_dict[layer_config['BLOCK']]
+        fuse_method = layer_config['FUSE_METHOD']
+        modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used last module
+            if not multi_scale_output and i == num_modules - 1:
+                reset_multi_scale_output = False
+            else:
+                reset_multi_scale_output = True
+            modules.append(
+                HighResolutionModule(num_branches, block, num_blocks,
+                                     num_inchannels, num_channels, fuse_method,
+                                     reset_multi_scale_output))
+            num_inchannels = modules[-1].get_num_inchannels()
+        return nn.Sequential(*modules), num_inchannels
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+        x_list = []
+        for i in range(self.stage2_cfg['NUM_BRANCHES']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+        x_list = []
+        for i in range(self.stage3_cfg['NUM_BRANCHES']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+        x_list = []
+        for i in range(self.stage4_cfg['NUM_BRANCHES']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+        # Classification Head
+        y = self.incre_modules[0](y_list[0])
+        for i in range(len(self.downsamp_modules)):
+            y = self.incre_modules[i+1](y_list[i+1]) + \
+                self.downsamp_modules[i](y)
+        y = self.final_layer(y)
+        if torch._C._get_tracing_state():
+            y = y.flatten(start_dim=2).mean(dim=2)
+        else:
+            y = F.avg_pool2d(y, kernel_size=y.size()[2:]).view(y.size(0), -1)
+        y = self.classifier(y)
+        return y
+def HRNet(**kwargs):
+    model = HighResolutionNet(**kwargs)
+    return model
--- a/models/inception_resnet.py
+++ b/models/inception_resnet.py
+import torch
+import torch.nn as nn
+__all__ = ['inception_resnet_v1', 'inception_resnet_v2']
+class BasicConv2d(nn.Module):
+    def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0):
+        super(BasicConv2d, self).__init__()
+        self.conv = nn.Conv2d(in_planes,
+                              out_planes,
+                              kernel_size=kernel_size,
+                              stride=stride,
+                              padding=padding,
+                              bias=False)
+        self.bn = nn.BatchNorm2d(out_planes,
+                                 eps=0.001,
+                                 momentum=0.1,
+                                 affine=True)
+        self.relu = nn.ReLU(inplace=False)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+class Stem_tail_v1(nn.Module):
+    def __init__(self):
+        super(Stem_tail_v1, self).__init__()
+        self.maxpool = nn.MaxPool2d(3, stride=2)
+        self.conv2d_3b = BasicConv2d(64, 80, kernel_size=1, stride=1)
+        self.conv2d_4a = BasicConv2d(80, 192, kernel_size=3, stride=1)
+        self.conv2d_4b = BasicConv2d(192, 256, kernel_size=3, stride=2)
+    def forward(self, x):
+        x = self.maxpool(x)
+        x = self.conv2d_3b(x)
+        x = self.conv2d_4a(x)
+        x = self.conv2d_4b(x)
+        return x
+class Stem_tail_v2(nn.Module):
+    def __init__(self):
+        super(Stem_tail_v2, self).__init__()
+        self.branch0_0 = nn.MaxPool2d(3, stride=2)
+        self.branch0_1 = BasicConv2d(64, 96, kernel_size=3, stride=2)
+        self.branch1_0 = nn.Sequential(
+            BasicConv2d(160, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 96, kernel_size=3, stride=1))
+        self.branch1_1 = nn.Sequential(
+            BasicConv2d(160, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 64, kernel_size=(1, 7), stride=1, padding=(0, 3)),
+            BasicConv2d(64, 64, kernel_size=(7, 1), stride=1, padding=(3, 0)),
+            BasicConv2d(64, 96, kernel_size=(3, 3), stride=1))
+        self.branch2_0 = BasicConv2d(192, 128, kernel_size=3, stride=2)
+        self.branch2_1 = nn.MaxPool2d(3, stride=2)
+    def forward(self, x):
+        x0_0 = self.branch0_0(x)
+        x0_1 = self.branch0_1(x)
+        x0 = torch.cat((x0_0, x0_1), 1)
+        x1_0 = self.branch1_0(x0)
+        x1_1 = self.branch1_1(x0)
+        x1 = torch.cat((x1_0, x1_1), 1)
+        x2_0 = self.branch2_0(x1)
+        x2_1 = self.branch2_1(x1)
+        out = torch.cat((x2_0, x2_1), 1)
+        return out
+class Block35(nn.Module):
+    def __init__(self, scale, input_channels):
+        super(Block35, self).__init__()
+        self.scale = scale
+        self.branch0 = BasicConv2d(input_channels, 32, kernel_size=1, stride=1)
+        self.branch1 = nn.Sequential(
+            BasicConv2d(input_channels, 32, kernel_size=1, stride=1),
+            BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1))
+        if input_channels == 256:
+            conv3_1 = 32
+            conv3_2 = 32
+        else:
+            conv3_1 = 48
+            conv3_2 = 64
+        self.branch2 = nn.Sequential(
+            BasicConv2d(input_channels, 32, kernel_size=1, stride=1),
+            BasicConv2d(32, conv3_1, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(conv3_1, conv3_2, kernel_size=3, stride=1, padding=1))
+        self.conv2d = nn.Conv2d(conv3_2 + 64,
+                                input_channels,
+                                kernel_size=1,
+                                stride=1)
+        self.relu = nn.ReLU(inplace=False)
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        out = torch.cat((x0, x1, x2), 1)
+        out = self.conv2d(out)
+        out = out * self.scale + x
+        out = self.relu(out)
+        return out
+class Mixed_6a(nn.Module):
+    def __init__(self, input_channels, k, ll, m, n):
+        super(Mixed_6a, self).__init__()
+        self.branch0 = BasicConv2d(input_channels, n, kernel_size=3, stride=2)
+        self.branch1 = nn.Sequential(
+            BasicConv2d(input_channels, k, kernel_size=1, stride=1),
+            BasicConv2d(k, ll, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(ll, m, kernel_size=3, stride=2))
+        self.branch2 = nn.MaxPool2d(3, stride=2)
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        out = torch.cat((x0, x1, x2), 1)
+        return out
+class Block17(nn.Module):
+    def __init__(self, scale, input_channels, m):
+        super(Block17, self).__init__()
+        self.scale = scale
+        self.branch0 = BasicConv2d(input_channels,
+                                   m // 2,
+                                   kernel_size=1,
+                                   stride=1)
+        step1 = (m // 2 + 128) // 2
+        self.branch1 = nn.Sequential(
+            BasicConv2d(input_channels, 128, kernel_size=1, stride=1),
+            BasicConv2d(128,
+                        step1,
+                        kernel_size=(1, 7),
+                        stride=1,
+                        padding=(0, 3)),
+            BasicConv2d(step1,
+                        m // 2,
+                        kernel_size=(7, 1),
+                        stride=1,
+                        padding=(3, 0)))
+        self.conv2d = nn.Conv2d(m, input_channels, kernel_size=1, stride=1)
+        self.relu = nn.ReLU(inplace=False)
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        out = torch.cat((x0, x1), 1)
+        out = self.conv2d(out)
+        out = out * self.scale + x
+        out = self.relu(out)
+        return out
+class Mixed_7a(nn.Module):
+    def __init__(self, input_channels, output_channels):
+        super(Mixed_7a, self).__init__()
+        self.branch0 = nn.Sequential(
+            BasicConv2d(input_channels, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, 384, kernel_size=3, stride=2))
+        channels_middle = (output_channels + 256) // 2
+        self.branch1 = nn.Sequential(
+            BasicConv2d(input_channels, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, channels_middle, kernel_size=3, stride=2))
+        self.branch2 = nn.Sequential(
+            BasicConv2d(input_channels, 256, kernel_size=1, stride=1),
+            BasicConv2d(256,
+                        channels_middle,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1),
+            BasicConv2d(channels_middle,
+                        output_channels,
+                        kernel_size=3,
+                        stride=2))
+        self.branch3 = nn.MaxPool2d(3, stride=2)
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+class Block8(nn.Module):
+    def __init__(self, scale, input_channels, ll, noReLU=False):
+        super(Block8, self).__init__()
+        self.scale = scale
+        self.noReLU = noReLU
+        self.branch0 = BasicConv2d(input_channels,
+                                   192,
+                                   kernel_size=1,
+                                   stride=1)
+        step1 = (ll + 192) // 2
+        self.branch1 = nn.Sequential(
+            BasicConv2d(input_channels, 192, kernel_size=1, stride=1),
+            BasicConv2d(192,
+                        step1,
+                        kernel_size=(1, 3),
+                        stride=1,
+                        padding=(0, 1)),
+            BasicConv2d(step1,
+                        ll,
+                        kernel_size=(3, 1),
+                        stride=1,
+                        padding=(1, 0)))
+        self.conv2d = nn.Conv2d(192 + ll,
+                                input_channels,
+                                kernel_size=1,
+                                stride=1)
+        if not self.noReLU:
+            self.relu = nn.ReLU(inplace=False)
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        out = torch.cat((x0, x1), 1)
+        out = self.conv2d(out)
+        out = out * self.scale + x
+        if not self.noReLU:
+            out = self.relu(out)
+        return out
+class InceptionResNet(nn.Module):
+    def __init__(self,
+                 channel35,
+                 k,
+                 l_index,
+                 m,
+                 n,
+                 num_classes=1000,
+                 num_repeat=[10, 20, 9],
+                 scale=[0.17, 0.10, 0.20]):
+        super(InceptionResNet, self).__init__()
+        # Special attributs
+        self.input_space = None
+        self.input_size = (299, 299, 3)
+        self.mean = None
+        self.std = None
+        # Modules
+        self.stem_head = nn.Sequential(
+            BasicConv2d(3, 32, kernel_size=3, stride=2),
+            BasicConv2d(32, 32, kernel_size=3, stride=1),
+            BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1),
+        )
+        if channel35 == 256:  # inception_resnet_v1
+            self.stem_tail = Stem_tail_v1()
+        else:  # inception_resnet_v2  320
+            self.stem_tail = Stem_tail_v2()
+        sequence_list0 = []
+        for j in range(num_repeat[0]):
+            sequence_list0.append(
+                Block35(scale=scale[0], input_channels=channel35))
+        self.repeat0 = nn.Sequential(*sequence_list0)
+        self.mixed_6a = Mixed_6a(input_channels=channel35,
+                                 k=k,
+                                 ll=l_index,
+                                 m=m,
+                                 n=n)
+        repeat1_input = channel35 + m + n
+        sequence_list1 = []
+        for j in range(num_repeat[1]):
+            sequence_list1.append(
+                Block17(scale=scale[1], input_channels=repeat1_input, m=m))
+        self.repeat1 = nn.Sequential(*sequence_list1)
+        self.mixed_7a = Mixed_7a(input_channels=repeat1_input,
+                                 output_channels=channel35)
+        repeat2_input = repeat1_input + channel35 * 3 // 2 + 512
+        sequence_list2 = []
+        for j in range(num_repeat[2]):
+            sequence_list2.append(
+                Block8(scale=scale[2],
+                       input_channels=repeat2_input,
+                       ll=l_index))
+        self.repeat2 = nn.Sequential(*sequence_list2)
+        self.block8 = Block8(scale=scale[2],
+                             input_channels=repeat2_input,
+                             ll=l_index,
+                             noReLU=True)
+        self.conv2d_7b = BasicConv2d(repeat2_input,
+                                     1536,
+                                     kernel_size=1,
+                                     stride=1)
+        self.avgpool_1a = nn.AvgPool2d(8)
+        self.drop = nn.Dropout(p=0.2)
+        self.last_linear = nn.Linear(1536, num_classes)
+    def features(self, input):
+        x = self.stem_head(input)
+        x = self.stem_tail(x)
+        x = self.repeat0(x)
+        x = self.mixed_6a(x)
+        x = self.repeat1(x)
+        x = self.mixed_7a(x)
+        x = self.repeat2(x)
+        x = self.block8(x)
+        x = self.conv2d_7b(x)
+        return x
+    def logits(self, features):
+        x = self.avgpool_1a(features)
+        self.drop(x)
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+def inception_resnet_v1(**kwargs):
+    model = InceptionResNet(channel35=256,
+                            k=192,
+                            l_index=192,
+                            m=256,
+                            n=384,
+                            **kwargs)
+    return model
+def inception_resnet_v2(**kwargs):
+    model = InceptionResNet(channel35=320,
+                            k=256,
+                            l_index=256,
+                            m=384,
+                            n=384,
+                            **kwargs)
+    return model
--- a/models/inception_v1.py
+++ b/models/inception_v1.py
+import torch
+import torch.nn as nn
+__all__ = ['InceptionV1', 'inception_v1']
+# modified according to https://github.com/minghao-wu/DeepLearningFromScratch/blob/master/GoogLeNet/GoogLeNet.py
+# aux_classifier and dropout
+def inception_v1(**kwargs):
+    return InceptionV1(**kwargs)
+class Inception(nn.Module):
+    def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5,
+                 pool_planes):
+        super(Inception, self).__init__()
+        # 1x1 conv branch
+        self.b1 = nn.Sequential(
+            nn.Conv2d(in_planes, n1x1, kernel_size=1),
+            nn.ReLU(True),
+        )
+        # 1x1 conv -> 3x3 conv branch
+        self.b2 = nn.Sequential(
+            nn.Conv2d(in_planes, n3x3red, kernel_size=1),
+            nn.ReLU(True),
+            nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1),
+            nn.ReLU(True),
+        )
+        # 1x1 conv -> 5x5 conv branch
+        self.b3 = nn.Sequential(
+            nn.Conv2d(in_planes, n5x5red, kernel_size=1),
+            nn.ReLU(True),
+            nn.Conv2d(n5x5red, n5x5, kernel_size=5, padding=2),
+            nn.ReLU(True),
+        )
+        # 3x3 pool -> 1x1 conv branch
+        self.b4 = nn.Sequential(
+            nn.MaxPool2d(3, stride=1, padding=1),
+            nn.Conv2d(in_planes, pool_planes, kernel_size=1),
+            nn.ReLU(True),
+        )
+    def forward(self, x):
+        y1 = self.b1(x)
+        y2 = self.b2(x)
+        y3 = self.b3(x)
+        y4 = self.b4(x)
+        return torch.cat([y1, y2, y3, y4], 1)
+class AuxClassifier(nn.Module):
+    def __init__(self, in_channels, num_classes):
+        super(AuxClassifier, self).__init__()
+        self.pool1 = nn.AvgPool2d(kernel_size=5, stride=3)
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=128,
+                      kernel_size=1), nn.ReLU(inplace=True))
+        self.fc1 = nn.Sequential(
+            nn.Linear(in_features=4 * 4 * 128, out_features=1024),
+            nn.ReLU(inplace=True))
+        self.drop = nn.Dropout(p=0.3)
+        self.fc2 = nn.Linear(in_features=1024, out_features=num_classes)
+    def forward(self, x):
+        x = self.pool1(x)
+        x = self.conv1(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc1(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        return (x)
+class InceptionV1(nn.Module):
+    def __init__(self, num_classes=1000, aux_classifier=True):
+        super(InceptionV1, self).__init__()
+        self.aux_classifier = aux_classifier
+        self.c1 = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
+            nn.ReLU(True),
+        )
+        self.c2 = nn.Sequential(
+            nn.Conv2d(64, 64, kernel_size=1, stride=1),
+            nn.ReLU(True),
+        )
+        self.c3 = nn.Sequential(
+            nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+        )
+        self.a3 = Inception(192, 64, 96, 128, 16, 32, 32)
+        self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)
+        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
+        self.lrn = nn.LocalResponseNorm(2)
+        self.a4 = Inception(480, 192, 96, 208, 16, 48, 64)
+        if aux_classifier:
+            self.aux0 = AuxClassifier(in_channels=512, num_classes=num_classes)
+        self.b4 = Inception(512, 160, 112, 224, 24, 64, 64)
+        self.c4 = Inception(512, 128, 128, 256, 24, 64, 64)
+        self.d4 = Inception(512, 112, 144, 288, 32, 64, 64)
+        if aux_classifier:
+            self.aux1 = AuxClassifier(in_channels=528, num_classes=num_classes)
+        self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)
+        self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)
+        self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.drop = nn.Dropout(p=0.4)
+        self.linear = nn.Linear(1024, num_classes)
+    def forward(self, x):
+        out = self.c1(x)
+        out = self.maxpool(out)
+        out = self.lrn(out)
+        out = self.c2(out)
+        out = self.c3(out)
+        out = self.lrn(out)
+        out = self.maxpool(out)
+        out = self.a3(out)
+        out = self.b3(out)
+        out = self.maxpool(out)
+        out = self.a4(out)
+        if self.training and self.aux_classifier:
+            output0 = self.aux0(out)
+        out = self.b4(out)
+        out = self.c4(out)
+        out = self.d4(out)
+        if self.training and self.aux_classifier:
+            output1 = self.aux1(out)
+        out = self.e4(out)
+        out = self.maxpool(out)
+        out = self.a5(out)
+        out = self.b5(out)
+        out = self.avgpool(out)
+        out = out.view(out.size(0), -1)
+        out = self.drop(out)
+        out = self.linear(out)
+        if self.training and self.aux_classifier:
+            out += (output0 + output1) * 0.3
+        return out
--- a/models/inception_v2.py
+++ b/models/inception_v2.py
+import torch
+import torch.nn as nn
+__all__ = ['InceptionV2', 'inception_v2']
+# modified according to
+# https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/bninception.py
+# batch normalization & 3×3*2 & delete maxpool in 3c and 4e
+def inception_v2(**kwargs):
+    return InceptionV2(**kwargs)
+class Inception_2(nn.Module):
+    def __init__(self,
+                 in_planes,
+                 n1x1,
+                 n3x3red,
+                 n3x3,
+                 n5x5red,
+                 n5x5,
+                 pool_planes,
+                 pool_type='avg'):
+        super(Inception_2, self).__init__()
+        # 1x1 conv branch
+        self.b1 = nn.Sequential(
+            nn.Conv2d(in_planes, n1x1, kernel_size=1),
+            nn.BatchNorm2d(n1x1, affine=True),
+            nn.ReLU(True),
+        )
+        # 1x1 conv -> 3x3 conv branch
+        self.b2 = nn.Sequential(
+            nn.Conv2d(in_planes, n3x3red, kernel_size=1),
+            nn.BatchNorm2d(n3x3red, affine=True),
+            nn.ReLU(True),
+            nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1),
+            nn.BatchNorm2d(n3x3, affine=True),
+            nn.ReLU(True),
+        )
+        # 1x1 conv -> 5x5 conv branch
+        self.b3 = nn.Sequential(
+            nn.Conv2d(in_planes, n5x5red, kernel_size=1),
+            nn.BatchNorm2d(n5x5red, affine=True),
+            nn.ReLU(True),
+            nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1),
+            nn.BatchNorm2d(n5x5, affine=True),
+            nn.ReLU(True),
+            nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),
+            nn.BatchNorm2d(n5x5, affine=True),
+            nn.ReLU(True),
+        )
+        # 3x3 pool
+        if pool_type == 'avg':
+            self.b4 = nn.Sequential(nn.AvgPool2d(3, stride=1, padding=1), )
+        else:
+            self.b4 = nn.Sequential(nn.MaxPool2d(3, stride=1, padding=1), )
+        # 1x1 conv branch
+        self.b5 = nn.Sequential(
+            nn.Conv2d(in_planes, pool_planes, kernel_size=1),
+            nn.BatchNorm2d(pool_planes, affine=True),
+            nn.ReLU(True),
+        )
+    def forward(self, x):
+        y1 = self.b1(x)
+        y2 = self.b2(x)
+        y3 = self.b3(x)
+        y4_pool = self.b4(x)
+        y4 = self.b5(y4_pool)
+        return torch.cat([y1, y2, y3, y4], 1)
+class Inception_through(nn.Module):
+    def __init__(self, in_planes, n3x3red, n3x3, n3x3red_double, n3x3_double):
+        super(Inception_through, self).__init__()
+        # 1x1 conv -> 3x3 conv branch
+        self.b2 = nn.Sequential(
+            nn.Conv2d(in_planes, n3x3red, kernel_size=1),
+            nn.BatchNorm2d(n3x3red, affine=True),
+            nn.ReLU(True),
+            nn.Conv2d(n3x3red, n3x3, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2d(n3x3, affine=True),
+            nn.ReLU(True),
+        )
+        # 1x1 conv -> 5x5 conv branch
+        self.b3 = nn.Sequential(
+            nn.Conv2d(in_planes, n3x3red_double, kernel_size=1),
+            nn.BatchNorm2d(n3x3red_double, affine=True),
+            nn.ReLU(True),
+            nn.Conv2d(n3x3red_double, n3x3_double, kernel_size=3, padding=1),
+            nn.BatchNorm2d(n3x3_double, affine=True),
+            nn.ReLU(True),
+            nn.Conv2d(n3x3_double,
+                      n3x3_double,
+                      kernel_size=3,
+                      stride=2,
+                      padding=1),
+            nn.BatchNorm2d(n3x3_double, affine=True),
+            nn.ReLU(True),
+        )
+        # 3x3 pool -> 1x1 conv branch
+        self.b4 = nn.Sequential(nn.MaxPool2d(3, stride=2, padding=1), )
+    def forward(self, x):
+        y2 = self.b2(x)
+        y3 = self.b3(x)
+        y4 = self.b4(x)
+        return torch.cat([y2, y3, y4], 1)
+class InceptionV2(nn.Module):
+    def __init__(self, num_classes=1000):
+        super(InceptionV2, self).__init__()
+        self.c1 = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
+            nn.ReLU(True),
+        )
+        self.c2 = nn.Sequential(
+            nn.Conv2d(64, 64, kernel_size=1, stride=1),
+            nn.ReLU(True),
+            nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+        )
+        self.a3 = Inception_2(192, 64, 64, 64, 64, 96, 32)
+        self.b3 = Inception_2(256, 64, 64, 96, 64, 96, 64)
+        self.c3 = Inception_through(320, 128, 160, 64, 96)
+        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
+        self.lrn = nn.LocalResponseNorm(2)
+        self.a4 = Inception_2(576, 224, 64, 96, 96, 128, 128)
+        self.b4 = Inception_2(576, 192, 96, 128, 96, 128, 128)
+        self.c4 = Inception_2(576, 160, 128, 160, 128, 160, 128)
+        self.d4 = Inception_2(608, 96, 128, 192, 160, 192, 128)
+        self.e4 = Inception_through(608, 128, 192, 192, 256)
+        self.a5 = Inception_2(1056, 352, 192, 320, 160, 224, 128)
+        self.b5 = Inception_2(1024, 352, 192, 320, 192, 224, 128, 'max')
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.linear = nn.Linear(1024, num_classes)
+    def forward(self, x):
+        out = self.c1(x)
+        out = self.maxpool(out)
+        out = self.lrn(out)
+        out = self.c2(out)
+        out = self.lrn(out)
+        out = self.maxpool(out)
+        out = self.a3(out)
+        out = self.b3(out)
+        out = self.c3(out)
+        out = self.a4(out)
+        out = self.b4(out)
+        out = self.c4(out)
+        out = self.d4(out)
+        out = self.e4(out)
+        out = self.a5(out)
+        out = self.b5(out)
+        out = self.avgpool(out)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
--- a/models/inception_v3.py
+++ b/models/inception_v3.py
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = ['InceptionV3', 'inception_v3']
+# modified according to https://github.com/JJBOY/CNN-repository/blob/master/model/Inception_v3.py
+def inception_v3(**kwargs):
+    return InceptionV3(aux_logits=False, **kwargs)
+class InceptionV3(nn.Module):
+    def __init__(self, num_classes=1000, aux_logits=True):
+        super(InceptionV3, self).__init__()
+        self.aux_logits = aux_logits
+        self.Conv2d_1a_3x3 = BasicConv2d(3, 32, kernel_size=3, stride=2)
+        self.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3)
+        self.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1)
+        self.Conv2d_3a_3x3 = BasicConv2d(64, 80, kernel_size=3)
+        self.Conv2d_3b_3x3 = BasicConv2d(80, 192, kernel_size=3, stride=2)
+        self.Conv2d_3c_3x3 = BasicConv2d(192, 288, kernel_size=3, padding=1)
+        self.Mixed_5b = InceptionA(288, pool_features=64)
+        self.Mixed_5c = InceptionA(288, pool_features=64)
+        self.Mixed_5d = InceptionA(288, pool_features=64)
+        self.Mixed_5to6 = InceptionB(288)
+        self.Mixed_6a = InceptionC(768, channels_7x7=128)
+        self.Mixed_6b = InceptionC(768, channels_7x7=160)
+        self.Mixed_6c = InceptionC(768, channels_7x7=160)
+        self.Mixed_6d = InceptionC(768, channels_7x7=160)
+        self.Mixed_6e = InceptionC(768, channels_7x7=192)
+        if aux_logits:
+            self.AuxLogits = InceptionAux(768, num_classes)
+        self.Mixed_7a = InceptionD(768)
+        self.Mixed_7b = InceptionE(1280)
+        self.Mixed_7c = InceptionE(2048)
+        self.avg_pool = nn.AvgPool2d(8)
+        self.fc = nn.Linear(2048, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                '''
+            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
+                import scipy.stats as stats
+                stddev = m.stddev if hasattr(m, 'stddev') else 0.1
+                X = stats.truncnorm(-2, 2, scale=stddev)
+                values = torch.Tensor(X.rvs(m.weight.numel()))
+                values = values.view(m.weight.size())
+                m.weight.data = values
+                #m.weight.data.copy_(values)
+                '''
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        # 299 x 299 x 3
+        x = self.Conv2d_1a_3x3(x)
+        # 149 x 149 x 32
+        x = self.Conv2d_2a_3x3(x)
+        # 147 x 147 x 32
+        x = self.Conv2d_2b_3x3(x)
+        # 147 x 147 x 64
+        x = F.max_pool2d(x, kernel_size=3, stride=2)
+        # 73 x 73 x 64
+        x = self.Conv2d_3a_3x3(x)
+        # 71 x 71 x 80
+        x = self.Conv2d_3b_3x3(x)
+        # 35 x 35 x 192
+        x = self.Conv2d_3c_3x3(x)
+        # 35 x 35 x 288
+        x = self.Mixed_5b(x)
+        # 35 x 35 x 288
+        x = self.Mixed_5c(x)
+        # 35 x 35 x 288
+        x = self.Mixed_5d(x)
+        # 35 x 35 x 288
+        x = self.Mixed_5to6(x)
+        # 17 x 17 x 768
+        x = self.Mixed_6a(x)
+        # 17 x 17 x 768
+        x = self.Mixed_6b(x)
+        # 17 x 17 x 768
+        x = self.Mixed_6c(x)
+        # 17 x 17 x 768
+        x = self.Mixed_6d(x)
+        # 17 x 17 x 768
+        x = self.Mixed_6e(x)
+        # 17 x 17 x 768
+        if self.training and self.aux_logits:
+            aux = self.AuxLogits(x)
+        # 17 x 17 x 768
+        x = self.Mixed_7a(x)
+        # 8 x 8 x 1280
+        x = self.Mixed_7b(x)
+        # 8 x 8 x 2048
+        x = self.Mixed_7c(x)
+        # 8 x 8 x 2048
+        x = self.avg_pool(x)
+        # x = F.avg_pool2d(x, kernel_size=8)
+        # 1 x 1 x 2048
+        # x = F.dropout(x, training=self.training)
+        # 1 x 1 x 2048
+        x = x.view(x.size(0), -1)
+        # 2048
+        x = self.fc(x)
+        # 1000 (num_classes)
+        if self.training and self.aux_logits:
+            return x, aux
+        return x
+class InceptionA(nn.Module):
+    def __init__(self, in_channels, pool_features):
+        super(InceptionA, self).__init__()
+        self.branch1x1 = BasicConv2d(in_channels, 64, kernel_size=1)
+        self.branch5x5_1 = BasicConv2d(in_channels, 48, kernel_size=1)
+        self.branch5x5_2 = BasicConv2d(48, 64, kernel_size=5, padding=2)
+        self.branch3x3dbl_1 = BasicConv2d(in_channels, 64, kernel_size=1)
+        self.branch3x3dbl_2 = BasicConv2d(64, 96, kernel_size=3, padding=1)
+        self.branch3x3dbl_3 = BasicConv2d(96, 96, kernel_size=3, padding=1)
+        self.branch_pool = BasicConv2d(in_channels,
+                                       pool_features,
+                                       kernel_size=1)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class InceptionB(nn.Module):
+    def __init__(self, in_channels):
+        super(InceptionB, self).__init__()
+        self.branch3x3_1 = BasicConv2d(in_channels, 64, kernel_size=1)
+        self.branch3x3_2 = BasicConv2d(64, 384, kernel_size=3, stride=2)
+        self.branch3x3dbl_1 = BasicConv2d(in_channels, 64, kernel_size=1)
+        self.branch3x3dbl_2 = BasicConv2d(64, 96, kernel_size=3, padding=1)
+        self.branch3x3dbl_3 = BasicConv2d(96, 96, kernel_size=3, stride=2)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2)
+    def forward(self, x):
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        branch_pool = self.maxpool(x)
+        outputs = [branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class InceptionC(nn.Module):
+    def __init__(self, in_channels, channels_7x7):
+        super(InceptionC, self).__init__()
+        self.branch1x1 = BasicConv2d(in_channels, 192, kernel_size=1)
+        c7 = channels_7x7
+        self.branch7x7_1 = BasicConv2d(in_channels, c7, kernel_size=1)
+        self.branch7x7_2 = BasicConv2d(c7,
+                                       c7,
+                                       kernel_size=(1, 7),
+                                       padding=(0, 3))
+        self.branch7x7_3 = BasicConv2d(c7,
+                                       192,
+                                       kernel_size=(7, 1),
+                                       padding=(3, 0))
+        self.branch7x7dbl_1 = BasicConv2d(in_channels, c7, kernel_size=1)
+        self.branch7x7dbl_2 = BasicConv2d(c7,
+                                          c7,
+                                          kernel_size=(7, 1),
+                                          padding=(3, 0))
+        self.branch7x7dbl_3 = BasicConv2d(c7,
+                                          c7,
+                                          kernel_size=(1, 7),
+                                          padding=(0, 3))
+        self.branch7x7dbl_4 = BasicConv2d(c7,
+                                          c7,
+                                          kernel_size=(7, 1),
+                                          padding=(3, 0))
+        self.branch7x7dbl_5 = BasicConv2d(c7,
+                                          192,
+                                          kernel_size=(1, 7),
+                                          padding=(0, 3))
+        self.branch_pool = BasicConv2d(in_channels, 192, kernel_size=1)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class InceptionD(nn.Module):
+    def __init__(self, in_channels):
+        super(InceptionD, self).__init__()
+        self.branch3x3_1 = BasicConv2d(in_channels, 192, kernel_size=1)
+        self.branch3x3_2 = BasicConv2d(192, 320, kernel_size=3, stride=2)
+        self.branch7x7x3_1 = BasicConv2d(in_channels, 192, kernel_size=1)
+        self.branch7x7x3_2 = BasicConv2d(192,
+                                         192,
+                                         kernel_size=(1, 7),
+                                         padding=(0, 3))
+        self.branch7x7x3_3 = BasicConv2d(192,
+                                         192,
+                                         kernel_size=(7, 1),
+                                         padding=(3, 0))
+        self.branch7x7x3_4 = BasicConv2d(192, 192, kernel_size=3, stride=2)
+    def forward(self, x):
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+        branch7x7x3 = self.branch7x7x3_1(x)
+        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
+        outputs = [branch3x3, branch7x7x3, branch_pool]
+        return torch.cat(outputs, 1)
+class InceptionE(nn.Module):
+    def __init__(self, in_channels):
+        super(InceptionE, self).__init__()
+        self.branch1x1 = BasicConv2d(in_channels, 320, kernel_size=1)
+        self.branch3x3_1 = BasicConv2d(in_channels, 384, kernel_size=1)
+        self.branch3x3_2a = BasicConv2d(384,
+                                        384,
+                                        kernel_size=(1, 3),
+                                        padding=(0, 1))
+        self.branch3x3_2b = BasicConv2d(384,
+                                        384,
+                                        kernel_size=(3, 1),
+                                        padding=(1, 0))
+        self.branch3x3dbl_1 = BasicConv2d(in_channels, 448, kernel_size=1)
+        self.branch3x3dbl_2 = BasicConv2d(448, 384, kernel_size=3, padding=1)
+        self.branch3x3dbl_3a = BasicConv2d(384,
+                                           384,
+                                           kernel_size=(1, 3),
+                                           padding=(0, 1))
+        self.branch3x3dbl_3b = BasicConv2d(384,
+                                           384,
+                                           kernel_size=(3, 1),
+                                           padding=(1, 0))
+        self.branch_pool = BasicConv2d(in_channels, 192, kernel_size=1)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class InceptionAux(nn.Module):
+    def __init__(self, in_channels, num_classes):
+        super(InceptionAux, self).__init__()
+        self.conv0 = BasicConv2d(in_channels, 128, kernel_size=1)
+        self.conv1 = BasicConv2d(128, 768, kernel_size=5)
+        self.conv1.stddev = 0.01
+        self.fc = nn.Linear(768, num_classes)
+        self.fc.stddev = 0.001
+    def forward(self, x):
+        # 17 x 17 x 768
+        x = F.avg_pool2d(x, kernel_size=5, stride=3)
+        # 5 x 5 x 768
+        x = self.conv0(x)
+        # 5 x 5 x 128
+        x = self.conv1(x)
+        # 1 x 1 x 768
+        x = x.view(x.size(0), -1)
+        # 768
+        x = self.fc(x)
+        # 1000
+        return x
+class BasicConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(BasicConv2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return F.relu(x, inplace=True)
+if __name__ == '__main__':
+    model = inception_v3()
+    print(model)
--- a/models/inception_v4.py
+++ b/models/inception_v4.py
+import math
+import torch
+import torch.nn as nn
+__all__ = ['InceptionV4', 'inception_v4']
+class BasicConv2d(nn.Module):
+    def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0):
+        super(BasicConv2d, self).__init__()
+        self.conv = nn.Conv2d(in_planes,
+                              out_planes,
+                              kernel_size=kernel_size,
+                              stride=stride,
+                              padding=padding,
+                              bias=False)  # verify bias false
+        self.bn = nn.BatchNorm2d(
+            out_planes,
+            eps=0.001,  # value found in tensorflow
+            momentum=0.1,  # default pytorch value
+            affine=True)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+class Mixed_3a(nn.Module):
+    def __init__(self):
+        super(Mixed_3a, self).__init__()
+        self.maxpool = nn.MaxPool2d(3, stride=2)
+        self.conv = BasicConv2d(64, 96, kernel_size=3, stride=2)
+    def forward(self, x):
+        x0 = self.maxpool(x)
+        x1 = self.conv(x)
+        out = torch.cat((x0, x1), 1)
+        return out
+class Mixed_4a(nn.Module):
+    def __init__(self):
+        super(Mixed_4a, self).__init__()
+        self.branch0 = nn.Sequential(
+            BasicConv2d(160, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 96, kernel_size=3, stride=1))
+        self.branch1 = nn.Sequential(
+            BasicConv2d(160, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 64, kernel_size=(1, 7), stride=1, padding=(0, 3)),
+            BasicConv2d(64, 64, kernel_size=(7, 1), stride=1, padding=(3, 0)),
+            BasicConv2d(64, 96, kernel_size=(3, 3), stride=1))
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        out = torch.cat((x0, x1), 1)
+        return out
+class Mixed_5a(nn.Module):
+    def __init__(self):
+        super(Mixed_5a, self).__init__()
+        self.conv = BasicConv2d(192, 192, kernel_size=3, stride=2)
+        self.maxpool = nn.MaxPool2d(3, stride=2)
+    def forward(self, x):
+        x0 = self.conv(x)
+        x1 = self.maxpool(x)
+        out = torch.cat((x0, x1), 1)
+        return out
+class Inception_A(nn.Module):
+    def __init__(self):
+        super(Inception_A, self).__init__()
+        self.branch0 = BasicConv2d(384, 96, kernel_size=1, stride=1)
+        self.branch1 = nn.Sequential(
+            BasicConv2d(384, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1))
+        self.branch2 = nn.Sequential(
+            BasicConv2d(384, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(96, 96, kernel_size=3, stride=1, padding=1))
+        self.branch3 = nn.Sequential(
+            nn.AvgPool2d(3, stride=1, padding=1),
+            BasicConv2d(384, 96, kernel_size=1, stride=1))
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+class Reduction_A(nn.Module):
+    def __init__(self):
+        super(Reduction_A, self).__init__()
+        self.branch0 = BasicConv2d(384, 384, kernel_size=3, stride=2)
+        self.branch1 = nn.Sequential(
+            BasicConv2d(384, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 224, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(224, 256, kernel_size=3, stride=2))
+        self.branch2 = nn.MaxPool2d(3, stride=2)
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        out = torch.cat((x0, x1, x2), 1)
+        return out
+class Inception_B(nn.Module):
+    def __init__(self):
+        super(Inception_B, self).__init__()
+        self.branch0 = BasicConv2d(1024, 384, kernel_size=1, stride=1)
+        self.branch1 = nn.Sequential(
+            BasicConv2d(1024, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 224, kernel_size=(1, 7), stride=1,
+                        padding=(0, 3)),
+            BasicConv2d(224, 256, kernel_size=(7, 1), stride=1,
+                        padding=(3, 0)))
+        self.branch2 = nn.Sequential(
+            BasicConv2d(1024, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 192, kernel_size=(1, 7), stride=1,
+                        padding=(0, 3)),
+            BasicConv2d(192, 224, kernel_size=(7, 1), stride=1,
+                        padding=(3, 0)),
+            BasicConv2d(224, 224, kernel_size=(1, 7), stride=1,
+                        padding=(0, 3)),
+            BasicConv2d(224, 256, kernel_size=(7, 1), stride=1,
+                        padding=(3, 0)))
+        self.branch3 = nn.Sequential(
+            nn.AvgPool2d(3, stride=1, padding=1),
+            BasicConv2d(1024, 128, kernel_size=1, stride=1))
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+class Reduction_B(nn.Module):
+    def __init__(self):
+        super(Reduction_B, self).__init__()
+        self.branch0 = nn.Sequential(
+            BasicConv2d(1024, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 192, kernel_size=3, stride=2))
+        self.branch1 = nn.Sequential(
+            BasicConv2d(1024, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, 256, kernel_size=(1, 7), stride=1,
+                        padding=(0, 3)),
+            BasicConv2d(256, 320, kernel_size=(7, 1), stride=1,
+                        padding=(3, 0)),
+            BasicConv2d(320, 320, kernel_size=3, stride=2))
+        self.branch2 = nn.MaxPool2d(3, stride=2)
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        out = torch.cat((x0, x1, x2), 1)
+        return out
+class Inception_C(nn.Module):
+    def __init__(self):
+        super(Inception_C, self).__init__()
+        self.branch0 = BasicConv2d(1536, 256, kernel_size=1, stride=1)
+        self.branch1_0 = BasicConv2d(1536, 384, kernel_size=1, stride=1)
+        self.branch1_1a = BasicConv2d(384,
+                                      256,
+                                      kernel_size=(1, 3),
+                                      stride=1,
+                                      padding=(0, 1))
+        self.branch1_1b = BasicConv2d(384,
+                                      256,
+                                      kernel_size=(3, 1),
+                                      stride=1,
+                                      padding=(1, 0))
+        self.branch2_0 = BasicConv2d(1536, 384, kernel_size=1, stride=1)
+        self.branch2_1 = BasicConv2d(384,
+                                     448,
+                                     kernel_size=(1, 3),
+                                     stride=1,
+                                     padding=(0, 1))
+        self.branch2_2 = BasicConv2d(448,
+                                     512,
+                                     kernel_size=(3, 1),
+                                     stride=1,
+                                     padding=(1, 0))
+        self.branch2_3a = BasicConv2d(512,
+                                      256,
+                                      kernel_size=(1, 3),
+                                      stride=1,
+                                      padding=(0, 1))
+        self.branch2_3b = BasicConv2d(512,
+                                      256,
+                                      kernel_size=(3, 1),
+                                      stride=1,
+                                      padding=(1, 0))
+        self.branch3 = nn.Sequential(
+            nn.AvgPool2d(3, stride=1, padding=1),
+            BasicConv2d(1536, 256, kernel_size=1, stride=1))
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1_0 = self.branch1_0(x)
+        x1_1a = self.branch1_1a(x1_0)
+        x1_1b = self.branch1_1b(x1_0)
+        x1 = torch.cat((x1_1a, x1_1b), 1)
+        x2_0 = self.branch2_0(x)
+        x2_1 = self.branch2_1(x2_0)
+        x2_2 = self.branch2_2(x2_1)
+        x2_3a = self.branch2_3a(x2_2)
+        x2_3b = self.branch2_3b(x2_2)
+        x2 = torch.cat((x2_3a, x2_3b), 1)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+class InceptionV4(nn.Module):
+    def __init__(self, num_classes=1000):
+        super(InceptionV4, self).__init__()
+        # Special attributs
+        self.input_space = None
+        self.input_size = (299, 299, 3)
+        self.mean = None
+        self.std = None
+        # Modules
+        self.features = nn.Sequential(
+            BasicConv2d(3, 32, kernel_size=3, stride=2),
+            BasicConv2d(32, 32, kernel_size=3, stride=1),
+            BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1),
+            Mixed_3a(),
+            Mixed_4a(),
+            Mixed_5a(),
+            Inception_A(),
+            Inception_A(),
+            Inception_A(),
+            Inception_A(),
+            Reduction_A(),  # Mixed_6a
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Reduction_B(),  # Mixed_7a
+            Inception_C(),
+            Inception_C(),
+            Inception_C())
+        self.avg_pool = nn.AvgPool2d(8)
+        self.drop = nn.Dropout(p=0.2)
+        self.last_linear = nn.Linear(1536, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    def logits(self, features):
+        x = self.avg_pool(features)
+        x = self.drop(x)
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+def inception_v4(**kwargs):
+    model = InceptionV4(**kwargs)
+    return model