First Commit.

b952e97b · chenych · b952e97b · b952e97b · b952e97b · b952e97b
Commit b952e97b authored Nov 03, 2023 by chenych
20 changed files
--- a/src/lib/models/Backbone/efficientdet/retinahead.py
+++ b/src/lib/models/Backbone/efficientdet/retinahead.py
+import numpy as np
+import torch.nn as nn
+from .conv_module import ConvModule, bias_init_with_prob, normal_init
+from six.moves import map, zip
+def multi_apply(func, *args, **kwargs):
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+class RetinaHead(nn.Module):
+    """
+    An anchor-based head used in [1]_.
+    The head contains two subnetworks. The first classifies anchor boxes and
+    the second regresses deltas for the anchors.
+    References:
+        .. [1]  https://arxiv.org/pdf/1708.02002.pdf
+    Example:
+        >>> import torch
+        >>> self = RetinaHead(11, 7)
+        >>> x = torch.rand(1, 7, 32, 32)
+        >>> cls_score, bbox_pred = self.forward_single(x)
+        >>> # Each anchor predicts a score for each class except background
+        >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
+        >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
+        >>> assert cls_per_anchor == (self.num_classes - 1)
+        >>> assert box_per_anchor == 4
+    """
+    def __init__(self,
+                num_classes,
+                 in_channels,
+                 feat_channels=64,
+                 stacked_convs=4,
+                 octave_base_scale=4,
+                 scales_per_octave=3,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 **kwargs):
+        super(RetinaHead, self).__init__()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.octave_base_scale = octave_base_scale
+        self.scales_per_octave = scales_per_octave
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        octave_scales = np.array(
+            [2**(i / scales_per_octave) for i in range(scales_per_octave)])
+        self.cls_out_channels = num_classes
+        self._init_layers()
+    def _init_layers(self):
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        #self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels,
+            self.cls_out_channels,
+            3,
+            padding=1)
+        #self.output_act = nn.Sigmoid()
+    def init_weights(self):
+        for m in self.cls_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.retina_cls, std=0.01, bias=bias_cls)
+        #normal_init(self.retina_reg, std=0.01)
+    def forward_single(self, x):
+        cls_feat = x
+        #reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        #for reg_conv in self.reg_convs:
+        #    reg_feat = reg_conv(reg_feat)
+        cls_score = self.retina_cls(cls_feat)
+        # out is B x C x W x H, with C = n_classes + n_anchors
+        #cls_score = cls_score.permute(0, 2, 3, 1)
+        #batch_size, width, height, channels = cls_score.shape
+        #cls_score = cls_score.view(batch_size, width, height, self.num_anchors, self.num_classes)
+        #cls_score = cls_score.contiguous().view(x.size(0), -1, self.num_classes)
+        #bbox_pred = self.retina_reg(reg_feat)
+        #bbox_pred = bbox_pred.permute(0, 2, 3, 1)
+        #bbox_pred = bbox_pred.contiguous().view(bbox_pred.size(0), -1, 4)
+        return [cls_score]
+    def forward(self, feats):
+        return multi_apply(self.forward_single, feats)
--- a/src/lib/models/Backbone/efficientdet/utils.py
+++ b/src/lib/models/Backbone/efficientdet/utils.py
+import re
+import math
+import collections
+from functools import partial
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils import model_zoo
+########################################################################
+############### HELPERS FUNCTIONS FOR MODEL ARCHITECTURE ###############
+########################################################################
+# Parameters for the entire model (stem, all blocks, and head)
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate',
+    'num_classes', 'width_coefficient', 'depth_coefficient',
+    'depth_divisor', 'min_depth', 'drop_connect_rate', 'image_size'])
+# Parameters for an individual model block
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
+    'expand_ratio', 'id_skip', 'stride', 'se_ratio'])
+# Change namedtuple defaults
+GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
+BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)
+class SwishImplementation(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, i):
+        result = i * torch.sigmoid(i)
+        ctx.save_for_backward(i)
+        return result
+    @staticmethod
+    def backward(ctx, grad_output):
+        i = ctx.saved_variables[0]
+        sigmoid_i = torch.sigmoid(i)
+        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
+class MemoryEfficientSwish(nn.Module):
+    def forward(self, x):
+        return SwishImplementation.apply(x)
+class Swish(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+def round_filters(filters, global_params):
+    """ Calculate and round number of filters based on depth multiplier. """
+    multiplier = global_params.width_coefficient
+    if not multiplier:
+        return filters
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
+    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
+        new_filters += divisor
+    return int(new_filters)
+def round_repeats(repeats, global_params):
+    """ Round number of filters based on depth multiplier. """
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    return int(math.ceil(multiplier * repeats))
+def drop_connect(inputs, p, training):
+    """ Drop connect. """
+    if not training: return inputs
+    batch_size = inputs.shape[0]
+    keep_prob = 1 - p
+    random_tensor = keep_prob
+    random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device)
+    binary_tensor = torch.floor(random_tensor)
+    output = inputs / keep_prob * binary_tensor
+    return output
+def get_same_padding_conv2d(image_size=None):
+    """ Chooses static padding if you have specified an image size, and dynamic padding otherwise.
+        Static padding is necessary for ONNX exporting of models. """
+    if image_size is None:
+        return Conv2dDynamicSamePadding
+    else:
+        return partial(Conv2dStaticSamePadding, image_size=image_size)
+class Conv2dDynamicSamePadding(nn.Conv2d):
+    """ 2D Convolutions like TensorFlow, for a dynamic image size """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
+        super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+class Conv2dStaticSamePadding(nn.Conv2d):
+    """ 2D Convolutions like TensorFlow, for a fixed image size"""
+    def __init__(self, in_channels, out_channels, kernel_size, image_size=None, **kwargs):
+        super().__init__(in_channels, out_channels, kernel_size, **kwargs)
+        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
+        # Calculate padding based on image size and save it
+        assert image_size is not None
+        ih, iw = image_size if type(image_size) == list else [image_size, image_size]
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            self.static_padding = nn.ZeroPad2d((pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
+        else:
+            self.static_padding = Identity()
+    def forward(self, x):
+        x = self.static_padding(x)
+        x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+        return x
+class Identity(nn.Module):
+    def __init__(self, ):
+        super(Identity, self).__init__()
+    def forward(self, input):
+        return input
+########################################################################
+############## HELPERS FUNCTIONS FOR LOADING MODEL PARAMS ##############
+########################################################################
+def efficientnet_params(model_name):
+    """ Map EfficientNet model name to parameter coefficients. """
+    params_dict = {
+        # Coefficients:   width,depth,res,dropout
+        'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+    }
+    return params_dict[model_name]
+class BlockDecoder(object):
+    """ Block Decoder for readability, straight from the official TensorFlow repository """
+    @staticmethod
+    def _decode_block_string(block_string):
+        """ Gets a block through a string notation of arguments. """
+        assert isinstance(block_string, str)
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+        # Check stride
+        assert (('s' in options and len(options['s']) == 1) or
+                (len(options['s']) == 2 and options['s'][0] == options['s'][1]))
+        return BlockArgs(
+            kernel_size=int(options['k']),
+            num_repeat=int(options['r']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            expand_ratio=int(options['e']),
+            id_skip=('noskip' not in block_string),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=[int(options['s'][0])])
+    @staticmethod
+    def _encode_block_string(block):
+        """Encodes a block to a string."""
+        args = [
+            'r%d' % block.num_repeat,
+            'k%d' % block.kernel_size,
+            's%d%d' % (block.strides[0], block.strides[1]),
+            'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters,
+            'o%d' % block.output_filters
+        ]
+        if 0 < block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+    @staticmethod
+    def decode(string_list):
+        """
+        Decodes a list of string notations to specify blocks inside the network.
+        :param string_list: a list of strings, each string is a notation of block
+        :return: a list of BlockArgs namedtuples of block args
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(BlockDecoder._decode_block_string(block_string))
+        return blocks_args
+    @staticmethod
+    def encode(blocks_args):
+        """
+        Encodes a list of BlockArgs to a list of strings.
+        :param blocks_args: a list of BlockArgs namedtuples of block args
+        :return: a list of strings, each string is a notation of block
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(BlockDecoder._encode_block_string(block))
+        return block_strings
+def efficientnet(width_coefficient=None, depth_coefficient=None, dropout_rate=0.2,
+                 drop_connect_rate=0.2, image_size=None, num_classes=1000):
+    """ Creates a efficientnet model. """
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s22_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s22_e6_i192_o320_se0.25',
+    ]
+    blocks_args = BlockDecoder.decode(blocks_args)
+    global_params = GlobalParams(
+        batch_norm_momentum=0.99,
+        batch_norm_epsilon=1e-3,
+        dropout_rate=dropout_rate,
+        drop_connect_rate=drop_connect_rate,
+        # data_format='channels_last',  # removed, this is always true in PyTorch
+        num_classes=num_classes,
+        width_coefficient=width_coefficient,
+        depth_coefficient=depth_coefficient,
+        depth_divisor=8,
+        min_depth=None,
+        image_size=image_size,
+    )
+    return blocks_args, global_params
+def get_model_params(model_name, override_params):
+    """ Get the block args and global params for a given model """
+    if model_name.startswith('efficientnet'):
+        w, d, s, p = efficientnet_params(model_name)
+        # note: all models have drop connect rate = 0.2
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s)
+    else:
+        raise NotImplementedError('model name is not pre-defined: %s' % model_name)
+    if override_params:
+        # ValueError will be raised here if override_params has fields not included in global_params.
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+url_map = {
+    'efficientnet-b0': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b0-355c32eb.pth',
+    'efficientnet-b1': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b1-f1951068.pth',
+    'efficientnet-b2': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b2-8bb594d6.pth',
+    'efficientnet-b3': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b3-5fb5a3c3.pth',
+    'efficientnet-b4': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b4-6ed6700e.pth',
+    'efficientnet-b5': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b5-b6417697.pth',
+    'efficientnet-b6': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b6-c76e70fd.pth',
+    'efficientnet-b7': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b7-dcc49843.pth',
+}
+def load_pretrained_weights(model, model_name, load_fc=True):
+    """ Loads pretrained weights, and downloads if loading for the first time. """
+    state_dict = model_zoo.load_url(url_map[model_name], map_location=lambda storage, loc: storage)
+    if load_fc:
+        model.load_state_dict(state_dict)
+    else:
+        state_dict.pop('_fc.weight')
+        state_dict.pop('_fc.bias')
+        res = model.load_state_dict(state_dict, strict=False)
+        assert set(res.missing_keys) == set(['_fc.weight', '_fc.bias']), 'issue loading pretrained weights'
+    print('Loaded pretrained weights for {}'.format(model_name))
--- a/src/lib/models/Backbone/hardnet.py
+++ b/src/lib/models/Backbone/hardnet.py
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ConvLayer(nn.Sequential):
+    def __init__(self, in_channels, out_channels, kernel=3, stride=1, dropout=0.1):
+        super().__init__()
+        self.add_module('conv', nn.Conv2d(in_channels, out_channels, kernel_size=kernel,
+                                          stride=stride, padding=kernel//2, bias = False))
+        self.add_module('norm', nn.BatchNorm2d(out_channels))
+        self.add_module('relu', nn.ReLU(inplace=True))
+    def forward(self, x):
+        return super().forward(x)
+class HarDBlock(nn.Module):
+    def get_link(self, layer, base_ch, growth_rate, grmul):
+        if layer == 0:
+          return base_ch, 0, []
+        out_channels = growth_rate
+        link = []
+        for i in range(10):
+          dv = 2 ** i
+          if layer % dv == 0:
+            k = layer - dv
+            link.append(k)
+            if i > 0:
+                out_channels *= grmul
+        out_channels = int(int(out_channels + 1) / 2) * 2
+        in_channels = 0
+        for i in link:
+          ch,_,_ = self.get_link(i, base_ch, growth_rate, grmul)
+          in_channels += ch
+        return out_channels, in_channels, link
+    def get_out_ch(self):
+        return self.out_channels
+    def __init__(self, in_channels, growth_rate, grmul, n_layers, keepBase=False, residual_out=False):
+        super().__init__()
+        self.keepBase = keepBase
+        self.links = []
+        layers_ = []
+        self.out_channels = 0 # if upsample else in_channels
+        for i in range(n_layers):
+          outch, inch, link = self.get_link(i+1, in_channels, growth_rate, grmul)
+          self.links.append(link)
+          use_relu = residual_out
+          layers_.append(ConvLayer(inch, outch))
+          if (i % 2 == 0) or (i == n_layers - 1):
+            self.out_channels += outch
+        #print("Blk out =",self.out_channels)
+        self.layers = nn.ModuleList(layers_)
+    def forward(self, x):
+        layers_ = [x]
+        for layer in range(len(self.layers)):
+            link = self.links[layer]
+            tin = []
+            for i in link:
+                tin.append(layers_[i])
+            if len(tin) > 1:
+                x = torch.cat(tin, 1)
+            else:
+                x = tin[0]
+            out = self.layers[layer](x)
+            layers_.append(out)
+        t = len(layers_)
+        out_ = []
+        for i in range(t):
+          if (i == 0 and self.keepBase) or \
+             (i == t-1) or (i%2 == 1):
+              out_.append(layers_[i])
+        out = torch.cat(out_, 1)
+        return out
+class TransitionUp(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        #print("upsample",in_channels, out_channels)
+    def forward(self, x, skip, concat=True):
+        out = F.interpolate(
+                x,
+                size=(skip.size(2), skip.size(3)),
+                mode="bilinear",
+                align_corners=True,
+                            )
+        if concat:                            
+          out = torch.cat([out, skip], 1)
+        return out
+class hardnet(nn.Module):
+    def __init__(self):
+        super(hardnet, self).__init__()
+        first_ch  = [16,24,32,48]
+        ch_list = [  64, 96, 160, 224, 320]
+        grmul = 1.7
+        gr       = [  10,16,18,24,32]
+        n_layers = [   4, 4, 8, 8, 8]
+        blks = len(n_layers) 
+        self.shortcut_layers = []
+        self.base = nn.ModuleList([])
+        self.base.append (
+             ConvLayer(in_channels=3, out_channels=first_ch[0], kernel=3,
+                       stride=2) )
+        self.base.append ( ConvLayer(first_ch[0], first_ch[1],  kernel=3) )
+        self.base.append ( ConvLayer(first_ch[1], first_ch[2],  kernel=3, stride=2) )
+        self.base.append ( ConvLayer(first_ch[2], first_ch[3],  kernel=3) )
+        skip_connection_channel_counts = []
+        ch = first_ch[3]
+        for i in range(blks):
+            blk = HarDBlock(ch, gr[i], grmul, n_layers[i])
+            ch = blk.get_out_ch()
+            skip_connection_channel_counts.append(ch)
+            self.base.append ( blk )
+            if i < blks-1:
+              self.shortcut_layers.append(len(self.base)-1)
+            self.base.append ( ConvLayer(ch, ch_list[i], kernel=1) )
+            ch = ch_list[i]
+            if i < blks-1:            
+              self.base.append ( nn.AvgPool2d(kernel_size=2, stride=2) )
+        cur_channels_count = ch
+        prev_block_channels = ch
+        n_blocks = blks-1
+        self.n_blocks =  n_blocks
+        #######################
+        #   Upsampling path   #
+        #######################
+        self.transUpBlocks = nn.ModuleList([])
+        self.denseBlocksUp = nn.ModuleList([])
+        self.conv1x1_up    = nn.ModuleList([])
+        for i in range(n_blocks-1,-1,-1):
+            self.transUpBlocks.append(TransitionUp(prev_block_channels, prev_block_channels))
+            cur_channels_count = prev_block_channels + skip_connection_channel_counts[i]
+            self.conv1x1_up.append(ConvLayer(cur_channels_count, cur_channels_count//2, kernel=1))
+            cur_channels_count = cur_channels_count//2
+            blk = HarDBlock(cur_channels_count, gr[i], grmul, n_layers[i])
+            self.denseBlocksUp.append(blk)
+            prev_block_channels = blk.get_out_ch()
+            cur_channels_count = prev_block_channels
+    def forward(self, x):
+        skip_connections = []
+        size_in = x.size()
+        for i in range(len(self.base)):
+            x = self.base[i](x)
+            if i in self.shortcut_layers:
+                skip_connections.append(x)
+        out = x
+        for i in range(self.n_blocks):
+            skip = skip_connections.pop()
+            out = self.transUpBlocks[i](out, skip, True)
+            out = self.conv1x1_up[i](out)
+            out = self.denseBlocksUp[i](out)
+        return out
+def get_hard_net(num_layers, cfg):
+  model = hardnet()
+  return model
--- a/src/lib/models/Backbone/large_hourglass.py
+++ b/src/lib/models/Backbone/large_hourglass.py
+# ------------------------------------------------------------------------------
+# This code is base on 
+# CornerNet (https://github.com/princeton-vl/CornerNet)
+# Copyright (c) 2018, University of Michigan
+# Licensed under the BSD 3-Clause License
+# ------------------------------------------------------------------------------
+from __future__ import absolute_import, division, print_function
+import numpy as np
+import torch
+import torch.nn as nn
+class convolution(nn.Module):
+    def __init__(self, k, inp_dim, out_dim, stride=1, with_bn=True):
+        super(convolution, self).__init__()
+        pad = (k - 1) // 2
+        self.conv = nn.Conv2d(inp_dim, out_dim, (k, k), padding=(pad, pad), stride=(stride, stride), bias=not with_bn)
+        self.bn   = nn.BatchNorm2d(out_dim) if with_bn else nn.Sequential()
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        conv = self.conv(x)
+        bn   = self.bn(conv)
+        relu = self.relu(bn)
+        return relu
+class fully_connected(nn.Module):
+    def __init__(self, inp_dim, out_dim, with_bn=True):
+        super(fully_connected, self).__init__()
+        self.with_bn = with_bn
+        self.linear = nn.Linear(inp_dim, out_dim)
+        if self.with_bn:
+            self.bn = nn.BatchNorm1d(out_dim)
+        self.relu   = nn.ReLU(inplace=True)
+    def forward(self, x):
+        linear = self.linear(x)
+        bn     = self.bn(linear) if self.with_bn else linear
+        relu   = self.relu(bn)
+        return relu
+class residual(nn.Module):
+    def __init__(self, k, inp_dim, out_dim, stride=1, with_bn=True):
+        super(residual, self).__init__()
+        self.conv1 = nn.Conv2d(inp_dim, out_dim, (3, 3), padding=(1, 1), stride=(stride, stride), bias=False)
+        self.bn1   = nn.BatchNorm2d(out_dim)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(out_dim, out_dim, (3, 3), padding=(1, 1), bias=False)
+        self.bn2   = nn.BatchNorm2d(out_dim)
+        self.skip  = nn.Sequential(
+            nn.Conv2d(inp_dim, out_dim, (1, 1), stride=(stride, stride), bias=False),
+            nn.BatchNorm2d(out_dim)
+        ) if stride != 1 or inp_dim != out_dim else nn.Sequential()
+        self.relu  = nn.ReLU(inplace=True)
+    def forward(self, x):
+        conv1 = self.conv1(x)
+        bn1   = self.bn1(conv1)
+        relu1 = self.relu1(bn1)
+        conv2 = self.conv2(relu1)
+        bn2   = self.bn2(conv2)
+        skip  = self.skip(x)
+        return self.relu(bn2 + skip)
+def make_layer(k, inp_dim, out_dim, modules, layer=convolution, **kwargs):
+    layers = [layer(k, inp_dim, out_dim, **kwargs)]
+    for _ in range(1, modules):
+        layers.append(layer(k, out_dim, out_dim, **kwargs))
+    return nn.Sequential(*layers)
+def make_layer_revr(k, inp_dim, out_dim, modules, layer=convolution, **kwargs):
+    layers = []
+    for _ in range(modules - 1):
+        layers.append(layer(k, inp_dim, inp_dim, **kwargs))
+    layers.append(layer(k, inp_dim, out_dim, **kwargs))
+    return nn.Sequential(*layers)
+class MergeUp(nn.Module):
+    def forward(self, up1, up2):
+        return up1 + up2
+def make_merge_layer(dim):
+    return MergeUp()
+# def make_pool_layer(dim):
+#     return nn.MaxPool2d(kernel_size=2, stride=2)
+def make_pool_layer(dim):
+    return nn.Sequential()
+def make_unpool_layer(dim):
+    return nn.Upsample(scale_factor=2)
+def make_kp_layer(cnv_dim, curr_dim, out_dim):
+    return nn.Sequential(
+        convolution(3, cnv_dim, curr_dim, with_bn=False),
+        nn.Conv2d(curr_dim, out_dim, (1, 1))
+    )
+def make_inter_layer(dim):
+    return residual(3, dim, dim)
+def make_cnv_layer(inp_dim, out_dim):
+    return convolution(3, inp_dim, out_dim)
+class kp_module(nn.Module):
+    def __init__(
+        self, n, dims, modules, layer=residual,
+        make_up_layer=make_layer, make_low_layer=make_layer,
+        make_hg_layer=make_layer, make_hg_layer_revr=make_layer_revr,
+        make_pool_layer=make_pool_layer, make_unpool_layer=make_unpool_layer,
+        make_merge_layer=make_merge_layer, **kwargs
+    ):
+        super(kp_module, self).__init__()
+        self.n   = n
+        curr_mod = modules[0]
+        next_mod = modules[1]
+        curr_dim = dims[0]
+        next_dim = dims[1]
+        self.up1  = make_up_layer(
+            3, curr_dim, curr_dim, curr_mod, 
+            layer=layer, **kwargs
+        )  
+        self.max1 = make_pool_layer(curr_dim)
+        self.low1 = make_hg_layer(
+            3, curr_dim, next_dim, curr_mod,
+            layer=layer, **kwargs
+        )
+        self.low2 = kp_module(
+            n - 1, dims[1:], modules[1:], layer=layer, 
+            make_up_layer=make_up_layer, 
+            make_low_layer=make_low_layer,
+            make_hg_layer=make_hg_layer,
+            make_hg_layer_revr=make_hg_layer_revr,
+            make_pool_layer=make_pool_layer,
+            make_unpool_layer=make_unpool_layer,
+            make_merge_layer=make_merge_layer,
+            **kwargs
+        ) if self.n > 1 else \
+        make_low_layer(
+            3, next_dim, next_dim, next_mod,
+            layer=layer, **kwargs
+        )
+        self.low3 = make_hg_layer_revr(
+            3, next_dim, curr_dim, curr_mod,
+            layer=layer, **kwargs
+        )
+        self.up2  = make_unpool_layer(curr_dim)
+        self.merge = make_merge_layer(curr_dim)
+    def forward(self, x):
+        up1  = self.up1(x)
+        max1 = self.max1(x)
+        low1 = self.low1(max1)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        up2  = self.up2(low3)
+        return self.merge(up1, up2)
+class exkp(nn.Module):
+    def __init__(
+        self, n, nstack, dims, modules, heads, pre=None, cnv_dim=256, 
+        make_tl_layer=None, make_br_layer=None,
+        make_cnv_layer=make_cnv_layer, make_heat_layer=make_kp_layer,
+        make_tag_layer=make_kp_layer, make_regr_layer=make_kp_layer,
+        make_up_layer=make_layer, make_low_layer=make_layer, 
+        make_hg_layer=make_layer, make_hg_layer_revr=make_layer_revr,
+        make_pool_layer=make_pool_layer, make_unpool_layer=make_unpool_layer,
+        make_merge_layer=make_merge_layer, make_inter_layer=make_inter_layer, 
+        kp_layer=residual
+    ):
+        super(exkp, self).__init__()
+        self.nstack    = nstack
+        self.heads     = heads
+        curr_dim = dims[0]
+        self.pre = nn.Sequential(
+            convolution(7, 3, 128, stride=2),
+            residual(3, 128, 256, stride=2)
+        ) if pre is None else pre
+        self.kps  = nn.ModuleList([
+            kp_module(
+                n, dims, modules, layer=kp_layer,
+                make_up_layer=make_up_layer,
+                make_low_layer=make_low_layer,
+                make_hg_layer=make_hg_layer,
+                make_hg_layer_revr=make_hg_layer_revr,
+                make_pool_layer=make_pool_layer,
+                make_unpool_layer=make_unpool_layer,
+                make_merge_layer=make_merge_layer
+            ) for _ in range(nstack)
+        ])
+        self.cnvs = nn.ModuleList([
+            make_cnv_layer(curr_dim, cnv_dim) for _ in range(nstack)
+        ])
+        self.inters = nn.ModuleList([
+            make_inter_layer(curr_dim) for _ in range(nstack - 1)
+        ])
+        self.inters_ = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(curr_dim, curr_dim, (1, 1), bias=False),
+                nn.BatchNorm2d(curr_dim)
+            ) for _ in range(nstack - 1)
+        ])
+        self.cnvs_   = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(cnv_dim, curr_dim, (1, 1), bias=False),
+                nn.BatchNorm2d(curr_dim)
+            ) for _ in range(nstack - 1)
+        ])
+        ## keypoint heatmaps
+        for head in heads.keys():
+            if 'hm' in head:
+                module =  nn.ModuleList([
+                    make_heat_layer(
+                        cnv_dim, curr_dim, heads[head]) for _ in range(nstack)
+                ])
+                self.__setattr__(head, module)
+                for heat in self.__getattr__(head):
+                    heat[-1].bias.data.fill_(-2.19)
+            else:
+                module = nn.ModuleList([
+                    make_regr_layer(
+                        cnv_dim, curr_dim, heads[head]) for _ in range(nstack)
+                ])
+                self.__setattr__(head, module)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, image):
+        # print('image shape', image.shape)
+        inter = self.pre(image)
+        outs  = []
+        for ind in range(self.nstack):
+            kp_, cnv_  = self.kps[ind], self.cnvs[ind]
+            kp  = kp_(inter)
+            cnv = cnv_(kp)
+            out = {}
+            for head in self.heads:
+                layer = self.__getattr__(head)[ind]
+                y = layer(cnv)
+                out[head] = y
+            outs.append(out)
+            if ind < self.nstack - 1:
+                inter = self.inters_[ind](inter) + self.cnvs_[ind](cnv)
+                inter = self.relu(inter)
+                inter = self.inters[ind](inter)
+        return outs
+def make_hg_layer(kernel, dim0, dim1, mod, layer=convolution, **kwargs):
+    layers  = [layer(kernel, dim0, dim1, stride=2)]
+    layers += [layer(kernel, dim1, dim1) for _ in range(mod - 1)]
+    return nn.Sequential(*layers)
+class HourglassNet(exkp):
+    def __init__(self, heads, num_stacks=2):
+        n       = 5
+        dims    = [256, 256, 384, 384, 384, 512]
+        modules = [2, 2, 2, 2, 2, 4]
+        super(HourglassNet, self).__init__(
+            n, num_stacks, dims, modules, heads,
+            make_tl_layer=None,
+            make_br_layer=None,
+            make_pool_layer=make_pool_layer,
+            make_hg_layer=make_hg_layer,
+            kp_layer=residual, cnv_dim=256
+        )
+def get_large_hourglass_net(num_layers, heads, head_conv):
+  model = HourglassNet(heads, 2)
+  return model
--- a/src/lib/models/Backbone/mobilenet_v2.py
+++ b/src/lib/models/Backbone/mobilenet_v2.py
+from torch import nn
+import torch.utils.model_zoo as model_zoo
+from collections import OrderedDict
+import math
+__all__ = ['MobileNetV2']
+model_urls = {
+    'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
+}
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+class ConvBNReLU(nn.Sequential):
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+        padding = (kernel_size - 1) // 2
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
+            nn.BatchNorm2d(out_planes),
+            nn.ReLU6(inplace=True)
+        )
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+        layers = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+        layers.extend([
+            # dw
+            ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
+            # pw-linear
+            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+class MobileNetV2(nn.Module):
+    def __init__(self,width_mult=1.0,round_nearest=8,):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        inverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1], # 0
+            [6, 24, 2, 2], # 1
+            [6, 32, 3, 2], # 2
+            [6, 64, 4, 2], # 3
+            [6, 96, 3, 1], # 4
+            [6, 160, 3, 2],# 5
+            [6, 320, 1, 1],# 6
+        ]
+        self.feat_id = [1,2,4,6]
+        self.feat_channel = []
+        # only check the first element, assuming user knows t,c,n,s are required
+        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+            raise ValueError("inverted_residual_setting should be non-empty "
+                             "or a 4-element list, got {}".format(inverted_residual_setting))
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+        features = [ConvBNReLU(3, input_channel, stride=2)]
+        # building inverted residual blocks
+        for id,(t, c, n, s) in enumerate(inverted_residual_setting):
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
+                input_channel = output_channel
+            if id in self.feat_id  :
+                self.__setattr__("feature_%d"%id,nn.Sequential(*features))
+                self.feat_channel.append(output_channel)
+                features = []
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+    def forward(self, x):
+        y = []
+        for id in self.feat_id:
+            x = self.__getattr__("feature_%d"%id)(x)
+            y.append(x)
+        return y
+def load_model(model,state_dict):
+    new_model=model.state_dict()
+    new_keys = list(new_model.keys())
+    old_keys = list(state_dict.keys())
+    restore_dict = OrderedDict()
+    for id in range(len(new_keys)):
+        restore_dict[new_keys[id]] = state_dict[old_keys[id]]
+    model.load_state_dict(restore_dict)
+def dict2list(func):
+    def wrap(*args, **kwargs):
+        self = args[0]
+        x = args[1]
+        ret_list = []
+        ret = func(self, x)
+        for k, v in ret[0].items():
+            ret_list.append(v)
+        return ret_list
+    return wrap
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            nn.init.normal_(m.weight, std=0.001)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+class IDAUp(nn.Module):
+    def __init__(self, out_dim, channel):
+        super(IDAUp, self).__init__()
+        self.out_dim = out_dim
+        self.up = nn.Sequential(
+                    nn.ConvTranspose2d(
+                        out_dim, out_dim, kernel_size=2, stride=2, padding=0,
+                        output_padding=0, groups=out_dim, bias=False),
+                    nn.BatchNorm2d(out_dim,eps=0.001,momentum=0.1),
+                    nn.ReLU())
+        self.conv =  nn.Sequential(
+                    nn.Conv2d(channel, out_dim,
+                              kernel_size=1, stride=1, bias=False),
+                    nn.BatchNorm2d(out_dim,eps=0.001,momentum=0.1),
+                    nn.ReLU(inplace=True))
+    def forward(self, layers):
+        layers = list(layers)
+        x = self.up(layers[0])
+        y = self.conv(layers[1])
+        out = x + y
+        return out
+class MobileNetUp(nn.Module):
+    def __init__(self, channels, out_dim = 24):
+        super(MobileNetUp, self).__init__()
+        channels =  channels[::-1]
+        self.conv =  nn.Sequential(
+                    nn.Conv2d(channels[0], out_dim,
+                              kernel_size=1, stride=1, bias=False),
+                    nn.BatchNorm2d(out_dim,eps=0.001,momentum=0.1),
+                    nn.ReLU(inplace=True))
+        self.conv_last =  nn.Sequential(
+                    nn.Conv2d(out_dim,out_dim,
+                              kernel_size=3, stride=1, padding=1 ,bias=False),
+                    nn.BatchNorm2d(out_dim,eps=1e-5,momentum=0.01),
+                    nn.ReLU(inplace=True))
+        for i,channel in enumerate(channels[1:]):
+            setattr(self,'up_%d'%(i),IDAUp(out_dim,channel))
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m,nn.ConvTranspose2d):
+                fill_up_weights(m)
+    def forward(self, layers):
+        layers = list(layers)
+        assert len(layers) > 1
+        x = self.conv(layers[-1])
+        for i in range(0,len(layers)-1):
+            up = getattr(self, 'up_{}'.format(i))
+            x = up([x,layers[len(layers)-2-i]])
+        x = self.conv_last(x)
+        return x
+class MobileNetSeg(nn.Module):
+    def __init__(self, base_name,heads,head_conv=24, pretrained = True):
+        super(MobileNetSeg, self).__init__()
+        self.heads = heads
+        self.base = globals()[base_name](
+            pretrained=pretrained)
+        channels = self.base.feat_channel
+        self.dla_up = MobileNetUp(channels, out_dim=head_conv)
+        for head in self.heads:
+            classes = self.heads[head]
+            fc =nn.Conv2d(head_conv, classes,
+                          kernel_size=1, stride=1,
+                          padding=0, bias=True)
+            if 'hm' in head:
+                fc.bias.data.fill_(-2.19)
+            else:
+                nn.init.normal_(fc.weight, std=0.001)
+                nn.init.constant_(fc.bias, 0)
+            self.__setattr__(head, fc)
+    # @dict2list         # 转onnx的时候需要将输出由dict转成list模式
+    def forward(self, x):
+        x = self.base(x)
+        x = self.dla_up(x)
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x)
+        return [ret]
+def mobilenetv2_10(pretrained=True, **kwargs):
+    model = MobileNetV2(width_mult=1.0)
+    if pretrained:
+        state_dict = model_zoo.load_url(model_urls['mobilenet_v2'],
+                                              progress=True)
+        load_model(model,state_dict)
+    return model
+def mobilenetv2_5(pretrained=False, **kwargs):
+    model = MobileNetV2(width_mult=0.5)
+    if pretrained:
+        print('This version does not have pretrain weights.')
+    return model
+# num_layers  : [10 , 5]
+def get_mobile_net(num_layers, heads, head_conv=24):
+  model = MobileNetSeg('mobilenetv2_{}'.format(num_layers), heads,
+                 pretrained=True,
+                 head_conv=head_conv)
+  return model
+if __name__ == '__main__':
+    import torch
+    input = torch.zeros([1,3,416,416])
+    model = get_mobile_net(5,{'hm':1,'reg':2,'wh':2},head_conv=24)          # hm reference for the classes of objects//这个头文件只能做矩形框检测
+    res = model(input)
+    print(res.shape)
--- a/src/lib/models/Backbone/mobilenetv2.py
+++ b/src/lib/models/Backbone/mobilenetv2.py
+from torch import nn
+import torch.utils.model_zoo as model_zoo
+from collections import OrderedDict
+import math
+__all__ = ['MobileNetV2']
+model_urls = {
+    'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
+}
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+class ConvBNReLU(nn.Sequential):
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+        padding = (kernel_size - 1) // 2
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
+            nn.BatchNorm2d(out_planes),
+            nn.ReLU(inplace=True) #replace ReLU6
+        )
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+        layers = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+        layers.extend([
+            # dw
+            ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
+            # pw-linear
+            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+class MobileNetV2(nn.Module):
+    def __init__(self, width_mult=1.0, round_nearest=8, ):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        inverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],  # 0
+            [6, 24, 2, 2],  # 1
+            [6, 32, 3, 2],  # 2
+            [6, 64, 4, 2],  # 3
+            [6, 96, 3, 1],  # 4
+            [6, 160, 3, 2],  # 5
+            [6, 320, 1, 1],  # 6
+        ]
+        self.feat_id = [1, 2, 4, 6]
+        self.feat_channel = []
+        # only check the first element, assuming user knows t,c,n,s are required
+        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+            raise ValueError("inverted_residual_setting should be non-empty "
+                             "or a 4-element list, got {}".format(inverted_residual_setting))
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+        features = [ConvBNReLU(3, input_channel, stride=2)]
+        # building inverted residual blocks
+        for id, (t, c, n, s) in enumerate(inverted_residual_setting):
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
+                input_channel = output_channel
+            if id in self.feat_id:
+                self.__setattr__("feature_%d" % id, nn.Sequential(*features))
+                self.feat_channel.append(output_channel)
+                features = []
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+    def forward(self, x):
+        y = []
+        for id in self.feat_id:
+            x = self.__getattr__("feature_%d" % id)(x)
+            y.append(x)
+        return y
+def load_model(model, state_dict):
+    new_model = model.state_dict()
+    new_keys = list(new_model.keys())
+    old_keys = list(state_dict.keys())
+    restore_dict = OrderedDict()
+    for id in range(len(new_keys)):
+        restore_dict[new_keys[id]] = state_dict[old_keys[id]]
+    model.load_state_dict(restore_dict)
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            nn.init.normal_(m.weight, std=0.001)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+class IDAUp(nn.Module):
+    def __init__(self, out_dim, channel):
+        super(IDAUp, self).__init__()
+        self.out_dim = out_dim
+        self.up = nn.Sequential(
+            nn.ConvTranspose2d(
+                out_dim, out_dim, kernel_size=2, stride=2, padding=0,
+                output_padding=0, groups=out_dim, bias=False),
+            nn.BatchNorm2d(out_dim, eps=0.001, momentum=0.1),
+            nn.ReLU())
+        self.conv = nn.Sequential(
+            nn.Conv2d(channel, out_dim,
+                      kernel_size=1, stride=1, bias=False),
+            nn.BatchNorm2d(out_dim, eps=0.001, momentum=0.1),
+            nn.ReLU(inplace=True))
+    def forward(self, layers):
+        layers = list(layers)
+        x = self.up(layers[0])
+        y = self.conv(layers[1])
+        out = x + y
+        return out
+class MobileNetUp(nn.Module):
+    def __init__(self, channels, out_dim=24):
+        super(MobileNetUp, self).__init__()
+        channels = channels[::-1]
+        self.conv = nn.Sequential(
+            nn.Conv2d(channels[0], out_dim,
+                      kernel_size=1, stride=1, bias=False),
+            nn.BatchNorm2d(out_dim, eps=0.001, momentum=0.1),
+            nn.ReLU(inplace=True))
+        self.conv_last = nn.Sequential(
+            nn.Conv2d(out_dim, out_dim,
+                      kernel_size=3, stride=1, padding=1, bias=False),
+            nn.BatchNorm2d(out_dim, eps=1e-5, momentum=0.01),
+            nn.ReLU(inplace=True))
+        for i, channel in enumerate(channels[1:]):
+            setattr(self, 'up_%d' % (i), IDAUp(out_dim, channel))
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                fill_up_weights(m)
+    def forward(self, layers):
+        layers = list(layers)
+        assert len(layers) > 1
+        x = self.conv(layers[-1])
+        for i in range(0, len(layers) - 1):
+            up = getattr(self, 'up_{}'.format(i))
+            x = up([x, layers[len(layers) - 2 - i]])
+        x = self.conv_last(x)
+        return x
+class MobileNetSeg(nn.Module):
+    def __init__(self, base_name, head_conv=24, pretrained=True):
+        super(MobileNetSeg, self).__init__()
+        # self.heads = {'hm':1,'reg':2,'wh':2}
+        self.base = globals()[base_name](
+            pretrained=pretrained)
+        channels = self.base.feat_channel
+        self.dla_up = MobileNetUp(channels, out_dim=head_conv)
+    def forward(self, x):
+        x = self.base(x)
+        x = self.dla_up(x)
+        return x
+def mobilenetv2_10(pretrained=True, **kwargs):
+    model = MobileNetV2(width_mult=1.0)
+    if pretrained:
+        state_dict = model_zoo.load_url(model_urls['mobilenet_v2'],
+                                        progress=True)
+        load_model(model, state_dict)
+    return model
+def mobilenetv2_5(pretrained=False, **kwargs):
+    model = MobileNetV2(width_mult=0.5)
+    if pretrained:
+        print('This version does not have pretrain weights.')
+    return model
+# num_layers  : [10 , 5]
+def get_mobile_pose_netv2(num_layers, cfg):
+    num_layers = 10
+    model = MobileNetSeg('mobilenetv2_{}'.format(num_layers),
+                         pretrained=True,
+                         head_conv=cfg.MODEL.INTERMEDIATE_CHANNEL)
+    return model
--- a/src/lib/models/Backbone/mobilenetv3.py
+++ b/src/lib/models/Backbone/mobilenetv3.py
+from __future__ import absolute_import, division, print_function
+import math
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import init
+from .DCNv2.dcn_v2 import DCN
+class DeformConv(nn.Module):
+    def __init__(self, chi, cho):
+        super(DeformConv, self).__init__()
+        self.actf = nn.Sequential(
+            nn.BatchNorm2d(cho, momentum=0.1),
+            nn.ReLU(inplace=True)
+        )
+        self.conv = DCN(chi, cho, kernel_size=(3, 3), stride=1, padding=1, dilation=1, deformable_groups=1)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.actf(x)
+        return x
+class IDAUp(nn.Module):
+    def __init__(self, o, channels, up_f):
+        super(IDAUp, self).__init__()
+        for i in range(1, len(channels)):
+            c = channels[i]
+            f = int(up_f[i])
+            proj = DeformConv(c, o)
+            node = DeformConv(o, o)
+            up = nn.ConvTranspose2d(o, o, f * 2, stride=f,
+                                    padding=f // 2, output_padding=0,
+                                    groups=o, bias=False)
+            fill_up_weights(up)
+            setattr(self, 'proj_' + str(i), proj)
+            setattr(self, 'up_' + str(i), up)
+            setattr(self, 'node_' + str(i), node)
+    def forward(self, layers, startp, endp):
+        for i in range(startp + 1, endp):
+            upsample = getattr(self, 'up_' + str(i - startp))
+            project = getattr(self, 'proj_' + str(i - startp))
+            layers[i] = upsample(project(layers[i]))
+            node = getattr(self, 'node_' + str(i - startp))
+            layers[i] = node(layers[i] + layers[i - 1])
+class hswish(nn.Module):
+    def forward(self, x):
+        out = x * F.relu6(x + 3, inplace=True) / 6
+        return out
+class hsigmoid(nn.Module):
+    def forward(self, x):
+        out = F.relu6(x + 3, inplace=True) / 6
+        return out
+class SeModule(nn.Module):
+    def __init__(self, in_size, reduction=4):
+        super(SeModule, self).__init__()
+        self.se = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_size, in_size // reduction, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(in_size // reduction),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(in_size // reduction, in_size, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(in_size),
+            hsigmoid()
+        )
+    def forward(self, x):
+        return x * self.se(x)
+class Block(nn.Module):
+    '''expand + depthwise + pointwise'''
+    def __init__(self, kernel_size, in_size, expand_size, out_size, nolinear, semodule, stride):
+        super(Block, self).__init__()
+        self.stride = stride
+        self.se = semodule
+        self.conv1 = nn.Conv2d(in_size, expand_size, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn1 = nn.BatchNorm2d(expand_size)
+        self.nolinear1 = nolinear
+        self.conv2 = nn.Conv2d(expand_size, expand_size, kernel_size=kernel_size, stride=stride,
+                               padding=kernel_size // 2, groups=expand_size, bias=False)
+        self.bn2 = nn.BatchNorm2d(expand_size)
+        self.nolinear2 = nolinear
+        self.conv3 = nn.Conv2d(expand_size, out_size, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn3 = nn.BatchNorm2d(out_size)
+        self.shortcut = nn.Sequential()
+        if stride == 1 and in_size != out_size:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_size, out_size, kernel_size=1, stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(out_size),
+            )
+    def forward(self, x):
+        out = self.nolinear1(self.bn1(self.conv1(x)))
+        out = self.nolinear2(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        if self.se != None:
+            out = self.se(out)
+        out = out + self.shortcut(x) if self.stride == 1 else out
+        return out
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+class MobileNetV3(nn.Module):
+    def __init__(self, final_kernel):
+        super(MobileNetV3, self).__init__()
+        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(16)
+        self.hs1 = hswish()
+        self.bneck0 = nn.Sequential(
+            Block(3, 16, 16, 16, nn.ReLU(inplace=True), None, 1),
+            Block(3, 16, 64, 24, nn.ReLU(inplace=True), None, 2),
+            Block(3, 24, 72, 24, nn.ReLU(inplace=True), None, 1),
+        )
+        self.bneck1 = nn.Sequential(
+            Block(5, 24, 72, 40, nn.ReLU(inplace=True), SeModule(40), 2),
+            Block(5, 40, 120, 40, nn.ReLU(inplace=True), SeModule(40), 1),
+            Block(5, 40, 120, 40, nn.ReLU(inplace=True), SeModule(40), 1),
+        )
+        self.bneck2 = nn.Sequential(
+            Block(3, 40, 240, 80, hswish(), None, 2),
+            Block(3, 80, 200, 80, hswish(), None, 1),
+            Block(3, 80, 184, 80, hswish(), None, 1),
+            Block(3, 80, 184, 80, hswish(), None, 1),
+            Block(3, 80, 480, 112, hswish(), SeModule(112), 1),
+            Block(3, 112, 672, 112, hswish(), SeModule(112), 1),
+            Block(5, 112, 672, 160, hswish(), SeModule(160), 1),
+        )
+        self.bneck3 = nn.Sequential(
+            Block(5, 160, 672, 160, hswish(), SeModule(160), 2),
+            Block(5, 160, 960, 160, hswish(), SeModule(160), 1),
+        )
+        self.conv2 = nn.Conv2d(160, 960, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn2 = nn.BatchNorm2d(960)
+        self.hs2 = hswish()
+        self.ida_up = IDAUp(24, [24, 40, 160, 960],
+                            [2 ** i for i in range(4)])
+        self.init_params()
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.001)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+    def forward(self, x):
+        out = self.hs1(self.bn1(self.conv1(x)))
+        out0 = self.bneck0(out)
+        out1 = self.bneck1(out0)
+        out2 = self.bneck2(out1)
+        out3 = self.bneck3(out2)
+        out3 = self.hs2(self.bn2(self.conv2(out3)))
+        out = [out0, out1, out2, out3]
+        y = []
+        for i in range(4):
+            y.append(out[i].clone())
+        self.ida_up(y, 0, len(y))
+        return y[-1]
+def get_mobilev3_pose_net(num_layers, cfg):
+  model = MobileNetV3(final_kernel=1)
+  return model
--- a/src/lib/models/Backbone/msra_resnet.py
+++ b/src/lib/models/Backbone/msra_resnet.py
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# Modified by Xingyi Zhou
+# ------------------------------------------------------------------------------
+from __future__ import absolute_import, division, print_function
+import os
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+BN_MOMENTUM = 0.1
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+def fill_fc_weights(layers):
+  for m in layers.modules():
+    if isinstance(m, nn.Conv2d):
+      nn.init.normal_(m.weight, std=0.001)
+      if m.bias is not None:
+        nn.init.constant_(m.bias, 0)
+class PoseResNet(nn.Module):
+    def __init__(self, block, layers, **kwargs):
+        self.inplanes = 64
+        self.deconv_with_bias = False
+        super(PoseResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        # used for deconv layers
+        self.deconv_layers = self._make_deconv_layer(
+            3,
+            [256, 256, 256],
+            [4, 4, 4],
+        )                
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        return deconv_kernel, padding, output_padding
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+            planes = num_filters[i]
+            layers.append(
+                nn.ConvTranspose2d(
+                    in_channels=self.inplanes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias))
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.deconv_layers(x)
+        return x
+    def init_weights(self, num_layers, pretrained=True):
+        if pretrained:
+            # print('=> init resnet deconv weights from normal distribution')
+            for _, m in self.deconv_layers.named_modules():
+                if isinstance(m, nn.ConvTranspose2d):
+                    # print('=> init {}.weight as normal(0, 0.001)'.format(name))
+                    # print('=> init {}.bias as 0'.format(name))
+                    nn.init.normal_(m.weight, std=0.001)
+                    if self.deconv_with_bias:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    # print('=> init {}.weight as 1'.format(name))
+                    # print('=> init {}.bias as 0'.format(name))
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+            #pretrained_state_dict = torch.load(pretrained)
+            url = model_urls['resnet{}'.format(num_layers)]
+            pretrained_state_dict = model_zoo.load_url(url)
+            print('=> loading pretrained model {}'.format(url))
+            self.load_state_dict(pretrained_state_dict, strict=False)
+        else:
+            print('=> imagenet pretrained model dose not exist')
+            print('=> please download it first')
+            raise ValueError('imagenet pretrained model does not exist')
+resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]),
+               34: (BasicBlock, [3, 4, 6, 3]),
+               50: (Bottleneck, [3, 4, 6, 3]),
+               101: (Bottleneck, [3, 4, 23, 3]),
+               152: (Bottleneck, [3, 8, 36, 3])}
+def get_resnet(num_layers, cfg):
+  block_class, layers = resnet_spec[num_layers]
+  model = PoseResNet(block_class, layers)
+  model.init_weights(num_layers, pretrained=True)
+  return model
--- a/src/lib/models/Backbone/performance.png
+++ b/src/lib/models/Backbone/performance.png
--- a/src/lib/models/Backbone/pose_dla_dcn.py
+++ b/src/lib/models/Backbone/pose_dla_dcn.py
+from __future__ import absolute_import, division, print_function
+import logging
+import math
+import os
+from os.path import join
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+from torch import nn
+from .DCNv2.dcn_v2 import DCN
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
+    return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=1, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.stride = stride
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out += residual
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 2
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = Bottleneck.expansion
+        bottle_planes = planes // expansion
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out += residual
+        out = self.relu(out)
+        return out
+class BottleneckX(nn.Module):
+    expansion = 2
+    cardinality = 32
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BottleneckX, self).__init__()
+        cardinality = BottleneckX.cardinality
+        # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
+        # bottle_planes = dim * cardinality
+        bottle_planes = planes * cardinality // 32
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation, bias=False,
+                               dilation=dilation, groups=cardinality)
+        self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out += residual
+        out = self.relu(out)
+        return out
+class Root(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(Root, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, 1,
+            stride=1, bias=False, padding=(kernel_size - 1) // 2)
+        self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.residual = residual
+    def forward(self, *x):
+        children = x
+        x = self.conv(torch.cat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+        return x
+class Tree(nn.Module):
+    def __init__(self, levels, block, in_channels, out_channels, stride=1,
+                 level_root=False, root_dim=0, root_kernel_size=1,
+                 dilation=1, root_residual=False):
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride,
+                               dilation=dilation)
+            self.tree2 = block(out_channels, out_channels, 1,
+                               dilation=dilation)
+        else:
+            self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
+                              stride, root_dim=0,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+            self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
+                              root_dim=root_dim + out_channels,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, root_kernel_size,
+                             root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels,
+                          kernel_size=1, stride=1, bias=False),
+                nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
+            )
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+class DLA(nn.Module):
+    def __init__(self, levels, channels, num_classes=1000,
+                 block=BasicBlock, residual_root=False, linear_root=False):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.num_classes = num_classes
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
+                      padding=3, bias=False),
+            nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True))
+        self.level0 = self._make_conv_level(
+            channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+        self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
+                           level_root=False,
+                           root_residual=residual_root)
+        self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
+                           level_root=True, root_residual=residual_root)
+        # for m in self.modules():
+        #     if isinstance(m, nn.Conv2d):
+        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        #         m.weight.data.normal_(0, math.sqrt(2. / n))
+        #     elif isinstance(m, nn.BatchNorm2d):
+        #         m.weight.data.fill_(1)
+        #         m.bias.data.zero_()
+    def _make_level(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                nn.MaxPool2d(stride, stride=stride),
+                nn.Conv2d(inplanes, planes,
+                          kernel_size=1, stride=1, bias=False),
+                nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
+            )
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample=downsample))
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+        return nn.Sequential(*layers)
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2d(inplanes, planes, kernel_size=3,
+                          stride=stride if i == 0 else 1,
+                          padding=dilation, bias=False, dilation=dilation),
+                nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
+                nn.ReLU(inplace=True)])
+            inplanes = planes
+        return nn.Sequential(*modules)
+    def forward(self, x):
+        y = []
+        x = self.base_layer(x)
+        for i in range(6):
+            x = getattr(self, 'level{}'.format(i))(x)
+            y.append(x)
+        return y
+    def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'):
+        # fc = self.fc
+        if name.endswith('.pth'):
+            model_weights = torch.load(data + name)
+        else:
+            model_url = get_model_url(data, name, hash)
+            model_weights = model_zoo.load_url(model_url)
+        num_classes = len(model_weights[list(model_weights.keys())[-1]])
+        self.fc = nn.Conv2d(
+            self.channels[-1], num_classes,
+            kernel_size=1, stride=1, padding=0, bias=True)
+        self.load_state_dict(model_weights)
+        # self.fc = fc
+def dla34(pretrained=True, **kwargs):  # DLA-34
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 128, 256, 512],
+                block=BasicBlock, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
+    return model
+class Identity(nn.Module):
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return x
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+class DeformConv(nn.Module):
+    def __init__(self, chi, cho):
+        super(DeformConv, self).__init__()
+        self.actf = nn.Sequential(
+            nn.BatchNorm2d(cho, momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True)
+        )
+        self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.actf(x)
+        return x
+class IDAUp(nn.Module):
+    def __init__(self, o, channels, up_f):
+        super(IDAUp, self).__init__()
+        for i in range(1, len(channels)):
+            c = channels[i]
+            f = int(up_f[i])  
+            proj = DeformConv(c, o)
+            node = DeformConv(o, o)
+            up = nn.ConvTranspose2d(o, o, f * 2, stride=f, 
+                                    padding=f // 2, output_padding=0,
+                                    groups=o, bias=False)
+            fill_up_weights(up)
+            setattr(self, 'proj_' + str(i), proj)
+            setattr(self, 'up_' + str(i), up)
+            setattr(self, 'node_' + str(i), node)
+    def forward(self, layers, startp, endp):
+        for i in range(startp + 1, endp):
+            upsample = getattr(self, 'up_' + str(i - startp))
+            project = getattr(self, 'proj_' + str(i - startp))
+            layers[i] = upsample(project(layers[i]))
+            node = getattr(self, 'node_' + str(i - startp))
+            layers[i] = node(layers[i] + layers[i - 1])
+class DLAUp(nn.Module):
+    def __init__(self, startp, channels, scales, in_channels=None):
+        super(DLAUp, self).__init__()
+        self.startp = startp
+        if in_channels is None:
+            in_channels = channels
+        self.channels = channels
+        channels = list(channels)
+        scales = np.array(scales, dtype=int)
+        for i in range(len(channels) - 1):
+            j = -i - 2
+            setattr(self, 'ida_{}'.format(i),
+                    IDAUp(channels[j], in_channels[j:],
+                          scales[j:] // scales[j]))
+            scales[j + 1:] = scales[j]
+            in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
+    def forward(self, layers):
+        out = [layers[-1]] # start with 32
+        for i in range(len(layers) - self.startp - 1):
+            ida = getattr(self, 'ida_{}'.format(i))
+            ida(layers, len(layers) -i - 2, len(layers))
+            out.insert(0, layers[-1])
+        return out
+class Interpolate(nn.Module):
+    def __init__(self, scale, mode):
+        super(Interpolate, self).__init__()
+        self.scale = scale
+        self.mode = mode
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False)
+        return x
+class DLASeg(nn.Module):
+    def __init__(self, base_name, pretrained, down_ratio, final_kernel,
+                 last_level, out_channel=0):
+        super(DLASeg, self).__init__()
+        assert down_ratio in [2, 4, 8, 16]
+        self.first_level = int(np.log2(down_ratio))
+        self.last_level = last_level
+        self.base = globals()[base_name](pretrained=pretrained)
+        channels = self.base.channels
+        scales = [2 ** i for i in range(len(channels[self.first_level:]))]
+        self.dla_up = DLAUp(self.first_level, channels[self.first_level:], scales)
+        if out_channel == 0:
+            out_channel = channels[self.first_level]
+        self.ida_up = IDAUp(out_channel, channels[self.first_level:self.last_level], 
+                            [2 ** i for i in range(self.last_level - self.first_level)])
+    def forward(self, x):
+        x = self.base(x)
+        x = self.dla_up(x)
+        y = []
+        for i in range(self.last_level - self.first_level):
+            y.append(x[i].clone())
+        self.ida_up(y, 0, len(y))
+        x = y[-1]
+        return x
+def get_pose_net(num_layers, cfg=None, down_ratio=4):
+  model = DLASeg('dla{}'.format(num_layers),
+                 pretrained=True,
+                 down_ratio=down_ratio,
+                 final_kernel=1,
+                 last_level=5)
+  return model
--- a/src/lib/models/Backbone/pose_higher_hrnet.py
+++ b/src/lib/models/Backbone/pose_higher_hrnet.py
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (leoxiaobin@gmail.com)
+# Modified by Bowen Cheng (bcheng9@illinois.edu)
+# ------------------------------------------------------------------------------
+from __future__ import absolute_import, division, print_function
+import logging
+import os
+import torch
+import torch.nn as nn
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class HighResolutionModule(nn.Module):
+    def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
+                 num_channels, fuse_method, multi_scale_output=True):
+        super(HighResolutionModule, self).__init__()
+        self._check_branches(
+            num_branches, blocks, num_blocks, num_inchannels, num_channels)
+        self.num_inchannels = num_inchannels
+        self.fuse_method = fuse_method
+        self.num_branches = num_branches
+        self.multi_scale_output = multi_scale_output
+        self.branches = self._make_branches(
+            num_branches, blocks, num_blocks, num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(True)
+    def _check_branches(self, num_branches, blocks, num_blocks,
+                        num_inchannels, num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
+                num_branches, len(num_blocks))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+        if num_branches != len(num_channels):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(
+                num_branches, len(num_channels))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+        if num_branches != len(num_inchannels):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(
+                num_branches, len(num_inchannels))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+    def _make_one_branch(self, branch_index, block, num_blocks, num_channels,
+                         stride=1):
+        downsample = None
+        if stride != 1 or \
+           self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.num_inchannels[branch_index],
+                          num_channels[branch_index] * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(num_channels[branch_index] * block.expansion,
+                               momentum=BN_MOMENTUM),
+            )
+        layers = []
+        layers.append(block(self.num_inchannels[branch_index],
+                            num_channels[branch_index], stride, downsample))
+        self.num_inchannels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(block(self.num_inchannels[branch_index],
+                                num_channels[branch_index]))
+        return nn.Sequential(*layers)
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+        return nn.ModuleList(branches)
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+        num_branches = self.num_branches
+        num_inchannels = self.num_inchannels
+        fuse_layers = []
+        for i in range(num_branches if self.multi_scale_output else 1):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(nn.Sequential(
+                        nn.Conv2d(num_inchannels[j],
+                                  num_inchannels[i],
+                                  1,
+                                  1,
+                                  0,
+                                  bias=False),
+                        nn.BatchNorm2d(num_inchannels[i]),
+                        nn.Upsample(scale_factor=2**(j-i), mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv3x3s = []
+                    for k in range(i-j):
+                        if k == i - j - 1:
+                            num_outchannels_conv3x3 = num_inchannels[i]
+                            conv3x3s.append(nn.Sequential(
+                                nn.Conv2d(num_inchannels[j],
+                                          num_outchannels_conv3x3,
+                                          3, 2, 1, bias=False),
+                                nn.BatchNorm2d(num_outchannels_conv3x3)))
+                        else:
+                            num_outchannels_conv3x3 = num_inchannels[j]
+                            conv3x3s.append(nn.Sequential(
+                                nn.Conv2d(num_inchannels[j],
+                                          num_outchannels_conv3x3,
+                                          3, 2, 1, bias=False),
+                                nn.BatchNorm2d(num_outchannels_conv3x3),
+                                nn.ReLU(True)))
+                    fuse_layer.append(nn.Sequential(*conv3x3s))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+        return nn.ModuleList(fuse_layers)
+    def get_num_inchannels(self):
+        return self.num_inchannels
+    def forward(self, x):
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
+            for j in range(1, self.num_branches):
+                if i == j:
+                    y = y + x[j]
+                else:
+                    y = y + self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+blocks_dict = {
+    'BASIC': BasicBlock,
+    'BOTTLENECK': Bottleneck
+}
+class PoseHigherResolutionNet(nn.Module):
+    def __init__(self, cfg, **kwargs):
+        self.inplanes = 64
+        extra = cfg.MODEL.EXTRA
+        super(PoseHigherResolutionNet, self).__init__()
+        # stem net
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(Bottleneck, 64, 4)
+        self.stage2_cfg = cfg['MODEL']['EXTRA']['STAGE2']
+        num_channels = self.stage2_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage2_cfg['BLOCK']]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition1 = self._make_transition_layer([256], num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+        self.stage3_cfg = cfg['MODEL']['EXTRA']['STAGE3']
+        num_channels = self.stage3_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage3_cfg['BLOCK']]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition2 = self._make_transition_layer(
+            pre_stage_channels, num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+        self.stage4_cfg = cfg['MODEL']['EXTRA']['STAGE4']
+        num_channels = self.stage4_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage4_cfg['BLOCK']]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition3 = self._make_transition_layer(
+            pre_stage_channels, num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multi_scale_output=False)
+        #self.final_layers = self._make_final_layers(cfg, pre_stage_channels[0])
+        #self.deconv_layers = self._make_deconv_layers(
+        #    cfg, pre_stage_channels[0])
+        self.num_deconvs = extra.DECONV.NUM_DECONVS
+        self.deconv_config = cfg.MODEL.EXTRA.DECONV
+        self.loss_config = cfg.LOSS
+        self.pretrained_layers = cfg['MODEL']['EXTRA']['PRETRAINED_LAYERS']
+    def _make_final_layers(self, cfg, input_channels):
+        dim_tag = cfg.MODEL.NUM_JOINTS if cfg.MODEL.TAG_PER_JOINT else 1
+        extra = cfg.MODEL.EXTRA
+        final_layers = []
+        output_channels = cfg.MODEL.NUM_JOINTS + dim_tag \
+            if cfg.LOSS.WITH_AE_LOSS[0] else cfg.MODEL.NUM_JOINTS
+        final_layers.append(nn.Conv2d(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=extra.FINAL_CONV_KERNEL,
+            stride=1,
+            padding=1 if extra.FINAL_CONV_KERNEL == 3 else 0
+        ))
+        deconv_cfg = extra.DECONV
+        for i in range(deconv_cfg.NUM_DECONVS):
+            input_channels = deconv_cfg.NUM_CHANNELS[i]
+            output_channels = cfg.MODEL.NUM_JOINTS + dim_tag \
+                if cfg.LOSS.WITH_AE_LOSS[i+1] else cfg.MODEL.NUM_JOINTS
+            final_layers.append(nn.Conv2d(
+                in_channels=input_channels,
+                out_channels=output_channels,
+                kernel_size=extra.FINAL_CONV_KERNEL,
+                stride=1,
+                padding=1 if extra.FINAL_CONV_KERNEL == 3 else 0
+            ))
+        return nn.ModuleList(final_layers)
+    def _make_deconv_layers(self, cfg, input_channels):
+        dim_tag = cfg.MODEL.NUM_JOINTS if cfg.MODEL.TAG_PER_JOINT else 1
+        extra = cfg.MODEL.EXTRA
+        deconv_cfg = extra.DECONV
+        deconv_layers = []
+        for i in range(deconv_cfg.NUM_DECONVS):
+            if deconv_cfg.CAT_OUTPUT[i]:
+                final_output_channels = cfg.MODEL.NUM_JOINTS + dim_tag \
+                    if cfg.LOSS.WITH_AE_LOSS[i] else cfg.MODEL.NUM_JOINTS
+                input_channels += final_output_channels
+            output_channels = deconv_cfg.NUM_CHANNELS[i]
+            deconv_kernel, padding, output_padding = \
+                self._get_deconv_cfg(deconv_cfg.KERNEL_SIZE[i])
+            layers = []
+            layers.append(nn.Sequential(
+                nn.ConvTranspose2d(
+                    in_channels=input_channels,
+                    out_channels=output_channels,
+                    kernel_size=deconv_kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False),
+                nn.BatchNorm2d(output_channels, momentum=BN_MOMENTUM),
+                nn.ReLU(inplace=True)
+            ))
+            for _ in range(cfg.MODEL.EXTRA.DECONV.NUM_BASIC_BLOCKS):
+                layers.append(nn.Sequential(
+                    BasicBlock(output_channels, output_channels),
+                ))
+            deconv_layers.append(nn.Sequential(*layers))
+            input_channels = output_channels
+        return nn.ModuleList(deconv_layers)
+    def _get_deconv_cfg(self, deconv_kernel):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        return deconv_kernel, padding, output_padding
+    def _make_transition_layer(
+            self, num_channels_pre_layer, num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(nn.Sequential(
+                        nn.Conv2d(num_channels_pre_layer[i],
+                                  num_channels_cur_layer[i],
+                                  3,
+                                  1,
+                                  1,
+                                  bias=False),
+                        nn.BatchNorm2d(num_channels_cur_layer[i]),
+                        nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv3x3s = []
+                for j in range(i+1-num_branches_pre):
+                    inchannels = num_channels_pre_layer[-1]
+                    outchannels = num_channels_cur_layer[i] \
+                        if j == i-num_branches_pre else inchannels
+                    conv3x3s.append(nn.Sequential(
+                        nn.Conv2d(
+                            inchannels, outchannels, 3, 2, 1, bias=False),
+                        nn.BatchNorm2d(outchannels),
+                        nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv3x3s))
+        return nn.ModuleList(transition_layers)
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    def _make_stage(self, layer_config, num_inchannels,
+                    multi_scale_output=True):
+        num_modules = layer_config['NUM_MODULES']
+        num_branches = layer_config['NUM_BRANCHES']
+        num_blocks = layer_config['NUM_BLOCKS']
+        num_channels = layer_config['NUM_CHANNELS']
+        block = blocks_dict[layer_config['BLOCK']]
+        fuse_method = layer_config['FUSE_METHOD']
+        modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used last module
+            if not multi_scale_output and i == num_modules - 1:
+                reset_multi_scale_output = False
+            else:
+                reset_multi_scale_output = True
+            modules.append(
+                HighResolutionModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    num_inchannels,
+                    num_channels,
+                    fuse_method,
+                    reset_multi_scale_output)
+            )
+            num_inchannels = modules[-1].get_num_inchannels()
+        return nn.Sequential(*modules), num_inchannels
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+        x_list = []
+        for i in range(self.stage2_cfg['NUM_BRANCHES']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+        x_list = []
+        for i in range(self.stage3_cfg['NUM_BRANCHES']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+        x_list = []
+        for i in range(self.stage4_cfg['NUM_BRANCHES']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+        x = y_list[0]
+        return x
+    def init_weights(self, pretrained='', verbose=True):
+        logger.info('=> init weights from normal distribution')
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, std=0.001)
+                for name, _ in m.named_parameters():
+                    if name in ['bias']:
+                        nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                nn.init.normal_(m.weight, std=0.001)
+                for name, _ in m.named_parameters():
+                    if name in ['bias']:
+                        nn.init.constant_(m.bias, 0)
+        parameters_names = set()
+        for name, _ in self.named_parameters():
+            parameters_names.add(name)
+        buffers_names = set()
+        for name, _ in self.named_buffers():
+            buffers_names.add(name)
+        if os.path.isfile(pretrained):
+            pretrained_state_dict = torch.load(pretrained)
+            logger.info('=> loading pretrained model {}'.format(pretrained))
+            need_init_state_dict = {}
+            for name, m in pretrained_state_dict.items():
+                if name.split('.')[0] in self.pretrained_layers \
+                   or self.pretrained_layers[0] is '*':
+                    if name in parameters_names or name in buffers_names:
+                        logger.info( '=> init {} from {}'.format(name, pretrained))
+                        need_init_state_dict[name] = m
+            self.load_state_dict(need_init_state_dict, strict=False)
+        print('High Resolution Network Trained on ImageNet loaded')
+def get_hrpose_net(num_layers, cfg, **kwargs):
+    model = PoseHigherResolutionNet(cfg, **kwargs)
+    if cfg.MODEL.INIT_WEIGHTS:
+        model.init_weights(cfg.MODEL.PRETRAINED)
+    return model
--- a/src/lib/models/Backbone/resnet_dcn.py
+++ b/src/lib/models/Backbone/resnet_dcn.py
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# Modified by Dequan Wang and Xingyi Zhou
+# ------------------------------------------------------------------------------
+from __future__ import absolute_import, division, print_function
+import logging
+import math
+import os
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+from .DCNv2.dcn_v2 import DCN
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :] 
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            nn.init.normal_(m.weight, std=0.001)
+            # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            # torch.nn.init.xavier_normal_(m.weight.data)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+class PoseResNet(nn.Module):
+    def __init__(self, block, layers, heads, head_conv):
+        self.inplanes = 64
+        self.heads = heads
+        self.deconv_with_bias = False
+        super(PoseResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        # used for deconv layers
+        self.deconv_layers = self._make_deconv_layer(
+            3,
+            [256, 128, 64],
+            [4, 4, 4],
+        )
+        for head in self.heads:
+            classes = self.heads[head]
+            if head_conv > 0:
+                fc = nn.Sequential(
+                  nn.Conv2d(64, head_conv,
+                    kernel_size=3, padding=1, bias=True),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(head_conv, classes, 
+                    kernel_size=1, stride=1, 
+                    padding=0, bias=True))
+                if 'hm' in head:
+                    fc[-1].bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            else:
+                fc = nn.Conv2d(64, classes, 
+                  kernel_size=1, stride=1, 
+                  padding=0, bias=True)
+                if 'hm' in head:
+                    fc.bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            self.__setattr__(head, fc)
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        return deconv_kernel, padding, output_padding
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+            planes = num_filters[i]
+            fc = DCN(self.inplanes, planes, 
+                    kernel_size=(3,3), stride=1,
+                    padding=1, dilation=1, deformable_groups=1)
+            # fc = nn.Conv2d(self.inplanes, planes,
+            #         kernel_size=3, stride=1, 
+            #         padding=1, dilation=1, bias=False)
+            # fill_fc_weights(fc)
+            up = nn.ConvTranspose2d(
+                    in_channels=planes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias)
+            fill_up_weights(up)
+            layers.append(fc)
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            layers.append(up)
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.deconv_layers(x)
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x)
+        return [ret]
+    def init_weights(self, num_layers):
+        if 1:
+            url = model_urls['resnet{}'.format(num_layers)]
+            pretrained_state_dict = model_zoo.load_url(url)
+            print('=> loading pretrained model {}'.format(url))
+            self.load_state_dict(pretrained_state_dict, strict=False)
+            print('=> init deconv weights from normal distribution')
+            for name, m in self.deconv_layers.named_modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]),
+               34: (BasicBlock, [3, 4, 6, 3]),
+               50: (Bottleneck, [3, 4, 6, 3]),
+               101: (Bottleneck, [3, 4, 23, 3]),
+               152: (Bottleneck, [3, 8, 36, 3])}
+def get_pose_net(num_layers, heads, head_conv=256):
+  block_class, layers = resnet_spec[num_layers]
+  model = PoseResNet(block_class, layers, heads, head_conv=head_conv)
+  model.init_weights(num_layers)
+  return model
--- a/src/lib/models/Backbone/shufflenetv2_dcn.py
+++ b/src/lib/models/Backbone/shufflenetv2_dcn.py
+import math
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from torch.nn import init
+from .DCNv2.dcn_v2 import DCN
+BN_MOMENTUM = 0.1
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU(inplace=True)
+    )
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU(inplace=True)
+    )
+def channel_shuffle(x, groups):
+    batchsize, num_channels, height, width = x.data.size()
+    channels_per_group = num_channels // groups
+    # reshape
+    x = x.view(batchsize, groups, 
+        channels_per_group, height, width)
+    x = torch.transpose(x, 1, 2).contiguous()
+    # flatten
+    x = x.view(batchsize, -1, height, width)
+    return x
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :] 
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, benchmodel):
+        super(InvertedResidual, self).__init__()
+        self.benchmodel = benchmodel
+        self.stride = stride
+        assert stride in [1, 2]
+        oup_inc = oup//2
+        if self.benchmodel == 1:
+            #assert inp == oup_inc
+        	self.banch2 = nn.Sequential(
+                # pw
+                nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                nn.ReLU(inplace=True),
+                # dw
+                nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                # pw-linear
+                nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                nn.ReLU(inplace=True),
+            )                
+        else:                  
+            self.banch1 = nn.Sequential(
+                # dw
+                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+                nn.BatchNorm2d(inp),
+                # pw-linear
+                nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                nn.ReLU(inplace=True),
+            )        
+            self.banch2 = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                nn.ReLU(inplace=True),
+                # dw
+                nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                # pw-linear
+                nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                nn.ReLU(inplace=True),
+            )
+    @staticmethod
+    def _concat(x, out):
+        # concatenate along channel axis
+        return torch.cat((x, out), 1)        
+    def forward(self, x):
+        if 1==self.benchmodel:
+            x1 = x[:, :(x.shape[1]//2), :, :]
+            x2 = x[:, (x.shape[1]//2):, :, :]
+            out = self._concat(x1, self.banch2(x2))
+        elif 2==self.benchmodel:
+            out = self._concat(self.banch1(x), self.banch2(x))
+        return channel_shuffle(out, 2)
+class ShuffleNetV2(nn.Module):
+    def __init__(self, input_size=512, width_mult=1.):
+        super(ShuffleNetV2, self).__init__()
+        self.inplanes = 24
+        self.deconv_with_bias = False
+        assert input_size % 32 == 0
+        self.stage_repeats = [4, 8, 4]
+        #self.stage_repeats = [2, 3, 2]
+        # index 0 is invalid and should never be called.
+        # only used for indexing convenience.
+        if width_mult == 0.5:
+            self.stage_out_channels = [-1, 24,  48,  96, 192, 1024]
+        elif width_mult == 1.0:
+            self.stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif width_mult == 1.5:
+            self.stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif width_mult == 2.0:
+            self.stage_out_channels = [-1, 24, 224, 488, 976, 2048]
+        else:
+            raise ValueError(
+                """{} groups is not supported for
+                       1x1 Grouped Convolutions""".format(num_groups))
+        # building first layer
+        input_channel = self.stage_out_channels[1]
+        self.conv1 = conv_bn(3, input_channel, 2)    
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 
+        self.features = []
+        # building inverted residual blocks
+        for idxstage in range(len(self.stage_repeats)):
+            numrepeat = self.stage_repeats[idxstage]
+            output_channel = self.stage_out_channels[idxstage+2]
+            for i in range(numrepeat):
+                if i == 0:
+	            #inp, oup, stride, benchmodel):
+                    self.features.append(InvertedResidual(input_channel, output_channel, 2, 2))
+                else:
+                    self.features.append(InvertedResidual(input_channel, output_channel, 1, 1))
+                input_channel = output_channel
+                self.inplanes = output_channel
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+        # consider here to add the last sevearal layers
+        # building last several layers
+        # self.conv_last      = conv_1x1_bn(input_channel, self.stage_out_channels[-1])
+        # self.globalpool = nn.Sequential(nn.AvgPool2d(int(input_size/32)))
+        # used for deconv layers
+        self.deconv_layers = self._make_deconv_layer(
+            3,
+            [256, 256, 256],
+            [4, 4, 4],
+        )
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        return deconv_kernel, padding, output_padding
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+            planes = num_filters[i]
+            fc = DCN(self.inplanes, planes, 
+                    kernel_size=(3,3), stride=1,
+                    padding=1, dilation=1, deformable_groups=1)
+            # fc = nn.Conv2d(self.inplanes, planes,
+            #         kernel_size=3, stride=1, 
+            #         padding=1, dilation=1, bias=False)
+            # fill_fc_weights(fc)
+            up = nn.ConvTranspose2d(
+                    in_channels=planes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias)
+            fill_up_weights(up)
+            layers.append(fc)
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            layers.append(up)
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+        return nn.Sequential(*layers)
+    def init_weights(self, pretrained=True):
+        if pretrained:
+            # print('=> init resnet deconv weights from normal distribution')
+            print('=> init deconv weights from normal distribution')
+            for name, m in self.deconv_layers.named_modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+            #pretrained_state_dict = torch.load(pretrained)
+            #address = "/data/pretrained_model/shufflenetv2_x1_69.390_88.412.pth.tar"
+            #pretrained_state_dict = torch.load(address)
+            #self.load_state_dict(pretrained_state_dict, strict=False)
+    def forward(self, x):
+        #import pdb; pdb.set_trace()
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        x = self.features(x)
+        x = self.deconv_layers(x)
+        return x
+def shufflenetv2(width_mult=1.):
+    model = ShuffleNetV2(width_mult=width_mult)
+    return model
+def get_shufflev2_net(num_layers, cfg):
+  model = ShuffleNetV2()
+  model.init_weights( pretrained=True)
+  return model
--- a/src/lib/models/data_parallel.py
+++ b/src/lib/models/data_parallel.py
+import torch
+from torch.nn.modules import Module
+from torch.nn.parallel.scatter_gather import gather
+from torch.nn.parallel.replicate import replicate
+from torch.nn.parallel.parallel_apply import parallel_apply
+from .scatter_gather import scatter_kwargs
+class _DataParallel(Module):
+    r"""Implements data parallelism at the module level.
+    This container parallelizes the application of the given module by
+    splitting the input across the specified devices by chunking in the batch
+    dimension. In the forward pass, the module is replicated on each device,
+    and each replica handles a portion of the input. During the backwards
+    pass, gradients from each replica are summed into the original module.
+    The batch size should be larger than the number of GPUs used. It should
+    also be an integer multiple of the number of GPUs so that each chunk is the
+    same size (so that each GPU processes the same number of samples).
+    See also: :ref:`cuda-nn-dataparallel-instead`
+    Arbitrary positional and keyword inputs are allowed to be passed into
+    DataParallel EXCEPT Tensors. All variables will be scattered on dim
+    specified (default 0). Primitive types will be broadcasted, but all
+    other types will be a shallow copy and can be corrupted if written to in
+    the model's forward pass.
+    Args:
+        module: module to be parallelized
+        device_ids: CUDA devices (default: all devices)
+        output_device: device location of output (default: device_ids[0])
+    Example::
+        >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2])
+        >>> output = net(input_var)
+    """
+    # TODO: update notes/cuda.rst when this class handles 8+ GPUs well
+    def __init__(self, module, device_ids=None, output_device=None, dim=0, chunk_sizes=None):
+        super(_DataParallel, self).__init__()
+        if not torch.cuda.is_available():
+            self.module = module
+            self.device_ids = []
+            return
+        if device_ids is None:
+            device_ids = list(range(torch.cuda.device_count()))
+        if output_device is None:
+            output_device = device_ids[0]
+        self.dim = dim
+        self.module = module
+        self.device_ids = device_ids
+        self.chunk_sizes = chunk_sizes
+        self.output_device = output_device
+        if len(self.device_ids) == 1:
+            self.module.cuda(device_ids[0])
+    def forward(self, *inputs, **kwargs):
+        if not self.device_ids:
+            return self.module(*inputs, **kwargs)
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids, self.chunk_sizes)
+        if len(self.device_ids) == 1:
+            return self.module(*inputs[0], **kwargs[0])
+        replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
+        outputs = self.parallel_apply(replicas, inputs, kwargs)
+        return self.gather(outputs, self.output_device)
+    def replicate(self, module, device_ids):
+        return replicate(module, device_ids)
+    def scatter(self, inputs, kwargs, device_ids, chunk_sizes):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim, chunk_sizes=self.chunk_sizes)
+    def parallel_apply(self, replicas, inputs, kwargs):
+        return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
+    def gather(self, outputs, output_device):
+        return gather(outputs, output_device, dim=self.dim)
+def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, module_kwargs=None):
+    r"""Evaluates module(input) in parallel across the GPUs given in device_ids.
+    This is the functional version of the DataParallel module.
+    Args:
+        module: the module to evaluate in parallel
+        inputs: inputs to the module
+        device_ids: GPU ids on which to replicate module
+        output_device: GPU location of the output  Use -1 to indicate the CPU.
+            (default: device_ids[0])
+    Returns:
+        a Variable containing the result of module(input) located on
+        output_device
+    """
+    if not isinstance(inputs, tuple):
+        inputs = (inputs,)
+    if device_ids is None:
+        device_ids = list(range(torch.cuda.device_count()))
+    if output_device is None:
+        output_device = device_ids[0]
+    inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim)
+    if len(device_ids) == 1:
+        return module(*inputs[0], **module_kwargs[0])
+    used_device_ids = device_ids[:len(inputs)]
+    replicas = replicate(module, used_device_ids)
+    outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)
+    return gather(outputs, output_device, dim)
+def DataParallel(module, device_ids=None, output_device=None, dim=0, chunk_sizes=None):
+    if chunk_sizes is None:
+        return torch.nn.DataParallel(module, device_ids, output_device, dim)
+    standard_size = True
+    for i in range(1, len(chunk_sizes)):
+        if chunk_sizes[i] != chunk_sizes[0]:
+            standard_size = False
+    if standard_size:
+        return torch.nn.DataParallel(module, device_ids, output_device, dim)
+    return _DataParallel(module, device_ids, output_device, dim, chunk_sizes)
\ No newline at end of file
--- a/src/lib/models/decode.py
+++ b/src/lib/models/decode.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import torch.nn as nn
+from .utils import _gather_feat, _tranpose_and_gather_feat
+import numpy as np
+def _nms(heat, kernel=3):
+    pad = (kernel - 1) // 2
+    hmax = nn.functional.max_pool2d(
+        heat, (kernel, kernel), stride=1, padding=pad)
+    keep = (hmax == heat).float()
+    return heat * keep
+def _left_aggregate(heat):
+    '''
+        heat: batchsize x channels x h x w
+    '''
+    shape = heat.shape 
+    heat = heat.reshape(-1, heat.shape[3])
+    heat = heat.transpose(1, 0).contiguous()
+    ret = heat.clone()
+    for i in range(1, heat.shape[0]):
+        inds = (heat[i] >= heat[i - 1])
+        ret[i] += ret[i - 1] * inds.float()
+    return (ret - heat).transpose(1, 0).reshape(shape) 
+def _right_aggregate(heat):
+    '''
+        heat: batchsize x channels x h x w
+    '''
+    shape = heat.shape 
+    heat = heat.reshape(-1, heat.shape[3])
+    heat = heat.transpose(1, 0).contiguous()
+    ret = heat.clone()
+    for i in range(heat.shape[0] - 2, -1, -1):
+        inds = (heat[i] >= heat[i +1])
+        ret[i] += ret[i + 1] * inds.float()
+    return (ret - heat).transpose(1, 0).reshape(shape) 
+def _top_aggregate(heat):
+    '''
+        heat: batchsize x channels x h x w
+    '''
+    heat = heat.transpose(3, 2) 
+    shape = heat.shape
+    heat = heat.reshape(-1, heat.shape[3])
+    heat = heat.transpose(1, 0).contiguous()
+    ret = heat.clone()
+    for i in range(1, heat.shape[0]):
+        inds = (heat[i] >= heat[i - 1])
+        ret[i] += ret[i - 1] * inds.float()
+    return (ret - heat).transpose(1, 0).reshape(shape).transpose(3, 2)
+def _bottom_aggregate(heat):
+    '''
+        heat: batchsize x channels x h x w
+    '''
+    heat = heat.transpose(3, 2) 
+    shape = heat.shape
+    heat = heat.reshape(-1, heat.shape[3])
+    heat = heat.transpose(1, 0).contiguous()
+    ret = heat.clone()
+    for i in range(heat.shape[0] - 2, -1, -1):
+        inds = (heat[i] >= heat[i + 1])
+        ret[i] += ret[i + 1] * inds.float()
+    return (ret - heat).transpose(1, 0).reshape(shape).transpose(3, 2)
+def _h_aggregate(heat, aggr_weight=0.1):
+    return aggr_weight * _left_aggregate(heat) + \
+           aggr_weight * _right_aggregate(heat) + heat
+def _v_aggregate(heat, aggr_weight=0.1):
+    return aggr_weight * _top_aggregate(heat) + \
+           aggr_weight * _bottom_aggregate(heat) + heat
+'''
+# Slow for large number of categories
+def _topk(scores, K=40):
+    batch, cat, height, width = scores.size()
+    topk_scores, topk_inds = torch.topk(scores.view(batch, -1), K)
+    topk_clses = (topk_inds / (height * width)).int()
+    topk_inds = topk_inds % (height * width)
+    topk_ys   = (topk_inds / width).int().float()
+    topk_xs   = (topk_inds % width).int().float()
+    return topk_scores, topk_inds, topk_clses, topk_ys, topk_xs
+'''
+def _topk_channel(scores, K=40):
+      batch, cat, height, width = scores.size()
+      topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
+      topk_inds = topk_inds % (height * width)
+      topk_ys   = (topk_inds / width).int().float()
+      topk_xs   = (topk_inds % width).int().float()
+      return topk_scores, topk_inds, topk_ys, topk_xs
+def _topk(scores, K=40):
+    batch, cat, height, width = scores.size()
+    topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)             # 前100个点
+    topk_inds = topk_inds % (height * width)
+    topk_ys   = (topk_inds / width).int().float()
+    topk_xs   = (topk_inds % width).int().float()
+    topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
+    topk_clses = (topk_ind / K).int()
+    topk_inds = _gather_feat(
+        topk_inds.view(batch, -1, 1), topk_ind).view(batch, K)
+    topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K)
+    topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K)
+    return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
+def agnex_ct_decode(
+    t_heat, l_heat, b_heat, r_heat, ct_heat, 
+    t_regr=None, l_regr=None, b_regr=None, r_regr=None, 
+    K=40, scores_thresh=0.1, center_thresh=0.1, aggr_weight=0.0, num_dets=1000
+):
+    batch, cat, height, width = t_heat.size()
+    '''
+    t_heat  = torch.sigmoid(t_heat)
+    l_heat  = torch.sigmoid(l_heat)
+    b_heat  = torch.sigmoid(b_heat)
+    r_heat  = torch.sigmoid(r_heat)
+    ct_heat = torch.sigmoid(ct_heat)
+    '''
+    if aggr_weight > 0: 
+      t_heat = _h_aggregate(t_heat, aggr_weight=aggr_weight)
+      l_heat = _v_aggregate(l_heat, aggr_weight=aggr_weight)
+      b_heat = _h_aggregate(b_heat, aggr_weight=aggr_weight)
+      r_heat = _v_aggregate(r_heat, aggr_weight=aggr_weight)
+    # perform nms on heatmaps
+    t_heat = _nms(t_heat)
+    l_heat = _nms(l_heat)
+    b_heat = _nms(b_heat)
+    r_heat = _nms(r_heat)
+    t_heat[t_heat > 1] = 1
+    l_heat[l_heat > 1] = 1
+    b_heat[b_heat > 1] = 1
+    r_heat[r_heat > 1] = 1
+    t_scores, t_inds, _, t_ys, t_xs = _topk(t_heat, K=K)
+    l_scores, l_inds, _, l_ys, l_xs = _topk(l_heat, K=K)
+    b_scores, b_inds, _, b_ys, b_xs = _topk(b_heat, K=K)
+    r_scores, r_inds, _, r_ys, r_xs = _topk(r_heat, K=K)
+    ct_heat_agn, ct_clses = torch.max(ct_heat, dim=1, keepdim=True)
+    # import pdb; pdb.set_trace()
+    t_ys = t_ys.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    t_xs = t_xs.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    l_ys = l_ys.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    l_xs = l_xs.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    b_ys = b_ys.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    b_xs = b_xs.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    r_ys = r_ys.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+    r_xs = r_xs.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+    box_ct_xs = ((l_xs + r_xs + 0.5) / 2).long()
+    box_ct_ys = ((t_ys + b_ys + 0.5) / 2).long()
+    ct_inds     = box_ct_ys * width + box_ct_xs
+    ct_inds     = ct_inds.view(batch, -1)
+    ct_heat_agn = ct_heat_agn.view(batch, -1, 1)
+    ct_clses    = ct_clses.view(batch, -1, 1)
+    ct_scores   = _gather_feat(ct_heat_agn, ct_inds)
+    clses       = _gather_feat(ct_clses, ct_inds)
+    t_scores = t_scores.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    l_scores = l_scores.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    b_scores = b_scores.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    r_scores = r_scores.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+    ct_scores = ct_scores.view(batch, K, K, K, K)
+    scores    = (t_scores + l_scores + b_scores + r_scores + 2 * ct_scores) / 6
+    # reject boxes based on classes
+    top_inds  = (t_ys > l_ys) + (t_ys > b_ys) + (t_ys > r_ys)
+    top_inds = (top_inds > 0)
+    left_inds  = (l_xs > t_xs) + (l_xs > b_xs) + (l_xs > r_xs)
+    left_inds = (left_inds > 0)
+    bottom_inds  = (b_ys < t_ys) + (b_ys < l_ys) + (b_ys < r_ys)
+    bottom_inds = (bottom_inds > 0)
+    right_inds  = (r_xs < t_xs) + (r_xs < l_xs) + (r_xs < b_xs)
+    right_inds = (right_inds > 0)
+    sc_inds = (t_scores < scores_thresh) + (l_scores < scores_thresh) + \
+              (b_scores < scores_thresh) + (r_scores < scores_thresh) + \
+              (ct_scores < center_thresh)
+    sc_inds = (sc_inds > 0)
+    scores = scores - sc_inds.float()
+    scores = scores - top_inds.float()
+    scores = scores - left_inds.float()
+    scores = scores - bottom_inds.float()
+    scores = scores - right_inds.float()
+    scores = scores.view(batch, -1)
+    scores, inds = torch.topk(scores, num_dets)
+    scores = scores.unsqueeze(2)
+    if t_regr is not None and l_regr is not None \
+      and b_regr is not None and r_regr is not None:
+        t_regr = _tranpose_and_gather_feat(t_regr, t_inds)
+        t_regr = t_regr.view(batch, K, 1, 1, 1, 2)
+        l_regr = _tranpose_and_gather_feat(l_regr, l_inds)
+        l_regr = l_regr.view(batch, 1, K, 1, 1, 2)
+        b_regr = _tranpose_and_gather_feat(b_regr, b_inds)
+        b_regr = b_regr.view(batch, 1, 1, K, 1, 2)
+        r_regr = _tranpose_and_gather_feat(r_regr, r_inds)
+        r_regr = r_regr.view(batch, 1, 1, 1, K, 2)
+        t_xs = t_xs + t_regr[..., 0]
+        t_ys = t_ys + t_regr[..., 1]
+        l_xs = l_xs + l_regr[..., 0]
+        l_ys = l_ys + l_regr[..., 1]
+        b_xs = b_xs + b_regr[..., 0]
+        b_ys = b_ys + b_regr[..., 1]
+        r_xs = r_xs + r_regr[..., 0]
+        r_ys = r_ys + r_regr[..., 1]
+    else:
+        t_xs = t_xs + 0.5
+        t_ys = t_ys + 0.5
+        l_xs = l_xs + 0.5
+        l_ys = l_ys + 0.5
+        b_xs = b_xs + 0.5
+        b_ys = b_ys + 0.5
+        r_xs = r_xs + 0.5
+        r_ys = r_ys + 0.5
+    bboxes = torch.stack((l_xs, t_ys, r_xs, b_ys), dim=5)
+    bboxes = bboxes.view(batch, -1, 4)
+    bboxes = _gather_feat(bboxes, inds)
+    clses  = clses.contiguous().view(batch, -1, 1)
+    clses  = _gather_feat(clses, inds).float()
+    t_xs = t_xs.contiguous().view(batch, -1, 1)
+    t_xs = _gather_feat(t_xs, inds).float()
+    t_ys = t_ys.contiguous().view(batch, -1, 1)
+    t_ys = _gather_feat(t_ys, inds).float()
+    l_xs = l_xs.contiguous().view(batch, -1, 1)
+    l_xs = _gather_feat(l_xs, inds).float()
+    l_ys = l_ys.contiguous().view(batch, -1, 1)
+    l_ys = _gather_feat(l_ys, inds).float()
+    b_xs = b_xs.contiguous().view(batch, -1, 1)
+    b_xs = _gather_feat(b_xs, inds).float()
+    b_ys = b_ys.contiguous().view(batch, -1, 1)
+    b_ys = _gather_feat(b_ys, inds).float()
+    r_xs = r_xs.contiguous().view(batch, -1, 1)
+    r_xs = _gather_feat(r_xs, inds).float()
+    r_ys = r_ys.contiguous().view(batch, -1, 1)
+    r_ys = _gather_feat(r_ys, inds).float()
+    detections = torch.cat([bboxes, scores, t_xs, t_ys, l_xs, l_ys, 
+                            b_xs, b_ys, r_xs, r_ys, clses], dim=2)
+    return detections
+def exct_decode(
+    t_heat, l_heat, b_heat, r_heat, ct_heat, 
+    t_regr=None, l_regr=None, b_regr=None, r_regr=None, 
+    K=40, scores_thresh=0.1, center_thresh=0.1, aggr_weight=0.0, num_dets=1000
+):
+    batch, cat, height, width = t_heat.size()
+    '''
+    t_heat  = torch.sigmoid(t_heat)
+    l_heat  = torch.sigmoid(l_heat)
+    b_heat  = torch.sigmoid(b_heat)
+    r_heat  = torch.sigmoid(r_heat)
+    ct_heat = torch.sigmoid(ct_heat)
+    '''
+    if aggr_weight > 0:   
+      t_heat = _h_aggregate(t_heat, aggr_weight=aggr_weight)
+      l_heat = _v_aggregate(l_heat, aggr_weight=aggr_weight)
+      b_heat = _h_aggregate(b_heat, aggr_weight=aggr_weight)
+      r_heat = _v_aggregate(r_heat, aggr_weight=aggr_weight)
+    # perform nms on heatmaps
+    t_heat = _nms(t_heat)
+    l_heat = _nms(l_heat)
+    b_heat = _nms(b_heat)
+    r_heat = _nms(r_heat)
+    t_heat[t_heat > 1] = 1
+    l_heat[l_heat > 1] = 1
+    b_heat[b_heat > 1] = 1
+    r_heat[r_heat > 1] = 1
+    t_scores, t_inds, t_clses, t_ys, t_xs = _topk(t_heat, K=K)
+    l_scores, l_inds, l_clses, l_ys, l_xs = _topk(l_heat, K=K)
+    b_scores, b_inds, b_clses, b_ys, b_xs = _topk(b_heat, K=K)
+    r_scores, r_inds, r_clses, r_ys, r_xs = _topk(r_heat, K=K)
+    t_ys = t_ys.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    t_xs = t_xs.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    l_ys = l_ys.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    l_xs = l_xs.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    b_ys = b_ys.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    b_xs = b_xs.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    r_ys = r_ys.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+    r_xs = r_xs.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+    t_clses = t_clses.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    l_clses = l_clses.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    b_clses = b_clses.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    r_clses = r_clses.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+    box_ct_xs = ((l_xs + r_xs + 0.5) / 2).long()
+    box_ct_ys = ((t_ys + b_ys + 0.5) / 2).long()
+    ct_inds = t_clses.long() * (height * width) + box_ct_ys * width + box_ct_xs
+    ct_inds = ct_inds.view(batch, -1)
+    ct_heat = ct_heat.view(batch, -1, 1)
+    ct_scores = _gather_feat(ct_heat, ct_inds)
+    t_scores = t_scores.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    l_scores = l_scores.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    b_scores = b_scores.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    r_scores = r_scores.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+    ct_scores = ct_scores.view(batch, K, K, K, K)
+    scores    = (t_scores + l_scores + b_scores + r_scores + 2 * ct_scores) / 6
+    # reject boxes based on classes
+    cls_inds = (t_clses != l_clses) + (t_clses != b_clses) + \
+               (t_clses != r_clses)
+    cls_inds = (cls_inds > 0)
+    top_inds  = (t_ys > l_ys) + (t_ys > b_ys) + (t_ys > r_ys)
+    top_inds = (top_inds > 0)
+    left_inds  = (l_xs > t_xs) + (l_xs > b_xs) + (l_xs > r_xs)
+    left_inds = (left_inds > 0)
+    bottom_inds  = (b_ys < t_ys) + (b_ys < l_ys) + (b_ys < r_ys)
+    bottom_inds = (bottom_inds > 0)
+    right_inds  = (r_xs < t_xs) + (r_xs < l_xs) + (r_xs < b_xs)
+    right_inds = (right_inds > 0)
+    sc_inds = (t_scores < scores_thresh) + (l_scores < scores_thresh) + \
+              (b_scores < scores_thresh) + (r_scores < scores_thresh) + \
+              (ct_scores < center_thresh)
+    sc_inds = (sc_inds > 0)
+    scores = scores - sc_inds.float()
+    scores = scores - cls_inds.float()
+    scores = scores - top_inds.float()
+    scores = scores - left_inds.float()
+    scores = scores - bottom_inds.float()
+    scores = scores - right_inds.float()
+    scores = scores.view(batch, -1)
+    scores, inds = torch.topk(scores, num_dets)
+    scores = scores.unsqueeze(2)
+    if t_regr is not None and l_regr is not None \
+      and b_regr is not None and r_regr is not None:
+        t_regr = _tranpose_and_gather_feat(t_regr, t_inds)
+        t_regr = t_regr.view(batch, K, 1, 1, 1, 2)
+        l_regr = _tranpose_and_gather_feat(l_regr, l_inds)
+        l_regr = l_regr.view(batch, 1, K, 1, 1, 2)
+        b_regr = _tranpose_and_gather_feat(b_regr, b_inds)
+        b_regr = b_regr.view(batch, 1, 1, K, 1, 2)
+        r_regr = _tranpose_and_gather_feat(r_regr, r_inds)
+        r_regr = r_regr.view(batch, 1, 1, 1, K, 2)
+        t_xs = t_xs + t_regr[..., 0]
+        t_ys = t_ys + t_regr[..., 1]
+        l_xs = l_xs + l_regr[..., 0]
+        l_ys = l_ys + l_regr[..., 1]
+        b_xs = b_xs + b_regr[..., 0]
+        b_ys = b_ys + b_regr[..., 1]
+        r_xs = r_xs + r_regr[..., 0]
+        r_ys = r_ys + r_regr[..., 1]
+    else:
+        t_xs = t_xs + 0.5
+        t_ys = t_ys + 0.5
+        l_xs = l_xs + 0.5
+        l_ys = l_ys + 0.5
+        b_xs = b_xs + 0.5
+        b_ys = b_ys + 0.5
+        r_xs = r_xs + 0.5
+        r_ys = r_ys + 0.5
+    bboxes = torch.stack((l_xs, t_ys, r_xs, b_ys), dim=5)
+    bboxes = bboxes.view(batch, -1, 4)
+    bboxes = _gather_feat(bboxes, inds)
+    clses  = t_clses.contiguous().view(batch, -1, 1)
+    clses  = _gather_feat(clses, inds).float()
+    t_xs = t_xs.contiguous().view(batch, -1, 1)
+    t_xs = _gather_feat(t_xs, inds).float()
+    t_ys = t_ys.contiguous().view(batch, -1, 1)
+    t_ys = _gather_feat(t_ys, inds).float()
+    l_xs = l_xs.contiguous().view(batch, -1, 1)
+    l_xs = _gather_feat(l_xs, inds).float()
+    l_ys = l_ys.contiguous().view(batch, -1, 1)
+    l_ys = _gather_feat(l_ys, inds).float()
+    b_xs = b_xs.contiguous().view(batch, -1, 1)
+    b_xs = _gather_feat(b_xs, inds).float()
+    b_ys = b_ys.contiguous().view(batch, -1, 1)
+    b_ys = _gather_feat(b_ys, inds).float()
+    r_xs = r_xs.contiguous().view(batch, -1, 1)
+    r_xs = _gather_feat(r_xs, inds).float()
+    r_ys = r_ys.contiguous().view(batch, -1, 1)
+    r_ys = _gather_feat(r_ys, inds).float()
+    detections = torch.cat([bboxes, scores, t_xs, t_ys, l_xs, l_ys, 
+                            b_xs, b_ys, r_xs, r_ys, clses], dim=2)
+    return detections
+def ddd_decode(heat, rot, depth, dim, wh=None, reg=None, K=40):
+    batch, cat, height, width = heat.size()
+    # heat = torch.sigmoid(heat)
+    # perform nms on heatmaps
+    heat = _nms(heat)
+    scores, inds, clses, ys, xs = _topk(heat, K=K)
+    if reg is not None:
+      reg = _tranpose_and_gather_feat(reg, inds)
+      reg = reg.view(batch, K, 2)
+      xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+      ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+    else:
+      xs = xs.view(batch, K, 1) + 0.5
+      ys = ys.view(batch, K, 1) + 0.5
+    rot = _tranpose_and_gather_feat(rot, inds)
+    rot = rot.view(batch, K, 8)
+    depth = _tranpose_and_gather_feat(depth, inds)
+    depth = depth.view(batch, K, 1)
+    dim = _tranpose_and_gather_feat(dim, inds)
+    dim = dim.view(batch, K, 3)
+    clses  = clses.view(batch, K, 1).float()
+    scores = scores.view(batch, K, 1)
+    xs = xs.view(batch, K, 1)
+    ys = ys.view(batch, K, 1)
+    if wh is not None:
+        wh = _tranpose_and_gather_feat(wh, inds)
+        wh = wh.view(batch, K, 2)
+        detections = torch.cat(
+            [xs, ys, scores, rot, depth, dim, wh, clses], dim=2)
+    else:
+        detections = torch.cat(
+            [xs, ys, scores, rot, depth, dim, clses], dim=2)
+    return detections
+def ctdet_decode(heat, wh, reg=None, cat_spec_wh=False, K=100):
+    batch, cat, height, width = heat.size()
+    # heat = torch.sigmoid(heat)
+    # perform nms on heatmaps
+    heat = _nms(heat)                               # 3 * 3 区域的最大值滤波
+    scores, inds, clses, ys, xs = _topk(heat, K=K)
+    if reg is not None:
+      reg = _tranpose_and_gather_feat(reg, inds)
+      reg = reg.view(batch, K, 2)
+      xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+      ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+    else:
+      xs = xs.view(batch, K, 1) + 0.5
+      ys = ys.view(batch, K, 1) + 0.5
+    wh = _tranpose_and_gather_feat(wh, inds)
+    if cat_spec_wh:
+      wh = wh.view(batch, K, cat, 2)
+      clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 2).long()
+      wh = wh.gather(2, clses_ind).view(batch, K, 2)
+    else:
+      wh = wh.view(batch, K, 2)
+    clses  = clses.view(batch, K, 1).float()
+    scores = scores.view(batch, K, 1)
+    bboxes = torch.cat([xs - wh[..., 0:1] / 2, 
+                        ys - wh[..., 1:2] / 2,
+                        xs + wh[..., 0:1] / 2, 
+                        ys + wh[..., 1:2] / 2], dim=2)
+    detections = torch.cat([bboxes, scores, clses], dim=2)
+    return detections
+def multi_pose_decode(
+    heat, wh, kps, reg=None, hm_hp=None, hp_offset=None, K=100):
+  batch, cat, height, width = heat.size()
+  num_joints = kps.shape[1] // 2
+  # heat = torch.sigmoid(heat)
+  # perform nms on heatmaps
+  heat = _nms(heat)
+  scores, inds, clses, ys, xs = _topk(heat, K=K)
+  kps = _tranpose_and_gather_feat(kps, inds)
+  kps = kps.view(batch, K, num_joints * 2)
+  kps[..., ::2] += xs.view(batch, K, 1).expand(batch, K, num_joints)                # 第一次通过中心点偏移获得的关节点的坐标
+  kps[..., 1::2] += ys.view(batch, K, 1).expand(batch, K, num_joints)
+  if reg is not None:                                                               # 回归的中心点偏移量
+    reg = _tranpose_and_gather_feat(reg, inds)
+    reg = reg.view(batch, K, 2)
+    xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+    ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+  else:
+    xs = xs.view(batch, K, 1) + 0.5
+    ys = ys.view(batch, K, 1) + 0.5
+  wh = _tranpose_and_gather_feat(wh, inds)                                          # 矩形框的宽高
+  wh = wh.view(batch, K, 2)
+  clses  = clses.view(batch, K, 1).float()
+  scores = scores.view(batch, K, 1)
+  bboxes = torch.cat([xs - wh[..., 0:1] / 2, 
+                      ys - wh[..., 1:2] / 2,
+                      xs + wh[..., 0:1] / 2, 
+                      ys + wh[..., 1:2] / 2], dim=2)
+  if hm_hp is not None:
+      hm_hp = _nms(hm_hp)                                         # 第二次：通过关节点热力图求得关节点的中心点
+      thresh = 0.1
+      kps = kps.view(batch, K, num_joints, 2).permute(
+          0, 2, 1, 3).contiguous() # b x J x K x 2
+      reg_kps = kps.unsqueeze(3).expand(batch, num_joints, K, K, 2)
+      hm_score, hm_inds, hm_ys, hm_xs = _topk_channel(hm_hp, K=K) # b x J x K
+      if hp_offset is not None:                                   # 关节点的中心的偏移
+          hp_offset = _tranpose_and_gather_feat(
+              hp_offset, hm_inds.view(batch, -1))
+          hp_offset = hp_offset.view(batch, num_joints, K, 2)
+          hm_xs = hm_xs + hp_offset[:, :, :, 0]
+          hm_ys = hm_ys + hp_offset[:, :, :, 1]
+      else:
+          hm_xs = hm_xs + 0.5
+          hm_ys = hm_ys + 0.5
+      mask = (hm_score > thresh).float()                            # 选置信度大于0.1的
+      hm_score = (1 - mask) * -1 + mask * hm_score
+      hm_ys = (1 - mask) * (-10000) + mask * hm_ys
+      hm_xs = (1 - mask) * (-10000) + mask * hm_xs
+      hm_kps = torch.stack([hm_xs, hm_ys], dim=-1).unsqueeze(
+          2).expand(batch, num_joints, K, K, 2)
+      dist = (((reg_kps - hm_kps) ** 2).sum(dim=4) ** 0.5)          # 两次求解的关节点求距离
+      min_dist, min_ind = dist.min(dim=3) # b x J x K
+      hm_score = hm_score.gather(2, min_ind).unsqueeze(-1) # b x J x K x 1
+      min_dist = min_dist.unsqueeze(-1)
+      min_ind = min_ind.view(batch, num_joints, K, 1, 1).expand(
+          batch, num_joints, K, 1, 2)
+      hm_kps = hm_kps.gather(3, min_ind)
+      hm_kps = hm_kps.view(batch, num_joints, K, 2)
+      # 如果在bboxes中则用第二种方法的关节点，在bboxes外用第一种方法提取的关节点，就是优先选第二种方法
+      l = bboxes[:, :, 0].view(batch, 1, K, 1).expand(batch, num_joints, K, 1)
+      t = bboxes[:, :, 1].view(batch, 1, K, 1).expand(batch, num_joints, K, 1)
+      r = bboxes[:, :, 2].view(batch, 1, K, 1).expand(batch, num_joints, K, 1)
+      b = bboxes[:, :, 3].view(batch, 1, K, 1).expand(batch, num_joints, K, 1)
+      mask = (hm_kps[..., 0:1] < l) + (hm_kps[..., 0:1] > r) + \
+             (hm_kps[..., 1:2] < t) + (hm_kps[..., 1:2] > b) + \
+             (hm_score < thresh) + (min_dist > (torch.max(b - t, r - l) * 0.3))
+      mask = (mask > 0).float().expand(batch, num_joints, K, 2)
+      kps = (1 - mask) * hm_kps + mask * kps
+      kps = kps.permute(0, 2, 1, 3).contiguous().view(
+          batch, K, num_joints * 2)
+  detections = torch.cat([bboxes, scores, kps, clses], dim=2)                   # box:4+score:1+kpoints:10+class:1=16
+  return detections
+def threshold_choose(scores, threshold):
+    mask = scores.gt(threshold)
+    topk_scores = scores[mask]
+    topk_inds = torch.range(0, scores.numel()-1)[mask.squeeze().flatten()]
+    topk_inds = topk_inds.cuda().to(torch.int64)
+    batch, cat, height, width = scores.size()
+    # topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)  # 前100个点
+    topk_inds = topk_inds % (height * width)
+    topk_ys = (topk_inds / width).int().float()
+    topk_xs = (topk_inds % width).int().float()
+    K = topk_inds.numel()
+    topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
+    topk_clses = (topk_ind / K).int()
+    topk_inds = _gather_feat(
+        topk_inds.view(batch, -1, 1), topk_ind).view(batch, K)
+    topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K)
+    topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K)
+    return topk_score, topk_inds, topk_clses, topk_ys, topk_xs, K
+def centerface_decode(
+        heat, wh, kps, reg=None, hm_hp=None, hp_offset=None, K=100):
+    batch, cat, height, width = heat.size()
+    num_joints = kps.shape[1] // 2
+    # heat = torch.sigmoid(heat)
+    # perform nms on heatmaps
+    heat = _nms(heat)
+    scores, inds, clses, ys_int, xs_int = _topk(heat, K=K)
+    # scores, inds, clses, ys_int, xs_int, K = threshold_choose(heat, threshold=0.05)
+    if reg is not None:  # 回归的中心点偏移量
+        reg = _tranpose_and_gather_feat(reg, inds)
+        reg = reg.view(batch, K, 2)
+        xs = xs_int.view(batch, K, 1) + reg[:, :, 0:1]                  # 1. 中心点，后面乘了4
+        ys = ys_int.view(batch, K, 1) + reg[:, :, 1:2]
+        # xs = (xs_int.view(batch, K, 1) + reg[:, :, 0:1] + 0.5)
+        # ys = (ys_int.view(batch, K, 1) + reg[:, :, 1:2] + 0.5)            # 1. 中心点，按centerface的方式计算
+    else:
+        xs = xs_int.view(batch, K, 1) + 0.5
+        ys = ys_int.view(batch, K, 1) + 0.5
+    wh = _tranpose_and_gather_feat(wh, inds)  # 人脸bbox矩形框的宽高
+    wh = wh.view(batch, K, 2)                                             # 2. wh,第一种方式
+    wh = wh.exp() * 4.                                                    # 2. wh,第二种式式
+    clses = clses.view(batch, K, 1).float()
+    scores = scores.view(batch, K, 1)
+    bboxes = torch.cat([xs - wh[..., 0:1] / 2,
+                        ys - wh[..., 1:2] / 2,
+                        xs + wh[..., 0:1] / 2,
+                        ys + wh[..., 1:2] / 2], dim=2)
+    kps = _tranpose_and_gather_feat(kps, inds)                                      # 3. 人脸关键点
+    kps = kps.view(batch, K, num_joints * 2)
+    kps[..., ::2] += xs.view(batch, K, 1).expand(batch, K, num_joints)  # 第一次通过中心点偏移获得的关节点的坐标
+    kps[..., 1::2] += ys.view(batch, K, 1).expand(batch, K, num_joints)
+    if hm_hp is not None:
+        hm_hp = _nms(hm_hp)  # 第二次：通过关节点热力图求得关节点的中心点
+        thresh = 0.1
+        kps = kps.view(batch, K, num_joints, 2).permute(
+            0, 2, 1, 3).contiguous()  # b x J x K x 2
+        reg_kps = kps.unsqueeze(3).expand(batch, num_joints, K, K, 2)
+        hm_score, hm_inds, hm_ys, hm_xs = _topk_channel(hm_hp, K=K)  # b x J x K
+        if hp_offset is not None:  # 关节点的中心的偏移
+            hp_offset = _tranpose_and_gather_feat(
+                hp_offset, hm_inds.view(batch, -1))
+            hp_offset = hp_offset.view(batch, num_joints, K, 2)
+            hm_xs = hm_xs + hp_offset[:, :, :, 0]
+            hm_ys = hm_ys + hp_offset[:, :, :, 1]
+        else:
+            hm_xs = hm_xs + 0.5
+            hm_ys = hm_ys + 0.5
+        mask = (hm_score > thresh).float()  # 选置信度大于0.1的
+        hm_score = (1 - mask) * -1 + mask * hm_score
+        hm_ys = (1 - mask) * (-10000) + mask * hm_ys
+        hm_xs = (1 - mask) * (-10000) + mask * hm_xs
+        hm_kps = torch.stack([hm_xs, hm_ys], dim=-1).unsqueeze(
+            2).expand(batch, num_joints, K, K, 2)
+        dist = (((reg_kps - hm_kps) ** 2).sum(dim=4) ** 0.5)  # 两次求解的关节点求距离
+        min_dist, min_ind = dist.min(dim=3)  # b x J x K
+        hm_score = hm_score.gather(2, min_ind).unsqueeze(-1)  # b x J x K x 1
+        min_dist = min_dist.unsqueeze(-1)
+        min_ind = min_ind.view(batch, num_joints, K, 1, 1).expand(
+            batch, num_joints, K, 1, 2)
+        hm_kps = hm_kps.gather(3, min_ind)
+        hm_kps = hm_kps.view(batch, num_joints, K, 2)
+        # 如果在bboxes中则用第二种方法的关节点，在bboxes外用第一种方法提取的关节点，就是优先选第二种方法
+        l = bboxes[:, :, 0].view(batch, 1, K, 1).expand(batch, num_joints, K, 1)
+        t = bboxes[:, :, 1].view(batch, 1, K, 1).expand(batch, num_joints, K, 1)
+        r = bboxes[:, :, 2].view(batch, 1, K, 1).expand(batch, num_joints, K, 1)
+        b = bboxes[:, :, 3].view(batch, 1, K, 1).expand(batch, num_joints, K, 1)
+        mask = (hm_kps[..., 0:1] < l) + (hm_kps[..., 0:1] > r) + \
+               (hm_kps[..., 1:2] < t) + (hm_kps[..., 1:2] > b) + \
+               (hm_score < thresh) + (min_dist > (torch.max(b - t, r - l) * 0.3))
+        mask = (mask > 0).float().expand(batch, num_joints, K, 2)
+        kps = (1 - mask) * hm_kps + mask * kps
+        kps = kps.permute(0, 2, 1, 3).contiguous().view(
+            batch, K, num_joints * 2)
+    detections = torch.cat([bboxes, scores, kps, clses], dim=2)  # box:4+score:1+kpoints:10+class:1=16
+    return detections
--- a/src/lib/models/losses.py
+++ b/src/lib/models/losses.py
+# ------------------------------------------------------------------------------
+# Portions of this code are from
+# CornerNet (https://github.com/princeton-vl/CornerNet)
+# Copyright (c) 2018, University of Michigan
+# Licensed under the BSD 3-Clause License
+# ------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import torch.nn as nn
+from .utils import _tranpose_and_gather_feat
+import torch.nn.functional as F
+def _slow_neg_loss(pred, gt):
+  '''focal loss from CornerNet'''
+  pos_inds = gt.eq(1)
+  neg_inds = gt.lt(1)
+  neg_weights = torch.pow(1 - gt[neg_inds], 4)
+  loss = 0
+  pos_pred = pred[pos_inds]
+  neg_pred = pred[neg_inds]
+  pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, 2)
+  neg_loss = torch.log(1 - neg_pred) * torch.pow(neg_pred, 2) * neg_weights
+  num_pos  = pos_inds.float().sum()
+  pos_loss = pos_loss.sum()
+  neg_loss = neg_loss.sum()
+  if pos_pred.nelement() == 0:
+    loss = loss - neg_loss
+  else:
+    loss = loss - (pos_loss + neg_loss) / num_pos
+  return loss
+def _neg_loss(pred, gt):
+  ''' Modified focal loss. Exactly the same as CornerNet.
+      Runs faster and costs a little bit more memory
+    Arguments:
+      pred (batch x c x h x w)
+      gt_regr (batch x c x h x w)
+  '''
+  pos_inds = gt.eq(1).float()
+  neg_inds = gt.lt(1).float()
+  neg_weights = torch.pow(1 - gt, 4)
+  loss = 0
+  pos_loss = torch.log(pred) * torch.pow(1 - pred, 2) * pos_inds
+  neg_loss = torch.log(1 - pred) * torch.pow(pred, 2) * neg_weights * neg_inds
+  num_pos  = pos_inds.float().sum()
+  pos_loss = pos_loss.sum()
+  neg_loss = neg_loss.sum()
+  if num_pos == 0:
+    loss = loss - neg_loss
+  else:
+    loss = loss - (pos_loss + neg_loss) / num_pos
+  return loss
+def _not_faster_neg_loss(pred, gt):
+    pos_inds = gt.eq(1).float()
+    neg_inds = gt.lt(1).float()
+    num_pos  = pos_inds.float().sum()
+    neg_weights = torch.pow(1 - gt, 4)
+    loss = 0
+    trans_pred = pred * neg_inds + (1 - pred) * pos_inds
+    weight = neg_weights * neg_inds + pos_inds
+    all_loss = torch.log(1 - trans_pred) * torch.pow(trans_pred, 2) * weight
+    all_loss = all_loss.sum()
+    if num_pos > 0:
+        all_loss /= num_pos
+    loss -=  all_loss
+    return loss
+def _slow_reg_loss(regr, gt_regr, mask):
+    num  = mask.float().sum()
+    mask = mask.unsqueeze(2).expand_as(gt_regr)
+    regr    = regr[mask]
+    gt_regr = gt_regr[mask]
+    # regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, size_average=False)
+    regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, reduction='sum')
+    regr_loss = regr_loss / (num + 1e-4)
+    return regr_loss
+def _reg_loss(regr, gt_regr, mask, wight_=None):
+  ''' L1 regression loss
+    Arguments:
+      regr (batch x max_objects x dim)
+      gt_regr (batch x max_objects x dim)
+      mask (batch x max_objects)
+  '''
+  num = mask.float().sum()
+  mask = mask.unsqueeze(2).expand_as(gt_regr).float()
+  regr = regr * mask
+  gt_regr = gt_regr * mask
+  if wight_ is not None:
+      wight_ = wight_.unsqueeze(2).expand_as(gt_regr).float()
+      # regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, reduce=False)
+      regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, reduction='none')
+      regr_loss *= wight_
+      regr_loss = regr_loss.sum()
+  else:
+      regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, reduction='sum')
+      # regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, size_average=False)
+  regr_loss = regr_loss / (num + 1e-4)
+  return regr_loss
+class FocalLoss(nn.Module):
+  '''nn.Module warpper for focal loss'''
+  def __init__(self):
+    super(FocalLoss, self).__init__()
+    self.neg_loss = _neg_loss
+  def forward(self, out, target):
+    return self.neg_loss(out, target)
+class RegLoss(nn.Module):
+  '''Regression loss for an output tensor
+    Arguments:
+      output (batch x dim x h x w)
+      mask (batch x max_objects)
+      ind (batch x max_objects)
+      target (batch x max_objects x dim)
+  '''
+  def __init__(self):
+    super(RegLoss, self).__init__()
+  def forward(self, output, mask, ind, target, wight_=None):
+    pred = _tranpose_and_gather_feat(output, ind)
+    loss = _reg_loss(pred, target, mask, wight_)
+    return loss
+class RegL1Loss(nn.Module):
+  def __init__(self):
+    super(RegL1Loss, self).__init__()
+  def forward(self, output, mask, ind, target):
+    pred = _tranpose_and_gather_feat(output, ind)
+    mask = mask.unsqueeze(2).expand_as(pred).float()
+    # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
+    loss = F.l1_loss(pred * mask, target * mask, reduction='sum')
+    # loss = F.l1_loss(pred * mask, target * mask, size_average=False)
+    loss = loss / (mask.sum() + 1e-4)
+    return loss
+class NormRegL1Loss(nn.Module):
+  def __init__(self):
+    super(NormRegL1Loss, self).__init__()
+  def forward(self, output, mask, ind, target):
+    pred = _tranpose_and_gather_feat(output, ind)
+    mask = mask.unsqueeze(2).expand_as(pred).float()
+    # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
+    pred = pred / (target + 1e-4)
+    target = target * 0 + 1
+    loss = F.l1_loss(pred * mask, target * mask, reduction='sum')
+    # loss = F.l1_loss(pred * mask, target * mask, size_average=False)
+    loss = loss / (mask.sum() + 1e-4)
+    return loss
+class RegWeightedL1Loss(nn.Module):
+  def __init__(self):
+    super(RegWeightedL1Loss, self).__init__()
+  def forward(self, output, mask, ind, target):
+    pred = _tranpose_and_gather_feat(output, ind)
+    mask = mask.float()
+    # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
+    loss = F.l1_loss(pred * mask, target * mask, reduction='sum')
+    # loss = F.l1_loss(pred * mask, target * mask, size_average=False)
+    loss = loss / (mask.sum() + 1e-4)
+    return loss
+class L1Loss(nn.Module):
+  def __init__(self):
+    super(L1Loss, self).__init__()
+  def forward(self, output, mask, ind, target):
+    pred = _tranpose_and_gather_feat(output, ind)
+    mask = mask.unsqueeze(2).expand_as(pred).float()
+    loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
+    return loss
+class BinRotLoss(nn.Module):
+  def __init__(self):
+    super(BinRotLoss, self).__init__()
+  def forward(self, output, mask, ind, rotbin, rotres):
+    pred = _tranpose_and_gather_feat(output, ind)
+    loss = compute_rot_loss(pred, rotbin, rotres, mask)
+    return loss
+def compute_res_loss(output, target):
+    return F.smooth_l1_loss(output, target, reduction='elementwise_mean')
+# TODO: weight
+def compute_bin_loss(output, target, mask):
+    mask = mask.expand_as(output)
+    output = output * mask.float()
+    return F.cross_entropy(output, target, reduction='elementwise_mean')
+def compute_rot_loss(output, target_bin, target_res, mask):
+    # output: (B, 128, 8) [bin1_cls[0], bin1_cls[1], bin1_sin, bin1_cos,
+    #                 bin2_cls[0], bin2_cls[1], bin2_sin, bin2_cos]
+    # target_bin: (B, 128, 2) [bin1_cls, bin2_cls]
+    # target_res: (B, 128, 2) [bin1_res, bin2_res]
+    # mask: (B, 128, 1)
+    # import pdb; pdb.set_trace()
+    output = output.view(-1, 8)
+    target_bin = target_bin.view(-1, 2)
+    target_res = target_res.view(-1, 2)
+    mask = mask.view(-1, 1)
+    loss_bin1 = compute_bin_loss(output[:, 0:2], target_bin[:, 0], mask)
+    loss_bin2 = compute_bin_loss(output[:, 4:6], target_bin[:, 1], mask)
+    loss_res = torch.zeros_like(loss_bin1)
+    if target_bin[:, 0].nonzero().shape[0] > 0:
+        idx1 = target_bin[:, 0].nonzero()[:, 0]
+        valid_output1 = torch.index_select(output, 0, idx1.long())
+        valid_target_res1 = torch.index_select(target_res, 0, idx1.long())
+        loss_sin1 = compute_res_loss(
+          valid_output1[:, 2], torch.sin(valid_target_res1[:, 0]))
+        loss_cos1 = compute_res_loss(
+          valid_output1[:, 3], torch.cos(valid_target_res1[:, 0]))
+        loss_res += loss_sin1 + loss_cos1
+    if target_bin[:, 1].nonzero().shape[0] > 0:
+        idx2 = target_bin[:, 1].nonzero()[:, 0]
+        valid_output2 = torch.index_select(output, 0, idx2.long())
+        valid_target_res2 = torch.index_select(target_res, 0, idx2.long())
+        loss_sin2 = compute_res_loss(
+          valid_output2[:, 6], torch.sin(valid_target_res2[:, 1]))
+        loss_cos2 = compute_res_loss(
+          valid_output2[:, 7], torch.cos(valid_target_res2[:, 1]))
+        loss_res += loss_sin2 + loss_cos2
+    return loss_bin1 + loss_bin2 + loss_res
--- a/src/lib/models/model.py
+++ b/src/lib/models/model.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torchvision.models as models
+import torch
+import torch.nn as nn
+import os
+from .networks.msra_resnet import get_pose_net
+# from .networks.dlav0 import get_pose_net as get_dlav0
+# from .networks.pose_dla_dcn import get_pose_net as get_dla_dcn
+# from .networks.resnet_dcn import get_pose_net as get_pose_net_dcn
+from .networks.large_hourglass import get_large_hourglass_net
+# from .Backbone.mobilenetv2 import get_mobile_pose_netv2
+# from .Backbone.mobilenet_v2 import get_mobile_net
+# from .Backbone.centerface_mobilenet_v2 import get_mobile_net
+from .Backbone.centerface_mobilenet_v2_fpn import get_mobile_net
+_model_factory = {
+  'res': get_pose_net, # default Resnet with deconv
+  # 'dlav0': get_dlav0, # default DLAup
+  # 'dla': get_dla_dcn,
+  # 'resdcn': get_pose_net_dcn,
+  'hourglass': get_large_hourglass_net,
+  'mobilev2': get_mobile_net,
+}
+def create_model(arch, heads, head_conv):
+  num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0
+  arch = arch[:arch.find('_')] if '_' in arch else arch
+  get_model = _model_factory[arch]
+  model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv)
+  return model
+def load_model(model, model_path, optimizer=None, resume=False, 
+               lr=None, lr_step=None):
+  start_epoch = 0
+  checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
+  print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch']))
+  state_dict_ = checkpoint['state_dict']
+  state_dict = {}
+  # convert data_parallal to model
+  for k in state_dict_:
+    if k.startswith('module') and not k.startswith('module_list'):
+      state_dict[k[7:]] = state_dict_[k]
+    else:
+      state_dict[k] = state_dict_[k]
+  model_state_dict = model.state_dict()
+  # check loaded parameters and created model parameters
+  for k in state_dict:
+    if k in model_state_dict:
+      if state_dict[k].shape != model_state_dict[k].shape:
+        print('Skip loading parameter {}, required shape{}, '\
+              'loaded shape{}.'.format(
+          k, model_state_dict[k].shape, state_dict[k].shape))
+        state_dict[k] = model_state_dict[k]
+    else:
+      print('Drop parameter {}.'.format(k))
+  for k in model_state_dict:
+    if not (k in state_dict):
+      print('No param {}.'.format(k))
+      state_dict[k] = model_state_dict[k]
+  model.load_state_dict(state_dict, strict=False)
+  # resume optimizer parameters
+  if optimizer is not None and resume:
+    if 'optimizer' in checkpoint:
+      optimizer.load_state_dict(checkpoint['optimizer'])
+      start_epoch = checkpoint['epoch']
+      start_lr = lr
+      for step in lr_step:
+        if start_epoch >= step:
+          start_lr *= 0.1
+      for param_group in optimizer.param_groups:
+        param_group['lr'] = start_lr
+      print('Resumed optimizer with start lr', start_lr)
+    else:
+      print('No optimizer parameters in checkpoint.')
+  if optimizer is not None:
+    return model, optimizer, start_epoch
+  else:
+    return model
+def save_model(path, epoch, model, optimizer=None):
+  if isinstance(model, torch.nn.DataParallel):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  data = {'epoch': epoch,
+          'state_dict': state_dict}
+  if not (optimizer is None):
+    data['optimizer'] = optimizer.state_dict()
+  torch.save(data, path)
--- a/src/lib/models/networks/DCNv2/.gitignore
+++ b/src/lib/models/networks/DCNv2/.gitignore
+.vscode
+.idea
+*.so
+*.o
+*pyc
+_ext
\ No newline at end of file
--- a/src/lib/models/networks/DCNv2/LICENSE
+++ b/src/lib/models/networks/DCNv2/LICENSE
+BSD 3-Clause License
+Copyright (c) 2019, Charles Shang
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/src/lib/models/networks/DCNv2/README.md
+++ b/src/lib/models/networks/DCNv2/README.md
+## Deformable Convolutional Networks V2 with Pytorch
+### Build
+```bash
+    ./make.sh         # build
+    python test.py    # run examples and gradient check 
+```
+### An Example
+- deformable conv
+```python
+    from dcn_v2 import DCN
+    input = torch.randn(2, 64, 128, 128).cuda()
+    # wrap all things (offset and mask) in DCN
+    dcn = DCN(64, 64, kernel_size=(3,3), stride=1, padding=1, deformable_groups=2).cuda()
+    output = dcn(input)
+    print(output.shape)
+```
+- deformable roi pooling
+```python
+    from dcn_v2 import DCNPooling
+    input = torch.randn(2, 32, 64, 64).cuda()
+    batch_inds = torch.randint(2, (20, 1)).cuda().float()
+    x = torch.randint(256, (20, 1)).cuda().float()
+    y = torch.randint(256, (20, 1)).cuda().float()
+    w = torch.randint(64, (20, 1)).cuda().float()
+    h = torch.randint(64, (20, 1)).cuda().float()
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+    # mdformable pooling (V2)
+    # wrap all things (offset and mask) in DCNPooling
+    dpooling = DCNPooling(spatial_scale=1.0 / 4,
+                         pooled_size=7,
+                         output_dim=32,
+                         no_trans=False,
+                         group_size=1,
+                         trans_std=0.1).cuda()
+    dout = dpooling(input, rois)
+```
+### Known Issues:
+- [x] Gradient check w.r.t offset (solved)
+- [ ] Backward is not reentrant (minor)
+This is an adaption of the official [Deformable-ConvNets](https://github.com/msracver/Deformable-ConvNets/tree/master/DCNv2_op).
+<s>I have ran the gradient check for many times with DOUBLE type. Every tensor **except offset** passes.
+However, when I set the offset to 0.5, it passes. I'm still wondering what cause this problem. Is it because some
+non-differential points? </s>
+Update: all gradient check passes with double precision. 
+Another issue is that it raises `RuntimeError: Backward is not reentrant`. However, the error is very small (`<1e-7` for 
+float `<1e-15` for double), 
+so it may not be a serious problem (?)
+Please post an issue or PR if you have any comments.
\ No newline at end of file