First Commit.

b952e97b · chenych · b952e97b · b952e97b · b952e97b · b952e97b
Commit b952e97b authored Nov 03, 2023 by chenych
20 changed files
--- a/src/lib/models/networks/DCNv2/src/dcn_v2_double.h
+++ b/src/lib/models/networks/DCNv2/src/dcn_v2_double.h
+void dcn_v2_forward(THDoubleTensor *input, THDoubleTensor *weight,
+                    THDoubleTensor *bias, THDoubleTensor *ones,
+                    THDoubleTensor *offset, THDoubleTensor *mask,
+                    THDoubleTensor *output, THDoubleTensor *columns,
+                    const int pad_h, const int pad_w,
+                    const int stride_h, const int stride_w,
+                    const int dilation_h, const int dilation_w,
+                    const int deformable_group);
+void dcn_v2_backward(THDoubleTensor *input, THDoubleTensor *weight,
+                     THDoubleTensor *bias, THDoubleTensor *ones,
+                     THDoubleTensor *offset, THDoubleTensor *mask,
+                     THDoubleTensor *output, THDoubleTensor *columns,
+                     THDoubleTensor *grad_input, THDoubleTensor *grad_weight,
+                     THDoubleTensor *grad_bias, THDoubleTensor *grad_offset,
+                     THDoubleTensor *grad_mask, THDoubleTensor *grad_output,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int pad_h, int pad_w,
+                     int dilation_h, int dilation_w,
+                     int deformable_group);
\ No newline at end of file
--- a/src/lib/models/networks/DCNv2/test.py
+++ b/src/lib/models/networks/DCNv2/test.py
+#!/usr/bin/env python
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+
+from dcn_v2 import DCNv2
+from dcn_v2_func import DCNv2Function
+from dcn_v2 import DCNv2Pooling
+from dcn_v2_func import DCNv2PoolingFunction
+
+deformable_groups = 1
+N, inC, inH, inW = 2, 2, 4, 4
+outC = 2
+kH, kW = 3, 3
+
+def conv_identify(weight, bias):
+    weight.data.zero_()
+    bias.data.zero_()
+    o, i, h, w = weight.shape
+    y = h//2
+    x = w//2
+    for p in range(i):
+        for q in range(o):
+            if p == q:
+                weight.data[q, p, y, x] = 1.0
+
+def check_zero_offset():
+    conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW,
+        kernel_size=(kH, kW),
+        stride=(1, 1),
+        padding=(1, 1),
+        bias=True).cuda()
+
+    conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW,
+        kernel_size=(kH, kW),
+        stride=(1, 1),
+        padding=(1, 1),
+        bias=True).cuda()
+
+    dcn_v2 = DCNv2(inC, outC, (kH, kW),
+                   stride=1, padding=1, dilation=1,
+                   deformable_groups=deformable_groups).cuda()
+
+    conv_offset.weight.data.zero_()
+    conv_offset.bias.data.zero_()
+    conv_mask.weight.data.zero_()
+    conv_mask.bias.data.zero_()
+    conv_identify(dcn_v2.weight, dcn_v2.bias)
+
+    input = torch.randn(N, inC, inH, inW).cuda()
+    offset = conv_offset(input)
+    mask = conv_mask(input)
+    mask = torch.sigmoid(mask)
+    output = dcn_v2(input, offset, mask)
+    output *= 2
+    d = (input - output).abs().max()
+    if d < 1e-10:
+        print('Zero offset passed')
+    else:
+        print('Zero offset failed')
+
+def check_gradient_dconv_double():
+
+    input = torch.randn(N, inC, inH, inW, dtype=torch.float64).cuda()
+    input.requires_grad = True
+
+    offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW, dtype=torch.float64).cuda()
+    # offset.data.zero_()
+    # offset.data -= 0.00001
+    offset.requires_grad = True
+
+    mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW, dtype=torch.float64).cuda()
+    # mask.data.zero_()
+    mask.requires_grad = True
+    mask = torch.sigmoid(mask)
+
+    weight = torch.randn(outC, inC, kH, kW, dtype=torch.float64).cuda()
+    weight.requires_grad = True
+
+    bias = torch.rand(outC, dtype=torch.float64).cuda()
+    bias.requires_grad = True
+
+    func = DCNv2Function(stride=1, padding=1, dilation=1, deformable_groups=deformable_groups)
+
+    print(gradcheck(func, (input, offset, mask, weight, bias), eps=1e-6, atol=1e-5, rtol=1e-3))
+
+def check_gradient_dconv():
+
+    input = torch.randn(N, inC, inH, inW).cuda()
+    input.requires_grad = True
+
+    offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW).cuda()
+    # offset.data.zero_()
+    # offset.data -= 0.5
+    offset.requires_grad = True
+
+    mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW).cuda()
+    # mask.data.zero_()
+    mask.requires_grad = True
+    mask = torch.sigmoid(mask)
+
+    weight = torch.randn(outC, inC, kH, kW).cuda()
+    weight.requires_grad = True
+
+    bias = torch.rand(outC).cuda()
+    bias.requires_grad = True
+
+    func = DCNv2Function(stride=1, padding=1, dilation=1, deformable_groups=deformable_groups)
+
+    print(gradcheck(func, (input, offset, mask, weight, bias), eps=1e-3, atol=1e-3, rtol=1e-2))
+
+def check_pooling_zero_offset():
+    from dcn_v2 import DCNv2Pooling
+    input = torch.randn(2, 16, 64, 64).cuda().zero_()
+    input[0, :, 16:26, 16:26] = 1.
+    input[1, :, 10:20, 20:30] = 2.
+    rois = torch.tensor([
+        [0, 65, 65, 103, 103],
+        [1, 81, 41, 119, 79],
+    ]).cuda().float()
+    pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                           pooled_size=7,
+                           output_dim=16,
+                           no_trans=True,
+                           group_size=1,
+                           trans_std=0.1).cuda()
+
+    out = pooling(input, rois, input.new())
+    s = ', '.join(['%f' % out[i, :, :, :].mean().item() for i in range(rois.shape[0])])
+    print(s)
+
+    dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                            pooled_size=7,
+                            output_dim=16,
+                            no_trans=False,
+                            group_size=1,
+                            trans_std=0.1).cuda()
+    offset = torch.randn(20, 2, 7, 7).cuda().zero_()
+    dout = dpooling(input, rois, offset)
+    s = ', '.join(['%f' % dout[i, :, :, :].mean().item() for i in range(rois.shape[0])])
+    print(s)
+
+def check_gradient_dpooling():
+    input = torch.randn(2, 3, 5, 5).cuda() * 0.01
+    N = 4
+    batch_inds = torch.randint(2, (N, 1)).cuda().float()
+    x = torch.rand((N, 1)).cuda().float() * 15
+    y = torch.rand((N, 1)).cuda().float() * 15
+    w = torch.rand((N, 1)).cuda().float() * 10
+    h = torch.rand((N, 1)).cuda().float() * 10
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+    offset = torch.randn(N, 2, 3, 3).cuda()
+    dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                            pooled_size=3,
+                            output_dim=3,
+                            no_trans=False,
+                            group_size=1,
+                            trans_std=0.0).cuda()
+    input.requires_grad = True
+    offset.requires_grad = True
+    print('check_gradient_dpooling', gradcheck(dpooling, (input, rois, offset), eps=1e-4))
+
+
+def example_dconv():
+    from dcn_v2 import DCN
+    input = torch.randn(2, 64, 128, 128).cuda()
+    # wrap all things (offset and mask) in DCN
+    dcn = DCN(64, 64, kernel_size=(3,3), stride=1, padding=1, deformable_groups=2).cuda()
+    output = dcn(input)
+    targert = output.new(*output.size())
+    targert.data.uniform_(-0.01, 0.01)
+    error = (targert - output).mean()
+    error.backward()
+    print(output.shape)
+
+def example_dpooling():
+    from dcn_v2 import DCNv2Pooling
+    input = torch.randn(2, 32, 64, 64).cuda()
+    batch_inds = torch.randint(2, (20, 1)).cuda().float()
+    x = torch.randint(256, (20, 1)).cuda().float()
+    y = torch.randint(256, (20, 1)).cuda().float()
+    w = torch.randint(64, (20, 1)).cuda().float()
+    h = torch.randint(64, (20, 1)).cuda().float()
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+    offset = torch.randn(20, 2, 7, 7).cuda()
+    input.requires_grad = True
+    offset.requires_grad = True
+
+    # normal roi_align
+    pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                           pooled_size=7,
+                           output_dim=32,
+                           no_trans=True,
+                           group_size=1,
+                           trans_std=0.1).cuda()
+
+    # deformable pooling
+    dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                            pooled_size=7,
+                            output_dim=32,
+                            no_trans=False,
+                            group_size=1,
+                            trans_std=0.1).cuda()
+
+    out = pooling(input, rois, offset)
+    dout = dpooling(input, rois, offset)
+    print(out.shape)
+    print(dout.shape)
+
+    target_out = out.new(*out.size())
+    target_out.data.uniform_(-0.01, 0.01)
+    target_dout = dout.new(*dout.size())
+    target_dout.data.uniform_(-0.01, 0.01)
+    e = (target_out - out).mean()
+    e.backward()
+    e = (target_dout - dout).mean()
+    e.backward()
+
+def example_mdpooling():
+    from dcn_v2 import DCNPooling
+    input = torch.randn(2, 32, 64, 64).cuda()
+    input.requires_grad = True
+    batch_inds = torch.randint(2, (20, 1)).cuda().float()
+    x = torch.randint(256, (20, 1)).cuda().float()
+    y = torch.randint(256, (20, 1)).cuda().float()
+    w = torch.randint(64, (20, 1)).cuda().float()
+    h = torch.randint(64, (20, 1)).cuda().float()
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+
+    # mdformable pooling (V2)
+    dpooling = DCNPooling(spatial_scale=1.0 / 4,
+                         pooled_size=7,
+                         output_dim=32,
+                         no_trans=False,
+                         group_size=1,
+                         trans_std=0.1).cuda()
+
+    dout = dpooling(input, rois)
+    target = dout.new(*dout.size())
+    target.data.uniform_(-0.1, 0.1)
+    error = (target - dout).mean()
+    error.backward()
+    print(dout.shape)
+
+if __name__ == '__main__':
+
+    example_dconv()
+    example_dpooling()
+    example_mdpooling()
+
+    check_pooling_zero_offset()
+    # zero offset check
+    if inC == outC:
+        check_zero_offset()
+
+    check_gradient_dpooling()
+
+    # # gradient check
+    # try:
+    #     check_gradient_double()
+    # except TypeError:
+    #     print('''****** You can swith to double precision in dcn_v2_func.py by (un)commenting these two lines:
+    #              ****** from _ext import dcn_v2 as _backend
+    #              ****** from _ext import dcn_v2_double as _backend''')
+    #     print('****** Your tensor may not be **double** type')
+    #     print('****** Switching to **float** type')
+    #
+    #     check_gradient()
+    # finally:
+    #     print('****** Note: backward is not reentrant error may not be a serious problem, '
+    #           '****** since the max error is less than 1e-7\n'
+    #           '****** Still looking for what trigger this problem')
\ No newline at end of file
--- a/src/lib/models/networks/dlav0.py
+++ b/src/lib/models/networks/dlav0.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from os.path import join
+
+import torch
+from torch import nn
+import torch.utils.model_zoo as model_zoo
+
+import numpy as np
+
+BatchNorm = nn.BatchNorm2d
+
+def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
+    return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn1 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=1, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = BatchNorm(planes)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = Bottleneck.expansion
+        bottle_planes = planes // expansion
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(bottle_planes)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = BatchNorm(bottle_planes)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BottleneckX(nn.Module):
+    expansion = 2
+    cardinality = 32
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BottleneckX, self).__init__()
+        cardinality = BottleneckX.cardinality
+        # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
+        # bottle_planes = dim * cardinality
+        bottle_planes = planes * cardinality // 32
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(bottle_planes)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation, bias=False,
+                               dilation=dilation, groups=cardinality)
+        self.bn2 = BatchNorm(bottle_planes)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Root(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(Root, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, 1,
+            stride=1, bias=False, padding=(kernel_size - 1) // 2)
+        self.bn = BatchNorm(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.residual = residual
+
+    def forward(self, *x):
+        children = x
+        x = self.conv(torch.cat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class Tree(nn.Module):
+    def __init__(self, levels, block, in_channels, out_channels, stride=1,
+                 level_root=False, root_dim=0, root_kernel_size=1,
+                 dilation=1, root_residual=False):
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride,
+                               dilation=dilation)
+            self.tree2 = block(out_channels, out_channels, 1,
+                               dilation=dilation)
+        else:
+            self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
+                              stride, root_dim=0,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+            self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
+                              root_dim=root_dim + out_channels,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, root_kernel_size,
+                             root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels,
+                          kernel_size=1, stride=1, bias=False),
+                BatchNorm(out_channels)
+            )
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Module):
+    def __init__(self, levels, channels, num_classes=1000,
+                 block=BasicBlock, residual_root=False, return_levels=False,
+                 pool_size=7, linear_root=False):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.return_levels = return_levels
+        self.num_classes = num_classes
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
+                      padding=3, bias=False),
+            BatchNorm(channels[0]),
+            nn.ReLU(inplace=True))
+        self.level0 = self._make_conv_level(
+            channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+        self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
+                           level_root=False,
+                           root_residual=residual_root)
+        self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
+                           level_root=True, root_residual=residual_root)
+
+        self.avgpool = nn.AvgPool2d(pool_size)
+        self.fc = nn.Conv2d(channels[-1], num_classes, kernel_size=1,
+                            stride=1, padding=0, bias=True)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_level(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                nn.MaxPool2d(stride, stride=stride),
+                nn.Conv2d(inplanes, planes,
+                          kernel_size=1, stride=1, bias=False),
+                BatchNorm(planes),
+            )
+
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample=downsample))
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2d(inplanes, planes, kernel_size=3,
+                          stride=stride if i == 0 else 1,
+                          padding=dilation, bias=False, dilation=dilation),
+                BatchNorm(planes),
+                nn.ReLU(inplace=True)])
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        y = []
+        x = self.base_layer(x)
+        for i in range(6):
+            x = getattr(self, 'level{}'.format(i))(x)
+            y.append(x)
+        if self.return_levels:
+            return y
+        else:
+            x = self.avgpool(x)
+            x = self.fc(x)
+            x = x.view(x.size(0), -1)
+
+            return x
+
+    def load_pretrained_model(self,  data='imagenet', name='dla34', hash='ba72cf86'):
+        fc = self.fc
+        if name.endswith('.pth'):
+            model_weights = torch.load(data + name)
+        else:
+            model_url = get_model_url(data, name, hash)
+            model_weights = model_zoo.load_url(model_url)
+        num_classes = len(model_weights[list(model_weights.keys())[-1]])
+        self.fc = nn.Conv2d(
+            self.channels[-1], num_classes,
+            kernel_size=1, stride=1, padding=0, bias=True)
+        self.load_state_dict(model_weights)
+        self.fc = fc
+
+
+def dla34(pretrained, **kwargs):  # DLA-34
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 128, 256, 512],
+                block=BasicBlock, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
+    return model
+
+
+def dla46_c(pretrained=None, **kwargs):  # DLA-46-C
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=Bottleneck, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla46_c')
+    return model
+
+
+def dla46x_c(pretrained=None, **kwargs):  # DLA-X-46-C
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=BottleneckX, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla46x_c')
+    return model
+
+
+def dla60x_c(pretrained, **kwargs):  # DLA-X-60-C
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=BottleneckX, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla60x_c', hash='b870c45c')
+    return model
+
+
+def dla60(pretrained=None, **kwargs):  # DLA-60
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla60')
+    return model
+
+
+def dla60x(pretrained=None, **kwargs):  # DLA-X-60
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla60x')
+    return model
+
+
+def dla102(pretrained=None, **kwargs):  # DLA-102
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, residual_root=True, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla102')
+    return model
+
+
+def dla102x(pretrained=None, **kwargs):  # DLA-X-102
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, residual_root=True, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla102x')
+    return model
+
+
+def dla102x2(pretrained=None, **kwargs):  # DLA-X-102 64
+    BottleneckX.cardinality = 64
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, residual_root=True, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla102x2')
+    return model
+
+
+def dla169(pretrained=None, **kwargs):  # DLA-169
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 2, 3, 5, 1], [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, residual_root=True, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla169')
+    return model
+
+
+def set_bn(bn):
+    global BatchNorm
+    BatchNorm = bn
+    dla.BatchNorm = bn
+
+
+class Identity(nn.Module):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+
+
+class IDAUp(nn.Module):
+    def __init__(self, node_kernel, out_dim, channels, up_factors):
+        super(IDAUp, self).__init__()
+        self.channels = channels
+        self.out_dim = out_dim
+        for i, c in enumerate(channels):
+            if c == out_dim:
+                proj = Identity()
+            else:
+                proj = nn.Sequential(
+                    nn.Conv2d(c, out_dim,
+                              kernel_size=1, stride=1, bias=False),
+                    BatchNorm(out_dim),
+                    nn.ReLU(inplace=True))
+            f = int(up_factors[i])
+            if f == 1:
+                up = Identity()
+            else:
+                up = nn.ConvTranspose2d(
+                    out_dim, out_dim, f * 2, stride=f, padding=f // 2,
+                    output_padding=0, groups=out_dim, bias=False)
+                fill_up_weights(up)
+            setattr(self, 'proj_' + str(i), proj)
+            setattr(self, 'up_' + str(i), up)
+
+        for i in range(1, len(channels)):
+            node = nn.Sequential(
+                nn.Conv2d(out_dim * 2, out_dim,
+                          kernel_size=node_kernel, stride=1,
+                          padding=node_kernel // 2, bias=False),
+                BatchNorm(out_dim),
+                nn.ReLU(inplace=True))
+            setattr(self, 'node_' + str(i), node)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, layers):
+        assert len(self.channels) == len(layers), \
+            '{} vs {} layers'.format(len(self.channels), len(layers))
+        layers = list(layers)
+        for i, l in enumerate(layers):
+            upsample = getattr(self, 'up_' + str(i))
+            project = getattr(self, 'proj_' + str(i))
+            layers[i] = upsample(project(l))
+        x = layers[0]
+        y = []
+        for i in range(1, len(layers)):
+            node = getattr(self, 'node_' + str(i))
+            x = node(torch.cat([x, layers[i]], 1))
+            y.append(x)
+        return x, y
+
+
+class DLAUp(nn.Module):
+    def __init__(self, channels, scales=(1, 2, 4, 8, 16), in_channels=None):
+        super(DLAUp, self).__init__()
+        if in_channels is None:
+            in_channels = channels
+        self.channels = channels
+        channels = list(channels)
+        scales = np.array(scales, dtype=int)
+        for i in range(len(channels) - 1):
+            j = -i - 2
+            setattr(self, 'ida_{}'.format(i),
+                    IDAUp(3, channels[j], in_channels[j:],
+                          scales[j:] // scales[j]))
+            scales[j + 1:] = scales[j]
+            in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
+
+    def forward(self, layers):
+        layers = list(layers)
+        assert len(layers) > 1
+        for i in range(len(layers) - 1):
+            ida = getattr(self, 'ida_{}'.format(i))
+            x, y = ida(layers[-i - 2:])
+            layers[-i - 1:] = y
+        return x
+
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            nn.init.normal_(m.weight, std=0.001)
+            # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            # torch.nn.init.xavier_normal_(m.weight.data)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+class DLASeg(nn.Module):
+    def __init__(self, base_name, heads,
+                 pretrained=True, down_ratio=4, head_conv=256):
+        super(DLASeg, self).__init__()
+        assert down_ratio in [2, 4, 8, 16]
+        self.heads = heads
+        self.first_level = int(np.log2(down_ratio))
+        self.base = globals()[base_name](
+          pretrained=pretrained, return_levels=True)
+        channels = self.base.channels
+        scales = [2 ** i for i in range(len(channels[self.first_level:]))]
+        self.dla_up = DLAUp(channels[self.first_level:], scales=scales)
+        '''
+        self.fc = nn.Sequential(
+            nn.Conv2d(channels[self.first_level], classes, kernel_size=1,
+                      stride=1, padding=0, bias=True)
+        )
+        '''
+
+        for head in self.heads:
+            classes = self.heads[head]
+            if head_conv > 0:
+                fc = nn.Sequential(
+                  nn.Conv2d(channels[self.first_level], head_conv,
+                    kernel_size=3, padding=1, bias=True),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(head_conv, classes, 
+                    kernel_size=1, stride=1, 
+                    padding=0, bias=True))
+                if 'hm' in head:
+                    fc[-1].bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            else:
+                fc = nn.Conv2d(channels[self.first_level], classes, 
+                  kernel_size=1, stride=1, 
+                  padding=0, bias=True)
+                if 'hm' in head:
+                    fc.bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            self.__setattr__(head, fc)
+
+        '''
+        up_factor = 2 ** self.first_level
+        if up_factor > 1:
+            up = nn.ConvTranspose2d(classes, classes, up_factor * 2,
+                                    stride=up_factor, padding=up_factor // 2,
+                                    output_padding=0, groups=classes,
+                                    bias=False)
+            fill_up_weights(up)
+            up.weight.requires_grad = False
+        else:
+            up = Identity()
+        self.up = up
+        self.softmax = nn.LogSoftmax(dim=1)
+        
+
+        for m in self.fc.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+        '''
+
+    def forward(self, x):
+        x = self.base(x)
+        x = self.dla_up(x[self.first_level:])
+        # x = self.fc(x)
+        # y = self.softmax(self.up(x))
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x)
+        return [ret]
+
+    '''
+    def optim_parameters(self, memo=None):
+        for param in self.base.parameters():
+            yield param
+        for param in self.dla_up.parameters():
+            yield param
+        for param in self.fc.parameters():
+            yield param
+    '''
+'''
+def dla34up(classes, pretrained_base=None, **kwargs):
+    model = DLASeg('dla34', classes, pretrained_base=pretrained_base, **kwargs)
+    return model
+
+
+def dla60up(classes, pretrained_base=None, **kwargs):
+    model = DLASeg('dla60', classes, pretrained_base=pretrained_base, **kwargs)
+    return model
+
+
+def dla102up(classes, pretrained_base=None, **kwargs):
+    model = DLASeg('dla102', classes,
+                   pretrained_base=pretrained_base, **kwargs)
+    return model
+
+
+def dla169up(classes, pretrained_base=None, **kwargs):
+    model = DLASeg('dla169', classes,
+                   pretrained_base=pretrained_base, **kwargs)
+    return model
+'''
+
+def get_pose_net(num_layers, heads, add_conv=256, down_ratio=4):
+  model = DLASeg('dla{}'.format(num_layers), heads,
+                 pretrained=True,
+                 down_ratio=down_ratio,
+                 head_conv=head_conv)
+  return model
--- a/src/lib/models/networks/large_hourglass.py
+++ b/src/lib/models/networks/large_hourglass.py
+# ------------------------------------------------------------------------------
+# This code is base on 
+# CornerNet (https://github.com/princeton-vl/CornerNet)
+# Copyright (c) 2018, University of Michigan
+# Licensed under the BSD 3-Clause License
+# ------------------------------------------------------------------------------
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+class convolution(nn.Module):
+    def __init__(self, k, inp_dim, out_dim, stride=1, with_bn=True):
+        super(convolution, self).__init__()
+
+        pad = (k - 1) // 2
+        self.conv = nn.Conv2d(inp_dim, out_dim, (k, k), padding=(pad, pad), stride=(stride, stride), bias=not with_bn)
+        self.bn   = nn.BatchNorm2d(out_dim) if with_bn else nn.Sequential()
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        conv = self.conv(x)
+        bn   = self.bn(conv)
+        relu = self.relu(bn)
+        return relu
+
+class fully_connected(nn.Module):
+    def __init__(self, inp_dim, out_dim, with_bn=True):
+        super(fully_connected, self).__init__()
+        self.with_bn = with_bn
+
+        self.linear = nn.Linear(inp_dim, out_dim)
+        if self.with_bn:
+            self.bn = nn.BatchNorm1d(out_dim)
+        self.relu   = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        linear = self.linear(x)
+        bn     = self.bn(linear) if self.with_bn else linear
+        relu   = self.relu(bn)
+        return relu
+
+class residual(nn.Module):
+    def __init__(self, k, inp_dim, out_dim, stride=1, with_bn=True):
+        super(residual, self).__init__()
+
+        self.conv1 = nn.Conv2d(inp_dim, out_dim, (3, 3), padding=(1, 1), stride=(stride, stride), bias=False)
+        self.bn1   = nn.BatchNorm2d(out_dim)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(out_dim, out_dim, (3, 3), padding=(1, 1), bias=False)
+        self.bn2   = nn.BatchNorm2d(out_dim)
+        
+        self.skip  = nn.Sequential(
+            nn.Conv2d(inp_dim, out_dim, (1, 1), stride=(stride, stride), bias=False),
+            nn.BatchNorm2d(out_dim)
+        ) if stride != 1 or inp_dim != out_dim else nn.Sequential()
+        self.relu  = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        conv1 = self.conv1(x)
+        bn1   = self.bn1(conv1)
+        relu1 = self.relu1(bn1)
+
+        conv2 = self.conv2(relu1)
+        bn2   = self.bn2(conv2)
+
+        skip  = self.skip(x)
+        return self.relu(bn2 + skip)
+
+def make_layer(k, inp_dim, out_dim, modules, layer=convolution, **kwargs):
+    layers = [layer(k, inp_dim, out_dim, **kwargs)]
+    for _ in range(1, modules):
+        layers.append(layer(k, out_dim, out_dim, **kwargs))
+    return nn.Sequential(*layers)
+
+def make_layer_revr(k, inp_dim, out_dim, modules, layer=convolution, **kwargs):
+    layers = []
+    for _ in range(modules - 1):
+        layers.append(layer(k, inp_dim, inp_dim, **kwargs))
+    layers.append(layer(k, inp_dim, out_dim, **kwargs))
+    return nn.Sequential(*layers)
+
+class MergeUp(nn.Module):
+    def forward(self, up1, up2):
+        return up1 + up2
+
+def make_merge_layer(dim):
+    return MergeUp()
+
+# def make_pool_layer(dim):
+#     return nn.MaxPool2d(kernel_size=2, stride=2)
+
+def make_pool_layer(dim):
+    return nn.Sequential()
+
+def make_unpool_layer(dim):
+    return nn.Upsample(scale_factor=2)
+
+def make_kp_layer(cnv_dim, curr_dim, out_dim):
+    return nn.Sequential(
+        convolution(3, cnv_dim, curr_dim, with_bn=False),
+        nn.Conv2d(curr_dim, out_dim, (1, 1))
+    )
+
+def make_inter_layer(dim):
+    return residual(3, dim, dim)
+
+def make_cnv_layer(inp_dim, out_dim):
+    return convolution(3, inp_dim, out_dim)
+
+class kp_module(nn.Module):
+    def __init__(
+        self, n, dims, modules, layer=residual,
+        make_up_layer=make_layer, make_low_layer=make_layer,
+        make_hg_layer=make_layer, make_hg_layer_revr=make_layer_revr,
+        make_pool_layer=make_pool_layer, make_unpool_layer=make_unpool_layer,
+        make_merge_layer=make_merge_layer, **kwargs
+    ):
+        super(kp_module, self).__init__()
+
+        self.n   = n
+
+        curr_mod = modules[0]
+        next_mod = modules[1]
+
+        curr_dim = dims[0]
+        next_dim = dims[1]
+
+        self.up1  = make_up_layer(
+            3, curr_dim, curr_dim, curr_mod, 
+            layer=layer, **kwargs
+        )  
+        self.max1 = make_pool_layer(curr_dim)
+        self.low1 = make_hg_layer(
+            3, curr_dim, next_dim, curr_mod,
+            layer=layer, **kwargs
+        )
+        self.low2 = kp_module(
+            n - 1, dims[1:], modules[1:], layer=layer, 
+            make_up_layer=make_up_layer, 
+            make_low_layer=make_low_layer,
+            make_hg_layer=make_hg_layer,
+            make_hg_layer_revr=make_hg_layer_revr,
+            make_pool_layer=make_pool_layer,
+            make_unpool_layer=make_unpool_layer,
+            make_merge_layer=make_merge_layer,
+            **kwargs
+        ) if self.n > 1 else \
+        make_low_layer(
+            3, next_dim, next_dim, next_mod,
+            layer=layer, **kwargs
+        )
+        self.low3 = make_hg_layer_revr(
+            3, next_dim, curr_dim, curr_mod,
+            layer=layer, **kwargs
+        )
+        self.up2  = make_unpool_layer(curr_dim)
+
+        self.merge = make_merge_layer(curr_dim)
+
+    def forward(self, x):
+        up1  = self.up1(x)
+        max1 = self.max1(x)
+        low1 = self.low1(max1)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        up2  = self.up2(low3)
+        return self.merge(up1, up2)
+
+class exkp(nn.Module):
+    def __init__(
+        self, n, nstack, dims, modules, heads, pre=None, cnv_dim=256, 
+        make_tl_layer=None, make_br_layer=None,
+        make_cnv_layer=make_cnv_layer, make_heat_layer=make_kp_layer,
+        make_tag_layer=make_kp_layer, make_regr_layer=make_kp_layer,
+        make_up_layer=make_layer, make_low_layer=make_layer, 
+        make_hg_layer=make_layer, make_hg_layer_revr=make_layer_revr,
+        make_pool_layer=make_pool_layer, make_unpool_layer=make_unpool_layer,
+        make_merge_layer=make_merge_layer, make_inter_layer=make_inter_layer, 
+        kp_layer=residual
+    ):
+        super(exkp, self).__init__()
+
+        self.nstack    = nstack
+        self.heads     = heads
+
+        curr_dim = dims[0]
+
+        self.pre = nn.Sequential(
+            convolution(7, 3, 128, stride=2),
+            residual(3, 128, 256, stride=2)
+        ) if pre is None else pre
+
+        self.kps  = nn.ModuleList([
+            kp_module(
+                n, dims, modules, layer=kp_layer,
+                make_up_layer=make_up_layer,
+                make_low_layer=make_low_layer,
+                make_hg_layer=make_hg_layer,
+                make_hg_layer_revr=make_hg_layer_revr,
+                make_pool_layer=make_pool_layer,
+                make_unpool_layer=make_unpool_layer,
+                make_merge_layer=make_merge_layer
+            ) for _ in range(nstack)
+        ])
+        self.cnvs = nn.ModuleList([
+            make_cnv_layer(curr_dim, cnv_dim) for _ in range(nstack)
+        ])
+
+        self.inters = nn.ModuleList([
+            make_inter_layer(curr_dim) for _ in range(nstack - 1)
+        ])
+
+        self.inters_ = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(curr_dim, curr_dim, (1, 1), bias=False),
+                nn.BatchNorm2d(curr_dim)
+            ) for _ in range(nstack - 1)
+        ])
+        self.cnvs_   = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(cnv_dim, curr_dim, (1, 1), bias=False),
+                nn.BatchNorm2d(curr_dim)
+            ) for _ in range(nstack - 1)
+        ])
+
+        ## keypoint heatmaps
+        for head in heads.keys():
+            if 'hm' in head:
+                module =  nn.ModuleList([
+                    make_heat_layer(
+                        cnv_dim, curr_dim, heads[head]) for _ in range(nstack)
+                ])
+                self.__setattr__(head, module)
+                for heat in self.__getattr__(head):
+                    heat[-1].bias.data.fill_(-2.19)
+            else:
+                module = nn.ModuleList([
+                    make_regr_layer(
+                        cnv_dim, curr_dim, heads[head]) for _ in range(nstack)
+                ])
+                self.__setattr__(head, module)
+
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, image):
+        # print('image shape', image.shape)
+        inter = self.pre(image)
+        outs  = []
+
+        for ind in range(self.nstack):
+            kp_, cnv_  = self.kps[ind], self.cnvs[ind]
+            kp  = kp_(inter)
+            cnv = cnv_(kp)
+
+            out = {}
+            for head in self.heads:
+                layer = self.__getattr__(head)[ind]
+                y = layer(cnv)
+                out[head] = y
+            
+            outs.append(out)
+            if ind < self.nstack - 1:
+                inter = self.inters_[ind](inter) + self.cnvs_[ind](cnv)
+                inter = self.relu(inter)
+                inter = self.inters[ind](inter)
+        return outs
+
+
+def make_hg_layer(kernel, dim0, dim1, mod, layer=convolution, **kwargs):
+    layers  = [layer(kernel, dim0, dim1, stride=2)]
+    layers += [layer(kernel, dim1, dim1) for _ in range(mod - 1)]
+    return nn.Sequential(*layers)
+
+
+class HourglassNet(exkp):
+    def __init__(self, heads, num_stacks=2):
+        n       = 5
+        dims    = [256, 256, 384, 384, 384, 512]
+        modules = [2, 2, 2, 2, 2, 4]
+
+        super(HourglassNet, self).__init__(
+            n, num_stacks, dims, modules, heads,
+            make_tl_layer=None,
+            make_br_layer=None,
+            make_pool_layer=make_pool_layer,
+            make_hg_layer=make_hg_layer,
+            kp_layer=residual, cnv_dim=256
+        )
+
+def get_large_hourglass_net(num_layers, heads, head_conv):
+  model = HourglassNet(heads, 2)
+  return model
--- a/src/lib/models/networks/msra_resnet.py
+++ b/src/lib/models/networks/msra_resnet.py
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# Modified by Xingyi Zhou
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+
+BN_MOMENTUM = 0.1
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class PoseResNet(nn.Module):
+
+    def __init__(self, block, layers, heads, head_conv, **kwargs):
+        self.inplanes = 64
+        self.deconv_with_bias = False
+        self.heads = heads
+
+        super(PoseResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+        # used for deconv layers
+        self.deconv_layers = self._make_deconv_layer(
+            3,
+            [256, 256, 256],
+            [4, 4, 4],
+        )
+        # self.final_layer = []
+
+        for head in sorted(self.heads):
+          num_output = self.heads[head]
+          if head_conv > 0:
+            fc = nn.Sequential(
+                nn.Conv2d(256, head_conv,
+                  kernel_size=3, padding=1, bias=True),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_conv, num_output, 
+                  kernel_size=1, stride=1, padding=0))
+          else:
+            fc = nn.Conv2d(
+              in_channels=256,
+              out_channels=num_output,
+              kernel_size=1,
+              stride=1,
+              padding=0
+          )
+          self.__setattr__(head, fc)
+
+        # self.final_layer = nn.ModuleList(self.final_layer)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+
+            planes = num_filters[i]
+            layers.append(
+                nn.ConvTranspose2d(
+                    in_channels=self.inplanes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias))
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.deconv_layers(x)
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x)
+        return [ret]
+
+    def init_weights(self, num_layers, pretrained=True):
+        if pretrained:
+            # print('=> init resnet deconv weights from normal distribution')
+            for _, m in self.deconv_layers.named_modules():
+                if isinstance(m, nn.ConvTranspose2d):
+                    # print('=> init {}.weight as normal(0, 0.001)'.format(name))
+                    # print('=> init {}.bias as 0'.format(name))
+                    nn.init.normal_(m.weight, std=0.001)
+                    if self.deconv_with_bias:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    # print('=> init {}.weight as 1'.format(name))
+                    # print('=> init {}.bias as 0'.format(name))
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+            # print('=> init final conv weights from normal distribution')
+            for head in self.heads:
+              final_layer = self.__getattr__(head)
+              for i, m in enumerate(final_layer.modules()):
+                  if isinstance(m, nn.Conv2d):
+                      # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                      # print('=> init {}.weight as normal(0, 0.001)'.format(name))
+                      # print('=> init {}.bias as 0'.format(name))
+                      if m.weight.shape[0] == self.heads[head]:
+                          if 'hm' in head:
+                              nn.init.constant_(m.bias, -2.19)
+                          else:
+                              nn.init.normal_(m.weight, std=0.001)
+                              nn.init.constant_(m.bias, 0)
+            #pretrained_state_dict = torch.load(pretrained)
+            url = model_urls['resnet{}'.format(num_layers)]
+            pretrained_state_dict = model_zoo.load_url(url)
+            print('=> loading pretrained model {}'.format(url))
+            self.load_state_dict(pretrained_state_dict, strict=False)
+        else:
+            print('=> imagenet pretrained model dose not exist')
+            print('=> please download it first')
+            raise ValueError('imagenet pretrained model does not exist')
+
+
+resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]),
+               34: (BasicBlock, [3, 4, 6, 3]),
+               50: (Bottleneck, [3, 4, 6, 3]),
+               101: (Bottleneck, [3, 4, 23, 3]),
+               152: (Bottleneck, [3, 8, 36, 3])}
+
+
+def get_pose_net(num_layers, heads, head_conv):
+  block_class, layers = resnet_spec[num_layers]
+
+  model = PoseResNet(block_class, layers, heads, head_conv=head_conv)
+  model.init_weights(num_layers, pretrained=True)
+  return model
--- a/src/lib/models/networks/pose_dla_dcn.py
+++ b/src/lib/models/networks/pose_dla_dcn.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import math
+import logging
+import numpy as np
+from os.path import join
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+
+from .DCNv2.dcn_v2 import DCN
+
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
+    return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=1, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = Bottleneck.expansion
+        bottle_planes = planes // expansion
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BottleneckX(nn.Module):
+    expansion = 2
+    cardinality = 32
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BottleneckX, self).__init__()
+        cardinality = BottleneckX.cardinality
+        # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
+        # bottle_planes = dim * cardinality
+        bottle_planes = planes * cardinality // 32
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation, bias=False,
+                               dilation=dilation, groups=cardinality)
+        self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Root(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(Root, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, 1,
+            stride=1, bias=False, padding=(kernel_size - 1) // 2)
+        self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.residual = residual
+
+    def forward(self, *x):
+        children = x
+        x = self.conv(torch.cat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class Tree(nn.Module):
+    def __init__(self, levels, block, in_channels, out_channels, stride=1,
+                 level_root=False, root_dim=0, root_kernel_size=1,
+                 dilation=1, root_residual=False):
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride,
+                               dilation=dilation)
+            self.tree2 = block(out_channels, out_channels, 1,
+                               dilation=dilation)
+        else:
+            self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
+                              stride, root_dim=0,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+            self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
+                              root_dim=root_dim + out_channels,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, root_kernel_size,
+                             root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels,
+                          kernel_size=1, stride=1, bias=False),
+                nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
+            )
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Module):
+    def __init__(self, levels, channels, num_classes=1000,
+                 block=BasicBlock, residual_root=False, linear_root=False):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.num_classes = num_classes
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
+                      padding=3, bias=False),
+            nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True))
+        self.level0 = self._make_conv_level(
+            channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+        self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
+                           level_root=False,
+                           root_residual=residual_root)
+        self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
+                           level_root=True, root_residual=residual_root)
+
+        # for m in self.modules():
+        #     if isinstance(m, nn.Conv2d):
+        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        #         m.weight.data.normal_(0, math.sqrt(2. / n))
+        #     elif isinstance(m, nn.BatchNorm2d):
+        #         m.weight.data.fill_(1)
+        #         m.bias.data.zero_()
+
+    def _make_level(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                nn.MaxPool2d(stride, stride=stride),
+                nn.Conv2d(inplanes, planes,
+                          kernel_size=1, stride=1, bias=False),
+                nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample=downsample))
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2d(inplanes, planes, kernel_size=3,
+                          stride=stride if i == 0 else 1,
+                          padding=dilation, bias=False, dilation=dilation),
+                nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
+                nn.ReLU(inplace=True)])
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        y = []
+        x = self.base_layer(x)
+        for i in range(6):
+            x = getattr(self, 'level{}'.format(i))(x)
+            y.append(x)
+        return y
+
+    def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'):
+        # fc = self.fc
+        if name.endswith('.pth'):
+            model_weights = torch.load(data + name)
+        else:
+            model_url = get_model_url(data, name, hash)
+            model_weights = model_zoo.load_url(model_url)
+        num_classes = len(model_weights[list(model_weights.keys())[-1]])
+        self.fc = nn.Conv2d(
+            self.channels[-1], num_classes,
+            kernel_size=1, stride=1, padding=0, bias=True)
+        self.load_state_dict(model_weights)
+        # self.fc = fc
+
+
+def dla34(pretrained=True, **kwargs):  # DLA-34
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 128, 256, 512],
+                block=BasicBlock, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
+    return model
+
+class Identity(nn.Module):
+
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+
+
+class DeformConv(nn.Module):
+    def __init__(self, chi, cho):
+        super(DeformConv, self).__init__()
+        self.actf = nn.Sequential(
+            nn.BatchNorm2d(cho, momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True)
+        )
+        self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.actf(x)
+        return x
+
+
+class IDAUp(nn.Module):
+
+    def __init__(self, o, channels, up_f):
+        super(IDAUp, self).__init__()
+        for i in range(1, len(channels)):
+            c = channels[i]
+            f = int(up_f[i])  
+            proj = DeformConv(c, o)
+            node = DeformConv(o, o)
+     
+            up = nn.ConvTranspose2d(o, o, f * 2, stride=f, 
+                                    padding=f // 2, output_padding=0,
+                                    groups=o, bias=False)
+            fill_up_weights(up)
+
+            setattr(self, 'proj_' + str(i), proj)
+            setattr(self, 'up_' + str(i), up)
+            setattr(self, 'node_' + str(i), node)
+                 
+        
+    def forward(self, layers, startp, endp):
+        for i in range(startp + 1, endp):
+            upsample = getattr(self, 'up_' + str(i - startp))
+            project = getattr(self, 'proj_' + str(i - startp))
+            layers[i] = upsample(project(layers[i]))
+            node = getattr(self, 'node_' + str(i - startp))
+            layers[i] = node(layers[i] + layers[i - 1])
+
+
+
+class DLAUp(nn.Module):
+    def __init__(self, startp, channels, scales, in_channels=None):
+        super(DLAUp, self).__init__()
+        self.startp = startp
+        if in_channels is None:
+            in_channels = channels
+        self.channels = channels
+        channels = list(channels)
+        scales = np.array(scales, dtype=int)
+        for i in range(len(channels) - 1):
+            j = -i - 2
+            setattr(self, 'ida_{}'.format(i),
+                    IDAUp(channels[j], in_channels[j:],
+                          scales[j:] // scales[j]))
+            scales[j + 1:] = scales[j]
+            in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
+
+    def forward(self, layers):
+        out = [layers[-1]] # start with 16 * 16
+        for i in range(len(layers) - self.startp - 1):
+            ida = getattr(self, 'ida_{}'.format(i))
+            ida(layers, len(layers) -i - 2, len(layers))
+            out.insert(0, layers[-1])
+        return out
+
+
+class Interpolate(nn.Module):
+    def __init__(self, scale, mode):
+        super(Interpolate, self).__init__()
+        self.scale = scale
+        self.mode = mode
+        
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False)
+        return x
+
+
+class DLASeg(nn.Module):
+    def __init__(self, base_name, heads, pretrained, down_ratio, final_kernel,
+                 last_level, head_conv, out_channel=0):
+        super(DLASeg, self).__init__()
+        assert down_ratio in [2, 4, 8, 16]
+        self.first_level = int(np.log2(down_ratio))
+        self.last_level = last_level
+        self.base = globals()[base_name](pretrained=pretrained)
+        channels = self.base.channels
+        scales = [2 ** i for i in range(len(channels[self.first_level:]))]
+        self.dla_up = DLAUp(self.first_level, channels[self.first_level:], scales)
+
+        if out_channel == 0:
+            out_channel = channels[self.first_level]
+
+        self.ida_up = IDAUp(out_channel, channels[self.first_level:self.last_level], 
+                            [2 ** i for i in range(self.last_level - self.first_level)])
+        
+        self.heads = heads
+        for head in self.heads:
+            classes = self.heads[head]
+            if head_conv > 0:
+              fc = nn.Sequential(
+                  nn.Conv2d(channels[self.first_level], head_conv,
+                    kernel_size=3, padding=1, bias=True),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(head_conv, classes, 
+                    kernel_size=final_kernel, stride=1, 
+                    padding=final_kernel // 2, bias=True))
+              if 'hm' in head:
+                fc[-1].bias.data.fill_(-2.19)
+              else:
+                fill_fc_weights(fc)
+            else:
+              fc = nn.Conv2d(channels[self.first_level], classes, 
+                  kernel_size=final_kernel, stride=1, 
+                  padding=final_kernel // 2, bias=True)
+              if 'hm' in head:
+                fc.bias.data.fill_(-2.19)
+              else:
+                fill_fc_weights(fc)
+            self.__setattr__(head, fc)
+
+    def forward(self, x):
+        x = self.base(x)
+        x = self.dla_up(x)
+
+        y = []
+        for i in range(self.last_level - self.first_level):
+            y.append(x[i].clone())
+        self.ida_up(y, 0, len(y))
+
+        z = {}
+        for head in self.heads:
+            z[head] = self.__getattr__(head)(y[-1])
+        return [z]
+    
+
+def get_pose_net(num_layers, heads, head_conv=256, down_ratio=4):
+  model = DLASeg('dla{}'.format(num_layers), heads,
+                 pretrained=True,
+                 down_ratio=down_ratio,
+                 final_kernel=1,
+                 last_level=5,
+                 head_conv=head_conv)
+  return model
+
--- a/src/lib/models/networks/resnet_dcn.py
+++ b/src/lib/models/networks/resnet_dcn.py
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# Modified by Dequan Wang and Xingyi Zhou
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from .DCNv2.dcn_v2 import DCN
+import torch.utils.model_zoo as model_zoo
+
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :] 
+
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            nn.init.normal_(m.weight, std=0.001)
+            # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            # torch.nn.init.xavier_normal_(m.weight.data)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+class PoseResNet(nn.Module):
+
+    def __init__(self, block, layers, heads, head_conv):
+        self.inplanes = 64
+        self.heads = heads
+        self.deconv_with_bias = False
+
+        super(PoseResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+        # used for deconv layers
+        self.deconv_layers = self._make_deconv_layer(
+            3,
+            [256, 128, 64],
+            [4, 4, 4],
+        )
+
+        for head in self.heads:
+            classes = self.heads[head]
+            if head_conv > 0:
+                fc = nn.Sequential(
+                  nn.Conv2d(64, head_conv,
+                    kernel_size=3, padding=1, bias=True),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(head_conv, classes, 
+                    kernel_size=1, stride=1, 
+                    padding=0, bias=True))
+                if 'hm' in head:
+                    fc[-1].bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            else:
+                fc = nn.Conv2d(64, classes, 
+                  kernel_size=1, stride=1, 
+                  padding=0, bias=True)
+                if 'hm' in head:
+                    fc.bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            self.__setattr__(head, fc)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+
+            planes = num_filters[i]
+            fc = DCN(self.inplanes, planes, 
+                    kernel_size=(3,3), stride=1,
+                    padding=1, dilation=1, deformable_groups=1)
+            # fc = nn.Conv2d(self.inplanes, planes,
+            #         kernel_size=3, stride=1, 
+            #         padding=1, dilation=1, bias=False)
+            # fill_fc_weights(fc)
+            up = nn.ConvTranspose2d(
+                    in_channels=planes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias)
+            fill_up_weights(up)
+
+            layers.append(fc)
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            layers.append(up)
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.deconv_layers(x)
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x)
+        return [ret]
+
+    def init_weights(self, num_layers):
+        if 1:
+            url = model_urls['resnet{}'.format(num_layers)]
+            pretrained_state_dict = model_zoo.load_url(url)
+            print('=> loading pretrained model {}'.format(url))
+            self.load_state_dict(pretrained_state_dict, strict=False)
+            print('=> init deconv weights from normal distribution')
+            for name, m in self.deconv_layers.named_modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+
+
+resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]),
+               34: (BasicBlock, [3, 4, 6, 3]),
+               50: (Bottleneck, [3, 4, 6, 3]),
+               101: (Bottleneck, [3, 4, 23, 3]),
+               152: (Bottleneck, [3, 8, 36, 3])}
+
+
+def get_pose_net(num_layers, heads, head_conv=256):
+  block_class, layers = resnet_spec[num_layers]
+
+  model = PoseResNet(block_class, layers, heads, head_conv=head_conv)
+  model.init_weights(num_layers)
+  return model
--- a/src/lib/models/scatter_gather.py
+++ b/src/lib/models/scatter_gather.py
+import torch
+from torch.autograd import Variable
+from torch.nn.parallel._functions import Scatter, Gather
+
+
+def scatter(inputs, target_gpus, dim=0, chunk_sizes=None):
+    r"""
+    Slices variables into approximately equal chunks and
+    distributes them across given GPUs. Duplicates
+    references to objects that are not variables. Does not
+    support Tensors.
+    """
+    def scatter_map(obj):
+        if isinstance(obj, Variable):
+            return Scatter.apply(target_gpus, chunk_sizes, dim, obj)
+        assert not torch.is_tensor(obj), "Tensors not supported in scatter."
+        if isinstance(obj, tuple):
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list):
+            return list(map(list, zip(*map(scatter_map, obj))))
+        if isinstance(obj, dict):
+            return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+        return [obj for targets in target_gpus]
+
+    return scatter_map(inputs)
+
+
+def scatter_kwargs(inputs, kwargs, target_gpus, dim=0, chunk_sizes=None):
+    r"""Scatter with support for kwargs dictionary"""
+    inputs = scatter(inputs, target_gpus, dim, chunk_sizes) if inputs else []
+    kwargs = scatter(kwargs, target_gpus, dim, chunk_sizes) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
--- a/src/lib/models/utils.py
+++ b/src/lib/models/utils.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+
+def _sigmoid(x):
+  y = torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4)
+  return y
+
+def _gather_feat(feat, ind, mask=None):
+    dim  = feat.size(2)
+    ind  = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
+    feat = feat.gather(1, ind)
+    if mask is not None:
+        mask = mask.unsqueeze(2).expand_as(feat)
+        feat = feat[mask]
+        feat = feat.view(-1, dim)
+    return feat
+
+def _tranpose_and_gather_feat(feat, ind):
+    feat = feat.permute(0, 2, 3, 1).contiguous()
+    feat = feat.view(feat.size(0), -1, feat.size(3))
+    feat = _gather_feat(feat, ind)
+    return feat
+
+def flip_tensor(x):
+    return torch.flip(x, [3])
+    # tmp = x.detach().cpu().numpy()[..., ::-1].copy()
+    # return torch.from_numpy(tmp).to(x.device)
+
+def flip_lr(x, flip_idx):
+  tmp = x.detach().cpu().numpy()[..., ::-1].copy()
+  shape = tmp.shape
+  for e in flip_idx:
+    tmp[:, e[0], ...], tmp[:, e[1], ...] = \
+      tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy()
+  return torch.from_numpy(tmp.reshape(shape)).to(x.device)
+
+def flip_lr_off(x, flip_idx):
+  tmp = x.detach().cpu().numpy()[..., ::-1].copy()
+  shape = tmp.shape
+  tmp = tmp.reshape(tmp.shape[0], 17, 2, 
+                    tmp.shape[2], tmp.shape[3])
+  tmp[:, :, 0, :, :] *= -1
+  for e in flip_idx:
+    tmp[:, e[0], ...], tmp[:, e[1], ...] = \
+      tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy()
+  return torch.from_numpy(tmp.reshape(shape)).to(x.device)
\ No newline at end of file
--- a/src/lib/opts_pose.py
+++ b/src/lib/opts_pose.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+
+
+class opts(object):
+    def __init__(self):
+        self.parser = argparse.ArgumentParser()
+        # basic experiment setting
+        self.parser.add_argument('--task', default='multi_pose',
+                                 help='ctdet | ddd | multi_pose | exdet')
+        self.parser.add_argument('--dataset', default='facehp',
+                                 help='coco | kitti | coco_hp | pascal | pig | face | facehp')
+        self.parser.add_argument('--exp_id', default='dla')
+        self.parser.add_argument('--test', action='store_true')
+        self.parser.add_argument('--debug', type=int, default=0,
+                                 help='level of visualization.'
+                                      '-1: return the result image'
+                                      '1: only show the final detection results'
+                                      '2: show the network output features'
+                                      '3: use matplot to display'  # useful when lunching training with ipython notebook
+                                      '4: save all visualizations to disk')
+        self.parser.add_argument('--demo', default='/path/WIDER_train/images/0--Parade/0_Parade_marchingband_1_80.jpg',
+                                 help='path to image/ image folders/ video. '
+                                      'or "webcam"')
+        self.parser.add_argument(
+            '--data_dir', type=str, default='')
+        self.parser.add_argument('--load_model', default='',
+                                 help='path to pretrained model')
+        self.parser.add_argument('--resume', action='store_true',
+                                 help='resume an experiment. '
+                                      'Reloaded the optimizer parameter and '
+                                      'set load_model to model_last.pth '
+                                      'in the exp dir if load_model is empty.')
+        self.parser.add_argument(
+            '--train_json', default=None, help='the train file labels')
+        self.parser.add_argument(
+            '--val_json', default=None, help='the test file labels')
+        self.parser.add_argument(
+            '--output_video', type=str, default='../output/res_3.mp4')
+
+        # system
+        self.parser.add_argument('--gpus', default='0',
+                                 help='-1 for CPU, use comma for multiple gpus, such as 1,2,3')
+        self.parser.add_argument('--num_workers', type=int, default=4,
+                                 help='dataloader threads. 0 for single-thread.')
+        self.parser.add_argument('--not_cuda_benchmark', action='store_true',
+                                 help='disable when the input size is not fixed.')
+        self.parser.add_argument('--seed', type=int, default=317,
+                                 help='random seed')  # from CornerNet
+
+        # log
+        self.parser.add_argument('--print_iter', type=int, default=0,
+                                 help='disable progress bar and print to screen.')
+        self.parser.add_argument('--hide_data_time', action='store_true',
+                                 help='not display time during training.')
+        self.parser.add_argument('--save_all', action='store_true', default=True,
+                                 help='save model to disk every 5 epochs.')
+        self.parser.add_argument('--metric', default='loss',
+                                 help='main metric to save best model')
+        self.parser.add_argument('--vis_thresh', type=float, default=0.4,
+                                 help='visualization threshold.')
+        self.parser.add_argument('--debugger_theme', default='white',
+                                 choices=['white', 'black'])
+
+        # model
+        self.parser.add_argument('--arch', default='mobilev2_10',
+                                 help='model architecture. Currently tested'
+                                      'res_18 | res_101 | resdcn_18 | resdcn_101 |'
+                                      'dlav0_34 | dla_34 | hourglass | mobilev2_10')
+        self.parser.add_argument('--head_conv', type=int, default=-1,
+                                 help='conv layer channels for output head'
+                                      '0 for no conv layer'
+                                      '-1 for default setting'
+                                      '64 for resnets and 256 for dla.')
+        self.parser.add_argument('--down_ratio', type=int, default=4,
+                                 help='output stride. Currently only supports 4.')
+
+        # input
+        self.parser.add_argument('--input_res', type=int, default=-1,
+                                 help='input height and width. -1 for default from '
+                                 'dataset. Will be overriden by input_h | input_w')
+        self.parser.add_argument('--input_h', type=int, default=-1,
+                                 help='input height. -1 for default from dataset.')
+        self.parser.add_argument('--input_w', type=int, default=-1,
+                                 help='input width. -1 for default from dataset.')
+
+        # train
+        self.parser.add_argument('--lr', type=float, default=1.25e-4,
+                                 help='learning rate for batch size 32.')
+        self.parser.add_argument('--lr_step', type=str, default='30,80',
+                                 help='drop learning rate by 10.')
+        self.parser.add_argument('--num_epochs', type=int, default=140,
+                                 help='total training epochs.')
+        self.parser.add_argument('--batch_size', type=int, default=32,
+                                 help='batch size')
+        self.parser.add_argument('--master_batch_size', type=int, default=15,
+                                 help='batch size on the master gpu.')
+        self.parser.add_argument('--num_iters', type=int, default=-1,
+                                 help='default: #samples / batch_size.')
+        self.parser.add_argument('--val_intervals', type=int, default=5,
+                                 help='number of epochs to run validation.')
+        self.parser.add_argument('--trainval', action='store_true',
+                                 help='include validation in training and '
+                                      'test on test set')
+
+        # test
+        self.parser.add_argument('--flip_test', action='store_true',
+                                 help='flip data augmentation.')
+        self.parser.add_argument('--test_scales', type=str, default='1',
+                                 help='multi scale test augmentation.')
+        self.parser.add_argument('--nms', action='store_true',
+                                 help='run nms in testing.')
+        self.parser.add_argument('--K', type=int, default=200,
+                                 help='max number of output objects.')
+        self.parser.add_argument('--not_prefetch_test', action='store_true',
+                                 help='not use parallal data pre-processing.')
+        self.parser.add_argument('--fix_res', action='store_true',
+                                 help='fix testing resolution or keep '
+                                      'the original resolution')
+        self.parser.add_argument('--keep_res', action='store_true',
+                                 help='keep the original resolution'
+                                      ' during validation.')
+
+        # dataset
+        self.parser.add_argument('--not_rand_crop', action='store_true',
+                                 help='not use the random crop data augmentation'
+                                      'from CornerNet.')
+        self.parser.add_argument('--shift', type=float, default=0.1,
+                                 help='when not using random crop'
+                                      'apply shift augmentation.')
+        self.parser.add_argument('--scale', type=float, default=0.4,
+                                 help='when not using random crop'
+                                      'apply scale augmentation.')
+        self.parser.add_argument('--rotate', type=float, default=0,
+                                 help='when not using random crop'
+                                      'apply rotation augmentation.')
+        self.parser.add_argument('--flip', type=float, default=0.5,
+                                 help='probability of applying flip augmentation.')
+        self.parser.add_argument('--no_color_aug', action='store_true',
+                                 help='not use the color augmenation '
+                                      'from CornerNet')
+        # multi_pose
+        self.parser.add_argument('--aug_rot', type=float, default=0,
+                                 help='probability of applying '
+                                      'rotation augmentation.')
+        # ddd
+        self.parser.add_argument('--aug_ddd', type=float, default=0.5,
+                                 help='probability of applying crop augmentation.')
+        self.parser.add_argument('--rect_mask', action='store_true',
+                                 help='for ignored object, apply mask on the '
+                                      'rectangular region or just center point.')
+        self.parser.add_argument('--kitti_split', default='3dop',
+                                 help='different validation split for kitti: '
+                                      '3dop | subcnn')
+
+        # loss
+        self.parser.add_argument('--mse_loss', action='store_true',
+                                 help='use mse loss or focal loss to train '
+                                      'keypoint heatmaps.')
+        # ctdet
+        self.parser.add_argument('--reg_loss', default='sl1',
+                                 help='regression loss: sl1 | l1 | l2')
+        self.parser.add_argument('--hm_weight', type=float, default=1,
+                                 help='loss weight for keypoint heatmaps.')
+        self.parser.add_argument('--off_weight', type=float, default=1,
+                                 help='loss weight for keypoint local offsets.')
+        self.parser.add_argument('--wh_weight', type=float, default=0.1,
+                                 help='loss weight for bounding box size.')
+        # multi_pose
+        self.parser.add_argument('--lm_weight', type=float, default=0.1,
+                                 help='loss weight for human pose offset.')
+        self.parser.add_argument('--hm_hp_weight', type=float, default=1,
+                                 help='loss weight for human keypoint heatmap.')
+        # ddd
+        self.parser.add_argument('--dep_weight', type=float, default=1,
+                                 help='loss weight for depth.')
+        self.parser.add_argument('--dim_weight', type=float, default=1,
+                                 help='loss weight for 3d bounding box size.')
+        self.parser.add_argument('--rot_weight', type=float, default=1,
+                                 help='loss weight for orientation.')
+        self.parser.add_argument('--peak_thresh', type=float, default=0.2)
+
+        # task
+        # ctdet
+        self.parser.add_argument('--norm_wh', action='store_true',
+                                 help='L1(\hat(y) / y, 1) or L1(\hat(y), y)')
+        self.parser.add_argument('--dense_wh', action='store_true',
+                                 help='apply weighted regression near center or '
+                                      'just apply regression on center point.')
+        self.parser.add_argument('--cat_spec_wh', action='store_true',
+                                 help='category specific bounding box size.')
+        self.parser.add_argument('--not_reg_offset', action='store_true',
+                                 help='not regress local offset.')
+        # exdet
+        self.parser.add_argument('--agnostic_ex', action='store_true',
+                                 help='use category agnostic extreme points.')
+        self.parser.add_argument('--scores_thresh', type=float, default=0.1,
+                                 help='threshold for extreme point heatmap.')
+        self.parser.add_argument('--center_thresh', type=float, default=0.1,
+                                 help='threshold for centermap.')
+        self.parser.add_argument('--aggr_weight', type=float, default=0.0,
+                                 help='edge aggregation weight.')
+        # multi_pose
+        self.parser.add_argument('--dense_hp', action='store_true',
+                                 help='apply weighted pose regression near center '
+                                      'or just apply regression on center point.')
+        self.parser.add_argument('--not_hm_hp', action='store_true',
+                                 help='not estimate human joint heatmap, '
+                                      'directly use the joint offset from center.')
+        self.parser.add_argument('--not_reg_hp_offset', action='store_true',
+                                 help='not regress local offset for '
+                                      'human joint heatmaps.')
+        self.parser.add_argument('--not_reg_bbox', action='store_true',
+                                 help='not regression bounding box size.')
+
+        # ground truth validation
+        self.parser.add_argument('--eval_oracle_hm', action='store_true',
+                                 help='use ground center heatmap.')
+        self.parser.add_argument('--eval_oracle_wh', action='store_true',
+                                 help='use ground truth bounding box size.')
+        self.parser.add_argument('--eval_oracle_offset', action='store_true',
+                                 help='use ground truth local heatmap offset.')
+        self.parser.add_argument('--eval_oracle_kps', action='store_true',
+                                 help='use ground truth human pose offset.')
+        self.parser.add_argument('--eval_oracle_hmhp', action='store_true',
+                                 help='use ground truth human joint heatmaps.')
+        self.parser.add_argument('--eval_oracle_hp_offset', action='store_true',
+                                 help='use ground truth human joint local offset.')
+        self.parser.add_argument('--eval_oracle_dep', action='store_true',
+                                 help='use ground truth depth.')
+
+    def parse(self, args=''):
+        if args == '':
+            opt = self.parser.parse_args()
+        else:
+            opt = self.parser.parse_args(args)
+
+        opt.gpus_str = opt.gpus
+        opt.gpus = [int(gpu) for gpu in opt.gpus.split(',')]
+        opt.gpus = [i for i in range(
+            len(opt.gpus))] if opt.gpus[0] >= 0 else [-1]
+        opt.lr_step = [int(i) for i in opt.lr_step.split(',')]
+        opt.test_scales = [float(i) for i in opt.test_scales.split(',')]
+
+        opt.fix_res = not opt.keep_res
+        print('Fix size testing.' if opt.fix_res else 'Keep resolution testing.')
+        opt.reg_offset = not opt.not_reg_offset
+        opt.reg_bbox = not opt.not_reg_bbox
+        opt.hm_hp = not opt.not_hm_hp
+        opt.reg_hp_offset = (not opt.not_reg_hp_offset) and opt.hm_hp
+
+        if opt.head_conv == -1:  # init default head_conv
+            opt.head_conv = 256 if 'dla' in opt.arch else 64
+        opt.pad = 127 if 'hourglass' in opt.arch else 31
+        opt.num_stacks = 2 if opt.arch == 'hourglass' else 1
+
+        if opt.trainval:
+            opt.val_intervals = 100000000
+
+        if opt.debug > 0:
+            opt.num_workers = 0
+            opt.batch_size = 1
+            opt.gpus = [opt.gpus[0]]
+            opt.master_batch_size = -1
+
+        if opt.master_batch_size == -1:
+            opt.master_batch_size = opt.batch_size // len(opt.gpus)
+        rest_batch_size = (opt.batch_size - opt.master_batch_size)
+        opt.chunk_sizes = [opt.master_batch_size]
+        for i in range(len(opt.gpus) - 1):
+            slave_chunk_size = rest_batch_size // (len(opt.gpus) - 1)
+            if i < rest_batch_size % (len(opt.gpus) - 1):
+                slave_chunk_size += 1
+            opt.chunk_sizes.append(slave_chunk_size)
+        print('training chunk_sizes:', opt.chunk_sizes)
+
+        opt.root_dir = os.path.join(os.path.dirname(__file__), '..', '..')
+        # opt.data_dir = os.path.join(opt.root_dir, 'data')
+        opt.exp_dir = os.path.join(opt.root_dir, 'exp', opt.task)
+        opt.save_dir = os.path.join(opt.exp_dir, opt.arch)
+        opt.debug_dir = os.path.join(opt.save_dir, 'debug')
+        print('The output will be saved to ', opt.save_dir)
+
+        if opt.resume and opt.load_model == '':
+            model_path = opt.save_dir[:-4] if opt.save_dir.endswith('TEST') \
+                else opt.save_dir
+            opt.load_model = os.path.join(model_path, 'model_last.pth')
+        return opt
+
+    def update_dataset_info_and_set_heads(self, opt, dataset):
+        input_h, input_w = dataset.default_resolution
+        opt.mean, opt.std = dataset.mean, dataset.std
+        opt.num_classes = dataset.num_classes
+
+        # input_h(w): opt.input_h overrides opt.input_res overrides dataset default
+        input_h = opt.input_res if opt.input_res > 0 else input_h
+        input_w = opt.input_res if opt.input_res > 0 else input_w
+        opt.input_h = opt.input_h if opt.input_h > 0 else input_h
+        opt.input_w = opt.input_w if opt.input_w > 0 else input_w
+        opt.output_h = opt.input_h // opt.down_ratio
+        opt.output_w = opt.input_w // opt.down_ratio
+        opt.input_res = max(opt.input_h, opt.input_w)
+        opt.output_res = max(opt.output_h, opt.output_w)
+
+        if opt.task == 'exdet':
+            # assert opt.dataset in ['coco']
+            num_hm = 1 if opt.agnostic_ex else opt.num_classes
+            opt.heads = {'hm_t': num_hm, 'hm_l': num_hm,
+                         'hm_b': num_hm, 'hm_r': num_hm,
+                         'hm_c': opt.num_classes}
+            if opt.reg_offset:
+                opt.heads.update(
+                    {'reg_t': 2, 'reg_l': 2, 'reg_b': 2, 'reg_r': 2})
+        elif opt.task == 'ddd':
+            # assert opt.dataset in ['gta', 'kitti', 'viper']
+            opt.heads = {'hm': opt.num_classes, 'dep': 1, 'rot': 8, 'dim': 3}
+            if opt.reg_bbox:
+                opt.heads.update(
+                    {'wh': 2})
+            if opt.reg_offset:
+                opt.heads.update({'reg': 2})
+        elif opt.task == 'ctdet':
+            # assert opt.dataset in ['pascal', 'coco']
+            opt.heads = {'hm': opt.num_classes,
+                         'wh': 2 if not opt.cat_spec_wh else 2 * opt.num_classes}
+            if opt.reg_offset:
+                opt.heads.update({'reg': 2})
+        # elif opt.task == 'multi_pose':
+        #   # assert opt.dataset in ['coco_hp']
+        #   opt.flip_idx = dataset.flip_idx
+        #   opt.heads = {'hm': opt.num_classes, 'wh': 2, 'hps': dataset.num_joints*2}
+        #   if opt.reg_offset:
+        #     opt.heads.update({'reg': 2})
+        #   if opt.hm_hp:
+        #     opt.heads.update({'hm_hp': dataset.num_joints})
+        #   if opt.reg_hp_offset:
+        #     opt.heads.update({'hp_offset': 2})
+        elif opt.task == 'multi_pose':
+            opt.flip_idx = dataset.flip_idx
+            opt.heads = {'hm': opt.num_classes, 'wh': 2,
+                         'hm_offset': 2, 'landmarks': dataset.num_joints * 2}
+
+        else:
+            assert 0, 'task not defined!'
+        print('heads', opt.heads)
+        return opt
+
+    def init(self, args=''):
+        default_dataset_info = {
+            'ctdet': {'default_resolution': [512, 512], 'num_classes': 1,
+                      'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225],
+                      'dataset': 'pig'},
+            # 'ctdet': {'default_resolution': [512, 512], 'num_classes': 1,
+            #           'mean': [0.408, 0.447, 0.470], 'std': [0.289, 0.274, 0.278],
+            #           'dataset': 'coco'},
+            'exdet': {'default_resolution': [512, 512], 'num_classes': 80,
+                      'mean': [0.408, 0.447, 0.470], 'std': [0.289, 0.274, 0.278],
+                      'dataset': 'coco'},
+            'multi_pose': {
+                'default_resolution': [512, 512], 'num_classes': 1,
+                'mean': [0.408, 0.447, 0.470], 'std': [0.289, 0.274, 0.278],
+                'dataset': 'facehp', 'num_joints': 5,
+                'flip_idx': [[0, 1], [3, 4]]},
+            'ddd': {'default_resolution': [384, 1280], 'num_classes': 3,
+                    'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225],
+                    'dataset': 'kitti'},
+        }
+
+        class Struct:
+            def __init__(self, entries):
+                for k, v in entries.items():
+                    self.__setattr__(k, v)
+        opt = self.parse(args)
+        dataset = Struct(default_dataset_info[opt.task])
+        opt.dataset = dataset.dataset
+        opt = self.update_dataset_info_and_set_heads(opt, dataset)
+        return opt
--- a/src/lib/trains/base_trainer.py
+++ b/src/lib/trains/base_trainer.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import torch
+from progress.bar import Bar
+from models.data_parallel import DataParallel
+from utils.utils import AverageMeter
+
+
+class ModleWithLoss(torch.nn.Module):
+    def __init__(self, model, loss):
+        super(ModleWithLoss, self).__init__()
+        self.model = model
+        self.loss = loss
+
+    def forward(self, batch):
+        outputs = self.model(batch['input'])
+        loss, loss_stats = self.loss(outputs, batch)          # 输入
+        return outputs[-1], loss, loss_stats
+
+
+class BaseTrainer(object):
+    def __init__(self, opt, model, optimizer=None):
+        self.opt = opt
+        self.optimizer = optimizer
+        self.loss_stats, self.loss = self._get_losses(opt)
+        self.model_with_loss = ModleWithLoss(model, self.loss)
+
+    def set_device(self, gpus, chunk_sizes, device):
+        if len(gpus) > 1:
+            self.model_with_loss = DataParallel(
+                self.model_with_loss, device_ids=gpus,
+                chunk_sizes=chunk_sizes).to(device)
+
+        else:
+            self.model_with_loss = self.model_with_loss.to(device)
+
+        for state in self.optimizer.state.values():
+            for k, v in state.items():
+                if isinstance(v, torch.Tensor):
+                    state[k] = v.to(device=device, non_blocking=True)
+
+    def run_epoch(self, phase, epoch, data_loader):
+        model_with_loss = self.model_with_loss
+        if phase == 'train':
+            model_with_loss.train()
+        else:
+            if len(self.opt.gpus) > 1:
+                model_with_loss = self.model_with_loss.module
+            model_with_loss.eval()
+            torch.cuda.empty_cache()
+
+        opt = self.opt
+        results = {}
+        data_time, batch_time = AverageMeter(), AverageMeter()
+        avg_loss_stats = {l: AverageMeter() for l in self.loss_stats}
+        num_iters = len(data_loader) if opt.num_iters < 0 else opt.num_iters
+        bar = Bar('{}/{}'.format(opt.task, opt.exp_id), max=num_iters)
+        end = time.time()
+        for iter_id, batch in enumerate(data_loader):
+            if iter_id >= num_iters:
+                break
+            data_time.update(time.time() - end)
+
+            for k in batch:
+                if k != 'meta':
+                    batch[k] = batch[k].to(
+                        device=opt.device, non_blocking=True)
+            output, loss, loss_stats = model_with_loss(batch)
+            loss = loss.mean()
+            if phase == 'train':
+                self.optimizer.zero_grad()
+                loss.backward()
+                self.optimizer.step()
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            Bar.suffix = '{phase}: [{0}]|Tot: {total:} |ETA: {eta:} '.format(
+                epoch, phase=phase, total=bar.elapsed_td, eta=bar.eta_td)
+
+            for l in avg_loss_stats:
+                avg_loss_stats[l].update(
+                    loss_stats[l].mean().item(), batch['input'].size(0))
+                Bar.suffix = Bar.suffix + \
+                    '|{} {:.4f} '.format(l, avg_loss_stats[l].avg)
+            if not opt.hide_data_time:
+                Bar.suffix = Bar.suffix + '|Data {dt.val:.3f}s({dt.avg:.3f}s) ' \
+                    '|Net {bt.avg:.3f}s'.format(dt=data_time, bt=batch_time)
+            if opt.print_iter > 0:
+                if iter_id % opt.print_iter == 0:
+                    print('{}/{}| {}'.format(opt.task, opt.exp_id, Bar.suffix))
+            else:
+                bar.next()
+
+            if opt.debug > 0:
+                self.debug(batch, output, iter_id)
+
+            if opt.test:
+                self.save_result(output, batch, results)
+            del output, loss, loss_stats
+
+        bar.finish()
+        ret = {k: v.avg for k, v in avg_loss_stats.items()}
+        ret['time'] = bar.elapsed_td.total_seconds() / 60.
+        return ret, results
+
+    def debug(self, batch, output, iter_id):
+        raise NotImplementedError
+
+    def save_result(self, output, batch, results):
+        raise NotImplementedError
+
+    def _get_losses(self, opt):
+        raise NotImplementedError
+
+    def val(self, epoch, data_loader):
+        return self.run_epoch('val', epoch, data_loader)
+
+    def train(self, epoch, data_loader):
+        return self.run_epoch('train', epoch, data_loader)
--- a/src/lib/trains/ctdet.py
+++ b/src/lib/trains/ctdet.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import numpy as np
+
+from models.losses import FocalLoss
+from models.losses import RegL1Loss, RegLoss, NormRegL1Loss, RegWeightedL1Loss
+from models.decode import ctdet_decode
+from models.utils import _sigmoid
+from utils.debugger import Debugger
+from utils.post_process import ctdet_post_process
+from utils.oracle_utils import gen_oracle_map
+from .base_trainer import BaseTrainer
+
+
+class CtdetLoss(torch.nn.Module):
+    def __init__(self, opt):
+        super(CtdetLoss, self).__init__()
+        self.crit = torch.nn.MSELoss() if opt.mse_loss else FocalLoss()
+        self.crit_reg = RegL1Loss() if opt.reg_loss == 'l1' else \
+            RegLoss() if opt.reg_loss == 'sl1' else None
+        self.crit_wh = torch.nn.L1Loss(reduction='sum') if opt.dense_wh else \
+            NormRegL1Loss() if opt.norm_wh else \
+            RegWeightedL1Loss() if opt.cat_spec_wh else self.crit_reg
+        self.opt = opt
+
+    def forward(self, outputs, batch):
+        opt = self.opt
+        hm_loss, wh_loss, off_loss = 0, 0, 0
+        for s in range(opt.num_stacks):
+            output = outputs[s]
+            if not opt.mse_loss:
+                output['hm'] = _sigmoid(output['hm'])
+
+            if opt.eval_oracle_hm:
+                output['hm'] = batch['hm']
+            if opt.eval_oracle_wh:
+                output['wh'] = torch.from_numpy(gen_oracle_map(
+                    batch['wh'].detach().cpu().numpy(),
+                    batch['ind'].detach().cpu().numpy(),
+                    output['wh'].shape[3], output['wh'].shape[2])).to(opt.device)
+            if opt.eval_oracle_offset:
+                output['reg'] = torch.from_numpy(gen_oracle_map(
+                    batch['reg'].detach().cpu().numpy(),
+                    batch['ind'].detach().cpu().numpy(),
+                    output['reg'].shape[3], output['reg'].shape[2])).to(opt.device)
+
+            hm_loss += self.crit(output['hm'], batch['hm']) / \
+                opt.num_stacks            # 热力图损失
+            if opt.wh_weight > 0:
+                if opt.dense_wh:
+                    mask_weight = batch['dense_wh_mask'].sum() + 1e-4
+                    wh_loss += (
+                        self.crit_wh(output['wh'] * batch['dense_wh_mask'],
+                                     batch['dense_wh'] * batch['dense_wh_mask']) /
+                        mask_weight) / opt.num_stacks
+                elif opt.cat_spec_wh:
+                    wh_loss += self.crit_wh(
+                        output['wh'], batch['cat_spec_mask'],
+                        batch['ind'], batch['cat_spec_wh']) / opt.num_stacks
+                else:
+                    wh_loss += self.crit_reg(
+                        output['wh'], batch['reg_mask'],
+                        batch['ind'], batch['wh']) / opt.num_stacks
+
+            if opt.reg_offset and opt.off_weight > 0:
+                off_loss += self.crit_reg(output['reg'], batch['reg_mask'],
+                                          batch['ind'], batch['reg']) / opt.num_stacks
+
+        loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + \
+            opt.off_weight * off_loss
+        loss_stats = {'loss': loss, 'hm_loss': hm_loss,
+                      'wh_loss': wh_loss, 'off_loss': off_loss}
+        return loss, loss_stats
+
+
+class CtdetTrainer(BaseTrainer):
+    def __init__(self, opt, model, optimizer=None):
+        super(CtdetTrainer, self).__init__(opt, model, optimizer=optimizer)
+
+    def _get_losses(self, opt):
+        loss_states = ['loss', 'hm_loss', 'wh_loss', 'off_loss']
+        loss = CtdetLoss(opt)
+        return loss_states, loss
+
+    def debug(self, batch, output, iter_id):
+        opt = self.opt
+        reg = output['reg'] if opt.reg_offset else None
+        dets = ctdet_decode(
+            output['hm'], output['wh'], reg=reg,
+            cat_spec_wh=opt.cat_spec_wh, K=opt.K)
+        dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+        dets[:, :, :4] *= opt.down_ratio
+        dets_gt = batch['meta']['gt_det'].numpy().reshape(1, -1, dets.shape[2])
+        dets_gt[:, :, :4] *= opt.down_ratio
+        for i in range(1):
+            debugger = Debugger(
+                dataset=opt.dataset, ipynb=(opt.debug == 3), theme=opt.debugger_theme)
+            img = batch['input'][i].detach().cpu().numpy().transpose(1, 2, 0)
+            img = np.clip(((
+                img * opt.std + opt.mean) * 255.), 0, 255).astype(np.uint8)
+            pred = debugger.gen_colormap(
+                output['hm'][i].detach().cpu().numpy())
+            gt = debugger.gen_colormap(batch['hm'][i].detach().cpu().numpy())
+            debugger.add_blend_img(img, pred, 'pred_hm')
+            debugger.add_blend_img(img, gt, 'gt_hm')
+            debugger.add_img(img, img_id='out_pred')
+            for k in range(len(dets[i])):
+                if dets[i, k, 4] > opt.center_thresh:
+                    debugger.add_coco_bbox(dets[i, k, :4], dets[i, k, -1],
+                                           dets[i, k, 4], img_id='out_pred')
+
+            debugger.add_img(img, img_id='out_gt')
+            for k in range(len(dets_gt[i])):
+                if dets_gt[i, k, 4] > opt.center_thresh:
+                    debugger.add_coco_bbox(dets_gt[i, k, :4], dets_gt[i, k, -1],
+                                           dets_gt[i, k, 4], img_id='out_gt')
+
+            if opt.debug == 4:
+                debugger.save_all_imgs(
+                    opt.debug_dir, prefix='{}'.format(iter_id))
+            else:
+                debugger.show_all_imgs(pause=True)
+
+    def save_result(self, output, batch, results):
+        reg = output['reg'] if self.opt.reg_offset else None
+        dets = ctdet_decode(
+            output['hm'], output['wh'], reg=reg,
+            cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)
+        dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+        dets_out = ctdet_post_process(
+            dets.copy(), batch['meta']['c'].cpu().numpy(),
+            batch['meta']['s'].cpu().numpy(),
+            output['hm'].shape[2], output['hm'].shape[3], output['hm'].shape[1])
+        results[batch['meta']['img_id'].cpu().numpy()[0]] = dets_out[0]
--- a/src/lib/trains/ddd.py
+++ b/src/lib/trains/ddd.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import numpy as np
+
+from models.losses import FocalLoss, L1Loss, BinRotLoss
+from models.decode import ddd_decode
+from models.utils import _sigmoid
+from utils.debugger import Debugger
+from utils.post_process import ddd_post_process
+from utils.oracle_utils import gen_oracle_map
+from .base_trainer import BaseTrainer
+
+
+class DddLoss(torch.nn.Module):
+    def __init__(self, opt):
+        super(DddLoss, self).__init__()
+        self.crit = torch.nn.MSELoss() if opt.mse_loss else FocalLoss()
+        self.crit_reg = L1Loss()
+        self.crit_rot = BinRotLoss()
+        self.opt = opt
+
+    def forward(self, outputs, batch):
+        opt = self.opt
+
+        hm_loss, dep_loss, rot_loss, dim_loss = 0, 0, 0, 0
+        wh_loss, off_loss = 0, 0
+        for s in range(opt.num_stacks):
+            output = outputs[s]
+            output['hm'] = _sigmoid(output['hm'])
+            output['dep'] = 1. / (output['dep'].sigmoid() + 1e-6) - 1.
+
+            if opt.eval_oracle_dep:
+                output['dep'] = torch.from_numpy(gen_oracle_map(
+                    batch['dep'].detach().cpu().numpy(),
+                    batch['ind'].detach().cpu().numpy(),
+                    opt.output_w, opt.output_h)).to(opt.device)
+
+            hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks
+            if opt.dep_weight > 0:
+                dep_loss += self.crit_reg(output['dep'], batch['reg_mask'],
+                                          batch['ind'], batch['dep']) / opt.num_stacks
+            if opt.dim_weight > 0:
+                dim_loss += self.crit_reg(output['dim'], batch['reg_mask'],
+                                          batch['ind'], batch['dim']) / opt.num_stacks
+            if opt.rot_weight > 0:
+                rot_loss += self.crit_rot(output['rot'], batch['rot_mask'],
+                                          batch['ind'], batch['rotbin'],
+                                          batch['rotres']) / opt.num_stacks
+            if opt.reg_bbox and opt.wh_weight > 0:
+                wh_loss += self.crit_reg(output['wh'], batch['rot_mask'],
+                                         batch['ind'], batch['wh']) / opt.num_stacks
+            if opt.reg_offset and opt.off_weight > 0:
+                off_loss += self.crit_reg(output['reg'], batch['rot_mask'],
+                                          batch['ind'], batch['reg']) / opt.num_stacks
+        loss = opt.hm_weight * hm_loss + opt.dep_weight * dep_loss + \
+            opt.dim_weight * dim_loss + opt.rot_weight * rot_loss + \
+            opt.wh_weight * wh_loss + opt.off_weight * off_loss
+
+        loss_stats = {'loss': loss, 'hm_loss': hm_loss, 'dep_loss': dep_loss,
+                      'dim_loss': dim_loss, 'rot_loss': rot_loss,
+                      'wh_loss': wh_loss, 'off_loss': off_loss}
+        return loss, loss_stats
+
+
+class DddTrainer(BaseTrainer):
+    def __init__(self, opt, model, optimizer=None):
+        super(DddTrainer, self).__init__(opt, model, optimizer=optimizer)
+
+    def _get_losses(self, opt):
+        loss_states = ['loss', 'hm_loss', 'dep_loss', 'dim_loss', 'rot_loss',
+                       'wh_loss', 'off_loss']
+        loss = DddLoss(opt)
+        return loss_states, loss
+
+    def debug(self, batch, output, iter_id):
+        opt = self.opt
+        wh = output['wh'] if opt.reg_bbox else None
+        reg = output['reg'] if opt.reg_offset else None
+        dets = ddd_decode(output['hm'], output['rot'], output['dep'],
+                          output['dim'], wh=wh, reg=reg, K=opt.K)
+
+        # x, y, score, r1-r8, depth, dim1-dim3, cls
+        dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+        calib = batch['meta']['calib'].detach().numpy()
+        # x, y, score, rot, depth, dim1, dim2, dim3
+        # if opt.dataset == 'gta':
+        #   dets[:, 12:15] /= 3
+        dets_pred = ddd_post_process(
+            dets.copy(), batch['meta']['c'].detach().numpy(),
+            batch['meta']['s'].detach().numpy(), calib, opt)
+        dets_gt = ddd_post_process(
+            batch['meta']['gt_det'].detach().numpy().copy(),
+            batch['meta']['c'].detach().numpy(),
+            batch['meta']['s'].detach().numpy(), calib, opt)
+        # for i in range(input.size(0)):
+        for i in range(1):
+            debugger = Debugger(dataset=opt.dataset, ipynb=(opt.debug == 3),
+                                theme=opt.debugger_theme)
+            img = batch['input'][i].detach().cpu().numpy().transpose(1, 2, 0)
+            img = ((img * self.opt.std + self.opt.mean) * 255.).astype(np.uint8)
+            pred = debugger.gen_colormap(
+                output['hm'][i].detach().cpu().numpy())
+            gt = debugger.gen_colormap(batch['hm'][i].detach().cpu().numpy())
+            debugger.add_blend_img(img, pred, 'hm_pred')
+            debugger.add_blend_img(img, gt, 'hm_gt')
+            # decode
+            debugger.add_ct_detection(
+                img, dets[i], show_box=opt.reg_bbox, center_thresh=opt.center_thresh,
+                img_id='det_pred')
+            debugger.add_ct_detection(
+                img, batch['meta']['gt_det'][i].cpu().numpy().copy(),
+                show_box=opt.reg_bbox, img_id='det_gt')
+            debugger.add_3d_detection(
+                batch['meta']['image_path'][i], dets_pred[i], calib[i],
+                center_thresh=opt.center_thresh, img_id='add_pred')
+            debugger.add_3d_detection(
+                batch['meta']['image_path'][i], dets_gt[i], calib[i],
+                center_thresh=opt.center_thresh, img_id='add_gt')
+            # debugger.add_bird_view(
+            #   dets_pred[i], center_thresh=opt.center_thresh, img_id='bird_pred')
+            # debugger.add_bird_view(dets_gt[i], img_id='bird_gt')
+            debugger.add_bird_views(
+                dets_pred[i], dets_gt[i],
+                center_thresh=opt.center_thresh, img_id='bird_pred_gt')
+
+            # debugger.add_blend_img(img, pred, 'out', white=True)
+            debugger.compose_vis_add(
+                batch['meta']['image_path'][i], dets_pred[i], calib[i],
+                opt.center_thresh, pred, 'bird_pred_gt', img_id='out')
+            # debugger.add_img(img, img_id='out')
+            if opt.debug == 4:
+                debugger.save_all_imgs(
+                    opt.debug_dir, prefix='{}'.format(iter_id))
+            else:
+                debugger.show_all_imgs(pause=True)
+
+    def save_result(self, output, batch, results):
+        opt = self.opt
+        wh = output['wh'] if opt.reg_bbox else None
+        reg = output['reg'] if opt.reg_offset else None
+        dets = ddd_decode(output['hm'], output['rot'], output['dep'],
+                          output['dim'], wh=wh, reg=reg, K=opt.K)
+
+        # x, y, score, r1-r8, depth, dim1-dim3, cls
+        dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+        calib = batch['meta']['calib'].detach().numpy()
+        # x, y, score, rot, depth, dim1, dim2, dim3
+        dets_pred = ddd_post_process(
+            dets.copy(), batch['meta']['c'].detach().numpy(),
+            batch['meta']['s'].detach().numpy(), calib, opt)
+        img_id = batch['meta']['img_id'].detach().numpy()[0]
+        results[img_id] = dets_pred[0]
+        for j in range(1, opt.num_classes + 1):
+            keep_inds = (results[img_id][j][:, -1] > opt.center_thresh)
+            results[img_id][j] = results[img_id][j][keep_inds]
--- a/src/lib/trains/exdet.py
+++ b/src/lib/trains/exdet.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import numpy as np
+import cv2
+import sys
+import time
+from utils.debugger import Debugger
+from models.data_parallel import DataParallel
+from models.losses import FocalLoss, RegL1Loss
+from models.decode import agnex_ct_decode, exct_decode
+from models.utils import _sigmoid
+from .base_trainer import BaseTrainer
+
+
+class ExdetLoss(torch.nn.Module):
+    def __init__(self, opt):
+        super(ExdetLoss, self).__init__()
+        self.crit = torch.nn.MSELoss() if opt.mse_loss else FocalLoss()
+        self.crit_reg = RegL1Loss()
+        self.opt = opt
+        self.parts = ['t', 'l', 'b', 'r', 'c']
+
+    def forward(self, outputs, batch):
+        opt = self.opt
+        hm_loss, reg_loss = 0, 0
+        for s in range(opt.num_stacks):
+            output = outputs[s]
+            for p in self.parts:
+                tag = 'hm_{}'.format(p)
+                output[tag] = _sigmoid(output[tag])
+                hm_loss += self.crit(output[tag], batch[tag]) / opt.num_stacks
+                if p != 'c' and opt.reg_offset and opt.off_weight > 0:
+                    reg_loss += self.crit_reg(output['reg_{}'.format(p)],
+                                              batch['reg_mask'],
+                                              batch['ind_{}'.format(p)],
+                                              batch['reg_{}'.format(p)]) / opt.num_stacks
+        loss = opt.hm_weight * hm_loss + opt.off_weight * reg_loss
+        loss_stats = {'loss': loss, 'off_loss': reg_loss, 'hm_loss': hm_loss}
+        return loss, loss_stats
+
+
+class ExdetTrainer(BaseTrainer):
+    def __init__(self, opt, model, optimizer=None):
+        super(ExdetTrainer, self).__init__(opt, model, optimizer=optimizer)
+        self.decode = agnex_ct_decode if opt.agnostic_ex else exct_decode
+
+    def _get_losses(self, opt):
+        loss_states = ['loss', 'hm_loss', 'off_loss']
+        loss = ExdetLoss(opt)
+        return loss_states, loss
+
+    def debug(self, batch, output, iter_id):
+        opt = self.opt
+        detections = self.decode(output['hm_t'], output['hm_l'],
+                                 output['hm_b'], output['hm_r'],
+                                 output['hm_c']).detach().cpu().numpy()
+        detections[:, :, :4] *= opt.input_res / opt.output_res
+        for i in range(1):
+            debugger = Debugger(
+                dataset=opt.dataset, ipynb=(opt.debug == 3), theme=opt.debugger_theme)
+            pred_hm = np.zeros(
+                (opt.input_res, opt.input_res, 3), dtype=np.uint8)
+            gt_hm = np.zeros((opt.input_res, opt.input_res, 3), dtype=np.uint8)
+            img = batch['input'][i].detach().cpu().numpy().transpose(1, 2, 0)
+            img = ((img * self.opt.std + self.opt.mean) * 255.).astype(np.uint8)
+            for p in self.parts:
+                tag = 'hm_{}'.format(p)
+                pred = debugger.gen_colormap(
+                    output[tag][i].detach().cpu().numpy())
+                gt = debugger.gen_colormap(
+                    batch[tag][i].detach().cpu().numpy())
+                if p != 'c':
+                    pred_hm = np.maximum(pred_hm, pred)
+                    gt_hm = np.maximum(gt_hm, gt)
+                if p == 'c' or opt.debug > 2:
+                    debugger.add_blend_img(img, pred, 'pred_{}'.format(p))
+                    debugger.add_blend_img(img, gt, 'gt_{}'.format(p))
+            debugger.add_blend_img(img, pred_hm, 'pred')
+            debugger.add_blend_img(img, gt_hm, 'gt')
+            debugger.add_img(img, img_id='out')
+            for k in range(len(detections[i])):
+                if detections[i, k, 4] > 0.1:
+                    debugger.add_coco_bbox(detections[i, k, :4], detections[i, k, -1],
+                                           detections[i, k, 4], img_id='out')
+            if opt.debug == 4:
+                debugger.save_all_imgs(
+                    opt.debug_dir, prefix='{}'.format(iter_id))
+            else:
+                debugger.show_all_imgs(pause=True)
--- a/src/lib/trains/multi_pose.py
+++ b/src/lib/trains/multi_pose.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import numpy as np
+
+from models.losses import FocalLoss, RegL1Loss, RegLoss, RegWeightedL1Loss
+from models.decode import multi_pose_decode
+from models.utils import _sigmoid, flip_tensor, flip_lr_off, flip_lr
+from utils.debugger import Debugger
+from utils.post_process import multi_pose_post_process
+from utils.oracle_utils import gen_oracle_map
+from .base_trainer import BaseTrainer
+
+
+class MultiPoseLoss(torch.nn.Module):
+    def __init__(self, opt):
+        super(MultiPoseLoss, self).__init__()
+        self.crit = FocalLoss()
+        self.crit_hm_hp = torch.nn.MSELoss() if opt.mse_loss else FocalLoss()
+        self.crit_kp = RegWeightedL1Loss() if not opt.dense_hp else \
+            torch.nn.L1Loss(reduction='sum')
+        self.crit_reg = RegL1Loss() if opt.reg_loss == 'l1' else \
+            RegLoss() if opt.reg_loss == 'sl1' else None
+        self.opt = opt
+
+    def forward(self, outputs, batch):
+        opt = self.opt
+        hm_loss, wh_loss, off_loss = 0, 0, 0
+        lm_loss, off_loss, hm_hp_loss, hp_offset_loss = 0, 0, 0, 0
+        for s in range(opt.num_stacks):
+            output = outputs[s]
+            output['hm'] = output['hm']
+            # if opt.hm_hp and not opt.mse_loss:
+            #   output['hm_hp'] = _sigmoid(output['hm_hp'])
+
+            if opt.eval_oracle_hmhp:
+                output['hm_hp'] = batch['hm_hp']
+            if opt.eval_oracle_hm:
+                output['hm'] = batch['hm']
+            if opt.eval_oracle_kps:
+                if opt.dense_hp:
+                    output['hps'] = batch['dense_hps']
+                else:
+                    output['hps'] = torch.from_numpy(gen_oracle_map(
+                        batch['hps'].detach().cpu().numpy(),
+                        batch['ind'].detach().cpu().numpy(),
+                        opt.output_res, opt.output_res)).to(opt.device)
+            if opt.eval_oracle_hp_offset:
+                output['hp_offset'] = torch.from_numpy(gen_oracle_map(
+                    batch['hp_offset'].detach().cpu().numpy(),
+                    batch['hp_ind'].detach().cpu().numpy(),
+                    opt.output_res, opt.output_res)).to(opt.device)
+
+            # 1. focal loss,求目标的中心，
+            hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks
+            if opt.wh_weight > 0:
+                wh_loss += self.crit_reg(output['wh'], batch['reg_mask'],               # 2. 人脸bbox高度和宽度的loss
+                                         batch['ind'], batch['wh'], batch['wight_mask']) / opt.num_stacks
+            if opt.reg_offset and opt.off_weight > 0:
+                off_loss += self.crit_reg(output['hm_offset'], batch['reg_mask'],             # 3. 人脸bbox中心点下采样，所需要的偏差补偿
+                                          batch['ind'], batch['hm_offset'], batch['wight_mask']) / opt.num_stacks
+
+            if opt.dense_hp:
+                mask_weight = batch['dense_hps_mask'].sum() + 1e-4
+                lm_loss += (self.crit_kp(output['hps'] * batch['dense_hps_mask'],
+                                         batch['dense_hps'] * batch['dense_hps_mask']) /
+                            mask_weight) / opt.num_stacks
+            else:
+                lm_loss += self.crit_kp(output['landmarks'], batch['hps_mask'],               # 4. 关节点的偏移
+                                        batch['ind'], batch['landmarks']) / opt.num_stacks
+
+            # if opt.reg_hp_offset and opt.off_weight > 0:                              # 关节点的中心偏移
+            #   hp_offset_loss += self.crit_reg(
+            #     output['hp_offset'], batch['hp_mask'],
+            #     batch['hp_ind'], batch['hp_offset']) / opt.num_stacks
+            # if opt.hm_hp and opt.hm_hp_weight > 0:                                    # 关节点的热力图
+            #   hm_hp_loss += self.crit_hm_hp(
+            #     output['hm_hp'], batch['hm_hp']) / opt.num_stacks
+
+        loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + \
+            opt.off_weight * off_loss + opt.lm_weight * lm_loss
+
+        loss_stats = {'loss': loss, 'hm_loss': hm_loss, 'lm_loss': lm_loss,
+                      'wh_loss': wh_loss, 'off_loss': off_loss}
+        return loss, loss_stats
+
+
+class MultiPoseTrainer(BaseTrainer):
+    def __init__(self, opt, model, optimizer=None):
+        super(MultiPoseTrainer, self).__init__(opt, model, optimizer=optimizer)
+
+    def _get_losses(self, opt):
+        loss_states = ['loss', 'hm_loss', 'lm_loss', 'wh_loss', 'off_loss']
+        loss = MultiPoseLoss(opt)
+        return loss_states, loss
+
+    def debug(self, batch, output, iter_id):
+        opt = self.opt
+        reg = output['reg'] if opt.reg_offset else None
+        hm_hp = output['hm_hp'] if opt.hm_hp else None
+        hp_offset = output['hp_offset'] if opt.reg_hp_offset else None
+        dets = multi_pose_decode(
+            output['hm'], output['wh'], output['hps'],
+            reg=reg, hm_hp=hm_hp, hp_offset=hp_offset, K=opt.K)
+        dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+
+        dets[:, :, :4] *= opt.input_res / opt.output_res
+        dets[:, :, 5:39] *= opt.input_res / opt.output_res
+        dets_gt = batch['meta']['gt_det'].numpy().reshape(1, -1, dets.shape[2])
+        dets_gt[:, :, :4] *= opt.input_res / opt.output_res
+        dets_gt[:, :, 5:39] *= opt.input_res / opt.output_res
+        for i in range(1):
+            debugger = Debugger(
+                dataset=opt.dataset, ipynb=(opt.debug == 3), theme=opt.debugger_theme)
+            img = batch['input'][i].detach().cpu().numpy().transpose(1, 2, 0)
+            img = np.clip(((
+                img * opt.std + opt.mean) * 255.), 0, 255).astype(np.uint8)
+            pred = debugger.gen_colormap(
+                output['hm'][i].detach().cpu().numpy())
+            gt = debugger.gen_colormap(batch['hm'][i].detach().cpu().numpy())
+            debugger.add_blend_img(img, pred, 'pred_hm')
+            debugger.add_blend_img(img, gt, 'gt_hm')
+
+            debugger.add_img(img, img_id='out_pred')
+            for k in range(len(dets[i])):
+                if dets[i, k, 4] > opt.center_thresh:
+                    debugger.add_coco_bbox(dets[i, k, :4], dets[i, k, -1],
+                                           dets[i, k, 4], img_id='out_pred')
+                    debugger.add_coco_hp(dets[i, k, 5:39], img_id='out_pred')
+
+            debugger.add_img(img, img_id='out_gt')
+            for k in range(len(dets_gt[i])):
+                if dets_gt[i, k, 4] > opt.center_thresh:
+                    debugger.add_coco_bbox(dets_gt[i, k, :4], dets_gt[i, k, -1],
+                                           dets_gt[i, k, 4], img_id='out_gt')
+                    debugger.add_coco_hp(dets_gt[i, k, 5:39], img_id='out_gt')
+
+            if opt.hm_hp:
+                pred = debugger.gen_colormap_hp(
+                    output['hm_hp'][i].detach().cpu().numpy())
+                gt = debugger.gen_colormap_hp(
+                    batch['hm_hp'][i].detach().cpu().numpy())
+                debugger.add_blend_img(img, pred, 'pred_hmhp')
+                debugger.add_blend_img(img, gt, 'gt_hmhp')
+
+            if opt.debug == 4:
+                debugger.save_all_imgs(
+                    opt.debug_dir, prefix='{}'.format(iter_id))
+            else:
+                debugger.show_all_imgs(pause=True)
+
+    def save_result(self, output, batch, results):
+        reg = output['reg'] if self.opt.reg_offset else None
+        hm_hp = output['hm_hp'] if self.opt.hm_hp else None
+        hp_offset = output['hp_offset'] if self.opt.reg_hp_offset else None
+        dets = multi_pose_decode(
+            output['hm'], output['wh'], output['hps'],
+            reg=reg, hm_hp=hm_hp, hp_offset=hp_offset, K=self.opt.K)
+        dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+
+        dets_out = multi_pose_post_process(
+            dets.copy(), batch['meta']['c'].cpu().numpy(),
+            batch['meta']['s'].cpu().numpy(),
+            output['hm'].shape[2], output['hm'].shape[3])
+        results[batch['meta']['img_id'].cpu().numpy()[0]] = dets_out[0]
--- a/src/lib/trains/train_factory.py
+++ b/src/lib/trains/train_factory.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from .ctdet import CtdetTrainer
+from .ddd import DddTrainer
+from .exdet import ExdetTrainer
+from .multi_pose import MultiPoseTrainer
+
+train_factory = {
+    'exdet': ExdetTrainer,
+    'ddd': DddTrainer,
+    'ctdet': CtdetTrainer,
+    'multi_pose': MultiPoseTrainer,
+}
--- a/src/lib/utils/Randaugmentations.py
+++ b/src/lib/utils/Randaugmentations.py
+# code in this file is adpated from rpmcruz/autoaugment
+# https://github.com/rpmcruz/autoaugment/blob/master/transformations.py
+import random
+
+import PIL, PIL.ImageOps, PIL.ImageEnhance, PIL.ImageDraw
+import numpy as np
+import torch
+from PIL import Image
+
+
+def ShearX(img, v):  # [-0.3, 0.3]
+    assert -0.3 <= v <= 0.3
+    if random.random() > 0.5:
+        v = -v
+    return img.transform(img.size, PIL.Image.AFFINE, (1, v, 0, 0, 1, 0))
+
+
+def ShearY(img, v):  # [-0.3, 0.3]
+    assert -0.3 <= v <= 0.3
+    if random.random() > 0.5:
+        v = -v
+    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, 0, v, 1, 0))
+
+
+def TranslateX(img, v):  # [-150, 150] => percentage: [-0.45, 0.45]
+    assert -0.45 <= v <= 0.45
+    if random.random() > 0.5:
+        v = -v
+    v = v * img.size[0]
+    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, v, 0, 1, 0))
+
+
+def TranslateXabs(img, v):  # [-150, 150] => percentage: [-0.45, 0.45]
+    assert 0 <= v
+    if random.random() > 0.5:
+        v = -v
+    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, v, 0, 1, 0))
+
+
+def TranslateY(img, v):  # [-150, 150] => percentage: [-0.45, 0.45]
+    assert -0.45 <= v <= 0.45
+    if random.random() > 0.5:
+        v = -v
+    v = v * img.size[1]
+    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, 0, 0, 1, v))
+
+
+def TranslateYabs(img, v):  # [-150, 150] => percentage: [-0.45, 0.45]
+    assert 0 <= v
+    if random.random() > 0.5:
+        v = -v
+    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, 0, 0, 1, v))
+
+
+def Rotate(img, v):  # [-30, 30]
+    assert -30 <= v <= 30
+    if random.random() > 0.5:
+        v = -v
+    return img.rotate(v)
+
+
+def AutoContrast(img, _):
+    return PIL.ImageOps.autocontrast(img)
+
+
+def Invert(img, _):
+    return PIL.ImageOps.invert(img)
+
+
+def Equalize(img, _):
+    return PIL.ImageOps.equalize(img)
+
+
+def Flip(img, _):  # not from the paper
+    return PIL.ImageOps.mirror(img)
+
+
+def Solarize(img, v):  # [0, 256]
+    assert 0 <= v <= 256
+    return PIL.ImageOps.solarize(img, v)
+
+
+def SolarizeAdd(img, addition=0, threshold=128):
+    img_np = np.array(img).astype(np.int)
+    img_np = img_np + addition
+    img_np = np.clip(img_np, 0, 255)
+    img_np = img_np.astype(np.uint8)
+    img = Image.fromarray(img_np)
+    return PIL.ImageOps.solarize(img, threshold)
+
+
+def Posterize(img, v):  # [4, 8]
+    v = int(v)
+    v = max(1, v)
+    return PIL.ImageOps.posterize(img, v)
+
+
+def Contrast(img, v):  # [0.1,1.9]
+    assert 0.1 <= v <= 1.9
+    return PIL.ImageEnhance.Contrast(img).enhance(v)
+
+
+def Color(img, v):  # [0.1,1.9]
+    assert 0.1 <= v <= 1.9
+    return PIL.ImageEnhance.Color(img).enhance(v)
+
+
+def Brightness(img, v):  # [0.1,1.9]
+    assert 0.1 <= v <= 1.9
+    return PIL.ImageEnhance.Brightness(img).enhance(v)
+
+
+def Sharpness(img, v):  # [0.1,1.9]
+    assert 0.1 <= v <= 1.9
+    return PIL.ImageEnhance.Sharpness(img).enhance(v)
+
+
+def Cutout(img, v):  # [0, 60] => percentage: [0, 0.2]
+    assert 0.0 <= v <= 0.2
+    if v <= 0.:
+        return img
+
+    v = v * img.size[0]
+    return CutoutAbs(img, v)
+
+
+def CutoutAbs(img, v):  # [0, 60] => percentage: [0, 0.2]
+    # assert 0 <= v <= 20
+    if v < 0:
+        return img
+    w, h = img.size
+    x0 = np.random.uniform(w)
+    y0 = np.random.uniform(h)
+
+    x0 = int(max(0, x0 - v / 2.))
+    y0 = int(max(0, y0 - v / 2.))
+    x1 = min(w, x0 + v)
+    y1 = min(h, y0 + v)
+
+    xy = (x0, y0, x1, y1)
+    color = (125, 123, 114)
+    # color = (0, 0, 0)
+    img = img.copy()
+    PIL.ImageDraw.Draw(img).rectangle(xy, color)
+    return img
+
+
+def SamplePairing(imgs):  # [0, 0.4]
+    def f(img1, v):
+        i = np.random.choice(len(imgs))
+        img2 = PIL.Image.fromarray(imgs[i])
+        return PIL.Image.blend(img1, img2, v)
+
+    return f
+
+
+def Identity(img, v):
+    return img
+
+
+def augment_list():  # 16 oeprations and their ranges
+    # https://github.com/google-research/uda/blob/master/image/randaugment/policies.py#L57
+    # l = [
+    #     (Identity, 0., 1.0),
+    #     (ShearX, 0., 0.3),  # 0
+    #     (ShearY, 0., 0.3),  # 1
+    #     (TranslateX, 0., 0.33),  # 2
+    #     (TranslateY, 0., 0.33),  # 3
+    #     (Rotate, 0, 30),  # 4
+    #     (AutoContrast, 0, 1),  # 5
+    #     (Invert, 0, 1),  # 6
+    #     (Equalize, 0, 1),  # 7
+    #     (Solarize, 0, 110),  # 8
+    #     (Posterize, 4, 8),  # 9
+    #     # (Contrast, 0.1, 1.9),  # 10
+    #     (Color, 0.1, 1.9),  # 11
+    #     (Brightness, 0.1, 1.9),  # 12
+    #     (Sharpness, 0.1, 1.9),  # 13
+    #     # (Cutout, 0, 0.2),  # 14
+    #     # (SamplePairing(imgs), 0, 0.4),  # 15
+    # ]
+
+    # https://github.com/tensorflow/tpu/blob/8462d083dd89489a79e3200bcc8d4063bf362186/models/official/efficientnet/autoaugment.py#L505
+    l = [
+        (Identity, 0., 1.0),
+        (AutoContrast, 0, 1),
+        (Equalize, 0, 1),
+        (Invert, 0, 1),
+        # (Rotate, 0, 30),
+        (Posterize, 0, 4),
+        (Solarize, 0, 256),
+        (SolarizeAdd, 0, 110),
+        (Color, 0.1, 1.9),
+        (Contrast, 0.1, 1.9),
+        (Brightness, 0.1, 1.9),
+        (Sharpness, 0.1, 1.9),
+        # (ShearX, 0., 0.3),            # x方向的扭曲
+        # (ShearY, 0., 0.3),            # y方向的扭曲
+        (CutoutAbs, 0, 40),
+        # (TranslateXabs, 0., 100),       # x方向移动
+        # (TranslateYabs, 0., 100),
+    ]
+
+    return l
+
+
+class Lighting(object):
+    """Lighting noise(AlexNet - style PCA - based noise)"""
+
+    def __init__(self, alphastd, eigval, eigvec):
+        self.alphastd = alphastd
+        self.eigval = torch.Tensor(eigval)
+        self.eigvec = torch.Tensor(eigvec)
+
+    def __call__(self, img):
+        if self.alphastd == 0:
+            return img
+
+        alpha = img.new().resize_(3).normal_(0, self.alphastd)
+        rgb = self.eigvec.type_as(img).clone() \
+            .mul(alpha.view(1, 3).expand(3, 3)) \
+            .mul(self.eigval.view(1, 3).expand(3, 3)) \
+            .sum(1).squeeze()
+
+        return img.add(rgb.view(3, 1, 1).expand_as(img))
+
+
+def lighting_(data_rng, image, alphastd, eigval, eigvec):
+    alpha = data_rng.normal(scale=alphastd, size=(3, ))
+    image += np.dot(eigvec, eigval * alpha)
+
+
+class CutoutDefault(object):
+    """
+    Reference : https://github.com/quark0/darts/blob/master/cnn/utils.py
+    """
+    def __init__(self, length):
+        self.length = length
+
+    def __call__(self, img):
+        h, w = img.size(1), img.size(2)
+        mask = np.ones((h, w), np.float32)
+        y = np.random.randint(h)
+        x = np.random.randint(w)
+
+        y1 = np.clip(y - self.length // 2, 0, h)
+        y2 = np.clip(y + self.length // 2, 0, h)
+        x1 = np.clip(x - self.length // 2, 0, w)
+        x2 = np.clip(x + self.length // 2, 0, w)
+
+        mask[y1: y2, x1: x2] = 0.
+        mask = torch.from_numpy(mask)
+        mask = mask.expand_as(img)
+        img *= mask
+        return img
+
+
+class RandAugment:
+    '''
+        n:每次选择几种增强方式
+    '''
+    def __init__(self, n, m=30):
+        self.n = n
+        self.m = m      # [0, 30]
+        self.augment_list = augment_list()
+
+    def __call__(self, img):
+        ops = random.choices(self.augment_list, k=self.n)
+        for op, minval, maxval in ops:
+            aug_level = random.randint(0, self.m)
+            val = (float(aug_level) / 30) * float(maxval - minval) + minval
+            img = op(img, val)
+
+        return img
+
+
+def Randaugment(data_rng, inp, eig_val, eig_vec):
+    img = Image.fromarray(inp)
+    augment = RandAugment(5, 30)
+    img = augment(img)
+    cv_img = np.asarray(img)
+    cv_img = (cv_img.astype(np.float32) / 255.)
+    lighting_(data_rng, cv_img, 0.1, eig_val, eig_vec)
+    return cv_img
+
+
+if __name__ == '__main__':
+    img = Image.open("/path/0_Parade_marchingband_1_35.jpg")
+    # randaugment = RandAugment(5, 20)
+    randaugment = TranslateXabs
+    img1 = randaugment(img, 100)
+    img1.show()
--- a/src/lib/utils/__init__.py
+++ b/src/lib/utils/__init__.py
--- a/src/lib/utils/ddd_utils.py
+++ b/src/lib/utils/ddd_utils.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import cv2
+
+def compute_box_3d(dim, location, rotation_y):
+  # dim: 3
+  # location: 3
+  # rotation_y: 1
+  # return: 8 x 3
+  c, s = np.cos(rotation_y), np.sin(rotation_y)
+  R = np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]], dtype=np.float32)
+  l, w, h = dim[2], dim[1], dim[0]
+  x_corners = [l/2, l/2, -l/2, -l/2, l/2, l/2, -l/2, -l/2]
+  y_corners = [0,0,0,0,-h,-h,-h,-h]
+  z_corners = [w/2, -w/2, -w/2, w/2, w/2, -w/2, -w/2, w/2]
+
+  corners = np.array([x_corners, y_corners, z_corners], dtype=np.float32)
+  corners_3d = np.dot(R, corners) 
+  corners_3d = corners_3d + np.array(location, dtype=np.float32).reshape(3, 1)
+  return corners_3d.transpose(1, 0)
+
+def project_to_image(pts_3d, P):
+  # pts_3d: n x 3
+  # P: 3 x 4
+  # return: n x 2
+  pts_3d_homo = np.concatenate(
+    [pts_3d, np.ones((pts_3d.shape[0], 1), dtype=np.float32)], axis=1)
+  pts_2d = np.dot(P, pts_3d_homo.transpose(1, 0)).transpose(1, 0)
+  pts_2d = pts_2d[:, :2] / pts_2d[:, 2:]
+  # import pdb; pdb.set_trace()
+  return pts_2d
+
+def compute_orientation_3d(dim, location, rotation_y):
+  # dim: 3
+  # location: 3
+  # rotation_y: 1
+  # return: 2 x 3
+  c, s = np.cos(rotation_y), np.sin(rotation_y)
+  R = np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]], dtype=np.float32)
+  orientation_3d = np.array([[0, dim[2]], [0, 0], [0, 0]], dtype=np.float32)
+  orientation_3d = np.dot(R, orientation_3d)
+  orientation_3d = orientation_3d + \
+                   np.array(location, dtype=np.float32).reshape(3, 1)
+  return orientation_3d.transpose(1, 0)
+
+def draw_box_3d(image, corners, c=(0, 0, 255)):
+  face_idx = [[0,1,5,4],
+              [1,2,6, 5],
+              [2,3,7,6],
+              [3,0,4,7]]
+  for ind_f in range(3, -1, -1):
+    f = face_idx[ind_f]
+    for j in range(4):
+      cv2.line(image, (corners[f[j], 0], corners[f[j], 1]),
+               (corners[f[(j+1)%4], 0], corners[f[(j+1)%4], 1]), c, 2, lineType=cv2.LINE_AA)
+    if ind_f == 0:
+      cv2.line(image, (corners[f[0], 0], corners[f[0], 1]),
+               (corners[f[2], 0], corners[f[2], 1]), c, 1, lineType=cv2.LINE_AA)
+      cv2.line(image, (corners[f[1], 0], corners[f[1], 1]),
+               (corners[f[3], 0], corners[f[3], 1]), c, 1, lineType=cv2.LINE_AA)
+  return image
+
+def unproject_2d_to_3d(pt_2d, depth, P):
+  # pts_2d: 2
+  # depth: 1
+  # P: 3 x 4
+  # return: 3
+  z = depth - P[2, 3]
+  x = (pt_2d[0] * depth - P[0, 3] - P[0, 2] * z) / P[0, 0]
+  y = (pt_2d[1] * depth - P[1, 3] - P[1, 2] * z) / P[1, 1]
+  pt_3d = np.array([x, y, z], dtype=np.float32)
+  return pt_3d
+
+def alpha2rot_y(alpha, x, cx, fx):
+    """
+    Get rotation_y by alpha + theta - 180
+    alpha : Observation angle of object, ranging [-pi..pi]
+    x : Object center x to the camera center (x-W/2), in pixels
+    rotation_y : Rotation ry around Y-axis in camera coordinates [-pi..pi]
+    """
+    rot_y = alpha + np.arctan2(x - cx, fx)
+    if rot_y > np.pi:
+      rot_y -= 2 * np.pi
+    if rot_y < -np.pi:
+      rot_y += 2 * np.pi
+    return rot_y
+
+def rot_y2alpha(rot_y, x, cx, fx):
+    """
+    Get rotation_y by alpha + theta - 180
+    alpha : Observation angle of object, ranging [-pi..pi]
+    x : Object center x to the camera center (x-W/2), in pixels
+    rotation_y : Rotation ry around Y-axis in camera coordinates [-pi..pi]
+    """
+    alpha = rot_y - np.arctan2(x - cx, fx)
+    if alpha > np.pi:
+      alpha -= 2 * np.pi
+    if alpha < -np.pi:
+      alpha += 2 * np.pi
+    return alpha
+
+
+def ddd2locrot(center, alpha, dim, depth, calib):
+  # single image
+  locations = unproject_2d_to_3d(center, depth, calib)
+  locations[1] += dim[0] / 2
+  rotation_y = alpha2rot_y(alpha, center[0], calib[0, 2], calib[0, 0])
+  return locations, rotation_y
+
+def project_3d_bbox(location, dim, rotation_y, calib):
+  box_3d = compute_box_3d(dim, location, rotation_y)
+  box_2d = project_to_image(box_3d, calib)
+  return box_2d
+
+
+if __name__ == '__main__':
+  calib = np.array(
+    [[7.070493000000e+02, 0.000000000000e+00, 6.040814000000e+02, 4.575831000000e+01],
+     [0.000000000000e+00, 7.070493000000e+02, 1.805066000000e+02, -3.454157000000e-01],
+     [0.000000000000e+00, 0.000000000000e+00, 1.000000000000e+00, 4.981016000000e-03]],
+    dtype=np.float32)
+  alpha = -0.20
+  tl = np.array([712.40, 143.00], dtype=np.float32)
+  br = np.array([810.73, 307.92], dtype=np.float32)
+  ct = (tl + br) / 2
+  rotation_y = 0.01
+  print('alpha2rot_y', alpha2rot_y(alpha, ct[0], calib[0, 2], calib[0, 0]))
+  print('rotation_y', rotation_y)
\ No newline at end of file
--- a/src/lib/utils/debugger.py
+++ b/src/lib/utils/debugger.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+from .ddd_utils import compute_box_3d, project_to_image, draw_box_3d
+
+class Debugger(object):
+  def __init__(self, ipynb=False, theme='black', 
+               num_classes=-1, dataset=None, down_ratio=4):
+    self.ipynb = ipynb
+    if not self.ipynb:
+      import matplotlib.pyplot as plt
+      self.plt = plt
+    self.imgs = {}
+    self.theme = theme
+    colors = [(color_list[_]).astype(np.uint8) \
+            for _ in range(len(color_list))]
+    self.colors = np.array(colors, dtype=np.uint8).reshape(len(colors), 1, 1, 3)
+    if self.theme == 'white':
+      self.colors = self.colors.reshape(-1)[::-1].reshape(len(colors), 1, 1, 3)
+      self.colors = np.clip(self.colors, 0., 0.6 * 255).astype(np.uint8)
+    self.dim_scale = 1
+    if dataset == 'facehp':
+        self.names = ['face']
+        self.num_class = 1
+        self.num_joints = 5
+        self.edges = [[0, 1], [0, 2], [1, 3], [2, 4]]
+        self.ec = [(0, 0, 255), (255, 0, 0), (0, 0, 255), (127, 255, 0),
+                   (127, 255, 0), (0, 0, 255), (255, 0, 255),
+                   (255, 0, 0), (255, 0, 0), (0, 0, 255), (0, 0, 255),
+                   (255, 0, 0), (0, 0, 255), (255, 0, 255),
+                   (255, 0, 0), (255, 0, 0), (0, 0, 255), (0, 0, 255)]
+        self.colors_hp = [(0, 0, 255), (255, 0, 0), (0, 0, 255),
+                          (127, 255, 0), (127, 255, 0), (255, 0, 0), (0, 0, 255),
+                          (255, 0, 0), (0, 0, 255), (255, 0, 0), (0, 0, 255),
+                          (255, 0, 0), (0, 0, 255), (255, 0, 0), (0, 0, 255),
+                          (255, 0, 0), (0, 0, 255)]
+    if dataset == 'coco_hp':
+      self.names = ['p']
+      self.num_class = 1
+      self.num_joints = 17
+      self.edges = [[0, 1], [0, 2], [1, 3], [2, 4], 
+                    [3, 5], [4, 6], [5, 6], 
+                    [5, 7], [7, 9], [6, 8], [8, 10], 
+                    [5, 11], [6, 12], [11, 12], 
+                    [11, 13], [13, 15], [12, 14], [14, 16]]
+      self.ec = [(255, 0, 0), (0, 0, 255), (255, 0, 0), (0, 0, 255), 
+                 (255, 0, 0), (0, 0, 255), (255, 0, 255),
+                 (255, 0, 0), (255, 0, 0), (0, 0, 255), (0, 0, 255),
+                 (255, 0, 0), (0, 0, 255), (255, 0, 255),
+                 (255, 0, 0), (255, 0, 0), (0, 0, 255), (0, 0, 255)]
+      self.colors_hp = [(255, 0, 255), (255, 0, 0), (0, 0, 255), 
+        (255, 0, 0), (0, 0, 255), (255, 0, 0), (0, 0, 255),
+        (255, 0, 0), (0, 0, 255), (255, 0, 0), (0, 0, 255),
+        (255, 0, 0), (0, 0, 255), (255, 0, 0), (0, 0, 255),
+        (255, 0, 0), (0, 0, 255)]
+    elif num_classes == 80 or dataset == 'coco':
+      self.names = coco_class_name
+    elif num_classes == 20 or dataset == 'pascal':
+      self.names = pascal_class_name
+    elif dataset == 'gta':
+      self.names = gta_class_name
+      self.focal_length = 935.3074360871937
+      self.W = 1920
+      self.H = 1080
+      self.dim_scale = 3
+    elif dataset == 'viper':
+      self.names = gta_class_name
+      self.focal_length = 1158
+      self.W = 1920
+      self.H = 1080
+      self.dim_scale = 3
+    elif num_classes == 3 or dataset == 'kitti':
+      self.names = kitti_class_name
+      self.focal_length = 721.5377
+      self.W = 1242
+      self.H = 375
+    elif num_classes == 1 or dataset == 'pig':              # 自己的数据集
+      self.names = pig_class_name
+    elif num_classes == 1 or dataset == 'facehp':
+      self.names = face_class_name
+
+    num_classes = len(self.names)
+    self.down_ratio=down_ratio
+    # for bird view
+    self.world_size = 64
+    self.out_size = 384
+
+  def add_img(self, img, img_id='default', revert_color=False):
+    if revert_color:
+      img = 255 - img
+    self.imgs[img_id] = img.copy()
+  
+  def add_mask(self, mask, bg, imgId = 'default', trans = 0.8):
+    self.imgs[imgId] = (mask.reshape(
+      mask.shape[0], mask.shape[1], 1) * 255 * trans + \
+      bg * (1 - trans)).astype(np.uint8)
+  
+  def show_img(self, pause = False, imgId = 'default'):
+    cv2.imshow('{}'.format(imgId), self.imgs[imgId])
+    if pause:
+      cv2.waitKey()
+  
+  def add_blend_img(self, back, fore, img_id='blend', trans=0.7):
+    if self.theme == 'white':
+      fore = 255 - fore
+    if fore.shape[0] != back.shape[0] or fore.shape[0] != back.shape[1]:
+      fore = cv2.resize(fore, (back.shape[1], back.shape[0]))
+    if len(fore.shape) == 2:
+      fore = fore.reshape(fore.shape[0], fore.shape[1], 1)
+    self.imgs[img_id] = (back * (1. - trans) + fore * trans)
+    self.imgs[img_id][self.imgs[img_id] > 255] = 255
+    self.imgs[img_id][self.imgs[img_id] < 0] = 0
+    self.imgs[img_id] = self.imgs[img_id].astype(np.uint8).copy()
+
+  '''
+  # slow version
+  def gen_colormap(self, img, output_res=None):
+    # num_classes = len(self.colors)
+    img[img < 0] = 0
+    h, w = img.shape[1], img.shape[2]
+    if output_res is None:
+      output_res = (h * self.down_ratio, w * self.down_ratio)
+    color_map = np.zeros((output_res[0], output_res[1], 3), dtype=np.uint8)
+    for i in range(img.shape[0]):
+      resized = cv2.resize(img[i], (output_res[1], output_res[0]))
+      resized = resized.reshape(output_res[0], output_res[1], 1)
+      cl = self.colors[i] if not (self.theme == 'white') \
+           else 255 - self.colors[i]
+      color_map = np.maximum(color_map, (resized * cl).astype(np.uint8))
+    return color_map
+    '''
+
+  
+  def gen_colormap(self, img, output_res=None):
+    img = img.copy()
+    c, h, w = img.shape[0], img.shape[1], img.shape[2]
+    if output_res is None:
+      output_res = (h * self.down_ratio, w * self.down_ratio)
+    img = img.transpose(1, 2, 0).reshape(h, w, c, 1).astype(np.float32)
+    colors = np.array(
+      self.colors, dtype=np.float32).reshape(-1, 3)[:c].reshape(1, 1, c, 3)
+    if self.theme == 'white':
+      colors = 255 - colors
+    color_map = (img * colors).max(axis=2).astype(np.uint8)
+    color_map = cv2.resize(color_map, (output_res[0], output_res[1]))
+    return color_map
+    
+  '''
+  # slow
+  def gen_colormap_hp(self, img, output_res=None):
+    # num_classes = len(self.colors)
+    # img[img < 0] = 0
+    h, w = img.shape[1], img.shape[2]
+    if output_res is None:
+      output_res = (h * self.down_ratio, w * self.down_ratio)
+    color_map = np.zeros((output_res[0], output_res[1], 3), dtype=np.uint8)
+    for i in range(img.shape[0]):
+      resized = cv2.resize(img[i], (output_res[1], output_res[0]))
+      resized = resized.reshape(output_res[0], output_res[1], 1)
+      cl =  self.colors_hp[i] if not (self.theme == 'white') else \
+        (255 - np.array(self.colors_hp[i]))
+      color_map = np.maximum(color_map, (resized * cl).astype(np.uint8))
+    return color_map
+  '''
+  def gen_colormap_hp(self, img, output_res=None):
+    c, h, w = img.shape[0], img.shape[1], img.shape[2]
+    if output_res is None:
+      output_res = (h * self.down_ratio, w * self.down_ratio)
+    img = img.transpose(1, 2, 0).reshape(h, w, c, 1).astype(np.float32)
+    colors = np.array(
+      self.colors_hp, dtype=np.float32).reshape(-1, 3)[:c].reshape(1, 1, c, 3)
+    if self.theme == 'white':
+      colors = 255 - colors
+    color_map = (img * colors).max(axis=2).astype(np.uint8)
+    color_map = cv2.resize(color_map, (output_res[0], output_res[1]))
+    return color_map
+
+
+  def add_rect(self, rect1, rect2, c, conf=1, img_id='default'): 
+    cv2.rectangle(
+      self.imgs[img_id], (rect1[0], rect1[1]), (rect2[0], rect2[1]), c, 2)
+    if conf < 1:
+      cv2.circle(self.imgs[img_id], (rect1[0], rect1[1]), int(10 * conf), c, 1)
+      cv2.circle(self.imgs[img_id], (rect2[0], rect2[1]), int(10 * conf), c, 1)
+      cv2.circle(self.imgs[img_id], (rect1[0], rect2[1]), int(10 * conf), c, 1)
+      cv2.circle(self.imgs[img_id], (rect2[0], rect1[1]), int(10 * conf), c, 1)
+
+  def add_coco_bbox(self, bbox, cat, conf=1, show_txt=True, img_id='default'): 
+    bbox = np.array(bbox, dtype=np.int32)
+    # cat = (int(cat) + 1) % 80
+    cat = int(cat)
+    # print('cat', cat, self.names[cat])
+    c = self.colors[cat][0][0].tolist()
+    if self.theme == 'white':
+      c = (255 - np.array(c)).tolist()
+    txt = '{}{:.1f}'.format(self.names[cat], conf)
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0]
+    cv2.rectangle(
+      self.imgs[img_id], (bbox[0], bbox[1]), (bbox[2], bbox[3]), c, 2)
+    if show_txt:
+      cv2.rectangle(self.imgs[img_id],
+                    (bbox[0], bbox[1] - cat_size[1] - 2),
+                    (bbox[0] + cat_size[0], bbox[1] - 2), c, -1)
+      cv2.putText(self.imgs[img_id], txt, (bbox[0], bbox[1] - 2), 
+                  font, 0.5, (0, 0, 0), thickness=1, lineType=cv2.LINE_AA)
+
+  def add_coco_hp(self, points, img_id='default'): 
+    points = np.array(points, dtype=np.int32).reshape(self.num_joints, 2)
+    for j in range(self.num_joints):
+      cv2.circle(self.imgs[img_id],
+                 (points[j, 0], points[j, 1]), 3, self.colors_hp[j], -1)
+    # for person pose edege show
+    # for j, e in enumerate(self.edges):
+    #   if points[e].min() > 0:
+    #     cv2.line(self.imgs[img_id], (points[e[0], 0], points[e[0], 1]),
+    #                   (points[e[1], 0], points[e[1], 1]), self.ec[j], 2,
+    #                   lineType=cv2.LINE_AA)
+
+  def add_points(self, points, img_id='default'):
+    num_classes = len(points)
+    # assert num_classes == len(self.colors)
+    for i in range(num_classes):
+      for j in range(len(points[i])):
+        c = self.colors[i, 0, 0]
+        cv2.circle(self.imgs[img_id], (points[i][j][0] * self.down_ratio, 
+                                       points[i][j][1] * self.down_ratio),
+                   5, (255, 255, 255), -1)
+        cv2.circle(self.imgs[img_id], (points[i][j][0] * self.down_ratio,
+                                       points[i][j][1] * self.down_ratio),
+                   3, (int(c[0]), int(c[1]), int(c[2])), -1)
+
+  def show_all_imgs(self, pause=False, time=0):
+    if not self.ipynb:
+      for i, v in self.imgs.items():
+        cv2.imshow('{}'.format(i), v)
+      if cv2.waitKey(0 if pause else 2000) == 27:
+        import sys
+        sys.exit(0)
+    else:
+      self.ax = None
+      nImgs = len(self.imgs)
+      fig=self.plt.figure(figsize=(nImgs * 10,10))
+      nCols = nImgs
+      nRows = nImgs // nCols
+      for i, (k, v) in enumerate(self.imgs.items()):
+        fig.add_subplot(1, nImgs, i + 1)
+        if len(v.shape) == 3:
+          self.plt.imshow(cv2.cvtColor(v, cv2.COLOR_BGR2RGB))
+        else:
+          self.plt.imshow(v)
+      self.plt.show()
+
+  def return_img(self, img_id='multi_pose'):
+    return self.imgs[img_id]
+
+  def save_img(self, imgId='default', path='./cache/debug/'):
+    cv2.imwrite(path + '{}.png'.format(imgId), self.imgs[imgId])
+    
+  def save_all_imgs(self, path='./cache/debug/', prefix='', genID=False):
+    if genID:
+      try:
+        idx = int(np.loadtxt(path + '/id.txt'))
+      except:
+        idx = 0
+      prefix=idx
+      np.savetxt(path + '/id.txt', np.ones(1) * (idx + 1), fmt='%d')
+    for i, v in self.imgs.items():
+      cv2.imwrite(path + '/{}{}.png'.format(prefix, i), v)
+
+  def remove_side(self, img_id, img):
+    if not (img_id in self.imgs):
+      return
+    ws = img.sum(axis=2).sum(axis=0)
+    l = 0
+    while ws[l] == 0 and l < len(ws):
+      l+= 1
+    r = ws.shape[0] - 1
+    while ws[r] == 0 and r > 0:
+      r -= 1
+    hs = img.sum(axis=2).sum(axis=1)
+    t = 0
+    while hs[t] == 0 and t < len(hs):
+      t += 1
+    b = hs.shape[0] - 1
+    while hs[b] == 0 and b > 0:
+      b -= 1
+    self.imgs[img_id] = self.imgs[img_id][t:b+1, l:r+1].copy()
+
+  def project_3d_to_bird(self, pt):
+    pt[0] += self.world_size / 2
+    pt[1] = self.world_size - pt[1]
+    pt = pt * self.out_size / self.world_size
+    return pt.astype(np.int32)
+
+  def add_ct_detection(
+    self, img, dets, show_box=False, show_txt=True, 
+    center_thresh=0.5, img_id='det'):
+    # dets: max_preds x 5
+    self.imgs[img_id] = img.copy()
+    if type(dets) == type({}):
+      for cat in dets:
+        for i in range(len(dets[cat])):
+          if dets[cat][i, 2] > center_thresh:
+            cl = (self.colors[cat, 0, 0]).tolist()
+            ct = dets[cat][i, :2].astype(np.int32)
+            if show_box:
+              w, h = dets[cat][i, -2], dets[cat][i, -1]
+              x, y = dets[cat][i, 0], dets[cat][i, 1]
+              bbox = np.array([x - w / 2, y - h / 2, x + w / 2, y + h / 2],
+                              dtype=np.float32)
+              self.add_coco_bbox(
+                bbox, cat - 1, dets[cat][i, 2], 
+                show_txt=show_txt, img_id=img_id)
+    else:
+      for i in range(len(dets)):
+        if dets[i, 2] > center_thresh:
+          # print('dets', dets[i])
+          cat = int(dets[i, -1])
+          cl = (self.colors[cat, 0, 0] if self.theme == 'black' else \
+                                       255 - self.colors[cat, 0, 0]).tolist()
+          ct = dets[i, :2].astype(np.int32) * self.down_ratio
+          cv2.circle(self.imgs[img_id], (ct[0], ct[1]), 3, cl, -1)
+          if show_box:
+            w, h = dets[i, -3] * self.down_ratio, dets[i, -2] * self.down_ratio
+            x, y = dets[i, 0] * self.down_ratio, dets[i, 1] * self.down_ratio
+            bbox = np.array([x - w / 2, y - h / 2, x + w / 2, y + h / 2],
+                            dtype=np.float32)
+            self.add_coco_bbox(bbox, dets[i, -1], dets[i, 2], img_id=img_id)
+
+
+  def add_3d_detection(
+    self, image_or_path, dets, calib, show_txt=False, 
+    center_thresh=0.5, img_id='det'):
+    if isinstance(image_or_path, np.ndarray):
+      self.imgs[img_id] = image_or_path
+    else: 
+      self.imgs[img_id] = cv2.imread(image_or_path)
+    for cat in dets:
+      for i in range(len(dets[cat])):
+        cl = (self.colors[cat - 1, 0, 0]).tolist()
+        if dets[cat][i, -1] > center_thresh:
+          dim = dets[cat][i, 5:8]
+          loc  = dets[cat][i, 8:11]
+          rot_y = dets[cat][i, 11]
+          # loc[1] = loc[1] - dim[0] / 2 + dim[0] / 2 / self.dim_scale
+          # dim = dim / self.dim_scale
+          if loc[2] > 1:
+            box_3d = compute_box_3d(dim, loc, rot_y)
+            box_2d = project_to_image(box_3d, calib)
+            self.imgs[img_id] = draw_box_3d(self.imgs[img_id], box_2d, cl)
+
+  def compose_vis_add(
+    self, img_path, dets, calib,
+    center_thresh, pred, bev, img_id='out'):
+    self.imgs[img_id] = cv2.imread(img_path)
+    # h, w = self.imgs[img_id].shape[:2]
+    # pred = cv2.resize(pred, (h, w))
+    h, w = pred.shape[:2]
+    hs, ws = self.imgs[img_id].shape[0] / h, self.imgs[img_id].shape[1] / w
+    self.imgs[img_id] = cv2.resize(self.imgs[img_id], (w, h))
+    self.add_blend_img(self.imgs[img_id], pred, img_id)
+    for cat in dets:
+      for i in range(len(dets[cat])):
+        cl = (self.colors[cat - 1, 0, 0]).tolist()
+        if dets[cat][i, -1] > center_thresh:
+          dim = dets[cat][i, 5:8]
+          loc  = dets[cat][i, 8:11]
+          rot_y = dets[cat][i, 11]
+          # loc[1] = loc[1] - dim[0] / 2 + dim[0] / 2 / self.dim_scale
+          # dim = dim / self.dim_scale
+          if loc[2] > 1:
+            box_3d = compute_box_3d(dim, loc, rot_y)
+            box_2d = project_to_image(box_3d, calib)
+            box_2d[:, 0] /= hs
+            box_2d[:, 1] /= ws
+            self.imgs[img_id] = draw_box_3d(self.imgs[img_id], box_2d, cl)
+    self.imgs[img_id] = np.concatenate(
+      [self.imgs[img_id], self.imgs[bev]], axis=1)
+
+  def add_2d_detection(
+    self, img, dets, show_box=False, show_txt=True, 
+    center_thresh=0.5, img_id='det'):
+    self.imgs[img_id] = img
+    for cat in dets:
+      for i in range(len(dets[cat])):
+        cl = (self.colors[cat - 1, 0, 0]).tolist()
+        if dets[cat][i, -1] > center_thresh:
+          bbox = dets[cat][i, 1:5]
+          self.add_coco_bbox(
+            bbox, cat - 1, dets[cat][i, -1], 
+            show_txt=show_txt, img_id=img_id)
+
+  def add_bird_view(self, dets, center_thresh=0.3, img_id='bird'):
+    bird_view = np.ones((self.out_size, self.out_size, 3), dtype=np.uint8) * 230
+    for cat in dets:
+      cl = (self.colors[cat - 1, 0, 0]).tolist()
+      lc = (250, 152, 12)
+      for i in range(len(dets[cat])):
+        if dets[cat][i, -1] > center_thresh:
+          dim = dets[cat][i, 5:8]
+          loc  = dets[cat][i, 8:11]
+          rot_y = dets[cat][i, 11]
+          rect = compute_box_3d(dim, loc, rot_y)[:4, [0, 2]]
+          for k in range(4):
+            rect[k] = self.project_3d_to_bird(rect[k])
+            # cv2.circle(bird_view, (rect[k][0], rect[k][1]), 2, lc, -1)
+          cv2.polylines(
+              bird_view,[rect.reshape(-1, 1, 2).astype(np.int32)],
+              True,lc,2,lineType=cv2.LINE_AA)
+          for e in [[0, 1]]:
+            t = 4 if e == [0, 1] else 1
+            cv2.line(bird_view, (rect[e[0]][0], rect[e[0]][1]),
+                    (rect[e[1]][0], rect[e[1]][1]), lc, t,
+                    lineType=cv2.LINE_AA)
+    self.imgs[img_id] = bird_view
+
+  def add_bird_views(self, dets_dt, dets_gt, center_thresh=0.3, img_id='bird'):
+    alpha = 0.5
+    bird_view = np.ones((self.out_size, self.out_size, 3), dtype=np.uint8) * 230
+    for ii, (dets, lc, cc) in enumerate(
+      [(dets_gt, (12, 49, 250), (0, 0, 255)), 
+       (dets_dt, (250, 152, 12), (255, 0, 0))]):
+      # cc = np.array(lc, dtype=np.uint8).reshape(1, 1, 3)
+      for cat in dets:
+        cl = (self.colors[cat - 1, 0, 0]).tolist()
+        for i in range(len(dets[cat])):
+          if dets[cat][i, -1] > center_thresh:
+            dim = dets[cat][i, 5:8]
+            loc  = dets[cat][i, 8:11]
+            rot_y = dets[cat][i, 11]
+            rect = compute_box_3d(dim, loc, rot_y)[:4, [0, 2]]
+            for k in range(4):
+              rect[k] = self.project_3d_to_bird(rect[k])
+            if ii == 0:
+              cv2.fillPoly(
+                bird_view,[rect.reshape(-1, 1, 2).astype(np.int32)],
+                lc,lineType=cv2.LINE_AA)
+            else:
+              cv2.polylines(
+                bird_view,[rect.reshape(-1, 1, 2).astype(np.int32)],
+                True,lc,2,lineType=cv2.LINE_AA)
+            # for e in [[0, 1], [1, 2], [2, 3], [3, 0]]:
+            for e in [[0, 1]]:
+              t = 4 if e == [0, 1] else 1
+              cv2.line(bird_view, (rect[e[0]][0], rect[e[0]][1]),
+                      (rect[e[1]][0], rect[e[1]][1]), lc, t,
+                      lineType=cv2.LINE_AA)
+    self.imgs[img_id] = bird_view
+
+
+kitti_class_name = [
+  'p', 'v', 'b'
+]
+
+gta_class_name = [
+  'p', 'v'
+]
+
+pascal_class_name = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", 
+  "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", 
+  "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
+
+coco_class_name = [
+     'person', 'bicycle', 'car', 'motorcycle', 'airplane',
+     'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+     'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
+     'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
+     'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
+     'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
+     'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass',
+     'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
+     'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
+     'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+     'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+     'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+     'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+]
+
+pig_class_name = ['pig']
+
+face_class_name = ['face']
+
+color_list = np.array(
+        [
+            1.000, 1.000, 1.000,
+            0.850, 0.325, 0.098,
+            0.929, 0.694, 0.125,
+            0.494, 0.184, 0.556,
+            0.466, 0.674, 0.188,
+            0.301, 0.745, 0.933,
+            0.635, 0.078, 0.184,
+            0.300, 0.300, 0.300,
+            0.600, 0.600, 0.600,
+            1.000, 0.000, 0.000,
+            1.000, 0.500, 0.000,
+            0.749, 0.749, 0.000,
+            0.000, 1.000, 0.000,
+            0.000, 0.000, 1.000,
+            0.667, 0.000, 1.000,
+            0.333, 0.333, 0.000,
+            0.333, 0.667, 0.000,
+            0.333, 1.000, 0.000,
+            0.667, 0.333, 0.000,
+            0.667, 0.667, 0.000,
+            0.667, 1.000, 0.000,
+            1.000, 0.333, 0.000,
+            1.000, 0.667, 0.000,
+            1.000, 1.000, 0.000,
+            0.000, 0.333, 0.500,
+            0.000, 0.667, 0.500,
+            0.000, 1.000, 0.500,
+            0.333, 0.000, 0.500,
+            0.333, 0.333, 0.500,
+            0.333, 0.667, 0.500,
+            0.333, 1.000, 0.500,
+            0.667, 0.000, 0.500,
+            0.667, 0.333, 0.500,
+            0.667, 0.667, 0.500,
+            0.667, 1.000, 0.500,
+            1.000, 0.000, 0.500,
+            1.000, 0.333, 0.500,
+            1.000, 0.667, 0.500,
+            1.000, 1.000, 0.500,
+            0.000, 0.333, 1.000,
+            0.000, 0.667, 1.000,
+            0.000, 1.000, 1.000,
+            0.333, 0.000, 1.000,
+            0.333, 0.333, 1.000,
+            0.333, 0.667, 1.000,
+            0.333, 1.000, 1.000,
+            0.667, 0.000, 1.000,
+            0.667, 0.333, 1.000,
+            0.667, 0.667, 1.000,
+            0.667, 1.000, 1.000,
+            1.000, 0.000, 1.000,
+            1.000, 0.333, 1.000,
+            1.000, 0.667, 1.000,
+            0.167, 0.000, 0.000,
+            0.333, 0.000, 0.000,
+            0.500, 0.000, 0.000,
+            0.667, 0.000, 0.000,
+            0.833, 0.000, 0.000,
+            1.000, 0.000, 0.000,
+            0.000, 0.167, 0.000,
+            0.000, 0.333, 0.000,
+            0.000, 0.500, 0.000,
+            0.000, 0.667, 0.000,
+            0.000, 0.833, 0.000,
+            0.000, 1.000, 0.000,
+            0.000, 0.000, 0.167,
+            0.000, 0.000, 0.333,
+            0.000, 0.000, 0.500,
+            0.000, 0.000, 0.667,
+            0.000, 0.000, 0.833,
+            0.000, 0.000, 1.000,
+            0.000, 0.000, 0.000,
+            0.143, 0.143, 0.143,
+            0.286, 0.286, 0.286,
+            0.429, 0.429, 0.429,
+            0.571, 0.571, 0.571,
+            0.714, 0.714, 0.714,
+            0.857, 0.857, 0.857,
+            0.000, 0.447, 0.741,
+            0.50, 0.5, 0
+        ]
+    ).astype(np.float32)
+color_list = color_list.reshape((-1, 3)) * 255