Merge pull request #37 from yizhou-wang/v1.1-mnet-tdc

V1.1 add mnet tdc modules

Merge pull request #37 from yizhou-wang/v1.1-mnet-tdc
V1.1 add mnet tdc modules
66fa7039 · Yizhou Wang · GitHub · 7c3fd6f9 · d0140132 · 66fa7039
Unverified Commit 66fa7039 authored Jan 29, 2022 by Yizhou Wang Committed by GitHub Jan 29, 2022
19 changed files
--- a/rodnet/ops/__init__.py
+++ b/rodnet/ops/__init__.py
--- a/rodnet/ops/dcn/__init__.py
+++ b/rodnet/ops/dcn/__init__.py
+from .deform_conv_2d import DeformConv2D, DeformConvPack2D
+from .deform_conv_2d import ModulatedDeformConv2D, ModulatedDeformConvPack2D
+from .deform_pool_2d import DeformRoIPooling2D, DeformRoIPoolingPack2D
+from .deform_pool_2d import ModulatedDeformRoIPoolingPack2D
+from .deform_conv_3d import DeformConv3D, DeformConvPack3D
+from .deform_conv_3d import ModulatedDeformConv3D, ModulatedDeformConvPack3D
+# from .deform_pool_3d import DeformRoIPooling3D, DeformRoIPoolingPack3D
+# from .deform_pool_3d import ModulatedDeformRoIPoolingPack3D
--- a/rodnet/ops/dcn/deform_conv_2d.py
+++ b/rodnet/ops/dcn/deform_conv_2d.py
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+# from mmdet.utils import print_log
+from . import deform_conv_2d_cuda
+class DeformConvFunction2D(Function):
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                weight,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deformable_groups=1,
+                im2col_step=64):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                'Expected 4D tensor as input, got {}D tensor instead.'.format(
+                    input.dim()))
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.im2col_step = im2col_step
+        ctx.save_for_backward(input, offset, weight)
+        output = input.new_empty(
+            DeformConvFunction2D._output_size(input, weight, ctx.padding,
+                                              ctx.dilation, ctx.stride))
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+        if not input.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
+            assert (input.shape[0] %
+                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
+            deform_conv_2d_cuda.deform_conv_forward_cuda(
+                input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1],
+                weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0],
+                ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                ctx.dilation[0], ctx.groups, ctx.deformable_groups,
+                cur_im2col_step)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+        grad_input = grad_offset = grad_weight = None
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
+            assert (input.shape[0] %
+                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                grad_input = torch.zeros_like(input)
+                grad_offset = torch.zeros_like(offset)
+                deform_conv_2d_cuda.deform_conv_backward_input_cuda(
+                    input, offset, grad_output, grad_input,
+                    grad_offset, weight, ctx.bufs_[0], weight.size(3),
+                    weight.size(2), ctx.stride[1], ctx.stride[0],
+                    ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                    ctx.dilation[0], ctx.groups, ctx.deformable_groups,
+                    cur_im2col_step)
+            if ctx.needs_input_grad[2]:
+                grad_weight = torch.zeros_like(weight)
+                deform_conv_2d_cuda.deform_conv_backward_parameters_cuda(
+                    input, offset, grad_output,
+                    grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3),
+                    weight.size(2), ctx.stride[1], ctx.stride[0],
+                    ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                    ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1,
+                    cur_im2col_step)
+        return (grad_input, grad_offset, grad_weight, None, None, None, None,
+                None)
+    @staticmethod
+    def _output_size(input, weight, padding, dilation, stride):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = padding[d]
+            kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1,)
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be {})'.format(
+                    'x'.join(map(str, output_size))))
+        return output_size
+class ModulatedDeformConvFunction2D(Function):
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                mask,
+                weight,
+                bias=None,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deformable_groups=1):
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(1)  # fake tensor
+        if not input.is_cuda:
+            raise NotImplementedError
+        if weight.requires_grad or mask.requires_grad or offset.requires_grad \
+                or input.requires_grad:
+            ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(
+            ModulatedDeformConvFunction2D._infer_shape(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        deform_conv_2d_cuda.modulated_deform_conv_cuda_forward(
+            input, weight, bias, ctx._bufs[0], offset, mask, output,
+            ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride,
+            ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
+            ctx.groups, ctx.deformable_groups, ctx.with_bias)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        deform_conv_2d_cuda.modulated_deform_conv_cuda_backward(
+            input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1],
+            grad_input, grad_weight, grad_bias, grad_offset, grad_mask,
+            grad_output, weight.shape[2], weight.shape[3], ctx.stride,
+            ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
+            ctx.groups, ctx.deformable_groups, ctx.with_bias)
+        if not ctx.with_bias:
+            grad_bias = None
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None)
+    @staticmethod
+    def _infer_shape(ctx, input, weight):
+        n = input.size(0)
+        channels_out = weight.size(0)
+        height, width = input.shape[2:4]
+        kernel_h, kernel_w = weight.shape[2:4]
+        height_out = (height + 2 * ctx.padding -
+                      (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1
+        width_out = (width + 2 * ctx.padding -
+                     (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1
+        return n, channels_out, height_out, width_out
+deform_conv = DeformConvFunction2D.apply
+modulated_deform_conv = ModulatedDeformConvFunction2D.apply
+class DeformConv2D(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=1,
+                 bias=False):
+        super(DeformConv2D, self).__init__()
+        assert not bias
+        assert in_channels % groups == 0, \
+            'in_channels {} cannot be divisible by groups {}'.format(
+                in_channels, groups)
+        assert out_channels % groups == 0, \
+            'out_channels {} cannot be divisible by groups {}'.format(
+                out_channels, groups)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+        self.reset_parameters()
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+    def forward(self, x, offset):
+        # To fix an assert error in deform_conv_cuda.cpp:128
+        # input image is smaller than kernel
+        input_pad = (
+                x.size(2) < self.kernel_size[0] or x.size(3) < self.kernel_size[1])
+        if input_pad:
+            pad_h = max(self.kernel_size[0] - x.size(2), 0)
+            pad_w = max(self.kernel_size[1] - x.size(3), 0)
+            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
+            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant',
+                           0).contiguous()
+        out = deform_conv(x, offset, self.weight, self.stride, self.padding,
+                          self.dilation, self.groups, self.deformable_groups)
+        if input_pad:
+            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
+                                                   pad_w].contiguous()
+        return out
+class DeformConvPack2D(DeformConv2D):
+    """A Deformable Conv Encapsulation that acts as normal Conv layers.
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+    _version = 2
+    def __init__(self, *args, **kwargs):
+        super(DeformConvPack2D, self).__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deformable_groups * 2 * self.kernel_size[0] *
+            self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            dilation=_pair(self.dilation),
+            bias=True)
+        self.init_offset()
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+    def forward(self, x):
+        offset = self.conv_offset(x)
+        return deform_conv(x, offset, self.weight, self.stride, self.padding,
+                           self.dilation, self.groups, self.deformable_groups)
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, DeformConvPack loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+        if version is not None and version > 1:
+            print_log(
+                'DeformConvPack {} is upgraded to version 2.'.format(
+                    prefix.rstrip('.')),
+                logger='root')
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+class ModulatedDeformConv2D(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=1,
+                 bias=True):
+        super(ModulatedDeformConv2D, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.with_bias = bias
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups,
+                         *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.zero_()
+    def forward(self, x, offset, mask):
+        return modulated_deform_conv(x, offset, mask, self.weight, self.bias,
+                                     self.stride, self.padding, self.dilation,
+                                     self.groups, self.deformable_groups)
+class ModulatedDeformConvPack2D(ModulatedDeformConv2D):
+    """A ModulatedDeformable Conv Encapsulation that acts as normal Conv layers.
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+    _version = 2
+    def __init__(self, *args, **kwargs):
+        super(ModulatedDeformConvPack2D, self).__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deformable_groups * 3 * self.kernel_size[0] *
+            self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            dilation=_pair(self.dilation),
+            bias=True)
+        self.init_offset()
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+    def forward(self, x):
+        out = self.conv_offset(x)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return modulated_deform_conv(x, offset, mask, self.weight, self.bias,
+                                     self.stride, self.padding, self.dilation,
+                                     self.groups, self.deformable_groups)
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, ModulatedDeformConvPack
+            # loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+        if version is not None and version > 1:
+            print_log(
+                'ModulatedDeformConvPack {} is upgraded to version 2.'.format(
+                    prefix.rstrip('.')),
+                logger='root')
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
--- a/rodnet/ops/dcn/deform_conv_3d.py
+++ b/rodnet/ops/dcn/deform_conv_3d.py
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _triple, _pair, _single
+from . import deform_conv_3d_cuda
+class DeformConvFunction3D(Function):
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                weight,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deformable_groups=1,
+                im2col_step=64):
+        if input is not None and input.dim() != 5:
+            raise ValueError(
+                'Expected 5D tensor as input, got {}D tensor instead.'.format(
+                    input.dim()))
+        ctx.stride = _triple(stride)
+        ctx.padding = _triple(padding)
+        ctx.dilation = _triple(dilation)
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.im2col_step = im2col_step
+        ctx.save_for_backward(input, offset, weight)
+        output = input.new_empty(
+            DeformConvFunction3D._output_size(input, weight, ctx.padding,
+                                              ctx.dilation, ctx.stride))
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+        if not input.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
+            assert (input.shape[0] %
+                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
+            deform_conv_3d_cuda.deform_conv_forward_cuda(
+                input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1],
+                weight.size(4), weight.size(3), weight.size(2),
+                ctx.stride[2], ctx.stride[1], ctx.stride[0],
+                ctx.padding[2], ctx.padding[1], ctx.padding[0],
+                ctx.dilation[2], ctx.dilation[1], ctx.dilation[0],
+                ctx.groups, ctx.deformable_groups,
+                cur_im2col_step)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+        grad_input = grad_offset = grad_weight = None
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
+            assert (input.shape[0] %
+                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
+            # needs_input_grad[0][1] for input and offset, [2] for kernel weights
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                grad_input = torch.zeros_like(input)
+                grad_offset = torch.zeros_like(offset)
+                deform_conv_3d_cuda.deform_conv_backward_input_cuda(
+                    input, offset, grad_output, grad_input,
+                    grad_offset, weight, ctx.bufs_[0],
+                    weight.size(4), weight.size(3), weight.size(2),
+                    ctx.stride[2], ctx.stride[1], ctx.stride[0],
+                    ctx.padding[2], ctx.padding[1], ctx.padding[0],
+                    ctx.dilation[2], ctx.dilation[1], ctx.dilation[0],
+                    ctx.groups, ctx.deformable_groups,
+                    cur_im2col_step)
+            if ctx.needs_input_grad[2]:
+                grad_weight = torch.zeros_like(weight)
+                deform_conv_3d_cuda.deform_conv_backward_parameters_cuda(
+                    input, offset, grad_output,
+                    grad_weight, ctx.bufs_[0], ctx.bufs_[1],
+                    weight.size(4), weight.size(3), weight.size(2),
+                    ctx.stride[2], ctx.stride[1], ctx.stride[0],
+                    ctx.padding[2], ctx.padding[1], ctx.padding[0],
+                    ctx.dilation[2], ctx.dilation[1], ctx.dilation[0],
+                    ctx.groups, ctx.deformable_groups, 1,
+                    cur_im2col_step)
+        return (grad_input, grad_offset, grad_weight, None, None, None, None,
+                None)
+    @staticmethod
+    def _output_size(input, weight, padding, dilation, stride):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = padding[d]
+            kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1,)
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be {})'.format(
+                    'x'.join(map(str, output_size))))
+        return output_size
+class ModulatedDeformConvFunction3D(Function):
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                mask,
+                weight,
+                bias=None,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deformable_groups=1):
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(1)  # fake tensor
+        if not input.is_cuda:
+            raise NotImplementedError
+        if weight.requires_grad or mask.requires_grad or offset.requires_grad \
+                or input.requires_grad:
+            ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(
+            ModulatedDeformConvFunction3D._infer_shape(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        deform_conv_3d_cuda.modulated_deform_conv_cuda_forward(
+            input, weight, bias, ctx._bufs[0], offset, mask, output,
+            ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride,
+            ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
+            ctx.groups, ctx.deformable_groups, ctx.with_bias)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        deform_conv_3d_cuda.modulated_deform_conv_cuda_backward(
+            input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1],
+            grad_input, grad_weight, grad_bias, grad_offset, grad_mask,
+            grad_output, weight.shape[2], weight.shape[3], ctx.stride,
+            ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
+            ctx.groups, ctx.deformable_groups, ctx.with_bias)
+        if not ctx.with_bias:
+            grad_bias = None
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None)
+    @staticmethod
+    def _infer_shape(ctx, input, weight):
+        n = input.size(0)
+        channels_out = weight.size(0)
+        height, width = input.shape[2:4]
+        kernel_h, kernel_w = weight.shape[2:4]
+        height_out = (height + 2 * ctx.padding -
+                      (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1
+        width_out = (width + 2 * ctx.padding -
+                     (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1
+        return n, channels_out, height_out, width_out
+deform_conv = DeformConvFunction3D.apply
+modulated_deform_conv = ModulatedDeformConvFunction3D.apply
+class DeformConv3D(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=1,
+                 bias=False):
+        super(DeformConv3D, self).__init__()
+        assert not bias
+        assert in_channels % groups == 0, \
+            'in_channels {} cannot be divisible by groups {}'.format(
+                in_channels, groups)
+        assert out_channels % groups == 0, \
+            'out_channels {} cannot be divisible by groups {}'.format(
+                out_channels, groups)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _triple(kernel_size)
+        self.stride = _triple(stride)
+        self.padding = _triple(padding)
+        self.dilation = _triple(dilation)
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+        self.reset_parameters()
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+    def forward(self, x, offset):
+        # To fix an assert error in deform_conv_cuda.cpp:128
+        # input image is smaller than kernel
+        # TODO: add t to input_pad
+        input_pad = (
+                x.size(2) < self.kernel_size[0] or x.size(3) < self.kernel_size[1])
+        if input_pad:
+            pad_h = max(self.kernel_size[0] - x.size(2), 0)
+            pad_w = max(self.kernel_size[1] - x.size(3), 0)
+            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
+            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant',
+                           0).contiguous()
+        out = deform_conv(x, offset, self.weight, self.stride, self.padding,
+                          self.dilation, self.groups, self.deformable_groups)
+        if input_pad:
+            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
+                                                   pad_w].contiguous()
+        return out
+class DeformConvPack3D(DeformConv3D):
+    """A Deformable Conv Encapsulation that acts as normal Conv layers.
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+    _version = 2
+    def __init__(self, *args, **kwargs):
+        super(DeformConvPack3D, self).__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv3d(
+            self.in_channels,
+            self.deformable_groups * 2 * self.kernel_size[0] *
+            self.kernel_size[1] * self.kernel_size[2],
+            kernel_size=self.kernel_size,
+            stride=_triple(self.stride),
+            padding=_triple(self.padding),
+            dilation=_triple(self.dilation),
+            bias=True)
+        self.init_offset()
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+    def forward(self, x):
+        offset = self.conv_offset(x)
+        return deform_conv(x, offset, self.weight, self.stride, self.padding,
+                           self.dilation, self.groups, self.deformable_groups)
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, DeformConvPack loads previous benchmark models.
+            # TODO: check here
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+class ModulatedDeformConv3D(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=1,
+                 bias=True):
+        super(ModulatedDeformConv3D, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.with_bias = bias
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups,
+                         *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.zero_()
+    def forward(self, x, offset, mask):
+        return modulated_deform_conv(x, offset, mask, self.weight, self.bias,
+                                     self.stride, self.padding, self.dilation,
+                                     self.groups, self.deformable_groups)
+class ModulatedDeformConvPack3D(ModulatedDeformConv3D):
+    """A ModulatedDeformable Conv Encapsulation that acts as normal Conv layers.
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+    _version = 2
+    def __init__(self, *args, **kwargs):
+        super(ModulatedDeformConvPack3D, self).__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deformable_groups * 3 * self.kernel_size[0] *
+            self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            dilation=_pair(self.dilation),
+            bias=True)
+        self.init_offset()
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+    def forward(self, x):
+        out = self.conv_offset(x)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return modulated_deform_conv(x, offset, mask, self.weight, self.bias,
+                                     self.stride, self.padding, self.dilation,
+                                     self.groups, self.deformable_groups)
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, ModulatedDeformConvPack
+            # loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+        if version is not None and version > 1:
+            print_log(
+                'ModulatedDeformConvPack {} is upgraded to version 2.'.format(
+                    prefix.rstrip('.')),
+                logger='root')
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
--- a/rodnet/ops/dcn/deform_pool_2d.py
+++ b/rodnet/ops/dcn/deform_pool_2d.py
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+from . import deform_pool_2d_cuda
+class DeformRoIPoolingFunction2D(Function):
+    @staticmethod
+    def forward(ctx,
+                data,
+                rois,
+                offset,
+                spatial_scale,
+                out_size,
+                out_channels,
+                no_trans,
+                group_size=1,
+                part_size=None,
+                sample_per_part=4,
+                trans_std=.0):
+        # TODO: support unsquare RoIs
+        out_h, out_w = _pair(out_size)
+        assert isinstance(out_h, int) and isinstance(out_w, int)
+        assert out_h == out_w
+        out_size = out_h  # out_h and out_w must be equal
+        ctx.spatial_scale = spatial_scale
+        ctx.out_size = out_size
+        ctx.out_channels = out_channels
+        ctx.no_trans = no_trans
+        ctx.group_size = group_size
+        ctx.part_size = out_size if part_size is None else part_size
+        ctx.sample_per_part = sample_per_part
+        ctx.trans_std = trans_std
+        assert 0.0 <= ctx.trans_std <= 1.0
+        if not data.is_cuda:
+            raise NotImplementedError
+        n = rois.shape[0]
+        output = data.new_empty(n, out_channels, out_size, out_size)
+        output_count = data.new_empty(n, out_channels, out_size, out_size)
+        deform_pool_2d_cuda.deform_psroi_pooling_cuda_forward(
+            data, rois, offset, output, output_count, ctx.no_trans,
+            ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size,
+            ctx.part_size, ctx.sample_per_part, ctx.trans_std)
+        if data.requires_grad or rois.requires_grad or offset.requires_grad:
+            ctx.save_for_backward(data, rois, offset)
+        ctx.output_count = output_count
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        data, rois, offset = ctx.saved_tensors
+        output_count = ctx.output_count
+        grad_input = torch.zeros_like(data)
+        grad_rois = None
+        grad_offset = torch.zeros_like(offset)
+        deform_pool_2d_cuda.deform_psroi_pooling_cuda_backward(
+            grad_output, data, rois, offset, output_count, grad_input,
+            grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels,
+            ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part,
+            ctx.trans_std)
+        return (grad_input, grad_rois, grad_offset, None, None, None, None,
+                None, None, None, None)
+deform_roi_pooling = DeformRoIPoolingFunction2D.apply
+class DeformRoIPooling2D(nn.Module):
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0):
+        super(DeformRoIPooling2D, self).__init__()
+        self.spatial_scale = spatial_scale
+        self.out_size = _pair(out_size)
+        self.out_channels = out_channels
+        self.no_trans = no_trans
+        self.group_size = group_size
+        self.part_size = out_size if part_size is None else part_size
+        self.sample_per_part = sample_per_part
+        self.trans_std = trans_std
+    def forward(self, data, rois, offset):
+        if self.no_trans:
+            offset = data.new_empty(0)
+        return deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                  self.out_size, self.out_channels,
+                                  self.no_trans, self.group_size,
+                                  self.part_size, self.sample_per_part,
+                                  self.trans_std)
+class DeformRoIPoolingPack2D(DeformRoIPooling2D):
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 num_offset_fcs=3,
+                 deform_fc_channels=1024):
+        super(DeformRoIPoolingPack2D,
+              self).__init__(spatial_scale, out_size, out_channels, no_trans,
+                             group_size, part_size, sample_per_part, trans_std)
+        self.num_offset_fcs = num_offset_fcs
+        self.deform_fc_channels = deform_fc_channels
+        if not no_trans:
+            seq = []
+            ic = self.out_size[0] * self.out_size[1] * self.out_channels
+            for i in range(self.num_offset_fcs):
+                if i < self.num_offset_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size[0] * self.out_size[1] * 2
+                seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_offset_fcs - 1:
+                    seq.append(nn.ReLU(inplace=True))
+            self.offset_fc = nn.Sequential(*seq)
+            self.offset_fc[-1].weight.data.zero_()
+            self.offset_fc[-1].bias.data.zero_()
+    def forward(self, data, rois):
+        assert data.size(1) == self.out_channels
+        n = rois.shape[0]
+        if n == 0:
+            return data.new_empty(n, self.out_channels, self.out_size[0],
+                                  self.out_size[1])
+        if self.no_trans:
+            offset = data.new_empty(0)
+            return deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                      self.out_size, self.out_channels,
+                                      self.no_trans, self.group_size,
+                                      self.part_size, self.sample_per_part,
+                                      self.trans_std)
+        else:
+            offset = data.new_empty(0)
+            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                   self.out_size, self.out_channels, True,
+                                   self.group_size, self.part_size,
+                                   self.sample_per_part, self.trans_std)
+            offset = self.offset_fc(x.view(n, -1))
+            offset = offset.view(n, 2, self.out_size[0], self.out_size[1])
+            return deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                      self.out_size, self.out_channels,
+                                      self.no_trans, self.group_size,
+                                      self.part_size, self.sample_per_part,
+                                      self.trans_std)
+class ModulatedDeformRoIPoolingPack2D(DeformRoIPooling2D):
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 num_offset_fcs=3,
+                 num_mask_fcs=2,
+                 deform_fc_channels=1024):
+        super(ModulatedDeformRoIPoolingPack2D,
+              self).__init__(spatial_scale, out_size, out_channels, no_trans,
+                             group_size, part_size, sample_per_part, trans_std)
+        self.num_offset_fcs = num_offset_fcs
+        self.num_mask_fcs = num_mask_fcs
+        self.deform_fc_channels = deform_fc_channels
+        if not no_trans:
+            offset_fc_seq = []
+            ic = self.out_size[0] * self.out_size[1] * self.out_channels
+            for i in range(self.num_offset_fcs):
+                if i < self.num_offset_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size[0] * self.out_size[1] * 2
+                offset_fc_seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_offset_fcs - 1:
+                    offset_fc_seq.append(nn.ReLU(inplace=True))
+            self.offset_fc = nn.Sequential(*offset_fc_seq)
+            self.offset_fc[-1].weight.data.zero_()
+            self.offset_fc[-1].bias.data.zero_()
+            mask_fc_seq = []
+            ic = self.out_size[0] * self.out_size[1] * self.out_channels
+            for i in range(self.num_mask_fcs):
+                if i < self.num_mask_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size[0] * self.out_size[1]
+                mask_fc_seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_mask_fcs - 1:
+                    mask_fc_seq.append(nn.ReLU(inplace=True))
+                else:
+                    mask_fc_seq.append(nn.Sigmoid())
+            self.mask_fc = nn.Sequential(*mask_fc_seq)
+            self.mask_fc[-2].weight.data.zero_()
+            self.mask_fc[-2].bias.data.zero_()
+    def forward(self, data, rois):
+        assert data.size(1) == self.out_channels
+        n = rois.shape[0]
+        if n == 0:
+            return data.new_empty(n, self.out_channels, self.out_size[0],
+                                  self.out_size[1])
+        if self.no_trans:
+            offset = data.new_empty(0)
+            return deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                      self.out_size, self.out_channels,
+                                      self.no_trans, self.group_size,
+                                      self.part_size, self.sample_per_part,
+                                      self.trans_std)
+        else:
+            offset = data.new_empty(0)
+            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                   self.out_size, self.out_channels, True,
+                                   self.group_size, self.part_size,
+                                   self.sample_per_part, self.trans_std)
+            offset = self.offset_fc(x.view(n, -1))
+            offset = offset.view(n, 2, self.out_size[0], self.out_size[1])
+            mask = self.mask_fc(x.view(n, -1))
+            mask = mask.view(n, 1, self.out_size[0], self.out_size[1])
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std) * mask
--- a/rodnet/ops/dcn/deform_pool_3d.py
+++ b/rodnet/ops/dcn/deform_pool_3d.py
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+from . import deform_pool_3d_cuda
+class DeformRoIPoolingFunction3D(Function):
+    @staticmethod
+    def forward(ctx,
+                data,
+                rois,
+                offset,
+                spatial_scale,
+                out_size,
+                out_channels,
+                no_trans,
+                group_size=1,
+                part_size=None,
+                sample_per_part=4,
+                trans_std=.0):
+        # TODO: support unsquare RoIs
+        out_h, out_w = _pair(out_size)
+        assert isinstance(out_h, int) and isinstance(out_w, int)
+        assert out_h == out_w
+        out_size = out_h  # out_h and out_w must be equal
+        ctx.spatial_scale = spatial_scale
+        ctx.out_size = out_size
+        ctx.out_channels = out_channels
+        ctx.no_trans = no_trans
+        ctx.group_size = group_size
+        ctx.part_size = out_size if part_size is None else part_size
+        ctx.sample_per_part = sample_per_part
+        ctx.trans_std = trans_std
+        assert 0.0 <= ctx.trans_std <= 1.0
+        if not data.is_cuda:
+            raise NotImplementedError
+        n = rois.shape[0]
+        output = data.new_empty(n, out_channels, out_size, out_size)
+        output_count = data.new_empty(n, out_channels, out_size, out_size)
+        deform_pool_3d_cuda.deform_psroi_pooling_cuda_forward(
+            data, rois, offset, output, output_count, ctx.no_trans,
+            ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size,
+            ctx.part_size, ctx.sample_per_part, ctx.trans_std)
+        if data.requires_grad or rois.requires_grad or offset.requires_grad:
+            ctx.save_for_backward(data, rois, offset)
+        ctx.output_count = output_count
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        data, rois, offset = ctx.saved_tensors
+        output_count = ctx.output_count
+        grad_input = torch.zeros_like(data)
+        grad_rois = None
+        grad_offset = torch.zeros_like(offset)
+        deform_pool_3d_cuda.deform_psroi_pooling_cuda_backward(
+            grad_output, data, rois, offset, output_count, grad_input,
+            grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels,
+            ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part,
+            ctx.trans_std)
+        return (grad_input, grad_rois, grad_offset, None, None, None, None,
+                None, None, None, None)
+deform_roi_pooling = DeformRoIPoolingFunction3D.apply
+class DeformRoIPooling3D(nn.Module):
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0):
+        super(DeformRoIPooling3D, self).__init__()
+        self.spatial_scale = spatial_scale
+        self.out_size = _pair(out_size)
+        self.out_channels = out_channels
+        self.no_trans = no_trans
+        self.group_size = group_size
+        self.part_size = out_size if part_size is None else part_size
+        self.sample_per_part = sample_per_part
+        self.trans_std = trans_std
+    def forward(self, data, rois, offset):
+        if self.no_trans:
+            offset = data.new_empty(0)
+        return deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                  self.out_size, self.out_channels,
+                                  self.no_trans, self.group_size,
+                                  self.part_size, self.sample_per_part,
+                                  self.trans_std)
+class DeformRoIPoolingPack3D(DeformRoIPooling3D):
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 num_offset_fcs=3,
+                 deform_fc_channels=1024):
+        super(DeformRoIPoolingPack3D,
+              self).__init__(spatial_scale, out_size, out_channels, no_trans,
+                             group_size, part_size, sample_per_part, trans_std)
+        self.num_offset_fcs = num_offset_fcs
+        self.deform_fc_channels = deform_fc_channels
+        if not no_trans:
+            seq = []
+            ic = self.out_size[0] * self.out_size[1] * self.out_channels
+            for i in range(self.num_offset_fcs):
+                if i < self.num_offset_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size[0] * self.out_size[1] * 2
+                seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_offset_fcs - 1:
+                    seq.append(nn.ReLU(inplace=True))
+            self.offset_fc = nn.Sequential(*seq)
+            self.offset_fc[-1].weight.data.zero_()
+            self.offset_fc[-1].bias.data.zero_()
+    def forward(self, data, rois):
+        assert data.size(1) == self.out_channels
+        n = rois.shape[0]
+        if n == 0:
+            return data.new_empty(n, self.out_channels, self.out_size[0],
+                                  self.out_size[1])
+        if self.no_trans:
+            offset = data.new_empty(0)
+            return deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                      self.out_size, self.out_channels,
+                                      self.no_trans, self.group_size,
+                                      self.part_size, self.sample_per_part,
+                                      self.trans_std)
+        else:
+            offset = data.new_empty(0)
+            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                   self.out_size, self.out_channels, True,
+                                   self.group_size, self.part_size,
+                                   self.sample_per_part, self.trans_std)
+            offset = self.offset_fc(x.view(n, -1))
+            offset = offset.view(n, 2, self.out_size[0], self.out_size[1])
+            return deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                      self.out_size, self.out_channels,
+                                      self.no_trans, self.group_size,
+                                      self.part_size, self.sample_per_part,
+                                      self.trans_std)
+class ModulatedDeformRoIPoolingPack3D(DeformRoIPooling3D):
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 num_offset_fcs=3,
+                 num_mask_fcs=2,
+                 deform_fc_channels=1024):
+        super(ModulatedDeformRoIPoolingPack3D,
+              self).__init__(spatial_scale, out_size, out_channels, no_trans,
+                             group_size, part_size, sample_per_part, trans_std)
+        self.num_offset_fcs = num_offset_fcs
+        self.num_mask_fcs = num_mask_fcs
+        self.deform_fc_channels = deform_fc_channels
+        if not no_trans:
+            offset_fc_seq = []
+            ic = self.out_size[0] * self.out_size[1] * self.out_channels
+            for i in range(self.num_offset_fcs):
+                if i < self.num_offset_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size[0] * self.out_size[1] * 2
+                offset_fc_seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_offset_fcs - 1:
+                    offset_fc_seq.append(nn.ReLU(inplace=True))
+            self.offset_fc = nn.Sequential(*offset_fc_seq)
+            self.offset_fc[-1].weight.data.zero_()
+            self.offset_fc[-1].bias.data.zero_()
+            mask_fc_seq = []
+            ic = self.out_size[0] * self.out_size[1] * self.out_channels
+            for i in range(self.num_mask_fcs):
+                if i < self.num_mask_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size[0] * self.out_size[1]
+                mask_fc_seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_mask_fcs - 1:
+                    mask_fc_seq.append(nn.ReLU(inplace=True))
+                else:
+                    mask_fc_seq.append(nn.Sigmoid())
+            self.mask_fc = nn.Sequential(*mask_fc_seq)
+            self.mask_fc[-2].weight.data.zero_()
+            self.mask_fc[-2].bias.data.zero_()
+    def forward(self, data, rois):
+        assert data.size(1) == self.out_channels
+        n = rois.shape[0]
+        if n == 0:
+            return data.new_empty(n, self.out_channels, self.out_size[0],
+                                  self.out_size[1])
+        if self.no_trans:
+            offset = data.new_empty(0)
+            return deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                      self.out_size, self.out_channels,
+                                      self.no_trans, self.group_size,
+                                      self.part_size, self.sample_per_part,
+                                      self.trans_std)
+        else:
+            offset = data.new_empty(0)
+            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                   self.out_size, self.out_channels, True,
+                                   self.group_size, self.part_size,
+                                   self.sample_per_part, self.trans_std)
+            offset = self.offset_fc(x.view(n, -1))
+            offset = offset.view(n, 2, self.out_size[0], self.out_size[1])
+            mask = self.mask_fc(x.view(n, -1))
+            mask = mask.view(n, 1, self.out_size[0], self.out_size[1])
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std) * mask
--- a/rodnet/ops/dcn/src/deform_conv_2d_cuda.cpp
+++ b/rodnet/ops/dcn/src/deform_conv_2d_cuda.cpp
--- a/rodnet/ops/dcn/src/deform_conv_2d_cuda_kernel.cu
+++ b/rodnet/ops/dcn/src/deform_conv_2d_cuda_kernel.cu
--- a/rodnet/ops/dcn/src/deform_conv_3d_cuda.cpp
+++ b/rodnet/ops/dcn/src/deform_conv_3d_cuda.cpp
--- a/rodnet/ops/dcn/src/deform_conv_3d_cuda_kernel.cu
+++ b/rodnet/ops/dcn/src/deform_conv_3d_cuda_kernel.cu
--- a/rodnet/ops/dcn/src/deform_pool_2d_cuda.cpp
+++ b/rodnet/ops/dcn/src/deform_pool_2d_cuda.cpp
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
+// based on
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+#include <torch/extension.h>
+#include <ATen/DeviceGuard.h>
+#include <cmath>
+#include <vector>
+void DeformablePSROIPoolForward(
+    const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
+    at::Tensor out, at::Tensor top_count, const int batch, const int channels,
+    const int height, const int width, const int num_bbox,
+    const int channels_trans, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+void DeformablePSROIPoolBackwardAcc(
+    const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
+    const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad,
+    at::Tensor trans_grad, const int batch, const int channels,
+    const int height, const int width, const int num_bbox,
+    const int channels_trans, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+void deform_psroi_pooling_cuda_forward(
+    at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
+    at::Tensor top_count, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std) {
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  at::DeviceGuard guard(input.device());
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out.size(0), num_bbox);
+  DeformablePSROIPoolForward(
+      input, bbox, trans, out, top_count, batch, channels, height, width,
+      num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size,
+      pooled_size, part_size, sample_per_part, trans_std);
+}
+void deform_psroi_pooling_cuda_backward(
+    at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
+    at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
+    const int no_trans, const float spatial_scale, const int output_dim,
+    const int group_size, const int pooled_size, const int part_size,
+    const int sample_per_part, const float trans_std) {
+  AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  at::DeviceGuard guard(input.device());
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out_grad.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out_grad.size(0), num_bbox);
+  DeformablePSROIPoolBackwardAcc(
+      out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch,
+      channels, height, width, num_bbox, channels_trans, no_trans,
+      spatial_scale, output_dim, group_size, pooled_size, part_size,
+      sample_per_part, trans_std);
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("deform_psroi_pooling_cuda_forward", &deform_psroi_pooling_cuda_forward,
+        "deform psroi pooling forward(CUDA)");
+  m.def("deform_psroi_pooling_cuda_backward",
+        &deform_psroi_pooling_cuda_backward,
+        "deform psroi pooling backward(CUDA)");
+}
--- a/rodnet/ops/dcn/src/deform_pool_2d_cuda_kernel.cu
+++ b/rodnet/ops/dcn/src/deform_pool_2d_cuda_kernel.cu
--- a/rodnet/ops/dcn/src/deform_pool_3d_cuda.cpp
+++ b/rodnet/ops/dcn/src/deform_pool_3d_cuda.cpp
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
+// based on
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+#include <torch/extension.h>
+#include <ATen/DeviceGuard.h>
+#include <cmath>
+#include <vector>
+void DeformablePSROIPoolForward(
+    const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
+    at::Tensor out, at::Tensor top_count, const int batch, const int channels,
+    const int height, const int width, const int num_bbox,
+    const int channels_trans, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+void DeformablePSROIPoolBackwardAcc(
+    const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
+    const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad,
+    at::Tensor trans_grad, const int batch, const int channels,
+    const int height, const int width, const int num_bbox,
+    const int channels_trans, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+void deform_psroi_pooling_cuda_forward(
+    at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
+    at::Tensor top_count, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std) {
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  at::DeviceGuard guard(input.device());
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out.size(0), num_bbox);
+  DeformablePSROIPoolForward(
+      input, bbox, trans, out, top_count, batch, channels, height, width,
+      num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size,
+      pooled_size, part_size, sample_per_part, trans_std);
+}
+void deform_psroi_pooling_cuda_backward(
+    at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
+    at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
+    const int no_trans, const float spatial_scale, const int output_dim,
+    const int group_size, const int pooled_size, const int part_size,
+    const int sample_per_part, const float trans_std) {
+  AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  at::DeviceGuard guard(input.device());
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out_grad.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out_grad.size(0), num_bbox);
+  DeformablePSROIPoolBackwardAcc(
+      out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch,
+      channels, height, width, num_bbox, channels_trans, no_trans,
+      spatial_scale, output_dim, group_size, pooled_size, part_size,
+      sample_per_part, trans_std);
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("deform_psroi_pooling_cuda_forward", &deform_psroi_pooling_cuda_forward,
+        "deform psroi pooling forward(CUDA)");
+  m.def("deform_psroi_pooling_cuda_backward",
+        &deform_psroi_pooling_cuda_backward,
+        "deform psroi pooling backward(CUDA)");
+}
--- a/rodnet/ops/dcn/src/deform_pool_3d_cuda_kernel.cu
+++ b/rodnet/ops/dcn/src/deform_pool_3d_cuda_kernel.cu
--- a/rodnet/utils/load_configs.py
+++ b/rodnet/utils/load_configs.py
@@ -18,3 +18,13 @@ def load_configs_from_file(config_path):
        if not name.startswith('__')
    }
    return cfg_dict
+def update_config_dict(config_dict, args):
+    data_root_old = config_dict['dataset_cfg']['base_root']
+    config_dict['dataset_cfg']['base_root'] = args.data_root
+    config_dict['dataset_cfg']['data_root'] = config_dict['dataset_cfg']['data_root'].replace(data_root_old,
+                                                                                              args.data_root)
+    config_dict['dataset_cfg']['anno_root'] = config_dict['dataset_cfg']['anno_root'].replace(data_root_old,
+                                                                                              args.data_root)
+    return config_dict
--- a/setup.py
+++ b/setup.py
@@ -6,6 +6,7 @@ from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 os.environ['CFLAGS'] = '-Wno-deprecated-declarations'  # suppress warnings in debug mode
 def readme():
    with open('README.md', encoding='utf-8') as f:
        content = f.read()
@@ -44,7 +45,7 @@ def make_cuda_ext(name, module, sources):
 if __name__ == '__main__':
    setup(
        name='rodnet',
-        version='1.0',
+        version='1.1',
        description='RODNet: Object Detection from Radar Data',
        long_description=readme(),
        long_description_content_type='text/markdown',
@@ -77,7 +78,39 @@ if __name__ == '__main__':
        keywords='rodnet, object detection, radar, autonomous driving',
        packages=find_packages(include=["rodnet.*"]),
+        package_data={'rodnet.ops': ['*/*.so']},
        python_requires='>=3.6',
        install_requires=get_requirements(),
+        ext_modules=[
+            make_cuda_ext(
+                name='deform_conv_2d_cuda',
+                module='rodnet.ops.dcn',
+                sources=[
+                    'src/deform_conv_2d_cuda.cpp',
+                    'src/deform_conv_2d_cuda_kernel.cu'
+                ]),
+            make_cuda_ext(
+                name='deform_conv_3d_cuda',
+                module='rodnet.ops.dcn',
+                sources=[
+                    'src/deform_conv_3d_cuda.cpp',
+                    'src/deform_conv_3d_cuda_kernel.cu'
+                ]),
+            make_cuda_ext(
+                name='deform_pool_2d_cuda',
+                module='rodnet.ops.dcn',
+                sources=[
+                    'src/deform_pool_2d_cuda.cpp',
+                    'src/deform_pool_2d_cuda_kernel.cu'
+                ]),
+            make_cuda_ext(
+                name='deform_pool_3d_cuda',
+                module='rodnet.ops.dcn',
+                sources=[
+                    'src/deform_pool_3d_cuda.cpp',
+                    'src/deform_pool_3d_cuda_kernel.cu'
+                ]),
+        ],
+        cmdclass={'build_ext': BuildExtension},
        zip_safe=False
    )
--- a/setup_wo_tdc.py
+++ b/setup_wo_tdc.py
+import os
+from setuptools import setup, find_packages
+import torch
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+os.environ['CFLAGS'] = '-Wno-deprecated-declarations'  # suppress warnings in debug mode
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+def get_requirements(filename='requirements.txt'):
+    here = os.path.dirname(os.path.realpath(__file__))
+    with open(os.path.join(here, filename), 'r') as f:
+        requires = [line.replace('\n', '') for line in f.readlines()]
+    return requires
+def make_cuda_ext(name, module, sources):
+    define_macros = []
+    if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
+        define_macros += [('WITH_CUDA', None)]
+    else:
+        raise EnvironmentError('CUDA is required to compile RODNet!')
+    return CUDAExtension(
+        name='{}.{}'.format(module, name),
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        define_macros=define_macros,
+        extra_compile_args={
+            'cxx': [],
+            'nvcc': [
+                '-D__CUDA_NO_HALF_OPERATORS__',
+                '-D__CUDA_NO_HALF_CONVERSIONS__',
+                '-D__CUDA_NO_HALF2_OPERATORS__',
+            ]
+        })
+if __name__ == '__main__':
+    setup(
+        name='rodnet',
+        version='1.1',
+        description='RODNet: Object Detection from Radar Data',
+        long_description=readme(),
+        long_description_content_type='text/markdown',
+        url='https://github.com/yizhou-wang/RODNet',
+        author='Yizhou Wang',
+        author_email='ywang26@uw.edu',
+        classifiers=[
+            # How mature is this project? Common values are
+            #   3 - Alpha
+            #   4 - Beta
+            #   5 - Production/Stable
+            'Development Status :: 3 - Alpha',
+            # Indicate who your project is intended for
+            'Intended Audience :: Developers',
+            'Topic :: Software Development :: Build Tools',
+            # Pick your license as you wish
+            'License :: OSI Approved :: MIT License',
+            # Specify the Python versions you support here. In particular, ensure
+            # that you indicate whether you support Python 2, Python 3 or both.
+            # These classifiers are *not* checked by 'pip install'. See instead
+            # 'python_requires' below.
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+            'Programming Language :: Python :: 3.8',
+        ],
+        keywords='rodnet, object detection, radar, autonomous driving',
+        packages=find_packages(include=["rodnet.*"]),
+        # package_data={'rodnet.ops': ['*/*.so']},
+        python_requires='>=3.6',
+        install_requires=get_requirements(),
+        # ext_modules=[
+        #     make_cuda_ext(
+        #         name='deform_conv_2d_cuda',
+        #         module='rodnet.ops.dcn',
+        #         sources=[
+        #             'src/deform_conv_2d_cuda.cpp',
+        #             'src/deform_conv_2d_cuda_kernel.cu'
+        #         ]),
+        #     make_cuda_ext(
+        #         name='deform_conv_3d_cuda',
+        #         module='rodnet.ops.dcn',
+        #         sources=[
+        #             'src/deform_conv_3d_cuda.cpp',
+        #             'src/deform_conv_3d_cuda_kernel.cu'
+        #         ]),
+        #     make_cuda_ext(
+        #         name='deform_pool_2d_cuda',
+        #         module='rodnet.ops.dcn',
+        #         sources=[
+        #             'src/deform_pool_2d_cuda.cpp',
+        #             'src/deform_pool_2d_cuda_kernel.cu'
+        #         ]),
+        #     make_cuda_ext(
+        #         name='deform_pool_3d_cuda',
+        #         module='rodnet.ops.dcn',
+        #         sources=[
+        #             'src/deform_pool_3d_cuda.cpp',
+        #             'src/deform_pool_3d_cuda_kernel.cu'
+        #         ]),
+        # ],
+        # cmdclass={'build_ext': BuildExtension},
+        zip_safe=False
+    )
--- a/tools/prepare_dataset/prepare_data.py
+++ b/tools/prepare_dataset/prepare_data.py
@@ -11,7 +11,7 @@ from cruw.annotation.init_json import init_meta_json
 from cruw.mapping import ra2idx
 from rodnet.core.confidence_map import generate_confmap, normalize_confmap, add_noise_channel
-from rodnet.utils.load_configs import load_configs_from_file
+from rodnet.utils.load_configs import load_configs_from_file, update_config_dict
 from rodnet.utils.visualization import visualize_confmap
 SPLITS_LIST = ['train', 'valid', 'test', 'demo']
@@ -20,7 +20,8 @@ SPLITS_LIST = ['train', 'valid', 'test', 'demo']
 def parse_args():
    parser = argparse.ArgumentParser(description='Prepare RODNet data.')
    parser.add_argument('--config', type=str, dest='config', help='configuration file path')
-    parser.add_argument('--data_root', type=str, help='directory to the prepared data')
+    parser.add_argument('--data_root', type=str,
+                        help='directory to the dataset (will overwrite data_root in config file)')
    parser.add_argument('--sensor_config', type=str, default='sensor_config_rod2021')
    parser.add_argument('--split', type=str, dest='split', default='',
                        help='choose from train, valid, test, supertest')
@@ -220,6 +221,7 @@ if __name__ == "__main__":
    dataset = CRUW(data_root=data_root, sensor_config_name=args.sensor_config)
    config_dict = load_configs_from_file(args.config)
+    config_dict = update_config_dict(config_dict, args)  # update configs by args
    radar_configs = dataset.sensor_cfg.radar_cfg
    if splits == None:

--- a/tools/train.py
+++ b/tools/train.py